diff options
author | Eiji Ota <Eiji.Ota@Sun.COM> | 2010-06-22 11:08:53 -0700 |
---|---|---|
committer | Eiji Ota <Eiji.Ota@Sun.COM> | 2010-06-22 11:08:53 -0700 |
commit | 5d5562f583b2b6affe19bdce0b3c8b1840d667a4 (patch) | |
tree | 47a1b89fee7e60eb9c7dbae60a1b38c86c95818b /usr/src | |
parent | 5d41001a9500cedff204767de21831d30c455733 (diff) | |
download | illumos-gate-5d5562f583b2b6affe19bdce0b3c8b1840d667a4.tar.gz |
6928074 Need to improve interrupt to tasklet handover mechanism in Solaris RDSv3
6947377 Need to bind receive tasklet thread to multiple CPUS.
6947384 Multiple taskq threads required for rdsv3 worker
6949013 Need FMR pooling to improve the performance of rdsv3_ib_free_mr
6950897 Need to optimize rdsv3_poll_cq() for performance
6952827 upgrade to Linux rds 1.5.1-dev
6954762 convert event processing worker threads from taskq's to kthread's
6958691 RDSv3 package has the wrong category
Diffstat (limited to 'usr/src')
30 files changed, 1519 insertions, 691 deletions
diff --git a/usr/src/pkg/manifests/driver-network-rdsv3.mf b/usr/src/pkg/manifests/driver-network-rdsv3.mf index e65ee79b44..7685eb53f2 100644 --- a/usr/src/pkg/manifests/driver-network-rdsv3.mf +++ b/usr/src/pkg/manifests/driver-network-rdsv3.mf @@ -33,7 +33,7 @@ set name=pkg.fmri value=pkg:/driver/network/rdsv3@$(PKGVERS) set name=pkg.description \ value="The RDS driver is an implementation of the Reliable Datagram Sockets API. It provides reliable, in-order datagram and RDMA data delivery between sockets." set name=pkg.summary value="Solaris Reliable Datagram Sockets" -set name=info.classification value=org.opensolaris.category.2008:System/Core +set name=info.classification value=org.opensolaris.category.2008:System/Hardware set name=variant.arch value=$(ARCH) dir path=kernel group=sys dir path=kernel/drv group=sys diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 74c19a8f1f..7c17dc19f8 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -601,7 +601,7 @@ RDSV3_OBJS += af_rds.o rdsv3_ddi.o bind.o loop.o threads.o connection.o \ transport.o cong.o sysctl.o message.o rds_recv.o send.o \ stats.o info.o page.o rdma_transport.o ib_ring.o ib_rdma.o \ ib_recv.o ib.o ib_send.o ib_sysctl.o ib_stats.o ib_cm.o \ - rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o + rdsv3_sc.o rdsv3_debug.o rdsv3_impl.o rdma.o rdsv3_af_thr.o ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \ iser_resource.o iser_xfer.o diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c index f2d246709d..e221e5d515 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/af_rds.c @@ -85,7 +85,6 @@ extern struct rdma_cm_id *rdsv3_rdma_listen_id; kmutex_t rdsv3_sock_lock; static unsigned long rdsv3_sock_count; list_t rdsv3_sock_list; -rdsv3_wait_queue_t rdsv3_poll_waitq; /* * This is called as the final descriptor referencing this socket is closed. @@ -103,7 +102,7 @@ rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) struct rsock *sk = (struct rsock *)proto_handle; struct rdsv3_sock *rs; - if (sk == NULL) + if (!sk) goto out; rs = rdsv3_sk_to_rs(sk); @@ -112,10 +111,15 @@ rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) rdsv3_sk_sock_orphan(sk); rdsv3_cong_remove_socket(rs); rdsv3_remove_bound(rs); + /* * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so * that ensures the recv path has completed messing * with the socket. + * + * Note2 - rdsv3_clear_recv_queue(rs) should be called first + * to prevent some race conditions, which is different from + * the Linux code. */ rdsv3_clear_recv_queue(rs); rdsv3_send_drop_to(rs, NULL); @@ -224,12 +228,12 @@ rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, * if (rs->rs_seen_congestion) * poll_wait(file, &rds_poll_waitq, wait); */ - mutex_enter(&rdsv3_poll_waitq.waitq_mutex); + mutex_enter(&rs->rs_congested_lock); while (rs->rs_seen_congestion) { - cv_wait(&rdsv3_poll_waitq.waitq_cv, - &rdsv3_poll_waitq.waitq_mutex); + cv_wait(&rs->rs_congested_cv, + &rs->rs_congested_lock); } - mutex_exit(&rdsv3_poll_waitq.waitq_mutex); + mutex_exit(&rs->rs_congested_lock); rw_enter(&rs->rs_recv_lock, RW_READER); if (!rs->rs_cong_monitor) { @@ -251,15 +255,16 @@ rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, mask |= (POLLIN | POLLRDNORM); if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) mask |= (POLLOUT | POLLWRNORM); - rw_exit(&rs->rs_recv_lock); /* clear state any time we wake a seen-congested socket */ if (mask) { - mutex_enter(&rdsv3_poll_waitq.waitq_mutex); + mutex_enter(&rs->rs_congested_lock); rs->rs_seen_congestion = 0; - mutex_exit(&rdsv3_poll_waitq.waitq_mutex); + mutex_exit(&rs->rs_congested_lock); } + rw_exit(&rs->rs_recv_lock); + #if 0 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); #endif @@ -840,6 +845,8 @@ rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL); rs->rs_cred = credp; rs->rs_zoneid = getzoneid(); crhold(credp); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/bind.c b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c index 233157ea64..8eb4d57450 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/bind.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/bind.c @@ -109,6 +109,7 @@ rdsv3_add_bound(struct rdsv3_sock *rs, uint32_be_t addr, uint16_be_t *port) do { if (rover == 0) rover++; + if (rdsv3_bind_tree_walk(addr, htons(rover), rs) == NULL) { *port = htons(rover); ret = 0; @@ -190,7 +191,7 @@ rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, } rs->rs_transport = rdsv3_trans_get_preferred(sin->sin_addr.s_addr); - if (rs->rs_transport == NULL) { + if (!rs->rs_transport) { rdsv3_remove_bound(rs); if (rdsv3_printk_ratelimit()) { RDSV3_DPRINTF1("rdsv3_bind", diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/cong.c b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c index 813930803f..aaa05c2068 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/cong.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/cong.c @@ -160,7 +160,7 @@ rdsv3_cong_from_addr(uint32_be_t addr) RDSV3_DPRINTF4("rdsv3_cong_from_addr", "Enter(addr: %x)", ntohl(addr)); map = kmem_zalloc(sizeof (struct rdsv3_cong_map), KM_NOSLEEP); - if (map == NULL) + if (!map) return (NULL); map->m_addr = addr; @@ -179,7 +179,7 @@ rdsv3_cong_from_addr(uint32_be_t addr) ret = rdsv3_cong_tree_walk(addr, map); mutex_exit(&rdsv3_cong_lock); - if (ret == NULL) { + if (!ret) { ret = map; map = NULL; } @@ -236,7 +236,7 @@ rdsv3_cong_get_maps(struct rdsv3_connection *conn) conn->c_lcong = rdsv3_cong_from_addr(conn->c_laddr); conn->c_fcong = rdsv3_cong_from_addr(conn->c_faddr); - if (conn->c_lcong == NULL || conn->c_fcong == NULL) + if (!(conn->c_lcong && conn->c_fcong)) return (-ENOMEM); return (0); @@ -254,7 +254,7 @@ rdsv3_cong_queue_updates(struct rdsv3_cong_map *map) RDSV3_FOR_EACH_LIST_NODE(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { rdsv3_stats_inc(s_cong_update_queued); - rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + (void) rdsv3_send_xmit(conn); } } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/connection.c b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c index de0ebb562c..65a9a59b77 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/connection.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/connection.c @@ -73,18 +73,6 @@ static struct kmem_cache *rdsv3_conn_slab = NULL; var |= RDSV3_INFO_CONNECTION_FLAG_##suffix; \ } while (0) -static inline int -rdsv3_conn_is_sending(struct rdsv3_connection *conn) -{ - int ret = 0; - - if (!mutex_tryenter(&conn->c_send_lock)) - ret = 1; - else - mutex_exit(&conn->c_send_lock); - - return (ret); -} static struct rdsv3_connection * rdsv3_conn_lookup(uint32_be_t laddr, uint32_be_t faddr, avl_index_t *pos) @@ -143,8 +131,7 @@ rdsv3_conn_reset(struct rdsv3_connection *conn) */ static struct rdsv3_connection * __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, - struct rdsv3_transport *trans, int gfp, - int is_outgoing) + struct rdsv3_transport *trans, int gfp, int is_outgoing) { struct rdsv3_connection *conn, *parent = NULL; avl_index_t pos; @@ -173,7 +160,7 @@ __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, ntohl(laddr), ntohl(faddr)); conn = kmem_cache_alloc(rdsv3_conn_slab, gfp); - if (conn == NULL) { + if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } @@ -220,6 +207,7 @@ __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, RDSV3_INIT_DELAYED_WORK(&conn->c_send_w, rdsv3_send_worker); RDSV3_INIT_DELAYED_WORK(&conn->c_recv_w, rdsv3_recv_worker); RDSV3_INIT_DELAYED_WORK(&conn->c_conn_w, rdsv3_connect_worker); + RDSV3_INIT_DELAYED_WORK(&conn->c_reap_w, rdsv3_reaper_worker); RDSV3_INIT_WORK(&conn->c_down_w, rdsv3_shutdown_worker); mutex_init(&conn->c_cm_lock, NULL, MUTEX_DRIVER, NULL); conn->c_flags = 0; @@ -261,6 +249,8 @@ __rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, } else { avl_insert(&rdsv3_conn_hash, conn, pos); rdsv3_cong_add_conn(conn); + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, + RDSV3_REAPER_WAIT_JIFFIES); rdsv3_conn_count++; } } @@ -287,10 +277,95 @@ rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr, return (__rdsv3_conn_create(laddr, faddr, trans, gfp, 1)); } +extern struct avl_tree rdsv3_conn_hash; + +void +rdsv3_conn_shutdown(struct rdsv3_connection *conn) +{ + RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Enter(conn: %p)", conn); + + /* shut it down unless it's down already */ + if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_enter(&conn->c_cm_lock); + if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, + RDSV3_CONN_DISCONNECTING) && + !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, + RDSV3_CONN_DISCONNECTING)) { + RDSV3_DPRINTF2("rdsv3_conn_shutdown", + "shutdown called in state %d", + atomic_get(&conn->c_state)); + rdsv3_conn_drop(conn); + mutex_exit(&conn->c_cm_lock); + return; + } + mutex_exit(&conn->c_cm_lock); + + /* verify everybody's out of rds_send_xmit() */ + mutex_enter(&conn->c_send_lock); + while (atomic_get(&conn->c_senders)) { + mutex_exit(&conn->c_send_lock); + delay(1); + mutex_enter(&conn->c_send_lock); + } + + conn->c_trans->conn_shutdown(conn); + rdsv3_conn_reset(conn); + mutex_exit(&conn->c_send_lock); + + if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, + RDSV3_CONN_DOWN)) { + /* + * This can happen - eg when we're in the middle of + * tearing down the connection, and someone unloads + * the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ +#ifndef __lock_lint + RDSV3_DPRINTF2("rdsv3_conn_shutdown", + "failed to transition to state DOWN, " + "current statis is: %d", + atomic_get(&conn->c_state)); + rdsv3_conn_drop(conn); +#endif + return; + } + } + + /* + * Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. + */ + rdsv3_cancel_delayed_work(&conn->c_conn_w); + + { + struct rdsv3_conn_info_s conn_info; + + conn_info.c_laddr = conn->c_laddr; + conn_info.c_faddr = conn->c_faddr; + if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) + rdsv3_queue_reconnect(conn); + } + RDSV3_DPRINTF2("rdsv3_conn_shutdown", "Exit"); +} + +/* + * Stop and free a connection. + */ void rdsv3_conn_destroy(struct rdsv3_connection *conn) { struct rdsv3_message *rm, *rtmp; + list_t to_be_dropped; RDSV3_DPRINTF4("rdsv3_conn_destroy", "freeing conn %p for %u.%u.%u.%u -> %u.%u.%u.%u", @@ -298,22 +373,34 @@ rdsv3_conn_destroy(struct rdsv3_connection *conn) avl_remove(&rdsv3_conn_hash, conn); - /* wait for the rds thread to shut it down */ - conn->c_state = RDSV3_CONN_ERROR; - rdsv3_cancel_delayed_work(&conn->c_conn_w); + rdsv3_cancel_delayed_work(&conn->c_reap_w); rdsv3_cancel_delayed_work(&conn->c_send_w); rdsv3_cancel_delayed_work(&conn->c_recv_w); - rdsv3_shutdown_worker(&conn->c_down_w); - rdsv3_flush_workqueue(rdsv3_wq); + + rdsv3_conn_shutdown(conn); /* tear down queued messages */ - RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, - &conn->c_send_queue, + + list_create(&to_be_dropped, sizeof (struct rdsv3_message), + offsetof(struct rdsv3_message, m_conn_item)); + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_retrans, m_conn_item) { + list_remove_node(&rm->m_conn_item); + list_insert_tail(&to_be_dropped, rm); + } + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &conn->c_send_queue, m_conn_item) { list_remove_node(&rm->m_conn_item); - ASSERT(!list_link_active(&rm->m_sock_item)); + list_insert_tail(&to_be_dropped, rm); + } + + RDSV3_FOR_EACH_LIST_NODE_SAFE(rm, rtmp, &to_be_dropped, m_conn_item) { + clear_bit(RDSV3_MSG_ON_CONN, &rm->m_flags); + list_remove_node(&rm->m_conn_item); rdsv3_message_put(rm); } + if (conn->c_xmit_rm) rdsv3_message_put(conn->c_xmit_rm); @@ -378,7 +465,6 @@ rdsv3_conn_message_info(struct rsock *sock, unsigned int len, conn = AVL_NEXT(&rdsv3_conn_hash, conn); } while (conn != NULL); - rw_exit(&rdsv3_conn_lock); lens->nr = total; @@ -450,7 +536,6 @@ rdsv3_for_each_conn_info(struct rsock *sock, unsigned int len, } conn = AVL_NEXT(&rdsv3_conn_hash, conn); } while (conn != NULL); - rw_exit(&rdsv3_conn_lock); kmem_free(buffer, item_len + 8); @@ -470,7 +555,8 @@ rdsv3_conn_info_visitor(struct rdsv3_connection *conn, void *buffer) cinfo->flags = 0; rdsv3_conn_info_set(cinfo->flags, - rdsv3_conn_is_sending(conn), SENDING); + MUTEX_HELD(&conn->c_send_lock), SENDING); + /* XXX Future: return the state rather than these funky bits */ rdsv3_conn_info_set(cinfo->flags, atomic_get(&conn->c_state) == RDSV3_CONN_CONNECTING, @@ -497,10 +583,10 @@ rdsv3_conn_init() rdsv3_conn_slab = kmem_cache_create("rdsv3_connection", sizeof (struct rdsv3_connection), 0, rdsv3_conn_constructor, rdsv3_conn_destructor, NULL, NULL, NULL, 0); - if (rdsv3_conn_slab == NULL) { + if (!rdsv3_conn_slab) { RDSV3_DPRINTF2("rdsv3_conn_init", "kmem_cache_create(rdsv3_conn_slab) failed"); - return (-1); + return (-ENOMEM); } avl_create(&rdsv3_conn_hash, rdsv3_conn_compare, diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c index 7eaef82c06..0dfc74d08d 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib.c @@ -109,11 +109,19 @@ rdsv3_ib_add_one(ib_device_t *device) if (!rds_ibdev) goto free_attr; + rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device); + rds_ibdev->hca_attr = *dev_attr; + + rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL); mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); + rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp; + rds_ibdev->max_responder_resources = + (uint_t)dev_attr->hca_max_rdma_in_qp; + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); if (IS_ERR(rds_ibdev->pd)) @@ -123,6 +131,11 @@ rdsv3_ib_add_one(ib_device_t *device) goto free_dev; } + if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) { + rdsv3_ib_destroy_mr_pool(rds_ibdev); + goto free_dev; + } + (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", (longlong_t)htonll(dev_attr->hca_node_guid)); rds_ibdev->ib_frag_slab = kmem_cache_create(name, @@ -133,9 +146,40 @@ rdsv3_ib_add_one(ib_device_t *device) "kmem_cache_create for ib_frag_slab failed for device: %s", device->name); rdsv3_ib_destroy_mr_pool(rds_ibdev); + rdsv3_ib_destroy_inc_pool(rds_ibdev); + goto free_dev; + } + + rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl, + (uint64_t)rds_ibdev->hca_attr.hca_node_guid); + if (rds_ibdev->aft_hcagp == NULL) { + rdsv3_ib_destroy_mr_pool(rds_ibdev); + rdsv3_ib_destroy_inc_pool(rds_ibdev); + kmem_cache_destroy(rds_ibdev->ib_frag_slab); + goto free_dev; + } + rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn, + (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU, + rds_ibdev->aft_hcagp); + if (rds_ibdev->fmr_soft_cq == NULL) { + rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); + rdsv3_ib_destroy_mr_pool(rds_ibdev); + rdsv3_ib_destroy_inc_pool(rds_ibdev); + kmem_cache_destroy(rds_ibdev->ib_frag_slab); goto free_dev; } + rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist, + (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU, + rds_ibdev->aft_hcagp); + if (rds_ibdev->inc_soft_cq == NULL) { + rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); + rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); + rdsv3_ib_destroy_mr_pool(rds_ibdev); + rdsv3_ib_destroy_inc_pool(rds_ibdev); + kmem_cache_destroy(rds_ibdev->ib_frag_slab); + goto free_dev; + } list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), offsetof(struct rdsv3_ib_ipaddr, list)); @@ -153,6 +197,8 @@ rdsv3_ib_add_one(ib_device_t *device) err_pd: (void) ib_dealloc_pd(rds_ibdev->pd); free_dev: + mutex_destroy(&rds_ibdev->spinlock); + rw_destroy(&rds_ibdev->rwlock); kmem_free(rds_ibdev, sizeof (*rds_ibdev)); free_attr: kmem_free(dev_attr, sizeof (*dev_attr)); @@ -178,10 +224,18 @@ rdsv3_ib_remove_one(struct ib_device *device) rdsv3_ib_destroy_conns(rds_ibdev); + if (rds_ibdev->fmr_soft_cq) + rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); + if (rds_ibdev->inc_soft_cq) + rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq); + rdsv3_ib_destroy_mr_pool(rds_ibdev); + rdsv3_ib_destroy_inc_pool(rds_ibdev); kmem_cache_destroy(rds_ibdev->ib_frag_slab); + rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); + #if 0 while (ib_dealloc_pd(rds_ibdev->pd)) { #ifndef __lock_lint @@ -203,6 +257,8 @@ rdsv3_ib_remove_one(struct ib_device *device) list_destroy(&rds_ibdev->ipaddr_list); list_destroy(&rds_ibdev->conn_list); list_remove_node(&rds_ibdev->list); + mutex_destroy(&rds_ibdev->spinlock); + rw_destroy(&rds_ibdev->rwlock); kmem_free(rds_ibdev, sizeof (*rds_ibdev)); RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device); @@ -362,7 +418,6 @@ struct rdsv3_transport rdsv3_ib_transport = { .conn_connect = rdsv3_ib_conn_connect, .conn_shutdown = rdsv3_ib_conn_shutdown, .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, - .inc_purge = rdsv3_ib_inc_purge, .inc_free = rdsv3_ib_inc_free, .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, .cm_handle_connect = rdsv3_ib_cm_handle_connect, diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c index 4d7b28feeb..eb390875c1 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_cm.c @@ -64,7 +64,7 @@ #include <sys/ib/clients/rdsv3/ib.h> #include <sys/ib/clients/rdsv3/rdsv3_debug.h> -extern ddi_taskq_t *rdsv3_taskq; +extern int rdsv3_enable_snd_cq; /* * Set the selected protocol version @@ -140,7 +140,8 @@ rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, { const struct rdsv3_ib_connect_private *dp = NULL; struct rdsv3_ib_connection *ic = conn->c_transport_data; - struct rdsv3_ib_device *rds_ibdev; + struct rdsv3_ib_device *rds_ibdev = + ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); struct ib_qp_attr qp_attr; int err; @@ -160,12 +161,41 @@ rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, } } - RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", - "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", - NIPQUAD(conn->c_faddr), - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3, 1)) { + RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", + "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rdsv3_conn_destroy(conn); + return; + } else { + RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", + "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } + + ASSERT(ic->i_soft_cq == NULL); + ic->i_soft_cq = rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn, + (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp, + ic->i_cq->ibt_cq); + if (rdsv3_enable_snd_cq) { + ic->i_snd_soft_cq = rdsv3_af_intr_thr_create( + rdsv3_ib_snd_tasklet_fn, + (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp, + ic->i_snd_cq->ibt_cq); + } + ic->i_refill_rq = rdsv3_af_thr_create(rdsv3_ib_refill_fn, (void *)conn, + SCQ_WRK_BIND_CPU, rds_ibdev->aft_hcagp); + rdsv3_af_grp_draw(rds_ibdev->aft_hcagp); + + (void) ib_req_notify_cq(ic->i_cq, IB_CQ_SOLICITED); + if (rdsv3_enable_snd_cq) { + (void) ib_req_notify_cq(ic->i_snd_cq, IB_CQ_NEXT_COMP); + } /* * Init rings and fill recv. this needs to wait until protocol @@ -178,7 +208,7 @@ rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, * Post receive buffers - as a side effect, this will update * the posted credit count. */ - (void) rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 1); + (void) rdsv3_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rdsv3_ib_tune_rnr(ic, &qp_attr); @@ -190,7 +220,6 @@ rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err); /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr); if (err) RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete", @@ -215,22 +244,29 @@ static void rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn, struct rdma_conn_param *conn_param, struct rdsv3_ib_connect_private *dp, - uint32_t protocol_version) + uint32_t protocol_version, + uint32_t max_responder_resources, + uint32_t max_initiator_depth) { + struct rdsv3_ib_connection *ic = conn->c_transport_data; + struct rdsv3_ib_device *rds_ibdev; + RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param", "Enter conn: %p conn_param: %p private: %p version: %d", conn, conn_param, dp, protocol_version); (void) memset(conn_param, 0, sizeof (struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client); + + conn_param->responder_resources = + MIN(rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + MIN(rds_ibdev->max_initiator_depth, max_initiator_depth); conn_param->retry_count = min(rdsv3_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rdsv3_ib_connection *ic = conn->c_transport_data; - (void) memset(dp, 0, sizeof (*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -268,6 +304,122 @@ rdsv3_ib_cq_event_handler(struct ib_event *event, void *data) } static void +rdsv3_ib_snd_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rdsv3_connection *conn = context; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler", + "Enter(conn: %p ic: %p cq: %p)", conn, ic, cq); + + rdsv3_af_thr_fire(ic->i_snd_soft_cq); +} + +void +rdsv3_ib_snd_tasklet_fn(void *data) +{ + struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; + struct rdsv3_connection *conn = ic->conn; + struct rdsv3_ib_ack_state ack_state = { 0, }; + ibt_wc_t wc; + uint_t polled; + + RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn", + "Enter(conn: %p ic: %p)", conn, ic); + + /* + * Poll in a loop before and after enabling the next event + */ + while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) == + IBT_SUCCESS) { + RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", + "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status, + wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); + + ASSERT(wc.wc_id & RDSV3_IB_SEND_OP); + rdsv3_ib_send_cqe_handler(ic, &wc); + } + (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_snd_cq), + IBT_NEXT_COMPLETION); + if (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) == + IBT_SUCCESS) { + ASSERT(wc.wc_id & RDSV3_IB_SEND_OP); + rdsv3_ib_send_cqe_handler(ic, &wc); + } +} + +static void +rdsv3_ib_cq_comp_handler(struct ib_cq *cq, void *context) +{ + struct rdsv3_connection *conn = context; + struct rdsv3_ib_connection *ic = conn->c_transport_data; + + RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler", + "Enter(conn: %p cq: %p)", conn, cq); + + rdsv3_ib_stats_inc(s_ib_evt_handler_call); + + rdsv3_af_thr_fire(ic->i_soft_cq); +} + +void +rdsv3_ib_refill_fn(void *data) +{ + struct rdsv3_connection *conn = (struct rdsv3_connection *)data; + + (void) rdsv3_ib_recv_refill(conn, 0); +} + +void +rdsv3_ib_tasklet_fn(void *data) +{ + struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; + struct rdsv3_connection *conn = ic->conn; + struct rdsv3_ib_ack_state ack_state = { 0, }; + ibt_wc_t wc; + uint_t polled; + + RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", + "Enter(conn: %p ic: %p)", conn, ic); + + rdsv3_ib_stats_inc(s_ib_tasklet_call); + + /* + * Poll in a loop before and after enabling the next event + */ + while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc, 1, &polled) == + IBT_SUCCESS) { + RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn", + "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n", + (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status, + wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); + + if (wc.wc_id & RDSV3_IB_SEND_OP) { + rdsv3_ib_send_cqe_handler(ic, &wc); + } else { + rdsv3_ib_recv_cqe_handler(ic, &wc, &ack_state); + } + } + (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_cq), + IBT_NEXT_SOLICITED); + + if (ack_state.ack_next_valid) { + rdsv3_ib_set_ack(ic, ack_state.ack_next, + ack_state.ack_required); + } + if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) { + rdsv3_send_drop_acked(conn, ack_state.ack_recv, NULL); + ic->i_ack_recv = ack_state.ack_recv; + } + if (rdsv3_conn_up(conn)) { + if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) + (void) rdsv3_send_xmit(ic->conn); + rdsv3_ib_attempt_ack(ic); + } +} + +static void rdsv3_ib_qp_event_handler(struct ib_event *event, void *data) { struct rdsv3_connection *conn = data; @@ -330,7 +482,7 @@ rdsv3_ib_setup_qp(struct rdsv3_connection *conn) * the rds_ibdev at all. */ rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client); - if (rds_ibdev == NULL) { + if (!rds_ibdev) { RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "RDS/IB: No client_data for device %s", dev->name); return (-EOPNOTSUPP); @@ -350,47 +502,30 @@ rdsv3_ib_setup_qp(struct rdsv3_connection *conn) * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() * anyway. */ - ic->i_send_cq = ib_create_cq(dev, rdsv3_ib_send_cq_comp_handler, + ic->i_cq = ib_create_cq(dev, rdsv3_ib_cq_comp_handler, rdsv3_ib_cq_event_handler, conn, - ic->i_send_ring.w_nr + 1, - IB_CQ_VECTOR_LEAST_ATTACHED); - if (IS_ERR(ic->i_send_cq)) { - ret = PTR_ERR(ic->i_send_cq); - ic->i_send_cq = NULL; - RDSV3_DPRINTF2("rdsv3_ib_setup_qp", - "ib_create_cq send failed: %d", ret); - goto out; - } - - /* - * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is - * not implmeneted in Hermon yet, but we can pass it to ib_create_cq() - * anyway. - */ - ic->i_recv_cq = ib_create_cq(dev, rdsv3_ib_recv_cq_comp_handler, - rdsv3_ib_cq_event_handler, conn, - ic->i_recv_ring.w_nr, - IB_CQ_VECTOR_LEAST_ATTACHED); - if (IS_ERR(ic->i_recv_cq)) { - ret = PTR_ERR(ic->i_recv_cq); - ic->i_recv_cq = NULL; - RDSV3_DPRINTF2("rdsv3_ib_setup_qp", - "ib_create_cq recv failed: %d", ret); - goto out; - } - - ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); - if (ret) { + ic->i_recv_ring.w_nr + ic->i_send_ring.w_nr + 1, + (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp)); + if (IS_ERR(ic->i_cq)) { + ret = PTR_ERR(ic->i_cq); + ic->i_cq = NULL; RDSV3_DPRINTF2("rdsv3_ib_setup_qp", - "ib_req_notify_cq send failed: %d", ret); + "ib_create_cq failed: %d", ret); goto out; } - - ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - if (ret) { - RDSV3_DPRINTF2("rdsv3_ib_setup_qp", - "ib_req_notify_cq recv failed: %d", ret); - goto out; + if (rdsv3_enable_snd_cq) { + ic->i_snd_cq = ib_create_cq(dev, rdsv3_ib_snd_cq_comp_handler, + rdsv3_ib_cq_event_handler, conn, ic->i_send_ring.w_nr + 1, + (intptr_t)rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp)); + if (IS_ERR(ic->i_snd_cq)) { + ret = PTR_ERR(ic->i_snd_cq); + (void) ib_destroy_cq(ic->i_cq); + ic->i_cq = NULL; + ic->i_snd_cq = NULL; + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", + "ib_create_cq send cq failed: %d", ret); + goto out; + } } /* XXX negotiate max send/recv with remote? */ @@ -404,8 +539,12 @@ rdsv3_ib_setup_qp(struct rdsv3_connection *conn) attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; - attr.send_cq = ic->i_send_cq; - attr.recv_cq = ic->i_recv_cq; + if (rdsv3_enable_snd_cq) { + attr.send_cq = ic->i_snd_cq; + } else { + attr.send_cq = ic->i_cq; + } + attr.recv_cq = ic->i_cq; /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -476,8 +615,8 @@ rdsv3_ib_setup_qp(struct rdsv3_connection *conn) rdsv3_ib_recv_init_ack(ic); - RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p %p", - conn, ic->i_pd, ic->i_mr, ic->i_send_cq, ic->i_recv_cq); + RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p", + conn, ic->i_pd, ic->i_mr, ic->i_cq); out: return (ret); @@ -649,7 +788,9 @@ rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } - rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); @@ -700,8 +841,8 @@ rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - (void) rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, - RDS_PROTOCOL_VERSION); + rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp, + RDS_PROTOCOL_VERSION, UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) { @@ -798,9 +939,8 @@ rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) int err = 0; RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", - "cm %p pd %p cq %p %p qp %p", ic->i_cm_id, - ic->i_pd, ic->i_send_cq, ic->i_recv_cq, - ic->i_cm_id ? ic->i_cm_id->qp : NULL); + "cm %p pd %p cq %p qp %p", ic->i_cm_id, + ic->i_pd, ic->i_cq, ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { struct ib_device *dev = ic->i_cm_id->device; @@ -821,15 +961,38 @@ rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) if (ic->i_cm_id->qp) { (void) ibt_flush_qp( ib_get_ibt_channel_hdl(ic->i_cm_id)); - - /* wait until all WRs are flushed */ - rdsv3_wait_event(&rdsv3_ib_ring_empty_wait, - rdsv3_ib_ring_empty(&ic->i_send_ring) && + /* + * Don't wait for the send ring to be empty -- there + * may be completed non-signaled entries sitting on + * there. We unmap these below. + */ + rdsv3_wait_event(&ic->i_recv_ring.w_empty_wait, rdsv3_ib_ring_empty(&ic->i_recv_ring)); - + /* + * Note that Linux original code calls + * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic). + */ rdma_destroy_qp(ic->i_cm_id); } + if (rdsv3_enable_snd_cq) { + if (ic->i_snd_soft_cq) { + rdsv3_af_thr_destroy(ic->i_snd_soft_cq); + ic->i_snd_soft_cq = NULL; + } + if (ic->i_snd_cq) + (void) ib_destroy_cq(ic->i_snd_cq); + } + if (ic->i_soft_cq) { + rdsv3_af_thr_destroy(ic->i_soft_cq); + ic->i_soft_cq = NULL; + } + if (ic->i_refill_rq) { + rdsv3_af_thr_destroy(ic->i_refill_rq); + ic->i_refill_rq = NULL; + } + if (ic->i_cq) + (void) ib_destroy_cq(ic->i_cq); if (ic->i_mr) rdsv3_ib_free_hdrs(dev, ic); @@ -839,10 +1002,6 @@ rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) if (ic->i_recvs) rdsv3_ib_recv_clear_ring(ic); - if (ic->i_send_cq) - (void) ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - (void) ib_destroy_cq(ic->i_recv_cq); rdma_destroy_id(ic->i_cm_id); /* @@ -854,13 +1013,12 @@ rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) ic->i_cm_id = NULL; ic->i_pd = NULL; ic->i_mr = NULL; - ic->i_send_cq = NULL; - ic->i_recv_cq = NULL; + ic->i_cq = NULL; + ic->i_snd_cq = NULL; ic->i_send_hdrs = NULL; ic->i_recv_hdrs = NULL; ic->i_ack = NULL; } - ASSERT(!ic->i_on_dev_list); /* Clear pending transmit */ @@ -902,6 +1060,11 @@ rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn) ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work)); ic->i_recvs = NULL; } + if (ic->i_recv_wrs) { + kmem_free(ic->i_recv_wrs, ic->i_recv_ring.w_nr * + (sizeof (ibt_recv_wr_t))); + ic->i_recv_wrs = NULL; + } RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn); } @@ -923,21 +1086,15 @@ int rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) { struct rdsv3_ib_connection *ic; - char tq_name[TASKQ_NAMELEN]; RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn); /* XXX too lazy? */ ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp); - if (ic == NULL) + if (!ic) return (-ENOMEM); list_link_init(&ic->ib_node); - (void) snprintf(tq_name, TASKQ_NAMELEN, "RDSV3_CONN_to_%x:%u", - htonl(conn->c_faddr), conn_cnt++ % 100); - ic->i_recv_tasklet = - ddi_taskq_create(NULL, tq_name, 1, TASKQ_DEFAULTPRI, 0); - mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL); @@ -956,7 +1113,6 @@ rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp) list_insert_tail(&ib_nodev_conns, ic); mutex_exit(&ib_nodev_conns_lock); - RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p", conn, conn->c_transport_data); return (0); @@ -986,8 +1142,6 @@ rdsv3_ib_conn_free(void *arg) list_remove_node(&ic->ib_node); mutex_exit(lock_ptr); #endif - - ddi_taskq_destroy(ic->i_recv_tasklet); kmem_free(ic, sizeof (*ic)); } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c index 68d5a635a8..938d4b72c2 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_rdma.c @@ -66,11 +66,10 @@ * This is stored as mr->r_trans_private. */ struct rdsv3_ib_mr { - struct rdsv3_ib_device *device; - struct rdsv3_ib_mr_pool *pool; - struct ib_fmr *fmr; - struct list list; - unsigned int remap_count; + list_node_t m_obj; /* list obj of rdsv3_fmr_pool list */ + struct rdsv3_ib_device *m_device; + struct rdsv3_fmr_pool *m_pool; /* hca fmr pool */ + unsigned int m_inval:1; struct rdsv3_scatterlist *sg; unsigned int sg_len; @@ -80,6 +79,7 @@ struct rdsv3_ib_mr { /* DDI pinned memory */ ddi_umem_cookie_t umem_cookie; /* IBTF type definitions */ + ibt_hca_hdl_t rc_hca_hdl; ibt_fmr_pool_hdl_t fmr_pool_hdl; ibt_ma_hdl_t rc_ma_hdl; ibt_mr_hdl_t rc_fmr_hdl; @@ -87,23 +87,12 @@ struct rdsv3_ib_mr { }; /* - * Our own little FMR pool + * delayed freed fmr's */ -struct rdsv3_ib_mr_pool { - struct mutex flush_lock; /* serialize fmr invalidate */ - struct rdsv3_work_s flush_worker; /* flush worker */ - - kmutex_t list_lock; /* protect variables below */ - atomic_t item_count; /* total # of MRs */ - atomic_t dirty_count; /* # dirty of MRs */ - /* MRs that have reached their max_maps limit */ - struct list drop_list; - struct list free_list; /* unused MRs */ - struct list clean_list; /* unused & unamapped MRs */ - atomic_t free_pinned; /* memory pinned by free MRs */ - unsigned long max_items; - unsigned long max_items_soft; - unsigned long max_free_pinned; +struct rdsv3_fmr_pool { + list_t f_list; /* list of freed mr */ + kmutex_t f_lock; /* lock of fmr pool */ + int32_t f_listcnt; }; static int rdsv3_ib_flush_mr_pool(struct rdsv3_ib_device *rds_ibdev, @@ -124,15 +113,15 @@ rdsv3_ib_get_device(uint32_be_t ipaddr) RDSV3_DPRINTF4("rdsv3_ib_get_device", "Enter: ipaddr: 0x%x", ipaddr); RDSV3_FOR_EACH_LIST_NODE(rds_ibdev, &rdsv3_ib_devices, list) { - mutex_enter(&rds_ibdev->spinlock); + rw_enter(&rds_ibdev->rwlock, RW_READER); RDSV3_FOR_EACH_LIST_NODE(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - mutex_exit(&rds_ibdev->spinlock); + rw_exit(&rds_ibdev->rwlock); return (rds_ibdev); } } - mutex_exit(&rds_ibdev->spinlock); + rw_exit(&rds_ibdev->rwlock); } RDSV3_DPRINTF4("rdsv3_ib_get_device", "Return: ipaddr: 0x%x", ipaddr); @@ -154,9 +143,9 @@ rdsv3_ib_add_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr) i_ipaddr->ipaddr = ipaddr; - mutex_enter(&rds_ibdev->spinlock); + rw_enter(&rds_ibdev->rwlock, RW_WRITER); list_insert_tail(&rds_ibdev->ipaddr_list, i_ipaddr); - mutex_exit(&rds_ibdev->spinlock); + rw_exit(&rds_ibdev->rwlock); return (0); } @@ -165,20 +154,25 @@ static void rdsv3_ib_remove_ipaddr(struct rdsv3_ib_device *rds_ibdev, uint32_be_t ipaddr) { struct rdsv3_ib_ipaddr *i_ipaddr, *next; + struct rdsv3_ib_ipaddr *to_free = NULL; RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "rds_ibdev: %p, ipaddr: %x", rds_ibdev, ipaddr); - mutex_enter(&rds_ibdev->spinlock); + rw_enter(&rds_ibdev->rwlock, RW_WRITER); RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_remove_node(&i_ipaddr->list); - kmem_free(i_ipaddr, sizeof (*i_ipaddr)); + to_free = i_ipaddr; break; } } - mutex_exit(&rds_ibdev->spinlock); + rw_exit(&rds_ibdev->rwlock); + + if (to_free) { + kmem_free(i_ipaddr, sizeof (*i_ipaddr)); + } RDSV3_DPRINTF4("rdsv3_ib_remove_ipaddr", "Return: rds_ibdev: %p, ipaddr: %x", rds_ibdev, ipaddr); @@ -270,12 +264,19 @@ __rdsv3_ib_destroy_conns(struct list *list, kmutex_t *list_lock) void rdsv3_ib_destroy_mr_pool(struct rdsv3_ib_device *rds_ibdev) { + struct rdsv3_fmr_pool *pool = rds_ibdev->fmr_pool; + RDSV3_DPRINTF4("rdsv3_ib_destroy_mr_pool", "Enter: ibdev: %p", rds_ibdev); if (rds_ibdev->fmr_pool_hdl == NULL) return; + if (pool) { + list_destroy(&pool->f_list); + kmem_free((void *) pool, sizeof (*pool)); + } + (void) rdsv3_ib_flush_mr_pool(rds_ibdev, rds_ibdev->fmr_pool_hdl, 1); (void) ibt_destroy_fmr_pool(ib_get_ibt_hca_hdl(rds_ibdev->dev), rds_ibdev->fmr_pool_hdl); @@ -288,15 +289,13 @@ rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *rds_ibdev) uint_t h_page_sz; ibt_fmr_pool_attr_t fmr_attr; ibt_status_t ibt_status; - ibt_hca_hdl_t hca_hdl; + struct rdsv3_fmr_pool *pool; RDSV3_DPRINTF4("rdsv3_ib_create_mr_pool", "Enter: ibdev: %p", rds_ibdev); - hca_hdl = ib_get_ibt_hca_hdl(rds_ibdev->dev); - /* get hca attributes */ - ibt_status = ibt_query_hca(hca_hdl, &rds_ibdev->hca_attr); - if (ibt_status != IBT_SUCCESS) { + pool = (struct rdsv3_fmr_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP); + if (pool == NULL) { return (-ENOMEM); } @@ -314,13 +313,23 @@ rdsv3_ib_create_mr_pool(struct rdsv3_ib_device *rds_ibdev) fmr_attr.fmr_func_arg = (void *) NULL; /* create the FMR pool */ - ibt_status = ibt_create_fmr_pool(hca_hdl, rds_ibdev->pd->ibt_pd, - &fmr_attr, &rds_ibdev->fmr_pool_hdl); + ibt_status = ibt_create_fmr_pool(rds_ibdev->ibt_hca_hdl, + rds_ibdev->pd->ibt_pd, &fmr_attr, &rds_ibdev->fmr_pool_hdl); if (ibt_status != IBT_SUCCESS) { + kmem_free((void *) pool, sizeof (*pool)); + rds_ibdev->fmr_pool = NULL; return (-ENOMEM); } + + list_create(&pool->f_list, sizeof (struct rdsv3_ib_mr), + offsetof(struct rdsv3_ib_mr, m_obj)); + mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL); + rds_ibdev->fmr_pool = pool; rds_ibdev->max_fmrs = fmr_attr.fmr_pool_size; rds_ibdev->fmr_message_size = fmr_attr.fmr_max_pages_per_fmr; + + RDSV3_DPRINTF2("rdsv3_ib_create_mr_pool", + "Exit: ibdev: %p fmr_pool: %p", rds_ibdev, pool); return (0); } @@ -377,7 +386,8 @@ rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, if (ret == 0) { ibmr->umem_cookie = umem_cookie; *key_ret = (uint32_t)ibmr->rc_mem_desc.pmd_rkey; - ibmr->device = rds_ibdev; + ibmr->m_device = rds_ibdev; + ibmr->m_pool = rds_ibdev->fmr_pool; RDSV3_DPRINTF4("rdsv3_ib_get_mr", "Return: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie); return (ibmr); @@ -400,6 +410,7 @@ rdsv3_ib_alloc_fmr(struct rdsv3_ib_device *rds_ibdev) if (rds_ibdev->fmr_pool_hdl) { ibmr = (struct rdsv3_ib_mr *)kmem_zalloc(sizeof (*ibmr), KM_SLEEP); + ibmr->rc_hca_hdl = ib_get_ibt_hca_hdl(rds_ibdev->dev); ibmr->fmr_pool_hdl = rds_ibdev->fmr_pool_hdl; return (ibmr); } @@ -430,18 +441,20 @@ rdsv3_ib_map_fmr(struct rdsv3_ib_device *rds_ibdev, struct rdsv3_ib_mr *ibmr, paddr_list_len = (bp->b_bcount / page_sz) + 2; /* start + end pg */ /* map user buffer to HCA address */ - ibt_status = ibt_map_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), + ibt_status = ibt_map_mem_area(ibmr->rc_hca_hdl, &va_attr, paddr_list_len, ®_req, &ibmr->rc_ma_hdl); if (ibt_status != IBT_SUCCESS) { return (-ENOMEM); } /* use a free entry from FMR pool to register the specified memory */ - ibt_status = ibt_register_physical_fmr( - ib_get_ibt_hca_hdl(rds_ibdev->dev), ibmr->fmr_pool_hdl, + ibt_status = ibt_register_physical_fmr(ibmr->rc_hca_hdl, + ibmr->fmr_pool_hdl, ®_req.fn_arg, &ibmr->rc_fmr_hdl, &ibmr->rc_mem_desc); if (ibt_status != IBT_SUCCESS) { - (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), + RDSV3_DPRINTF2("rdsv3_ib_map_fmr", "reg_phy_fmr failed %d", + ibt_status); + (void) ibt_unmap_mem_area(ibmr->rc_hca_hdl, ibmr->rc_ma_hdl); if (ibt_status == IBT_INSUFF_RESOURCE) { return (-ENOBUFS); @@ -482,37 +495,67 @@ rdsv3_ib_flush_mrs(void) } static void -__rdsv3_ib_teardown_mr(struct rdsv3_ib_mr *ibmr) +rdsv3_ib_drop_mr(struct rdsv3_ib_mr *ibmr) { - RDSV3_DPRINTF4("__rdsv3_ib_teardown_mr", - "Enter: ibmr: %p umem_cookie %p", ibmr, ibmr->umem_cookie); - - /* unpin memory pages */ + /* return the fmr to the IBTF pool */ + (void) ibt_deregister_fmr(ibmr->rc_hca_hdl, ibmr->rc_fmr_hdl); + (void) ibt_unmap_mem_area(ibmr->rc_hca_hdl, ibmr->rc_ma_hdl); (void) ddi_umem_unlock(ibmr->umem_cookie); + kmem_free((void *) ibmr, sizeof (*ibmr)); +} + +void +rdsv3_ib_drain_mrlist_fn(void *data) +{ + struct rdsv3_fmr_pool *pool = (struct rdsv3_fmr_pool *)data; + ibt_hca_hdl_t hca_hdl; + ibt_fmr_pool_hdl_t fmr_pool_hdl; + unsigned int inval; + struct rdsv3_ib_mr *ibmr; + list_t *listp = &pool->f_list; + kmutex_t *lockp = &pool->f_lock; + int i; + + inval = 0; + i = 0; + for (;;) { + mutex_enter(lockp); + ibmr = (struct rdsv3_ib_mr *)list_remove_head(listp); + if (ibmr) + pool->f_listcnt--; + mutex_exit(lockp); + if (!ibmr) + break; + if ((inval == 0) && ibmr->m_inval) { + inval = 1; + hca_hdl = ibmr->rc_hca_hdl; + fmr_pool_hdl = ibmr->fmr_pool_hdl; + } + i++; + rdsv3_ib_drop_mr(ibmr); + } + if (inval) + (void) ibt_flush_fmr_pool(hca_hdl, fmr_pool_hdl); } void rdsv3_ib_free_mr(void *trans_private, int invalidate) { struct rdsv3_ib_mr *ibmr = trans_private; - struct rdsv3_ib_device *rds_ibdev = ibmr->device; + rdsv3_af_thr_t *af_thr; RDSV3_DPRINTF4("rdsv3_ib_free_mr", "Enter: ibmr: %p inv: %d", ibmr, invalidate); - /* return the fmr to the IBTF pool */ - /* the final punch will come from the ibt_flush_fmr_pool() */ - (void) ibt_deregister_fmr(ib_get_ibt_hca_hdl(rds_ibdev->dev), - ibmr->rc_fmr_hdl); - (void) ibt_unmap_mem_area(ib_get_ibt_hca_hdl(rds_ibdev->dev), - ibmr->rc_ma_hdl); - __rdsv3_ib_teardown_mr(ibmr); - if (invalidate) { - rds_ibdev = ibmr->device; - (void) rdsv3_ib_flush_mr_pool(rds_ibdev, - rds_ibdev->fmr_pool_hdl, 0); - } - kmem_free((void *) ibmr, sizeof (*ibmr)); + /* save af_thr at local as ibmr might be freed at mutex_exit */ + af_thr = ibmr->m_device->fmr_soft_cq; + ibmr->m_inval = (unsigned int) invalidate; + mutex_enter(&ibmr->m_pool->f_lock); + list_insert_tail(&ibmr->m_pool->f_list, ibmr); + ibmr->m_pool->f_listcnt++; + mutex_exit(&ibmr->m_pool->f_lock); + + rdsv3_af_thr_fire(af_thr); } static int diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c index 6099671256..36bc553173 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_recv.c @@ -98,6 +98,7 @@ rdsv3_ib_recv_clear_one(struct rdsv3_ib_connection *ic, rdsv3_inc_put(&recv->r_ibinc->ii_inc); recv->r_ibinc = NULL; } + if (recv->r_frag) { kmem_cache_free(ic->rds_ibdev->ib_frag_slab, recv->r_frag); recv->r_frag = NULL; @@ -122,7 +123,7 @@ extern int atomic_add_unless(atomic_t *, uint_t, ulong_t); static int rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn, - struct rdsv3_ib_recv_work *recv, int kmflags) + struct rdsv3_ib_recv_work *recv) { struct rdsv3_ib_connection *ic = conn->c_transport_data; ibt_mi_hdl_t mi_hdl; @@ -132,25 +133,27 @@ rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn, RDSV3_DPRINTF5("rdsv3_ib_recv_refill_one", "conn: %p, recv: %p", conn, recv); - if (recv->r_ibinc == NULL) { + if (!recv->r_ibinc) { if (!atomic_add_unless(&rdsv3_ib_allocation, 1, rdsv3_ib_sysctl_max_recv_allocation)) { rdsv3_ib_stats_inc(s_ib_rx_alloc_limit); goto out; } recv->r_ibinc = kmem_cache_alloc(rdsv3_ib_incoming_slab, - kmflags); + KM_NOSLEEP); if (recv->r_ibinc == NULL) { atomic_add_32(&rdsv3_ib_allocation, -1); goto out; } rdsv3_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); + recv->r_ibinc->ii_ibdev = ic->rds_ibdev; + recv->r_ibinc->ii_pool = ic->rds_ibdev->inc_pool; } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(ic->rds_ibdev->ib_frag_slab, - kmflags); - if (recv->r_frag == NULL) + KM_NOSLEEP); + if (!recv->r_frag) goto out; } @@ -162,6 +165,11 @@ rdsv3_ib_recv_refill_one(struct rdsv3_connection *conn, return (0); out: + if (recv->r_ibinc) { + kmem_cache_free(rdsv3_ib_incoming_slab, recv->r_ibinc); + atomic_add_32(&rdsv3_ib_allocation, -1); + recv->r_ibinc = NULL; + } return (-ENOMEM); } @@ -174,7 +182,7 @@ out: * -1 is returned if posting fails due to temporary resource exhaustion. */ int -rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kmflags, int prefill) +rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill) { struct rdsv3_ib_connection *ic = conn->c_transport_data; struct rdsv3_ib_recv_work *recv; @@ -199,13 +207,13 @@ rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kmflags, int prefill) /* populate the WRs */ for (i = 0; i < avail; i++) { recv = &ic->i_recvs[pos]; - ret = rdsv3_ib_recv_refill_one(conn, recv, kmflags); + ret = rdsv3_ib_recv_refill_one(conn, recv); if (ret) { rdsv3_ib_ring_unalloc(&ic->i_recv_ring, avail - i); break; } - ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)(uintptr_t)recv; + ic->i_recv_wrs[i].wr_id = (ibt_wrid_t)pos; ic->i_recv_wrs[i].wr_nds = RDSV3_IB_RECV_SGE; ic->i_recv_wrs[i].wr_sgl = &recv->r_sge[0]; @@ -239,46 +247,98 @@ rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kmflags, int prefill) return (ret); } +/* + * delayed freed incoming's + */ +struct rdsv3_inc_pool { + list_t f_list; /* list of freed incoming */ + kmutex_t f_lock; /* lock of fmr pool */ + int32_t f_listcnt; +}; + void -rdsv3_ib_inc_purge(struct rdsv3_incoming *inc) +rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *rds_ibdev) { - struct rdsv3_ib_incoming *ibinc; - struct rdsv3_page_frag *frag; - struct rdsv3_page_frag *pos; - struct rdsv3_ib_connection *ic = - (struct rdsv3_ib_connection *)inc->i_conn->c_transport_data; + struct rdsv3_inc_pool *pool = rds_ibdev->inc_pool; + + if (pool) { + list_destroy(&pool->f_list); + kmem_free((void *) pool, sizeof (*pool)); + } +} - RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "inc: %p", inc); +int +rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *rds_ibdev) +{ + struct rdsv3_inc_pool *pool; - ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc); - RDSV3_DPRINTF5("rdsv3_ib_inc_purge", - "purging ibinc %p inc %p\n", ibinc, inc); + pool = (struct rdsv3_inc_pool *)kmem_zalloc(sizeof (*pool), KM_NOSLEEP); + if (pool == NULL) { + return (-ENOMEM); + } + list_create(&pool->f_list, sizeof (struct rdsv3_ib_incoming), + offsetof(struct rdsv3_ib_incoming, ii_obj)); + mutex_init(&pool->f_lock, NULL, MUTEX_DRIVER, NULL); + rds_ibdev->inc_pool = pool; + return (0); +} + +static void +rdsv3_ib_inc_drop(struct rdsv3_ib_incoming *ibinc) +{ + struct rdsv3_page_frag *frag; + struct rdsv3_page_frag *pos; RDSV3_FOR_EACH_LIST_NODE_SAFE(frag, pos, &ibinc->ii_frags, f_item) { list_remove_node(&frag->f_item); - kmem_cache_free(ic->rds_ibdev->ib_frag_slab, frag); + kmem_cache_free(ibinc->ii_ibdev->ib_frag_slab, frag); } - RDSV3_DPRINTF4("rdsv3_ib_inc_purge", "Return: inc: %p", inc); + ASSERT(list_is_empty(&ibinc->ii_frags)); + kmem_cache_free(rdsv3_ib_incoming_slab, ibinc); + atomic_dec_uint(&rdsv3_ib_allocation); +} + +void +rdsv3_ib_drain_inclist(void *data) +{ + struct rdsv3_inc_pool *pool = (struct rdsv3_inc_pool *)data; + struct rdsv3_ib_incoming *ibinc; + list_t *listp = &pool->f_list; + kmutex_t *lockp = &pool->f_lock; + int i = 0; + + for (;;) { + mutex_enter(lockp); + ibinc = (struct rdsv3_ib_incoming *)list_remove_head(listp); + if (ibinc) + pool->f_listcnt--; + mutex_exit(lockp); + if (!ibinc) + break; + i++; + rdsv3_ib_inc_drop(ibinc); + } } void rdsv3_ib_inc_free(struct rdsv3_incoming *inc) { struct rdsv3_ib_incoming *ibinc; + rdsv3_af_thr_t *af_thr; RDSV3_DPRINTF4("rdsv3_ib_inc_free", "inc: %p", inc); ibinc = container_of(inc, struct rdsv3_ib_incoming, ii_inc); + /* save af_thr in a local as ib_inc might be freed at mutex_exit */ + af_thr = ibinc->ii_ibdev->inc_soft_cq; - rdsv3_ib_inc_purge(inc); - RDSV3_DPRINTF5("rdsv3_ib_inc_free", "freeing ibinc %p inc %p", - ibinc, inc); - ASSERT(list_is_empty(&ibinc->ii_frags)); - kmem_cache_free(rdsv3_ib_incoming_slab, ibinc); - atomic_dec_uint(&rdsv3_ib_allocation); + mutex_enter(&ibinc->ii_pool->f_lock); + list_insert_tail(&ibinc->ii_pool->f_list, ibinc); + ibinc->ii_pool->f_listcnt++; + mutex_exit(&ibinc->ii_pool->f_lock); - RDSV3_DPRINTF4("rdsv3_ib_inc_free", "Return: inc: %p", inc); + rdsv3_af_thr_fire(af_thr); } int @@ -375,7 +435,7 @@ rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic) * room for it beyond the ring size. Send completion notices its special * wr_id and avoids working with the ring in that case. */ -static void +void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, int ack_required) { @@ -536,42 +596,6 @@ rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic) return (rdsv3_ib_get_ack(ic)); } -static struct rdsv3_header * -rdsv3_ib_get_header(struct rdsv3_connection *conn, - struct rdsv3_ib_recv_work *recv, - uint32_t data_len) -{ - struct rdsv3_ib_connection *ic = conn->c_transport_data; - void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; - - RDSV3_DPRINTF4("rdsv3_ib_get_header", "conn: %p, recv: %p len: %d", - conn, recv, data_len); - - /* - * Support header at the front (RDS 3.1+) as well as header-at-end. - * - * Cases: - * 1) header all in header buff (great!) - * 2) header all in data page (copy all to header buff) - * 3) header split across hdr buf + data page - * (move bit in hdr buff to end before copying other bit from - * data page) - */ - if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDSV3_FRAG_SIZE) - return (hdr_buff); - /* - * XXX - Need to discuss the support for version < RDS_PROTOCOL_3_1. - */ - if (conn->c_version == RDS_PROTOCOL_3_0) - return (hdr_buff); - - /* version < RDS_PROTOCOL_3_0 */ - RDSV3_DPRINTF2("rdsv3_ib_get_header", - "NULL header (version: 0x%x, data_len: %d)", conn->c_version, - data_len); - return (NULL); -} - /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -661,20 +685,6 @@ XXX conn, ibinc); } -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rdsv3_ib_ack_state { - uint64_t ack_next; - uint64_t ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - static void rdsv3_ib_process_recv(struct rdsv3_connection *conn, struct rdsv3_ib_recv_work *recv, uint32_t data_len, @@ -699,15 +709,7 @@ rdsv3_ib_process_recv(struct rdsv3_connection *conn, } data_len -= sizeof (struct rdsv3_header); - if ((ihdr = rdsv3_ib_get_header(conn, recv, data_len)) == NULL) { - RDSV3_DPRINTF2("rdsv3_ib_process_recv", "incoming message " - "from %u.%u.%u.%u didn't have a proper version (0x%x) or" - "data_len (0x%x), disconnecting and " - "reconnecting", - NIPQUAD(conn->c_faddr), conn->c_version, data_len); - rdsv3_conn_drop(conn); - return; - } + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rdsv3_message_verify_checksum(ihdr)) { @@ -735,7 +737,6 @@ rdsv3_ib_process_recv(struct rdsv3_connection *conn, * were rather special beasts. */ rdsv3_ib_stats_inc(s_ib_ack_received); - return; } @@ -745,7 +746,7 @@ rdsv3_ib_process_recv(struct rdsv3_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (ibinc == NULL) { + if (!ibinc) { ibinc = recv->r_ibinc; recv->r_ibinc = NULL; ic->i_ibinc = ibinc; @@ -810,131 +811,57 @@ rdsv3_ib_process_recv(struct rdsv3_connection *conn, conn, recv, data_len, state); } -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rdsv3_recv_incoming() per RDS connection. - */ - void -rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) -{ - struct rdsv3_connection *conn = context; - struct rdsv3_ib_connection *ic = conn->c_transport_data; - - RDSV3_DPRINTF4("rdsv3_ib_recv_cq_comp_handler", - "Enter(conn: %p cq: %p)", conn, cq); - - rdsv3_ib_stats_inc(s_ib_rx_cq_call); - - (void) ddi_taskq_dispatch(ic->i_recv_tasklet, rdsv3_ib_recv_tasklet_fn, - (void *)ic, DDI_SLEEP); -} - -static inline void -rdsv3_poll_cq(struct rdsv3_ib_connection *ic, struct rdsv3_ib_ack_state *state) +rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, + struct rdsv3_ib_ack_state *state) { struct rdsv3_connection *conn = ic->conn; - ibt_wc_t wc; struct rdsv3_ib_recv_work *recv; - uint_t polled; + struct rdsv3_ib_work_ring *recv_ringp = &ic->i_recv_ring; - while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_recv_cq), &wc, 1, &polled) == - IBT_SUCCESS) { - RDSV3_DPRINTF5("rdsv3_ib_recv_cq_comp_handler", - "rwc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wc_id, wc.wc_status, - wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); - rdsv3_ib_stats_inc(s_ib_rx_cq_event); + RDSV3_DPRINTF4("rdsv3_ib_recv_cqe_handler", + "rwc wc_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wc_id, wc->wc_status, + wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); - recv = (struct rdsv3_ib_recv_work *)(uintptr_t)wc.wc_id; - - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) { - /* - * We expect errors as the qp is drained during - * shutdown - */ - if (wc.wc_status == IBT_WC_SUCCESS) { - rdsv3_ib_process_recv(conn, recv, - wc.wc_bytes_xfer, state); - } else { - RDSV3_DPRINTF2("rdsv3_ib_recv_cq_comp_handler", - "recv completion on " - "%u.%u.%u.%u had status %u, " - "disconnecting and reconnecting\n", - NIPQUAD(conn->c_faddr), - wc.wc_status); - rdsv3_conn_drop(conn); - } - } + rdsv3_ib_stats_inc(s_ib_rx_cq_event); - rdsv3_ib_ring_free(&ic->i_recv_ring, 1); - } -} + recv = &ic->i_recvs[rdsv3_ib_ring_oldest(recv_ringp)]; -static processorid_t rdsv3_taskq_bind_cpuid = 0; -void -rdsv3_ib_recv_tasklet_fn(void *data) -{ - struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data; - struct rdsv3_connection *conn = ic->conn; - struct rdsv3_ib_ack_state state = { 0, }; - cpu_t *cp; - - RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Enter: ic: %p", ic); - - /* If not already bound, bind this thread to a CPU */ - if (ic->i_recv_tasklet_cpuid != rdsv3_taskq_bind_cpuid) { - cp = cpu[rdsv3_taskq_bind_cpuid]; - mutex_enter(&cpu_lock); - if (cpu_is_online(cp)) { - if (ic->i_recv_tasklet_cpuid >= 0) - thread_affinity_clear(curthread); - thread_affinity_set(curthread, rdsv3_taskq_bind_cpuid); - ic->i_recv_tasklet_cpuid = rdsv3_taskq_bind_cpuid; + /* + * Also process recvs in connecting state because it is possible + * to get a recv completion _before_ the rdmacm ESTABLISHED + * event is processed. + */ + if (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn)) { + /* We expect errors as the qp is drained during shutdown */ + if (wc->wc_status == IBT_WC_SUCCESS) { + rdsv3_ib_process_recv(conn, recv, + wc->wc_bytes_xfer, state); + } else { + RDSV3_DPRINTF2("rdsv3_ib_recv_cqe_handler", + "recv completion on " + "%u.%u.%u.%u had status %u, " + "disconnecting and reconnecting\n", + NIPQUAD(conn->c_faddr), + wc->wc_status); + rdsv3_conn_drop(conn); } - mutex_exit(&cpu_lock); } - rdsv3_poll_cq(ic, &state); - (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_recv_cq), - IBT_NEXT_SOLICITED); - rdsv3_poll_cq(ic, &state); - - if (state.ack_next_valid) - rdsv3_ib_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rdsv3_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; - } - if (rdsv3_conn_up(conn)) - rdsv3_ib_attempt_ack(ic); + rdsv3_ib_ring_free(recv_ringp, 1); /* * If we ever end up with a really empty receive ring, we're * in deep trouble, as the sender will definitely see RNR * timeouts. */ - if (rdsv3_ib_ring_empty(&ic->i_recv_ring)) + if (rdsv3_ib_ring_empty(recv_ringp)) rdsv3_ib_stats_inc(s_ib_rx_ring_empty); - /* - * If the ring is running low, then schedule the thread to refill. - */ - if (rdsv3_ib_ring_low(&ic->i_recv_ring) && - (rdsv3_conn_up(conn) || rdsv3_conn_connecting(conn))) - rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); - - RDSV3_DPRINTF4("rdsv3_ib_recv_tasklet_fn", "Return: ic: %p", ic); + if (rdsv3_ib_ring_low(recv_ringp)) { + rdsv3_af_thr_fire(ic->i_refill_rq); + } } int @@ -945,17 +872,6 @@ rdsv3_ib_recv(struct rdsv3_connection *conn) RDSV3_DPRINTF4("rdsv3_ib_recv", "conn %p\n", conn); - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_enter(&ic->i_recv_mutex); - if (rdsv3_ib_recv_refill(conn, KM_NOSLEEP, 0)) - ret = -ENOMEM; - else - rdsv3_ib_stats_inc(s_ib_rx_refill_from_thread); - mutex_exit(&ic->i_recv_mutex); - if (rdsv3_conn_up(conn)) rdsv3_ib_attempt_ack(ic); @@ -975,7 +891,7 @@ rdsv3_ib_recv_init(void) rdsv3_ib_incoming_slab = kmem_cache_create("rdsv3_ib_incoming", sizeof (struct rdsv3_ib_incoming), 0, rdsv3_ib_inc_constructor, rdsv3_ib_inc_destructor, NULL, NULL, NULL, 0); - if (rdsv3_ib_incoming_slab == NULL) { + if (!rdsv3_ib_incoming_slab) { RDSV3_DPRINTF2("rdsv3_ib_recv_init", "kmem_cache_create " "failed"); return (-ENOMEM); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c index d37d1f0eb2..1dfa761354 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_ring.c @@ -83,11 +83,6 @@ * more entries. */ -/* - * This only happens on shutdown. - */ -rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait; - void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr) { @@ -155,7 +150,7 @@ rdsv3_ib_ring_free(struct rdsv3_ib_work_ring *ring, uint32_t val) atomic_add_32(&ring->w_free_ctr, val); if (__rdsv3_ib_ring_empty(ring)) - rdsv3_wake_up(&rdsv3_ib_ring_empty_wait); + rdsv3_wake_up(&ring->w_empty_wait); } void diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c index 9f49ab1da3..44c39c648c 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/ib_send.c @@ -122,8 +122,13 @@ rdsv3_ib_send_unmap_rm(struct rdsv3_ib_connection *ic, RDSV3_DPRINTF4("rdsv3_ib_send_unmap_rm", "ic %p send %p rm %p\n", ic, send, rm); - rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents); + mutex_enter(&rm->m_rs_lock); + if (rm->m_count) { + rdsv3_ib_dma_unmap_sg(ic->i_cm_id->device, + rm->m_sg, rm->m_count); + rm->m_count = 0; + } + mutex_exit(&rm->m_rs_lock); if (rm->m_rdma_op != NULL) { rdsv3_ib_send_unmap_rdma(ic, rm->m_rdma_op); @@ -213,128 +218,110 @@ rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic) * the next to be freed, which is what this is concerned with. */ void -rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc) { - struct rdsv3_connection *conn = context; - struct rdsv3_ib_connection *ic = conn->c_transport_data; - ibt_wc_t wc; + struct rdsv3_connection *conn = ic->conn; struct rdsv3_ib_send_work *send; uint32_t completed, polled; uint32_t oldest; uint32_t i = 0; int ret; - RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "conn: %p cq: %p", - conn, cq); - - rdsv3_ib_stats_inc(s_ib_tx_cq_call); - ret = ibt_enable_cq_notify(RDSV3_CQ2CQHDL(cq), IBT_NEXT_COMPLETION); - if (ret) - RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", - "ib_req_notify_cq send failed: %d", ret); - - while (ibt_poll_cq(RDSV3_CQ2CQHDL(cq), &wc, 1, &polled) == - IBT_SUCCESS) { - RDSV3_DPRINTF5("rdsv3_ib_send_cq_comp_handler", - "swc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wc_id, wc.wc_status, - wc.wc_bytes_xfer, ntohl(wc.wc_immed_data)); - rdsv3_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wc_id == RDSV3_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) - rdsv3_ib_stats_inc(s_ib_tx_stalled); - rdsv3_ib_ack_send_complete(ic); - continue; - } + RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", + "wc wc_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wc_id, wc->wc_status, + wc->wc_bytes_xfer, ntohl(wc->wc_immed_data)); - oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); + rdsv3_ib_stats_inc(s_ib_tx_cq_event); + + if (wc->wc_id == RDSV3_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rdsv3_ib_stats_inc(s_ib_tx_stalled); + rdsv3_ib_ack_send_complete(ic); + return; + } - completed = rdsv3_ib_ring_completed(&ic->i_send_ring, - wc.wc_id, oldest); + oldest = rdsv3_ib_ring_oldest(&ic->i_send_ring); - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; + completed = rdsv3_ib_ring_completed(&ic->i_send_ring, + (wc->wc_id & ~RDSV3_IB_SEND_OP), oldest); + for (i = 0; i < completed; i++) { + send = &ic->i_sends[oldest]; + + /* + * In the error case, wc->opcode sometimes contains + * garbage + */ + switch (send->s_opcode) { + case IBT_WRC_SEND: + if (send->s_rm) + rdsv3_ib_send_unmap_rm(ic, send, + wc->wc_status); + break; + case IBT_WRC_RDMAW: + case IBT_WRC_RDMAR: /* - * In the error case, wc.opcode sometimes contains - * garbage + * Nothing to be done - the SG list will + * be unmapped + * when the SEND completes. */ - switch (send->s_opcode) { - case IBT_WRC_SEND: - if (send->s_rm) - rdsv3_ib_send_unmap_rm(ic, send, - wc.wc_status); - break; - case IBT_WRC_RDMAW: - case IBT_WRC_RDMAR: - /* - * Nothing to be done - the SG list will - * be unmapped - * when the SEND completes. - */ - break; - default: + break; + default: #ifndef __lock_lint - RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", - "RDS/IB: %s: unexpected opcode " - "0x%x in WR!", - __func__, send->s_opcode); + RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", + "RDS/IB: %s: unexpected opcode " + "0x%x in WR!", + __func__, send->s_opcode); #endif - break; - } + break; + } - send->s_opcode = 0xdd; - if (send->s_queued + HZ/2 < jiffies) - rdsv3_ib_stats_inc(s_ib_tx_stalled); + send->s_opcode = 0xdd; + if (send->s_queued + HZ/2 < jiffies) + rdsv3_ib_stats_inc(s_ib_tx_stalled); - /* - * If a RDMA operation produced an error, signal - * this right - * away. If we don't, the subsequent SEND that goes - * with this - * RDMA will be canceled with ERR_WFLUSH, and the - * application - * never learn that the RDMA failed. - */ - if (wc.wc_status == - IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { - struct rdsv3_message *rm; - - rm = rdsv3_send_get_message(conn, send->s_op); - if (rm) { - if (rm->m_rdma_op != NULL) - rdsv3_ib_send_unmap_rdma(ic, - rm->m_rdma_op); - rdsv3_ib_send_rdma_complete(rm, - wc.wc_status); - rdsv3_message_put(rm); - } + /* + * If a RDMA operation produced an error, signal + * this right + * away. If we don't, the subsequent SEND that goes + * with this + * RDMA will be canceled with ERR_WFLUSH, and the + * application + * never learn that the RDMA failed. + */ + if (wc->wc_status == + IBT_WC_REMOTE_ACCESS_ERR && send->s_op) { + struct rdsv3_message *rm; + + rm = rdsv3_send_get_message(conn, send->s_op); + if (rm) { + if (rm->m_rdma_op != NULL) + rdsv3_ib_send_unmap_rdma(ic, + rm->m_rdma_op); + rdsv3_ib_send_rdma_complete(rm, + wc->wc_status); + rdsv3_message_put(rm); } - - oldest = (oldest + 1) % ic->i_send_ring.w_nr; } - RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", "compl: %d", - completed); - rdsv3_ib_ring_free(&ic->i_send_ring, completed); + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } - if (test_and_clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + rdsv3_ib_ring_free(&ic->i_send_ring, completed); - /* We expect errors as the qp is drained during shutdown */ - if (wc.wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { - RDSV3_DPRINTF2("rdsv3_ib_send_cq_comp_handler", - "send completion on %u.%u.%u.%u " - "had status %u, disconnecting and reconnecting\n", - NIPQUAD(conn->c_faddr), wc.wc_status); - rdsv3_conn_drop(conn); - } + clear_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->wc_status != IBT_WC_SUCCESS && rdsv3_conn_up(conn)) { + RDSV3_DPRINTF2("rdsv3_ib_send_cqe_handler", + "send completion on %u.%u.%u.%u " + "had status %u, disconnecting and reconnecting\n", + NIPQUAD(conn->c_faddr), wc->wc_status); + rdsv3_conn_drop(conn); } - RDSV3_DPRINTF4("rdsv3_ib_send_cq_comp_handler", - "Return: conn: %p, cq: %p", conn, cq); + RDSV3_DPRINTF4("rdsv3_ib_send_cqe_handler", "Return: conn: %p", ic); } /* @@ -512,7 +499,7 @@ rdsv3_ib_xmit_populate_wr(struct rdsv3_ib_connection *ic, "ic: %p, wr: %p scat: %p %d %d %d %d", ic, wr, scat, pos, off, length, send_flags); - wr->wr_id = pos; + wr->wr_id = pos | RDSV3_IB_SEND_OP; wr->wr_trans = IBT_RC_SRV; wr->wr_flags = send_flags; wr->wr_opcode = IBT_WRC_SEND; @@ -622,7 +609,8 @@ rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, #endif work_alloc = rdsv3_ib_ring_alloc(&ic->i_send_ring, i, &pos); - if (work_alloc == 0) { + if (work_alloc != i) { + rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); set_bit(RDSV3_LL_SEND_FULL, &conn->c_flags); rdsv3_ib_stats_inc(s_ib_tx_ring_full); ret = -ENOMEM; @@ -886,6 +874,7 @@ add_header: } RDSV3_DPRINTF2("rdsv3_ib_xmit", "ibt_post_send failed\n"); rdsv3_conn_drop(ic->conn); + ret = -EAGAIN; goto out; } @@ -1052,6 +1041,7 @@ rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) return (-ENOMEM); } + RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "pos %u cnt %u", pos, op->r_count); /* * take the scatter list and transpose into a list of * send wr's each with a scatter list of RDSV3_IB_MAX_SGE @@ -1071,7 +1061,7 @@ rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) wr = &ic->i_send_wrs[k]; wr->wr_flags = 0; - wr->wr_id = pos; + wr->wr_id = pos | RDSV3_IB_SEND_OP; wr->wr_trans = IBT_RC_SRV; wr->wr_opcode = op->r_write ? IBT_WRC_RDMAW : IBT_WRC_RDMAR; @@ -1093,11 +1083,11 @@ rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) remote_addr += scat[i].swr.wr_sgl[idx].ds_len; sent += scat[i].swr.wr_sgl[idx].ds_len; idx++; - RDSV3_DPRINTF4("xmit_rdma", + RDSV3_DPRINTF5("xmit_rdma", "send_wrs[%d]sgl[%d] va %llx len %x", k, j, sge->ds_va, sge->ds_len); } - RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", + RDSV3_DPRINTF5("rdsv3_ib_xmit_rdma", "wr[%d] %p key: %x code: %d tlen: %d", k, wr, wr->wr.rc.rcwr.rdma.rdma_rkey, wr->wr_opcode, sent); @@ -1125,6 +1115,7 @@ rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op) "returned %d", NIPQUAD(conn->c_faddr), status); rdsv3_ib_ring_unalloc(&ic->i_send_ring, work_alloc); } + RDSV3_DPRINTF4("rdsv3_ib_xmit_rdma", "Ret: %p", ic); return (status); } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/info.c b/usr/src/uts/common/io/ib/clients/rdsv3/info.c index f516e7c93a..cfb4899cc5 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/info.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/info.c @@ -93,6 +93,7 @@ rdsv3_info_register_func(int optname, rdsv3_info_func func) ASSERT(optname >= RDSV3_INFO_FIRST && optname <= RDSV3_INFO_LAST); mutex_enter(&rdsv3_info_lock); + ASSERT(!rdsv3_info_funcs[offset]); rdsv3_info_funcs[offset] = func; mutex_exit(&rdsv3_info_lock); } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/loop.c b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c index b625af893b..dae15b5132 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/loop.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/loop.c @@ -85,12 +85,18 @@ rdsv3_loop_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDSV3_FLAG_CONG_BITMAP) { + rdsv3_cong_map_updated(conn->c_fcong, ~(uint64_t)0); + return (sizeof (struct rdsv3_header) + RDSV3_CONG_MAP_BYTES); + } ASSERT(!(hdr_off || sg || off)); RDSV3_DPRINTF4("rdsv3_loop_xmit", "Enter(conn: %p, rm: %p)", conn, rm); rdsv3_inc_init(&rm->m_inc, conn, conn->c_laddr); - rdsv3_message_addref(rm); /* for the inc */ + /* For the embedded inc. Matching put is in loop_inc_free() */ + rdsv3_message_addref(rm); rdsv3_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, KM_NOSLEEP); @@ -106,6 +112,18 @@ rdsv3_loop_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, ntohl(rm->m_inc.i_hdr.h_len)); } +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void +rdsv3_loop_inc_free(struct rdsv3_incoming *inc) +{ + struct rdsv3_message *rm = container_of(inc, struct rdsv3_message, + m_inc); + rdsv3_message_put(rm); +} + static int rdsv3_loop_xmit_cong_map(struct rdsv3_connection *conn, struct rdsv3_cong_map *map, @@ -151,7 +169,7 @@ rdsv3_loop_conn_alloc(struct rdsv3_connection *conn, int gfp) RDSV3_DPRINTF4("rdsv3_loop_conn_alloc", "Enter(conn: %p)", conn); lc = kmem_zalloc(sizeof (struct rdsv3_loop_connection), KM_NOSLEEP); - if (lc == NULL) + if (!lc) return (-ENOMEM); list_link_init(&lc->loop_node); @@ -232,8 +250,7 @@ struct rdsv3_transport rdsv3_loop_transport = { .conn_connect = rdsv3_loop_conn_connect, .conn_shutdown = rdsv3_loop_conn_shutdown, .inc_copy_to_user = rdsv3_message_inc_copy_to_user, - .inc_purge = rdsv3_message_inc_purge, - .inc_free = rdsv3_message_inc_free, + .inc_free = rdsv3_loop_inc_free, .t_name = "loopback", }; #else diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/message.c b/usr/src/uts/common/io/ib/clients/rdsv3/message.c index 9a266c76f0..f2349821cc 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/message.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/message.c @@ -60,8 +60,6 @@ #include <sys/ib/clients/rdsv3/rdma.h> #include <sys/ib/clients/rdsv3/rdsv3_debug.h> -static rdsv3_wait_queue_t rdsv3_message_flush_waitq; - #ifndef __lock_lint static unsigned int rdsv3_exthdr_size[__RDSV3_EXTHDR_MAX] = { [RDSV3_EXTHDR_NONE] = 0, @@ -132,14 +130,6 @@ rdsv3_message_purge(struct rdsv3_message *rm) } void -rdsv3_message_inc_purge(struct rdsv3_incoming *inc) -{ - struct rdsv3_message *rm = - container_of(inc, struct rdsv3_message, m_inc); - rdsv3_message_purge(rm); -} - -void rdsv3_message_put(struct rdsv3_message *rm) { RDSV3_DPRINTF5("rdsv3_message_put", @@ -313,6 +303,7 @@ rdsv3_message_alloc(unsigned int nents, int gfp) list_link_init(&rm->m_sock_item); list_link_init(&rm->m_conn_item); mutex_init(&rm->m_rs_lock, NULL, MUTEX_DRIVER, NULL); + rdsv3_init_waitqueue(&rm->m_flush_wait); RDSV3_DPRINTF4("rdsv3_message_alloc", "Return(rm: %p)", rm); out: @@ -399,7 +390,6 @@ rdsv3_message_copy_from_user(struct uio *uiop, total_len -= rdsv3_sg_len(sg); sg++; } - ret = 0; out: if (ret) { @@ -462,10 +452,11 @@ rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc, * If the message is still on the send queue, wait until the transport * is done with it. This is particularly important for RDMA operations. */ +/* ARGSUSED */ void rdsv3_message_wait(struct rdsv3_message *rm) { - rdsv3_wait_event(&rdsv3_message_flush_waitq, + rdsv3_wait_event(&rm->m_flush_wait, !test_bit(RDSV3_MSG_MAPPED, &rm->m_flags)); } @@ -473,5 +464,5 @@ void rdsv3_message_unmapped(struct rdsv3_message *rm) { clear_bit(RDSV3_MSG_MAPPED, &rm->m_flags); - rdsv3_wake_up_all(&rdsv3_message_flush_waitq); + rdsv3_wake_up_all(&rm->m_flush_wait); } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/page.c b/usr/src/uts/common/io/ib/clients/rdsv3/page.c index 356917c711..c07a6cdffc 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/page.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/page.c @@ -75,7 +75,7 @@ rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat, unsigned long bytes, if (bytes >= PAGE_SIZE) { page = kmem_alloc(PAGE_SIZE, gfp); - if (page == NULL) { + if (!page) { ret = -ENOMEM; } else { rdsv3_sg_set_page(scat, page, PAGE_SIZE, 0); @@ -88,7 +88,7 @@ rdsv3_page_remainder_alloc(struct rdsv3_scatterlist *scat, unsigned long bytes, * XXX - This is not same as linux. */ page = kmem_alloc(bytes, KM_NOSLEEP); - if (page == NULL) { + if (!page) { ret = -ENOMEM; goto out; } diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c index 30bf6e7f30..4d4c6f5db2 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdma.c @@ -176,24 +176,6 @@ rdsv3_rdma_drop_keys(struct rdsv3_sock *rs) rs->rs_transport->flush_mrs(); } -/* - * Helper function to pin user pages. - */ -#if 0 -static int -rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, - struct page **pages, int write) -{ - unsigned long l_user_addr = user_addr; - unsigned int l_nr_pages = nr_pages; - struct page **l_pages = pages; - int l_write = write; - - /* memory pin in rds_ib_get_mr() */ - return (0); -} -#endif - static int __rdsv3_rdma_map(struct rdsv3_sock *rs, struct rdsv3_get_mr_args *args, uint64_t *cookie_ret, struct rdsv3_mr **mr_ret) @@ -209,13 +191,13 @@ __rdsv3_rdma_map(struct rdsv3_sock *rs, struct rdsv3_get_mr_args *args, goto out; } - if (rs->rs_transport->get_mr == NULL) { + if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } mr = kmem_zalloc(sizeof (struct rdsv3_mr), KM_NOSLEEP); - if (mr == NULL) { + if (!mr) { ret = -ENOMEM; goto out; } @@ -418,12 +400,18 @@ rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force) mutex_enter(&rs->rs_rdma_lock); mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr && (mr->r_use_once || force)) { + if (!mr) { + RDSV3_DPRINTF4("rdsv3_rdma_unuse", + "rdsv3: trying to unuse MR with unknown r_key %u!", r_key); + mutex_exit(&rs->rs_rdma_lock); + return; + } + + if (mr->r_use_once || force) { avl_remove(&rs->rs_rdma_keys, &mr->r_rb_node); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; - } else if (mr) - atomic_add_32(&mr->r_refcount, 1); + } mutex_exit(&rs->rs_rdma_lock); /* @@ -431,21 +419,16 @@ rdsv3_rdma_unuse(struct rdsv3_sock *rs, uint32_t r_key, int force) * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr != NULL) { - RDSV3_DPRINTF4("rdsv3_rdma_unuse", "mr: %p zot_me %d", - mr, zot_me); - if (mr->r_trans->sync_mr) - mr->r_trans->sync_mr(mr->r_trans_private, - DMA_FROM_DEVICE); + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); - /* - * If the MR was marked as invalidate, this will - * trigger an async flush. - */ - if (zot_me) - rdsv3_destroy_mr(mr); - rdsv3_mr_put(mr); - } + /* + * If the MR was marked as invalidate, this will + * trigger an async flush. + */ + if (zot_me) + rdsv3_destroy_mr(mr); + rdsv3_mr_put(mr); RDSV3_DPRINTF4("rdsv3_rdma_unuse", "Return"); } @@ -455,7 +438,6 @@ rdsv3_rdma_free_op(struct rdsv3_rdma_op *ro) unsigned int i; /* deallocate RDMA resources on rdsv3_message */ - for (i = 0; i < ro->r_nents; i++) { ddi_umem_unlock(ro->r_rdma_sg[i].umem_cookie); } @@ -660,7 +642,7 @@ rdsv3_cmsg_rdma_dest(struct rdsv3_sock *rs, struct rdsv3_message *rm, mutex_enter(&rs->rs_rdma_lock); mr = rdsv3_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr == NULL) + if (!mr) err = -EINVAL; /* invalid r_key */ else atomic_add_32(&mr->r_refcount, 1); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c index 123e612252..18980d2a6a 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rds_recv.c @@ -256,7 +256,7 @@ rdsv3_recv_incoming(struct rdsv3_connection *conn, uint32_be_t saddr, } rs = rdsv3_find_bound(daddr, inc->i_hdr.h_dport); - if (rs == NULL) { + if (!rs) { rdsv3_stats_inc(s_recv_drop_no_sock); goto out; } @@ -308,7 +308,7 @@ out: static int rdsv3_next_incoming(struct rdsv3_sock *rs, struct rdsv3_incoming **inc) { - if (*inc == NULL) { + if (!*inc) { rw_enter(&rs->rs_recv_lock, RW_READER); if (!list_is_empty(&rs->rs_recv_queue)) { *inc = list_head(&rs->rs_recv_queue); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf index c17689cf40..f8cfea1948 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3.conf @@ -23,3 +23,17 @@ # name="rdsv3" parent="ib" unit-address="0"; ddi-forceattach=1; +# +# this needs to be set to max performance +# use echo '::interrupts -d' | mdb -k | grep hermon +# and enter the list of assigned cpu's +#HcaMsix=13,14; +# +# allow separate CQ's for SendQ and ReceiveQ +# default is to us shared CQ +#EnableSendCQ=0; +# +# allow interrupt and the event completion af_thr to use the same cpu +# default is exclusive mode, MSI-x interrupt and the event completion af_thr +# use two separate cpu's +#IntrLineUpMode=0; diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_af_thr.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_af_thr.c new file mode 100644 index 0000000000..6639211d3f --- /dev/null +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_af_thr.c @@ -0,0 +1,389 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ +#include <sys/ib/clients/rdsv3/ib.h> +#include <sys/ib/clients/rdsv3/rdsv3_af_thr_impl.h> +#include <sys/ib/clients/rdsv3/rdsv3_debug.h> + +extern pri_t maxclsyspri; +extern kmutex_t cpu_lock; + +int rdsv3_enable_snd_cq = 0; +int rdsv3_intr_line_up_mode = 0; +static kmutex_t rdsv3_cpuid_pool_lock; + +void +rdsv3_af_init(dev_info_t *dip) +{ + int i; + cpu_t *cp; + int *msix; + uint_t nmsix; + extern int ncpus; + + mutex_init(&rdsv3_cpuid_pool_lock, NULL, MUTEX_DEFAULT, NULL); + if (ncpus < RDSV3_CPUID_POOL_MAX) + rdsv3_cpuid_pool_cnt = ncpus; + else + rdsv3_cpuid_pool_cnt = RDSV3_CPUID_POOL_MAX; + + /* hold cpu_lock before calling cpu_get and cpu_is_online */ + mutex_enter(&cpu_lock); + for (i = 0; i < rdsv3_cpuid_pool_cnt; i++) { + cp = cpu_get((processorid_t)i); + if (cp == NULL || !cpu_is_online(cp)) + rdsv3_cpuid_pool[i] = RDSV3_CPUFLAGS_OFF; + else + rdsv3_cpuid_pool[i] = RDSV3_CPUFLAGS_ON; + } + mutex_exit(&cpu_lock); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "HcaMsix", (int **)&msix, &nmsix) == DDI_PROP_SUCCESS) { + /* remove the hca MSI-x interrupt cpu's */ + for (i = 0; i < nmsix; i++) { + rdsv3_cpuid_pool[msix[i]] |= RDSV3_CPUFLAGS_INTR; + rdsv3_msix_pool[i] = msix[i]; + } + rdsv3_msix_pool_cnt = nmsix; + ddi_prop_free(msix); + } + rdsv3_enable_snd_cq = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "EnableSendCQ", 0); + rdsv3_intr_line_up_mode = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "IntrLineUpMode", 0); +} + +static void +rdsv3_af_cpu_assign(rdsv3_af_grp_t *hcagp) +{ + int i, j, k, idx; + + RDSV3_DPRINTF2("rdsv3_af_cpu_assign", "hcagp %p", hcagp); + + mutex_enter(&rdsv3_cpuid_pool_lock); + for (i = 0; i < rdsv3_cpuid_pool_cnt; i++) { + if (!(rdsv3_cpuid_pool[i] & (RDSV3_CPUFLAGS_UNAVAIL | + RDSV3_CPUFLAGS_ASSIGNED | RDSV3_CPUFLAGS_HCA))) { + rdsv3_cpuid_pool[i] |= RDSV3_CPUFLAGS_HCA; + hcagp->g_hca_cpuid = i; + break; + } + /* share an assigned cpu */ + for (j = 0; j < rdsv3_cpuid_pool_cnt; j++) { + if (!(rdsv3_cpuid_pool[j] & (RDSV3_CPUFLAGS_UNAVAIL | + RDSV3_CPUFLAGS_HCA))) { + hcagp->g_hca_cpuid = j; + break; + } + } + /* if the code comes down here, cpu 0 will be used */ + } + + for (j = 0; j < RDSV3_AFT_CONN_CPU_POOL; j++) { + /* initialize to be an out-of-bound cpuid, no binding */ + hcagp->g_conn_cpuid_pool[j] = rdsv3_cpuid_pool_cnt; + for (i = 0; i < rdsv3_cpuid_pool_cnt; i++) { + if (!(rdsv3_cpuid_pool[i] & (RDSV3_CPUFLAGS_UNAVAIL | + RDSV3_CPUFLAGS_ASSIGNED | RDSV3_CPUFLAGS_HCA))) { + rdsv3_cpuid_pool[i] |= RDSV3_CPUFLAGS_ASSIGNED; + hcagp->g_conn_cpuid_pool[j] = i; + break; + } + } + if (i >= rdsv3_cpuid_pool_cnt) + break; + } + if (j >= RDSV3_AFT_CONN_CPU_POOL) { + mutex_exit(&rdsv3_cpuid_pool_lock); + return; + } + /* avoid the primary group */ + for (k = 0, idx = 0; k < 2; k++) { + /* search to the start of an hca group */ + for (i = idx; i < rdsv3_cpuid_pool_cnt; i++) { + if (rdsv3_cpuid_pool[i] & RDSV3_CPUFLAGS_HCA) { + idx = i + 1; + break; + } + } + } + /* share an assigned cpu */ + for (; j < RDSV3_AFT_CONN_CPU_POOL; j++) { + for (i = idx; i < rdsv3_cpuid_pool_cnt; i++) { + if (!(rdsv3_cpuid_pool[i] & (RDSV3_CPUFLAGS_UNAVAIL | + RDSV3_CPUFLAGS_HCA))) { + hcagp->g_conn_cpuid_pool[j] = i; + idx = i + 1; + break; + } + } + } + mutex_exit(&rdsv3_cpuid_pool_lock); +} + +rdsv3_af_grp_t * +rdsv3_af_grp_create(ibt_hca_hdl_t hca, uint64_t id) +{ + char name[128]; + ibt_cq_sched_attr_t cq_sched_attr; + ibt_status_t status; + rdsv3_af_grp_t *hcagp; + uint64_t l_id = id; + + hcagp = kmem_zalloc(sizeof (*hcagp), KM_NOSLEEP); + if (!hcagp) + return (NULL); + hcagp->g_hca_hdl = hca; + + rdsv3_af_cpu_assign(hcagp); + return (hcagp); +} + +void +rdsv3_af_grp_destroy(rdsv3_af_grp_t *hcagp) +{ + if (hcagp == NULL) + return; + + kmem_free(hcagp, sizeof (*hcagp)); +} + +void +rdsv3_af_grp_draw(rdsv3_af_grp_t *hcagp) +{ + rdsv3_af_grp_t *l_hcagp = hcagp; +} + +ibt_sched_hdl_t +rdsv3_af_grp_get_sched(rdsv3_af_grp_t *hcagp) +{ + return (hcagp->g_sched_hdl); +} + +rdsv3_af_thr_t * +rdsv3_af_intr_thr_create(rdsv3_af_thr_drain_func_t fn, void *data, uint_t flag, + rdsv3_af_grp_t *hcagp, ibt_cq_hdl_t ibt_cq_hdl) +{ + rdsv3_af_thr_t *ringp; + processorid_t cpuid; + + if (ibt_cq_hdl == NULL) + return (NULL); + ringp = rdsv3_af_thr_create(fn, data, flag, hcagp); + if (ringp == NULL) + return (NULL); + + mutex_enter(&cpu_lock); + if (hcagp->g_conn_cpuid_idx >= RDSV3_AFT_CONN_CPU_POOL) + hcagp->g_conn_cpuid_idx = 0; + cpuid = hcagp->g_conn_cpuid_pool[hcagp->g_conn_cpuid_idx++]; + (void) rdsv3_af_thr_bind(ringp, cpuid); + mutex_exit(&cpu_lock); + + if (ringp->aft_intr) { + if (rdsv3_intr_line_up_mode) { + (void) ddi_intr_set_affinity(ringp->aft_intr, cpuid); + } else { + (void) ddi_intr_set_affinity(ringp->aft_intr, + rdsv3_msix_pool[0]); + } + } + return (ringp); +} + +rdsv3_af_thr_t * +rdsv3_af_thr_create(rdsv3_af_thr_drain_func_t fn, void *data, uint_t flag, + rdsv3_af_grp_t *hcagp) +{ + rdsv3_af_thr_t *ringp; + pri_t pri; + uint_t l_flags = flag; + rdsv3_af_grp_t *l_hcagp = hcagp; + + ringp = kmem_zalloc(sizeof (rdsv3_af_thr_t), KM_NOSLEEP); + if (ringp == NULL) + return (NULL); + + ringp->aft_grp = hcagp; + mutex_init(&ringp->aft_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ringp->aft_async, NULL, CV_DEFAULT, NULL); + if (flag & SCQ_WRK_BIND_CPU) + pri = maxclsyspri; + else + pri = maxclsyspri; + ringp->aft_worker = thread_create(NULL, 0, + rdsv3_af_thr_worker, ringp, 0, &p0, TS_RUN, pri); + ringp->aft_data = data; + ringp->aft_drain_func = (rdsv3_af_thr_drain_func_t)fn; + + /* set the bind CPU to -1 to indicate no thread affinity set */ + ringp->aft_cpuid = -1; + ringp->aft_state = 0; + ringp->aft_cflag = flag; + + if (flag & SCQ_BIND_CPU) { + mutex_enter(&cpu_lock); + if (flag & SCQ_HCA_BIND_CPU) { + (void) rdsv3_af_thr_bind(ringp, hcagp->g_hca_cpuid); + } else if (flag & SCQ_WRK_BIND_CPU) { + (void) rdsv3_af_thr_bind(ringp, hcagp->g_hca_cpuid); + } + mutex_exit(&cpu_lock); + } + + RDSV3_DPRINTF2("rdsv3_af_thr_create", "af_thr %p ic %p", ringp, data); + return (ringp); +} + +void +rdsv3_af_thr_destroy(rdsv3_af_thr_t *ringp) +{ + RDSV3_DPRINTF2("rdsv3_af_thr_destroy", "af_thr %p", ringp); + + /* wait until the af_thr has gone to sleep */ + mutex_enter(&ringp->aft_lock); + while (ringp->aft_state & AFT_PROC) { + mutex_exit(&ringp->aft_lock); + delay(drv_usectohz(1000)); + mutex_enter(&ringp->aft_lock); + } + ringp->aft_state |= AFT_CONDEMNED; + if (!(ringp->aft_state & AFT_PROC)) { + cv_signal(&ringp->aft_async); + } + mutex_exit(&ringp->aft_lock); +} + +void +rdsv3_af_thr_fire(rdsv3_af_thr_t *ringp) +{ + mutex_enter(&ringp->aft_lock); + ringp->aft_state |= AFT_ARMED; + if (!(ringp->aft_state & AFT_PROC)) { + cv_signal(&ringp->aft_async); + } + mutex_exit(&ringp->aft_lock); +} + +static void +rdsv3_af_thr_worker(rdsv3_af_thr_t *ringp) +{ + kmutex_t *lock = &ringp->aft_lock; + kcondvar_t *async = &ringp->aft_async; + callb_cpr_t cprinfo; + + RDSV3_DPRINTF4("rdsv3_af_thr_worker", "Enter af_thr %p", ringp); + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "rdsv3_af_thr"); + mutex_enter(lock); + for (;;) { + while (!(ringp->aft_state & (AFT_ARMED | AFT_CONDEMNED))) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + } + ringp->aft_state &= ~AFT_ARMED; + + /* + * Either we have work to do, or we have been asked to + * shutdown + */ + if (ringp->aft_state & AFT_CONDEMNED) + goto done; + ASSERT(!(ringp->aft_state & AFT_PROC)); + ringp->aft_state |= AFT_PROC; + mutex_exit(&ringp->aft_lock); + + ringp->aft_drain_func(ringp->aft_data); + + mutex_enter(&ringp->aft_lock); + ringp->aft_state &= ~AFT_PROC; + } +done: + CALLB_CPR_EXIT(&cprinfo); + RDSV3_DPRINTF2("rdsv3_af_thr_worker", "Exit af_thr %p", ringp); + cv_destroy(&ringp->aft_async); + mutex_destroy(&ringp->aft_lock); + kmem_free(ringp, sizeof (rdsv3_af_thr_t)); + thread_exit(); +} + + +int rdsv3_af_thr_thread_bind = 1; + +/* + * Bind a soft ring worker thread to supplied CPU. + */ +cpu_t * +rdsv3_af_thr_bind(rdsv3_af_thr_t *ringp, processorid_t cpuid) +{ + cpu_t *cp; + boolean_t clear = B_FALSE; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (rdsv3_af_thr_thread_bind == 0) { + return (NULL); + } + + cp = cpu_get(cpuid); + if (cp == NULL || !cpu_is_online(cp)) + return (NULL); + + mutex_enter(&ringp->aft_lock); + ringp->aft_state |= AFT_BOUND; + if (ringp->aft_cpuid != -1) + clear = B_TRUE; + ringp->aft_cpuid = cpuid; + mutex_exit(&ringp->aft_lock); + + if (clear) + thread_affinity_clear(ringp->aft_worker); + + RDSV3_DPRINTF4("rdsv3_af_thr_bind", "Bound af_thr %p to cpu %d", + ringp, cpuid); + thread_affinity_set(ringp->aft_worker, cpuid); + return (cp); +} + +/* + * Un Bind a soft ring worker thread. + */ +static void +rdsv3_af_thr_unbind(rdsv3_af_thr_t *ringp) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + mutex_enter(&ringp->aft_lock); + if (!(ringp->aft_state & AFT_BOUND)) { + ASSERT(ringp->aft_cpuid == -1); + mutex_exit(&ringp->aft_lock); + return; + } + + ringp->aft_cpuid = -1; + ringp->aft_state &= ~AFT_BOUND; + thread_affinity_clear(ringp->aft_worker); + mutex_exit(&ringp->aft_lock); +} diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c index 877e0247fc..7128c5a58c 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_ddi.c @@ -34,6 +34,7 @@ #include <sys/ib/ibtl/ibti.h> #include <sys/ib/clients/rdsv3/rdsv3.h> #include <sys/ib/clients/rdsv3/rdsv3_debug.h> +#include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> extern int rdsv3_init(void); extern void rdsv3_exit(void); @@ -116,6 +117,7 @@ rdsv3_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) mutex_init(&rdsv3_rdma_listen_id_lock, NULL, MUTEX_DRIVER, NULL); rdsv3_rdma_listen_id = NULL; + rdsv3_af_init(dip); rdsv3_trans_init(); ret = rdsv3_init(); if (ret) { diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c index 328b58f00f..3c13ca2226 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/rdsv3_impl.c @@ -763,7 +763,7 @@ rdsv3_rdma_init_worker(struct rdsv3_work_s *work) rdsv3_rdma_init(); } -#define RDSV3_NUM_TASKQ_THREADS 4 +#define RDSV3_NUM_TASKQ_THREADS 1 rdsv3_workqueue_struct_t * rdsv3_create_task_workqueue(char *name) { @@ -825,6 +825,10 @@ rdsv3_sock_exit_data(struct rsock *sk) mutex_destroy(&rs->rs_rdma_lock); avl_destroy(&rs->rs_rdma_keys); + mutex_destroy(&rs->rs_conn_lock); + mutex_destroy(&rs->rs_congested_lock); + cv_destroy(&rs->rs_congested_cv); + rdsv3_exit_waitqueue(sk->sk_sleep); kmem_free(sk->sk_sleep, sizeof (rdsv3_wait_queue_t)); mutex_destroy(&sk->sk_lock); @@ -881,6 +885,9 @@ rdsv3_conn_constructor(void *buf, void *arg, int kmflags) conn->c_next_tx_seq = 1; mutex_init(&conn->c_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&conn->c_send_lock, NULL, MUTEX_DRIVER, NULL); + conn->c_send_generation = 1; + conn->c_senders = 0; + list_create(&conn->c_send_queue, sizeof (struct rdsv3_message), offsetof(struct rdsv3_message, m_conn_item)); list_create(&conn->c_retrans, sizeof (struct rdsv3_message), @@ -1317,7 +1324,6 @@ rdsv3_ib_alloc_hdrs(ib_device_t *dev, struct rdsv3_ib_connection *ic) (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); ic->i_recv_hdrs_dma = (uint64_t)(uintptr_t)(addr + (ic->i_send_ring.w_nr * sizeof (struct rdsv3_header))); - ic->i_recv_tasklet_cpuid = -1; ic->i_ack = (struct rdsv3_header *)(addr + ((ic->i_send_ring.w_nr + ic->i_recv_ring.w_nr) * diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/send.c b/usr/src/uts/common/io/ib/clients/rdsv3/send.c index 73d800b2f7..e345238f82 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/send.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/send.c @@ -90,6 +90,8 @@ rdsv3_send_reset(struct rdsv3_connection *conn) RDSV3_DPRINTF4("rdsv3_send_reset", "Enter(conn: %p)", conn); + ASSERT(MUTEX_HELD(&conn->c_send_lock)); + if (conn->c_xmit_rm) { rm = conn->c_xmit_rm; ro = rm->m_rdma_op; @@ -111,11 +113,11 @@ rdsv3_send_reset(struct rdsv3_connection *conn) rdsv3_message_put(conn->c_xmit_rm); conn->c_xmit_rm = NULL; } + conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; conn->c_xmit_rdma_sent = 0; - conn->c_map_queued = 0; conn->c_unacked_packets = rdsv3_sysctl_max_unacked_packets; @@ -164,6 +166,10 @@ rdsv3_send_xmit(struct rdsv3_connection *conn) int was_empty = 0; list_t to_be_dropped; +restart: + if (!rdsv3_conn_up(conn)) + goto out; + RDSV3_DPRINTF4("rdsv3_send_xmit", "Enter(conn: %p)", conn); list_create(&to_be_dropped, sizeof (struct rdsv3_message), @@ -175,10 +181,6 @@ rdsv3_send_xmit(struct rdsv3_connection *conn) * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. - * - * The sem holder will issue a retry if they notice that someone queued - * a message after they stopped walking the send queue but before they - * dropped the sem. */ if (!mutex_tryenter(&conn->c_send_lock)) { RDSV3_DPRINTF4("rdsv3_send_xmit", @@ -187,13 +189,14 @@ rdsv3_send_xmit(struct rdsv3_connection *conn) ret = -ENOMEM; goto out; } + atomic_add_32(&conn->c_senders, 1); if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); /* * spin trying to push headers and data down the connection until - * the connection doens't make forward progress. + * the connection doesn't make forward progress. */ while (--send_quota) { /* @@ -406,6 +409,8 @@ rdsv3_send_xmit(struct rdsv3_connection *conn) ret = -EAGAIN; } + atomic_dec_32(&conn->c_senders); + if (ret == 0 && was_empty) { /* * A simple bit test would be way faster than taking the @@ -508,7 +513,6 @@ rdsv3_rdma_send_complete(struct rdsv3_message *rm, int status) mutex_enter(&rs->rs_lock); list_insert_tail(&rs->rs_notify_queue, notifier); mutex_exit(&rs->rs_lock); - ro->r_notifier = NULL; } @@ -734,7 +738,6 @@ rdsv3_send_drop_to(struct rdsv3_sock *rs, struct sockaddr_in *dest) if (dest && (dest->sin_addr.s_addr != rm->m_daddr || dest->sin_port != rm->m_inc.i_hdr.h_dport)) continue; - wake = 1; list_remove(&rs->rs_send_queue, rm); list_insert_tail(&list, rm); @@ -1029,10 +1032,10 @@ rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, ret = rdsv3_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { - mutex_enter(&rdsv3_poll_waitq.waitq_mutex); + mutex_enter(&rs->rs_congested_lock); rs->rs_seen_congestion = 1; - cv_signal(&rdsv3_poll_waitq.waitq_cv); - mutex_exit(&rdsv3_poll_waitq.waitq_mutex); + cv_signal(&rs->rs_congested_cv); + mutex_exit(&rs->rs_congested_lock); RDSV3_DPRINTF2("rdsv3_sendmsg", "rdsv3_cong_wait (dport: %d) returned: %d", dport, ret); @@ -1105,7 +1108,7 @@ rdsv3_sendmsg(struct rdsv3_sock *rs, uio_t *uio, struct nmsghdr *msg, rdsv3_stats_inc(s_send_queued); if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) - rdsv3_send_worker(&conn->c_send_w.work); + (void) rdsv3_send_xmit(conn); rdsv3_message_put(rm); RDSV3_DPRINTF4("rdsv3_sendmsg", "Return(rs: %p, len: %d)", @@ -1139,7 +1142,7 @@ rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport) RDSV3_DPRINTF4("rdsv3_send_pong", "Enter(conn: %p)", conn); rm = rdsv3_message_alloc(0, KM_NOSLEEP); - if (rm == NULL) { + if (!rm) { ret = -ENOMEM; goto out; } @@ -1173,7 +1176,9 @@ rdsv3_send_pong(struct rdsv3_connection *conn, uint16_be_t dport) rdsv3_stats_inc(s_send_queued); rdsv3_stats_inc(s_send_pong); - rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); + if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags)) + (void) rdsv3_send_xmit(conn); + rdsv3_message_put(rm); RDSV3_DPRINTF4("rdsv3_send_pong", "Return(conn: %p)", conn); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/threads.c b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c index 67715480f5..0651659ec3 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/threads.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/threads.c @@ -119,6 +119,8 @@ rdsv3_connect_complete(struct rdsv3_connection *conn) conn, NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); conn->c_reconnect_jiffies = 0; + conn->c_last_connect_jiffies = ddi_get_lbolt(); + set_bit(0, &conn->c_map_queued); rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_send_w, 0); rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_recv_w, 0); @@ -144,7 +146,7 @@ rdsv3_connect_complete(struct rdsv3_connection *conn) * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -static void +void rdsv3_queue_reconnect(struct rdsv3_connection *conn) { unsigned long rand; @@ -213,80 +215,6 @@ rdsv3_connect_worker(struct rdsv3_work_s *work) RDSV3_DPRINTF2("rdsv3_connect_worker", "Return(work: %p)", work); } -extern struct avl_tree rdsv3_conn_hash; - -void -rdsv3_shutdown_worker(struct rdsv3_work_s *work) -{ - struct rdsv3_connection *conn = container_of(work, - struct rdsv3_connection, c_down_w); - struct rdsv3_conn_info_s conn_info; - - RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Enter(work: %p)", work); - - /* shut it down unless it's down already */ - if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN, RDSV3_CONN_DOWN)) { - /* - * Quiesce the connection mgmt handlers before we start tearing - * things down. We don't hold the mutex for the entire - * duration of the shutdown operation, else we may be - * deadlocking with the CM handler. Instead, the CM event - * handler is supposed to check for state DISCONNECTING - */ - mutex_enter(&conn->c_cm_lock); - if (!rdsv3_conn_transition(conn, RDSV3_CONN_UP, - RDSV3_CONN_DISCONNECTING) && - !rdsv3_conn_transition(conn, RDSV3_CONN_ERROR, - RDSV3_CONN_DISCONNECTING)) { - RDSV3_DPRINTF2("rdsv3_shutdown_worker", - "RDS: connect failed: conn: %p, state: %d", - conn, atomic_get(&conn->c_state)); - rdsv3_conn_drop(conn); - mutex_exit(&conn->c_cm_lock); - return; - } - mutex_exit(&conn->c_cm_lock); - - mutex_enter(&conn->c_send_lock); - conn->c_trans->conn_shutdown(conn); - rdsv3_conn_reset(conn); - mutex_exit(&conn->c_send_lock); - - if (!rdsv3_conn_transition(conn, RDSV3_CONN_DISCONNECTING, - RDSV3_CONN_DOWN)) { - /* - * This can happen - eg when we're in the middle of - * tearing down the connection, and someone unloads - * the rds module. Quite reproduceable with loopback - * connections. Mostly harmless. - */ -#ifndef __lock_lint - RDSV3_DPRINTF2("rdsv3_shutdown_worker", - "failed to transition to state DOWN, " - "current statis is: %d conn: %p", - atomic_get(&conn->c_state), conn); - rdsv3_conn_drop(conn); -#endif - return; - } - } - - /* - * Then reconnect if it's still live. - * The passive side of an IB loopback connection is never added - * to the conn hash, so we never trigger a reconnect on this - * conn - the reconnect is always triggered by the active peer. - */ - rdsv3_cancel_delayed_work(&conn->c_conn_w); - - conn_info.c_laddr = conn->c_laddr; - conn_info.c_faddr = conn->c_faddr; - if (avl_find(&rdsv3_conn_hash, &conn_info, NULL) == conn) - rdsv3_queue_reconnect(conn); - - RDSV3_DPRINTF2("rdsv3_shutdown_worker", "Return(work: %p)", work); -} - void rdsv3_send_worker(struct rdsv3_work_s *work) { @@ -344,6 +272,32 @@ rdsv3_recv_worker(struct rdsv3_work_s *work) } void +rdsv3_shutdown_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_down_w); + rdsv3_conn_shutdown(conn); +} + +#define time_after(a, b) ((long)(b) - (long)(a) < 0) + +void +rdsv3_reaper_worker(struct rdsv3_work_s *work) +{ + struct rdsv3_connection *conn = container_of(work, + struct rdsv3_connection, c_reap_w.work); + + if (rdsv3_conn_state(conn) != RDSV3_CONN_UP && + !time_after(conn->c_last_connect_jiffies, + ddi_get_lbolt() - RDSV3_REAPER_WAIT_JIFFIES)) { + rdsv3_conn_destroy(conn); + } else { + rdsv3_queue_delayed_work(rdsv3_wq, &conn->c_reap_w, + RDSV3_REAPER_WAIT_JIFFIES); + } +} + +void rdsv3_threads_exit(void) { rdsv3_destroy_task_workqueue(rdsv3_wq); @@ -353,7 +307,7 @@ int rdsv3_threads_init(void) { rdsv3_wq = rdsv3_create_task_workqueue("krdsd"); - if (rdsv3_wq == NULL) + if (!rdsv3_wq) return (-ENOMEM); return (0); diff --git a/usr/src/uts/common/io/ib/clients/rdsv3/transport.c b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c index e3a363523b..802e86f834 100644 --- a/usr/src/uts/common/io/ib/clients/rdsv3/transport.c +++ b/usr/src/uts/common/io/ib/clients/rdsv3/transport.c @@ -121,7 +121,8 @@ rdsv3_trans_get_preferred(uint32_be_t addr) rw_enter(&trans_sem, RW_READER); for (i = 0; i < RDS_TRANS_COUNT; i++) { - if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { + if (transports[i] && + transports[i]->laddr_check(addr) == 0) { ret = transports[i]; break; } diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h index ca900e6972..9f19384626 100644 --- a/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/ib.h @@ -29,6 +29,7 @@ #include <sys/ib/clients/rdsv3/rdsv3.h> #include <sys/ib/clients/rdsv3/rdma_transport.h> +#include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> #define RDSV3_FMR_SIZE 256 #define RDSV3_FMR_POOL_SIZE (12 * 1024) @@ -65,8 +66,11 @@ struct rdsv3_page_frag { }; struct rdsv3_ib_incoming { + list_node_t ii_obj; /* list obj of rdsv3_inc_pool list */ struct list ii_frags; struct rdsv3_incoming ii_inc; + struct rdsv3_inc_pool *ii_pool; + struct rdsv3_ib_device *ii_ibdev; }; struct rdsv3_ib_connect_private { @@ -95,11 +99,26 @@ struct rdsv3_ib_recv_work { }; struct rdsv3_ib_work_ring { - uint32_t w_nr; - uint32_t w_alloc_ptr; - uint32_t w_alloc_ctr; - uint32_t w_free_ptr; - atomic_t w_free_ctr; + uint32_t w_nr; + uint32_t w_alloc_ptr; + uint32_t w_alloc_ctr; + uint32_t w_free_ptr; + atomic_t w_free_ctr; + rdsv3_wait_queue_t w_empty_wait; +}; + +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rdsv3_ib_ack_state { + uint64_t ack_next; + uint64_t ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; }; struct rdsv3_ib_device; @@ -115,8 +134,8 @@ struct rdsv3_ib_connection { struct rdma_cm_id *i_cm_id; struct ib_pd *i_pd; struct rdsv3_hdrs_mr *i_mr; - struct ib_cq *i_send_cq; - struct ib_cq *i_recv_cq; + struct ib_cq *i_cq; + struct ib_cq *i_snd_cq; /* tx */ struct rdsv3_ib_work_ring i_send_ring; @@ -126,8 +145,12 @@ struct rdsv3_ib_connection { struct rdsv3_ib_send_work *i_sends; ibt_send_wr_t *i_send_wrs; + /* soft CQ */ + rdsv3_af_thr_t *i_soft_cq; + rdsv3_af_thr_t *i_snd_soft_cq; + rdsv3_af_thr_t *i_refill_rq; + /* rx */ - ddi_taskq_t *i_recv_tasklet; struct mutex i_recv_mutex; struct rdsv3_ib_work_ring i_recv_ring; struct rdsv3_ib_incoming *i_ibinc; @@ -138,8 +161,6 @@ struct rdsv3_ib_connection { ibt_recv_wr_t *i_recv_wrs; struct rdsv3_page_frag i_frag; uint64_t i_ack_recv; /* last ACK received */ - processorid_t i_recv_tasklet_cpuid; - /* CPU to which the tasklet taskq should be bound */ /* sending acks */ unsigned long i_ack_flags; @@ -192,35 +213,45 @@ struct rdsv3_ib_device { ib_device_t *dev; struct ib_pd *pd; struct kmem_cache *ib_frag_slab; - struct rds_ib_mr_pool *mr_pool; + kmutex_t spinlock; /* protect the above */ + krwlock_t rwlock; /* protect paddr_list */ unsigned int fmr_max_remaps; unsigned int max_fmrs; unsigned int fmr_message_size; int max_sge; unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; + struct rdsv3_fmr_pool *fmr_pool; + struct rdsv3_inc_pool *inc_pool; ibt_fmr_pool_hdl_t fmr_pool_hdl; - kmutex_t spinlock; /* protect the above */ ibt_hca_attr_t hca_attr; + rdsv3_af_thr_t *fmr_soft_cq; + rdsv3_af_thr_t *inc_soft_cq; + ibt_hca_hdl_t ibt_hca_hdl; + rdsv3_af_grp_t *aft_hcagp; }; /* bits for i_ack_flags */ #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 +#define RDSV3_IB_SEND_OP (1ULL << 63) + /* Magic WR_ID for ACKs */ #define RDSV3_IB_ACK_WR_ID (~(uint64_t)0) struct rdsv3_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; - uint64_t s_ib_tx_cq_call; + uint64_t s_ib_evt_handler_call; + uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; uint64_t s_ib_tx_ring_full; uint64_t s_ib_tx_throttle; uint64_t s_ib_tx_sg_mapping_failure; uint64_t s_ib_tx_stalled; uint64_t s_ib_tx_credit_updates; - uint64_t s_ib_rx_cq_call; uint64_t s_ib_rx_cq_event; uint64_t s_ib_rx_ring_empty; uint64_t s_ib_rx_refill_from_cq; @@ -266,6 +297,9 @@ int rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); void rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn, struct rdma_cm_event *event); +void rdsv3_ib_tasklet_fn(void *data); +void rdsv3_ib_snd_tasklet_fn(void *data); +void rdsv3_ib_refill_fn(void *data); /* ib_rdma.c */ int rdsv3_ib_update_ipaddr(struct rdsv3_ib_device *rds_ibdev, @@ -293,25 +327,29 @@ void *rdsv3_ib_get_mr(struct rdsv3_iovec *args, unsigned long nents, void rdsv3_ib_sync_mr(void *trans_private, int dir); void rdsv3_ib_free_mr(void *trans_private, int invalidate); void rdsv3_ib_flush_mrs(void); +void rdsv3_ib_drain_mrlist_fn(void *data); /* ib_recv.c */ int rdsv3_ib_recv_init(void); void rdsv3_ib_recv_exit(void); int rdsv3_ib_recv(struct rdsv3_connection *conn); -int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int kmflags, - int prefill); -void rdsv3_ib_inc_purge(struct rdsv3_incoming *inc); +int rdsv3_ib_recv_refill(struct rdsv3_connection *conn, int prefill); void rdsv3_ib_inc_free(struct rdsv3_incoming *inc); int rdsv3_ib_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uiop, size_t size); -void rdsv3_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); -void rdsv3_ib_recv_tasklet_fn(void *data); +void rdsv3_ib_recv_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc, + struct rdsv3_ib_ack_state *state); void rdsv3_ib_recv_init_ring(struct rdsv3_ib_connection *ic); void rdsv3_ib_recv_clear_ring(struct rdsv3_ib_connection *ic); void rdsv3_ib_recv_init_ack(struct rdsv3_ib_connection *ic); void rdsv3_ib_attempt_ack(struct rdsv3_ib_connection *ic); void rdsv3_ib_ack_send_complete(struct rdsv3_ib_connection *ic); uint64_t rdsv3_ib_piggyb_ack(struct rdsv3_ib_connection *ic); +void rdsv3_ib_set_ack(struct rdsv3_ib_connection *ic, uint64_t seq, + int ack_required); +int rdsv3_ib_create_inc_pool(struct rdsv3_ib_device *); +void rdsv3_ib_destroy_inc_pool(struct rdsv3_ib_device *); +void rdsv3_ib_drain_inclist(void *); /* ib_ring.c */ void rdsv3_ib_ring_init(struct rdsv3_ib_work_ring *ring, uint32_t nr); @@ -325,13 +363,12 @@ int rdsv3_ib_ring_low(struct rdsv3_ib_work_ring *ring); uint32_t rdsv3_ib_ring_oldest(struct rdsv3_ib_work_ring *ring); uint32_t rdsv3_ib_ring_completed(struct rdsv3_ib_work_ring *ring, uint32_t wr_id, uint32_t oldest); -extern rdsv3_wait_queue_t rdsv3_ib_ring_empty_wait; /* ib_send.c */ void rdsv3_ib_xmit_complete(struct rdsv3_connection *conn); int rdsv3_ib_xmit(struct rdsv3_connection *conn, struct rdsv3_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); -void rdsv3_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rdsv3_ib_send_cqe_handler(struct rdsv3_ib_connection *ic, ibt_wc_t *wc); void rdsv3_ib_send_init_ring(struct rdsv3_ib_connection *ic); void rdsv3_ib_send_clear_ring(struct rdsv3_ib_connection *ic); int rdsv3_ib_xmit_rdma(struct rdsv3_connection *conn, struct rdsv3_rdma_op *op); diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h index 0c78f40d2a..a45febea96 100644 --- a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3.h @@ -42,6 +42,7 @@ extern "C" { #include <inet/ip.h> #include <sys/avl.h> #include <sys/param.h> +#include <sys/time.h> #include <sys/rds.h> #include <sys/ib/ibtl/ibti.h> @@ -77,6 +78,9 @@ extern "C" { */ #define RDSV3_PORT 18634 +#define RDSV3_REAPER_WAIT_SECS (5*60) +#define RDSV3_REAPER_WAIT_JIFFIES SEC_TO_TICK(RDSV3_REAPER_WAIT_SECS) + /* * This is the sad making. Some kernels have a bug in the per_cpu() api which * makes DEFINE_PER_CPU trigger an oops on insmod because the per-cpu section @@ -139,6 +143,9 @@ struct rdsv3_connection { struct rdsv3_cong_map *c_fcong; struct mutex c_send_lock; /* protect send ring */ + atomic_t c_send_generation; + atomic_t c_senders; + struct rdsv3_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; @@ -158,9 +165,12 @@ struct rdsv3_connection { atomic_t c_state; unsigned long c_flags; unsigned long c_reconnect_jiffies; + clock_t c_last_connect_jiffies; + struct rdsv3_delayed_work_s c_send_w; struct rdsv3_delayed_work_s c_recv_w; struct rdsv3_delayed_work_s c_conn_w; + struct rdsv3_delayed_work_s c_reap_w; struct rdsv3_work_s c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ @@ -301,6 +311,8 @@ struct rdsv3_message { * -> rs->rs_lock */ kmutex_t m_rs_lock; + rdsv3_wait_queue_t m_flush_wait; + struct rdsv3_sock *m_rs; struct rdsv3_rdma_op *m_rdma_op; rdsv3_rdma_cookie_t m_rdma_cookie; @@ -381,7 +393,6 @@ struct rdsv3_transport { int (*recv)(struct rdsv3_connection *conn); int (*inc_copy_to_user)(struct rdsv3_incoming *inc, uio_t *uio, size_t size); - void (*inc_purge)(struct rdsv3_incoming *inc); void (*inc_free)(struct rdsv3_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, @@ -433,6 +444,8 @@ struct rdsv3_sock { int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion; + kmutex_t rs_congested_lock; + kcondvar_t rs_congested_cv; /* rs_lock protects all these adjacent members before the newline */ kmutex_t rs_lock; @@ -548,8 +561,6 @@ void rdsv3_sock_put(struct rdsv3_sock *rs); void rdsv3_wake_sk_sleep(struct rdsv3_sock *rs); void __rdsv3_wake_sk_sleep(struct rsock *sk); -extern rdsv3_wait_queue_t rdsv3_poll_waitq; - /* bind.c */ int rdsv3_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t len, cred_t *cr); @@ -564,6 +575,7 @@ struct rdsv3_connection *rdsv3_conn_create(uint32_be_t laddr, uint32_be_t faddr, struct rdsv3_connection *rdsv3_conn_create_outgoing(uint32_be_t laddr, uint32_be_t faddr, struct rdsv3_transport *trans, int gfp); +void rdsv3_conn_shutdown(struct rdsv3_connection *conn); void rdsv3_conn_destroy(struct rdsv3_connection *conn); void rdsv3_conn_reset(struct rdsv3_connection *conn); void rdsv3_conn_drop(struct rdsv3_connection *conn); @@ -690,10 +702,12 @@ extern unsigned int rdsv3_sysctl_trace_level; int rdsv3_threads_init(); void rdsv3_threads_exit(void); extern struct rdsv3_workqueue_struct_s *rdsv3_wq; +void rdsv3_queue_reconnect(struct rdsv3_connection *conn); void rdsv3_connect_worker(struct rdsv3_work_s *); void rdsv3_shutdown_worker(struct rdsv3_work_s *); void rdsv3_send_worker(struct rdsv3_work_s *); void rdsv3_recv_worker(struct rdsv3_work_s *); +void rdsv3_reaper_worker(struct rdsv3_work_s *); void rdsv3_connect_complete(struct rdsv3_connection *conn); /* transport.c */ @@ -724,7 +738,6 @@ int rdsv3_message_add_rdma_dest_extension(struct rdsv3_header *hdr, uint32_t r_key, uint32_t offset); int rdsv3_message_inc_copy_to_user(struct rdsv3_incoming *inc, uio_t *uio, size_t size); -void rdsv3_message_inc_purge(struct rdsv3_incoming *inc); void rdsv3_message_inc_free(struct rdsv3_incoming *inc); void rdsv3_message_addref(struct rdsv3_message *rm); void rdsv3_message_put(struct rdsv3_message *rm); diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr.h new file mode 100644 index 0000000000..81d7a65a32 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_AF_THR_H +#define _RDSV3_AF_THR_H + +/* + * This file is only present in Solaris + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/callb.h> +#include <sys/cpuvar.h> +#include <sys/cpupart.h> +#include <sys/processor.h> + +typedef struct rdsv3_af_grp_s rdsv3_af_grp_t; +typedef struct rdsv3_af_thr_s rdsv3_af_thr_t; +typedef void (*rdsv3_af_thr_drain_func_t)(void *); + +void rdsv3_af_init(dev_info_t *dip); +/* + * create flags. + */ +#define SCQ_DEFAULT 0x0000 +#define SCQ_HCA_BIND_CPU 0x0001 /* bind hca to a cpu */ +#define SCQ_INTR_BIND_CPU 0x0002 /* bind soft cq to a cpu */ +#define SCQ_WRK_BIND_CPU 0x0004 /* bind worker to a cpu */ + +rdsv3_af_grp_t *rdsv3_af_grp_create(ibt_hca_hdl_t hca, uint64_t id); +void rdsv3_af_grp_destroy(rdsv3_af_grp_t *hcagp); +void rdsv3_af_grp_draw(rdsv3_af_grp_t *hcagp); +ibt_sched_hdl_t rdsv3_af_grp_get_sched(rdsv3_af_grp_t *hcagp); + +rdsv3_af_thr_t *rdsv3_af_thr_create(rdsv3_af_thr_drain_func_t fn, void *data, + uint_t flag, rdsv3_af_grp_t *hcagp); +rdsv3_af_thr_t *rdsv3_af_intr_thr_create(rdsv3_af_thr_drain_func_t fn, + void *data, uint_t flag, rdsv3_af_grp_t *hcagp, ibt_cq_hdl_t ibt_cq_hdl); + +void rdsv3_af_thr_destroy(rdsv3_af_thr_t *ringp); +void rdsv3_af_thr_fire(rdsv3_af_thr_t *ringp); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_AF_THR_H */ diff --git a/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr_impl.h b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr_impl.h new file mode 100644 index 0000000000..1d6a6c83b8 --- /dev/null +++ b/usr/src/uts/common/sys/ib/clients/rdsv3/rdsv3_af_thr_impl.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _RDSV3_AF_THR_IMPL_H +#define _RDSV3_AF_THR_IMPL_H + +/* + * This file is only present in Solaris + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define ddi_intr_set_affinity set_intr_affinity +#include <sys/ib/clients/rdsv3/rdsv3_af_thr.h> +#define SCQ_BIND_CPU (SCQ_HCA_BIND_CPU | SCQ_WRK_BIND_CPU) + +#define RDSV3_AFT_MAX_CONN 4 +#define RDSV3_AFT_PER_CONN_CPU 1 +#define RDSV3_AFT_CONN_CPU_POOL (RDSV3_AFT_MAX_CONN * RDSV3_AFT_PER_CONN_CPU) + +#define RDSV3_CPUID_POOL_MAX 128 +static uint32_t rdsv3_cpuid_pool[RDSV3_CPUID_POOL_MAX]; +static int rdsv3_cpuid_pool_cnt; +#define RDSV3_MSIX_POOL_MAX 128 +static uint32_t rdsv3_msix_pool[RDSV3_MSIX_POOL_MAX]; +static int rdsv3_msix_pool_cnt; + +#define RDSV3_CPUFLAGS_ON 0x0001 +#define RDSV3_CPUFLAGS_OFF 0x0002 +#define RDSV3_CPUFLAGS_ASSIGNED 0x0004 +#define RDSV3_CPUFLAGS_INTR 0x0008 +#define RDSV3_CPUFLAGS_HCA 0x0010 + +#define RDSV3_CPUFLAGS_UNAVAIL (RDSV3_CPUFLAGS_OFF | RDSV3_CPUFLAGS_INTR) + +struct rdsv3_af_grp_s { + ibt_hca_hdl_t g_hca_hdl; + ibt_sched_hdl_t g_sched_hdl; + processorid_t g_hca_cpuid; + processorid_t g_conn_cpuid_pool[RDSV3_AFT_CONN_CPU_POOL]; + int g_conn_cpuid_idx; +}; + +struct rdsv3_af_thr_s { + /* Keep the most used members 64bytes cache aligned */ + kmutex_t aft_lock; /* lock before using any member */ + kcondvar_t aft_async; /* async thread blocks on */ + kthread_t *aft_worker; /* kernel thread id */ + void *aft_data; /* argument of cq_drain_func */ + processorid_t aft_cpuid; /* processor to bind to */ + uint16_t aft_state; /* state flags */ + uint16_t aft_cflag; /* creation flags */ + rdsv3_af_thr_drain_func_t aft_drain_func; + rdsv3_af_grp_t *aft_grp; + ddi_intr_handle_t aft_intr; /* intr cookie */ +}; + +/* + * State flags. + */ +#define AFT_PROC 0x0001 /* being processed */ +#define AFT_BOUND 0x0002 /* Worker thread is bound to a cpu */ +#define AFT_ARMED 0x0004 /* armed worker thread */ +#define AFT_CONDEMNED 0x0100 /* Being torn down */ + +static void rdsv3_af_thr_worker(rdsv3_af_thr_t *ringp); +static cpu_t *rdsv3_af_thr_bind(rdsv3_af_thr_t *ringp, processorid_t cpuid); +static void rdsv3_af_thr_unbind(rdsv3_af_thr_t *ringp); + +#ifdef __cplusplus +} +#endif + +#endif /* _RDSV3_AF_THR_IMPL_H */ diff --git a/usr/src/uts/common/sys/rds.h b/usr/src/uts/common/sys/rds.h index 0d375d82d7..dc1d281751 100644 --- a/usr/src/uts/common/sys/rds.h +++ b/usr/src/uts/common/sys/rds.h @@ -67,6 +67,7 @@ #define _RDSV3_RDS_H #include <sys/types.h> +#include <sys/socket.h> #ifdef __cplusplus extern "C" { @@ -341,15 +342,12 @@ struct rdsv3_get_mr_args { uint64_t flags; }; -#if 1 /* 1 at 1.5.1 */ -#include <sys/socket_impl.h> struct rdsv3_get_mr_for_dest_args { struct sockaddr_storage dest_addr; struct rdsv3_iovec vec; uint64_t cookie_addr; uint64_t flags; }; -#endif struct rdsv3_free_mr_args { rdsv3_rdma_cookie_t cookie; |