diff options
author | Faramarz Jalalian - Sun Microsystems - Irvine United States <Faramarz.Jalalian@Sun.COM> | 2009-05-26 21:50:26 -0700 |
---|---|---|
committer | Faramarz Jalalian - Sun Microsystems - Irvine United States <Faramarz.Jalalian@Sun.COM> | 2009-05-26 21:50:26 -0700 |
commit | 7f379ad161f54e78771cca76492a1bf75c316284 (patch) | |
tree | 2a09ea192102b2d747825bbf90a49d78829253c9 | |
parent | 189680041ed64164883a8097c9d6ebcf9559e0c7 (diff) | |
download | illumos-gate-onnv_116.tar.gz |
6802643 rpcib: Add support for using multiple HCA'sonnv_116
6818112 rpc/rdma needs to select the same local source address on retries
-rw-r--r-- | usr/src/uts/common/rpc/clnt_rdma.c | 23 | ||||
-rw-r--r-- | usr/src/uts/common/rpc/ib.h | 62 | ||||
-rw-r--r-- | usr/src/uts/common/rpc/rpc_rdma.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/rpc/rpcib.c | 972 |
4 files changed, 688 insertions, 377 deletions
diff --git a/usr/src/uts/common/rpc/clnt_rdma.c b/usr/src/uts/common/rpc/clnt_rdma.c index 7efa276ab3..207b4cbf21 100644 --- a/usr/src/uts/common/rpc/clnt_rdma.c +++ b/usr/src/uts/common/rpc/clnt_rdma.c @@ -116,6 +116,7 @@ typedef struct cku_private { CLIENT cku_client; /* client handle */ rdma_mod_t *cku_rd_mod; /* underlying RDMA mod */ void *cku_rd_handle; /* underlying RDMA device */ + struct netbuf cku_srcaddr; /* source address for retries */ struct netbuf cku_addr; /* remote netbuf address */ int cku_addrfmly; /* for finding addr_type */ struct rpc_err cku_err; /* error status */ @@ -267,6 +268,9 @@ clnt_rdma_kcreate(char *proto, void *handle, struct netbuf *raddr, int family, * Set up the rpc information */ p->cku_cred = cred; + p->cku_srcaddr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP); + p->cku_srcaddr.maxlen = raddr->maxlen; + p->cku_srcaddr.len = 0; p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP); p->cku_addr.maxlen = raddr->maxlen; p->cku_addr.len = raddr->len; @@ -282,6 +286,7 @@ clnt_rdma_kdestroy(CLIENT *h) { struct cku_private *p = htop(h); + kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); kmem_free(p, sizeof (*p)); } @@ -325,6 +330,8 @@ clnt_rdma_kinit(CLIENT *h, char *proto, void *handle, struct netbuf *raddr, p->cku_addr.maxlen = raddr->maxlen; } + p->cku_srcaddr.len = 0; + p->cku_addr.len = raddr->len; bcopy(raddr->buf, p->cku_addr.buf, raddr->len); h->cl_ops = &rdma_clnt_ops; @@ -646,8 +653,8 @@ call_again: if (p->cku_xid == 0) p->cku_xid = alloc_xid(); - status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_addr, - p->cku_addrfmly, p->cku_rd_handle, &conn); + status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_srcaddr, + &p->cku_addr, p->cku_addrfmly, p->cku_rd_handle, &conn); rw_exit(&rdma_lock); /* @@ -703,6 +710,18 @@ call_again: return (p->cku_err.re_status); } + if (p->cku_srcaddr.maxlen < conn->c_laddr.len) { + if ((p->cku_srcaddr.maxlen != 0) && + (p->cku_srcaddr.buf != NULL)) + kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); + p->cku_srcaddr.buf = kmem_zalloc(conn->c_laddr.maxlen, + KM_SLEEP); + p->cku_srcaddr.maxlen = conn->c_laddr.maxlen; + } + + p->cku_srcaddr.len = conn->c_laddr.len; + bcopy(conn->c_laddr.buf, p->cku_srcaddr.buf, conn->c_laddr.len); + clnt_check_credit(conn); status = CLNT_RDMA_FAIL; diff --git a/usr/src/uts/common/rpc/ib.h b/usr/src/uts/common/rpc/ib.h index 938dec5a25..971bfead98 100644 --- a/usr/src/uts/common/rpc/ib.h +++ b/usr/src/uts/common/rpc/ib.h @@ -168,35 +168,35 @@ struct rib_cq_s { }; /* + * Each registered service's data structure. + */ +typedef struct rib_service_s rib_service_t; +struct rib_service_s { + uint32_t srv_type; /* i.e, NFS, NLM, v4CBD */ + ibt_srv_hdl_t srv_hdl; /* from ibt_register call */ + ib_svc_id_t srv_id; + rib_service_t *next; +}; + +/* * RPCIB plugin state */ typedef struct rpcib_state { ibt_clnt_hdl_t ibt_clnt_hdl; uint32_t hca_count; uint32_t nhca_inited; - ib_guid_t *hca_guids; - rib_hca_t *hcas; + rib_hca_t *hcas_list; + krwlock_t hcas_list_lock; /* protects hcas_list */ int refcount; kmutex_t open_hca_lock; - rib_hca_t *hca; /* the hca being used */ queue_t *q; /* up queue for a serv_type */ - uint32_t service_type; /* NFS, NLM, etc */ void *private; + rib_service_t *service_list; + krwlock_t service_list_lock; + kmutex_t listen_lock; } rpcib_state_t; /* - * Each registered service's data structure. - * Each HCA has a list of these structures, which are the registered - * services on this HCA. - */ -typedef struct rib_service rib_service_t; -struct rib_service { - uint32_t srv_type; /* i.e, NFS, NLM, v4CBD */ - ibt_srv_hdl_t srv_hdl; /* from ibt_register call */ - rib_service_t *srv_next; -}; - -/* * Connection lists */ typedef struct { @@ -209,6 +209,14 @@ enum hca_state { HCA_INITED, /* hca in up and running state */ }; +typedef struct rib_hca_service_s rib_hca_service_t; +struct rib_hca_service_s { + ib_svc_id_t srv_id; + ib_gid_t gid; + ibt_sbind_hdl_t sbind_hdl; + rib_hca_service_t *next; +}; + /* * RPCIB per HCA structure */ @@ -221,6 +229,8 @@ struct rib_hca_s { ibt_hca_hdl_t hca_hdl; /* HCA handle */ ibt_hca_attr_t hca_attrs; /* HCA attributes */ ibt_pd_hdl_t pd_hdl; + rib_hca_service_t *bound_services; + krwlock_t bound_services_lock; ib_guid_t hca_guid; uint32_t hca_nports; ibt_hca_portinfo_t *hca_ports; @@ -229,15 +239,6 @@ struct rib_hca_s { krwlock_t state_lock; /* protects state field */ bool_t inuse; /* indicates HCA usage */ kmutex_t inuse_lock; /* protects inuse field */ - /* - * List of services registered on all ports available - * on this HCA. Only one consumer of KRPC can register - * its services at one time or tear them down at one - * time. - */ - rib_service_t *service_list; - krwlock_t service_list_lock; - rib_conn_list_t cl_conn_list; /* client conn list */ rib_conn_list_t srv_conn_list; /* server conn list */ @@ -259,11 +260,20 @@ struct rib_hca_s { kmutex_t avl_lock; krwlock_t avl_rw_lock; volatile bool_t avl_init; - kmutex_t cache_allocation; + kmutex_t cache_allocation_lock; ddi_taskq_t *cleanup_helper; ib_svc_id_t srv_id; ibt_srv_hdl_t srv_hdl; uint_t reg_state; + + volatile uint64_t cache_allocation; + uint64_t cache_hits; + uint64_t cache_misses; + uint64_t cache_cold_misses; + uint64_t cache_hot_misses; + uint64_t cache_misses_above_the_limit; + + struct rib_hca_s *next; }; diff --git a/usr/src/uts/common/rpc/rpc_rdma.h b/usr/src/uts/common/rpc/rpc_rdma.h index daa64a9122..cb35888be0 100644 --- a/usr/src/uts/common/rpc/rpc_rdma.h +++ b/usr/src/uts/common/rpc/rpc_rdma.h @@ -406,8 +406,8 @@ typedef struct rdmaops { rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *, void **handle); /* Connection */ - rdma_stat (*rdma_get_conn)(struct netbuf *, int addr_type, - void *, CONN **); + rdma_stat (*rdma_get_conn)(struct netbuf *, struct netbuf *, + int addr_type, void *, CONN **); rdma_stat (*rdma_rel_conn)(CONN *); /* Server side listner start and stop routines */ void (*rdma_svc_listen)(struct rdma_svc_data *); @@ -452,8 +452,8 @@ extern rdma_svc_wait_t rdma_wait; #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \ (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle) -#define RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn) \ - (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) +#define RDMA_GET_CONN(rdma_ops, saddr, daddr, addr_type, handle, conn) \ + (*(rdma_ops)->rdma_get_conn)(saddr, daddr, addr_type, handle, conn) #define RDMA_REL_CONN(conn) \ (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) diff --git a/usr/src/uts/common/rpc/rpcib.c b/usr/src/uts/common/rpc/rpcib.c index bbd38f8f9d..88f1a87e4c 100644 --- a/usr/src/uts/common/rpc/rpcib.c +++ b/usr/src/uts/common/rpc/rpcib.c @@ -114,6 +114,10 @@ static int rpcib_do_ip_ioctl(int, int, void *); static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); static int rpcib_cache_kstat_update(kstat_t *, int); static void rib_force_cleanup(void *); +static void rib_stop_hca_services(rib_hca_t *); +static void rib_attach_hca(void); +static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, + struct netbuf *d_svcaddr, CONN **conn); struct { kstat_named_t cache_limit; @@ -206,15 +210,8 @@ typedef struct cache_struct { avl_node_t avl_link; } cache_avl_struct_t; -static uint64_t rib_total_buffers = 0; uint64_t cache_limit = 100 * 1024 * 1024; -static volatile uint64_t cache_allocation = 0; static uint64_t cache_watermark = 80 * 1024 * 1024; -static uint64_t cache_hits = 0; -static uint64_t cache_misses = 0; -static uint64_t cache_cold_misses = 0; -static uint64_t cache_hot_misses = 0; -static uint64_t cache_misses_above_the_limit = 0; static bool_t stats_enabled = FALSE; static uint64_t max_unsignaled_rws = 5; @@ -302,7 +299,8 @@ static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); -static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **); +static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, + int addr_type, void *, CONN **); static rdma_stat rib_conn_release(CONN *conn); static rdma_stat rib_getinfo(rdma_info_t *info); @@ -357,7 +355,7 @@ static rdma_mod_t rib_mod = { &rib_ops, /* rdma op vector for ibtf */ }; -static rdma_stat open_hcas(rpcib_state_t *); +static rdma_stat rpcib_open_hcas(rpcib_state_t *); static rdma_stat rib_qp_init(rib_qp_t *, int); static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); @@ -494,6 +492,65 @@ rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) return (ret); } +static void +rpcib_free_hca_list() +{ + rib_hca_t *hca, *hcap; + + rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); + hca = rib_stat->hcas_list; + rib_stat->hcas_list = NULL; + rw_exit(&rib_stat->hcas_list_lock); + while (hca != NULL) { + rw_enter(&hca->state_lock, RW_WRITER); + hcap = hca; + hca = hca->next; + rib_stat->nhca_inited--; + rib_mod.rdma_count--; + hcap->state = HCA_DETACHED; + rw_exit(&hcap->state_lock); + rib_stop_hca_services(hcap); + + kmem_free(hcap, sizeof (*hcap)); + } +} + +static rdma_stat +rpcib_free_service_list() +{ + rib_service_t *service; + ibt_status_t ret; + + rw_enter(&rib_stat->service_list_lock, RW_WRITER); + while (rib_stat->service_list != NULL) { + service = rib_stat->service_list; + ret = ibt_unbind_all_services(service->srv_hdl); + if (ret != IBT_SUCCESS) { + rw_exit(&rib_stat->service_list_lock); +#ifdef DEBUG + cmn_err(CE_NOTE, "rpcib_free_service_list: " + "ibt_unbind_all_services failed (%d)\n", (int)ret); +#endif + return (RDMA_FAILED); + } + ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, + service->srv_hdl); + if (ret != IBT_SUCCESS) { + rw_exit(&rib_stat->service_list_lock); +#ifdef DEBUG + cmn_err(CE_NOTE, "rpcib_free_service_list: " + "ibt_deregister_service failed (%d)\n", (int)ret); +#endif + return (RDMA_FAILED); + } + rib_stat->service_list = service->next; + kmem_free(service, sizeof (rib_service_t)); + } + rw_exit(&rib_stat->service_list_lock); + + return (RDMA_SUCCESS); +} + static int rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { @@ -530,10 +587,14 @@ rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) if (rib_stat == NULL) { rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); + mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); } - rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids); + rib_stat->hca_count = ibt_get_hca_list(NULL); if (rib_stat->hca_count < 1) { + mutex_destroy(&rib_stat->listen_lock); + rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; @@ -544,15 +605,18 @@ rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) (void *)rib_stat, &rib_stat->ibt_clnt_hdl); if (ibt_status != IBT_SUCCESS) { - ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); + mutex_destroy(&rib_stat->listen_lock); + rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; return (DDI_FAILURE); } + rib_stat->service_list = NULL; + rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); mutex_enter(&rib_stat->open_hca_lock); - if (open_hcas(rib_stat) != RDMA_SUCCESS) { + if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { mutex_exit(&rib_stat->open_hca_lock); goto open_fail; } @@ -568,7 +632,6 @@ rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) /* * Register with rdmatf */ - rib_mod.rdma_count = rib_stat->nhca_inited; r_status = rdma_register_mod(&rib_mod); if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " @@ -579,11 +642,15 @@ rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) return (DDI_SUCCESS); register_fail: - rib_detach_hca(rib_stat->hca); + open_fail: - ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); (void) ibt_detach(rib_stat->ibt_clnt_hdl); + rpcib_free_hca_list(); + (void) rpcib_free_service_list(); + mutex_destroy(&rib_stat->listen_lock); + rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); + rw_destroy(&rib_stat->service_list_lock); kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; return (DDI_FAILURE); @@ -609,15 +676,17 @@ rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) mutex_enter(&plugin_state_lock); plugin_state = NO_ACCEPT; mutex_exit(&plugin_state_lock); - rib_detach_hca(rib_stat->hca); - ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count); + + if (rpcib_free_service_list() != RDMA_SUCCESS) + return (DDI_FAILURE); + rpcib_free_hca_list(); + (void) ibt_detach(rib_stat->ibt_clnt_hdl); + mutex_destroy(&rib_stat->listen_lock); + rw_destroy(&rib_stat->hcas_list_lock); mutex_destroy(&rib_stat->open_hca_lock); - if (rib_stat->hcas) { - kmem_free(rib_stat->hcas, rib_stat->hca_count * - sizeof (rib_hca_t)); - rib_stat->hcas = NULL; - } + rw_destroy(&rib_stat->service_list_lock); + kmem_free(rib_stat, sizeof (*rib_stat)); rib_stat = NULL; @@ -644,7 +713,7 @@ static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); */ static rdma_stat rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, - rib_cq_t **cqp, rpcib_state_t *ribstat) + rib_cq_t **cqp) { rib_cq_t *cq; ibt_cq_attr_t cq_attr; @@ -664,7 +733,7 @@ rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, error = RDMA_FAILED; goto fail; } - ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat); + ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); /* * Enable CQ callbacks. CQ Callbacks are single shot @@ -689,8 +758,25 @@ fail: return (error); } +/* + * rpcib_find_hca + * + * Caller should have already locked the hcas_lock before calling + * this function. + */ +static rib_hca_t * +rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) +{ + rib_hca_t *hca = ribstat->hcas_list; + + while (hca && hca->hca_guid != guid) + hca = hca->next; + + return (hca); +} + static rdma_stat -open_hcas(rpcib_state_t *ribstat) +rpcib_open_hcas(rpcib_state_t *ribstat) { rib_hca_t *hca; ibt_status_t ibt_status; @@ -702,25 +788,31 @@ open_hcas(rpcib_state_t *ribstat) kstat_t *ksp; cache_avl_struct_t example_avl_node; char rssc_name[32]; + int old_nhca_inited = ribstat->nhca_inited; + ib_guid_t *hca_guids; ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); - if (ribstat->hcas == NULL) - ribstat->hcas = kmem_zalloc(ribstat->hca_count * - sizeof (rib_hca_t), KM_SLEEP); + ribstat->hca_count = ibt_get_hca_list(&hca_guids); + if (ribstat->hca_count == 0) + return (RDMA_FAILED); + rw_enter(&ribstat->hcas_list_lock, RW_WRITER); /* * Open a hca and setup for RDMA */ for (i = 0; i < ribstat->hca_count; i++) { + if (rpcib_find_hca(ribstat, hca_guids[i])) + continue; + hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); + ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, - ribstat->hca_guids[i], - &ribstat->hcas[i].hca_hdl); + hca_guids[i], &hca->hca_hdl); if (ibt_status != IBT_SUCCESS) { + kmem_free(hca, sizeof (rib_hca_t)); continue; } - ribstat->hcas[i].hca_guid = ribstat->hca_guids[i]; - hca = &(ribstat->hcas[i]); + hca->hca_guid = hca_guids[i]; hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; hca->state = HCA_INITED; @@ -763,25 +855,25 @@ open_hcas(rpcib_state_t *ribstat) * cq's will be needed. */ status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, - &hca->svc_rcq, ribstat); + &hca->svc_rcq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, - &hca->svc_scq, ribstat); + &hca->svc_scq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, - &hca->clnt_rcq, ribstat); + &hca->clnt_rcq); if (status != RDMA_SUCCESS) { goto fail3; } status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, - &hca->clnt_scq, ribstat); + &hca->clnt_scq); if (status != RDMA_SUCCESS) { goto fail3; } @@ -805,7 +897,8 @@ open_hcas(rpcib_state_t *ribstat) if (hca->server_side_cache == NULL) { (void) sprintf(rssc_name, - "rib_server_side_cache_%04d", i); + "rib_srvr_cache_%llx", + (long long unsigned int) hca->hca_guid); hca->server_side_cache = kmem_cache_create( rssc_name, sizeof (cache_avl_struct_t), 0, @@ -821,9 +914,12 @@ open_hcas(rpcib_state_t *ribstat) (uint_t)(uintptr_t)&example_avl_node.avl_link- (uint_t)(uintptr_t)&example_avl_node); + rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, + hca->iblock); + rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock); - mutex_init(&hca->cache_allocation, + mutex_init(&hca->cache_allocation_lock, NULL, MUTEX_DRIVER, NULL); hca->avl_init = TRUE; @@ -844,34 +940,28 @@ open_hcas(rpcib_state_t *ribstat) } } if (hca->cleanup_helper == NULL) { + char tq_name[sizeof (hca->hca_guid) * 2 + 1]; + + (void) snprintf(tq_name, sizeof (tq_name), "%llX", + (unsigned long long int) hca->hca_guid); hca->cleanup_helper = ddi_taskq_create(NULL, - "CLEANUP_HELPER", 1, TASKQ_DEFAULTPRI, 0); + tq_name, 1, TASKQ_DEFAULTPRI, 0); } - /* - * Initialize the registered service list and - * the lock - */ - hca->service_list = NULL; - rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock); - mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, hca->iblock); rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, hca->iblock); - rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); hca->inuse = TRUE; - /* - * XXX One hca only. Add multi-hca functionality if needed - * later. - */ - ribstat->hca = hca; + + hca->next = ribstat->hcas_list; + ribstat->hcas_list = hca; ribstat->nhca_inited++; ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); - break; + continue; fail3: ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); @@ -879,9 +969,16 @@ fail2: (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); fail1: (void) ibt_close_hca(hca->hca_hdl); - + kmem_free(hca, sizeof (rib_hca_t)); } - if (ribstat->hca != NULL) + rw_exit(&ribstat->hcas_list_lock); + ibt_free_hca_list(hca_guids, ribstat->hca_count); + rib_mod.rdma_count = rib_stat->nhca_inited; + + /* + * return success if at least one new hca has been configured. + */ + if (ribstat->nhca_inited != old_nhca_inited) return (RDMA_SUCCESS); else return (RDMA_FAILED); @@ -1335,6 +1432,15 @@ rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) } } +static void +rib_attach_hca() +{ + mutex_enter(&rib_stat->open_hca_lock); + rpcib_open_hcas(rib_stat); + rib_listen(NULL); + mutex_exit(&rib_stat->open_hca_lock); +} + /* * Handles DR event of IBT_HCA_DETACH_EVENT. */ @@ -1343,20 +1449,46 @@ static void rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, ibt_async_code_t code, ibt_async_event_t *event) { - switch (code) { case IBT_HCA_ATTACH_EVENT: - /* ignore */ + rib_attach_hca(); break; case IBT_HCA_DETACH_EVENT: { - ASSERT(rib_stat->hca->hca_hdl == hca_hdl); - rib_detach_hca(rib_stat->hca); + rib_hca_t *hca; + + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + rw_enter(&hca->state_lock, RW_READER); + if ((hca->state != HCA_DETACHED) && + (hca->hca_hdl == hca_hdl)) { + rw_exit(&hca->state_lock); + break; + } + rw_exit(&hca->state_lock); + } + rw_exit(&rib_stat->hcas_list_lock); + + if (hca == NULL) + return; + ASSERT(hca->hca_hdl == hca_hdl); + rib_detach_hca(hca); #ifdef DEBUG cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); #endif break; } + case IBT_EVENT_PORT_UP: + /* + * A port is up. We should call rib_listen() since there is + * a chance that rib_listen() may have failed during + * rib_attach_hca() because the port had not been up yet. + */ + rib_listen(NULL); +#ifdef DEBUG + cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); +#endif + break; #ifdef DEBUG case IBT_EVENT_PATH_MIGRATED: cmn_err(CE_NOTE, "rib_async_handler(): " @@ -1390,9 +1522,6 @@ rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, case IBT_ERROR_PORT_DOWN: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); break; - case IBT_EVENT_PORT_UP: - cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); - break; case IBT_ASYNC_OPAQUE1: cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); break; @@ -1420,18 +1549,8 @@ rib_reachable(int addr_type, struct netbuf *raddr, void **handle) rdma_stat status; rpcib_ping_t rpt; - /* - * First check if a hca is still attached - */ - rw_enter(&rib_stat->hca->state_lock, RW_READER); - if (rib_stat->hca->state != HCA_INITED) { - rw_exit(&rib_stat->hca->state_lock); - return (RDMA_FAILED); - } - bzero(&rpt, sizeof (rpcib_ping_t)); status = rib_ping_srv(addr_type, raddr, &rpt); - rw_exit(&rib_stat->hca->state_lock); if (status == RDMA_SUCCESS) { *handle = (void *)rpt.hca; @@ -1589,11 +1708,9 @@ rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, ibt_cm_return_args_t *ret_args, void *priv_data, ibt_priv_data_len_t len) { - rpcib_state_t *ribstat; rib_hca_t *hca; - ribstat = (rpcib_state_t *)clnt_hdl; - hca = (rib_hca_t *)ribstat->hca; + hca = (rib_hca_t *)clnt_hdl; switch (event->cm_type) { @@ -1643,8 +1760,11 @@ rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, */ conn->c_state = C_DISCONN_PEND; mutex_exit(&conn->c_lock); - (void) rib_disconnect_channel(conn, - &hca->cl_conn_list); + rw_enter(&hca->state_lock, RW_READER); + if (hca->state != HCA_DETACHED) + (void) rib_disconnect_channel(conn, + &hca->cl_conn_list); + rw_exit(&hca->state_lock); } else { /* * conn will be freed when c_ref goes to 0. @@ -1736,7 +1856,7 @@ rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) chan_args.oc_path = &rptp->path; chan_args.oc_cm_handler = rib_clnt_cm_handler; - chan_args.oc_cm_clnt_private = (void *)rib_stat; + chan_args.oc_cm_clnt_private = (void *)hca; chan_args.oc_rdma_ra_out = 4; chan_args.oc_rdma_ra_in = 4; chan_args.oc_path_retry_cnt = 2; @@ -1799,7 +1919,7 @@ refresh: rdma_stat rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) { - uint_t i; + uint_t i, addr_count; ibt_status_t ibt_status; uint8_t num_paths_p; ibt_ip_path_attr_t ipattr; @@ -1808,8 +1928,11 @@ rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) rpcib_ipaddrs_t addrs6; struct sockaddr_in *sinp; struct sockaddr_in6 *sin6p; - rdma_stat retval = RDMA_SUCCESS; + rdma_stat retval = RDMA_FAILED; + rib_hca_t *hca; + if ((addr_type != AF_INET) && (addr_type != AF_INET6)) + return (RDMA_INVAL); ASSERT(raddr->buf != NULL); bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); @@ -1817,57 +1940,45 @@ rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { retval = RDMA_FAILED; - goto done; + goto done2; } - /* Prep the destination address */ - switch (addr_type) { - case AF_INET: + if (addr_type == AF_INET) { + addr_count = addrs4.ri_count; sinp = (struct sockaddr_in *)raddr->buf; rptp->dstip.family = AF_INET; rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; sinp = addrs4.ri_list; - - ipattr.ipa_dst_ip = &rptp->dstip; - ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; - ipattr.ipa_ndst = 1; - ipattr.ipa_max_paths = 1; - ipattr.ipa_src_ip.family = rptp->dstip.family; - for (i = 0; i < addrs4.ri_count; i++) { - num_paths_p = 0; - ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr; - bzero(&srcip, sizeof (ibt_path_ip_src_t)); - - ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, - IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, - &num_paths_p, &srcip); - if (ibt_status == IBT_SUCCESS && - num_paths_p != 0 && - rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { - rptp->hca = rib_stat->hca; - rptp->srcip.family = AF_INET; - rptp->srcip.un.ip4addr = - srcip.ip_primary.un.ip4addr; - goto done; - } - } - retval = RDMA_FAILED; - break; - - case AF_INET6: + } else { + addr_count = addrs6.ri_count; sin6p = (struct sockaddr_in6 *)raddr->buf; rptp->dstip.family = AF_INET6; rptp->dstip.un.ip6addr = sin6p->sin6_addr; sin6p = addrs6.ri_list; + } + + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + rw_enter(&hca->state_lock, RW_READER); + if (hca->state == HCA_DETACHED) { + rw_exit(&hca->state_lock); + continue; + } ipattr.ipa_dst_ip = &rptp->dstip; - ipattr.ipa_hca_guid = rib_stat->hca->hca_guid; + ipattr.ipa_hca_guid = hca->hca_guid; ipattr.ipa_ndst = 1; ipattr.ipa_max_paths = 1; ipattr.ipa_src_ip.family = rptp->dstip.family; - for (i = 0; i < addrs6.ri_count; i++) { + for (i = 0; i < addr_count; i++) { num_paths_p = 0; - ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr; + if (addr_type == AF_INET) { + ipattr.ipa_src_ip.un.ip4addr = + sinp[i].sin_addr.s_addr; + } else { + ipattr.ipa_src_ip.un.ip6addr = + sin6p[i].sin6_addr; + } bzero(&srcip, sizeof (ibt_path_ip_src_t)); ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, @@ -1875,23 +1986,28 @@ rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) &num_paths_p, &srcip); if (ibt_status == IBT_SUCCESS && num_paths_p != 0 && - rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) { - rptp->hca = rib_stat->hca; - rptp->srcip.family = AF_INET6; - rptp->srcip.un.ip6addr = - srcip.ip_primary.un.ip6addr; - goto done; + rptp->path.pi_hca_guid == hca->hca_guid) { + rptp->hca = hca; + rw_exit(&hca->state_lock); + if (addr_type == AF_INET) { + rptp->srcip.family = AF_INET; + rptp->srcip.un.ip4addr = + srcip.ip_primary.un.ip4addr; + } else { + rptp->srcip.family = AF_INET6; + rptp->srcip.un.ip6addr = + srcip.ip_primary.un.ip6addr; + + } + retval = RDMA_SUCCESS; + goto done1; } } - retval = RDMA_FAILED; - break; - - default: - retval = RDMA_INVAL; - break; + rw_exit(&hca->state_lock); } -done: - +done1: + rw_exit(&rib_stat->hcas_list_lock); +done2: if (addrs4.ri_size > 0) kmem_free(addrs4.ri_list, addrs4.ri_size); if (addrs6.ri_size > 0) @@ -2802,7 +2918,6 @@ rib_srv_cm_handler(void *any, ibt_cm_event_t *event, { queue_t *q; rib_qp_t *qp; - rpcib_state_t *ribstat; rib_hca_t *hca; rdma_stat status = RDMA_SUCCESS; int i; @@ -2820,9 +2935,7 @@ rib_srv_cm_handler(void *any, ibt_cm_event_t *event, ASSERT(any != NULL); ASSERT(event != NULL); - ribstat = (rpcib_state_t *)any; - hca = (rib_hca_t *)ribstat->hca; - ASSERT(hca != NULL); + hca = (rib_hca_t *)any; /* got a connection request */ switch (event->cm_type) { @@ -3051,7 +3164,8 @@ rib_srv_cm_handler(void *any, ibt_cm_event_t *event, } static rdma_stat -rib_register_service(rib_hca_t *hca, int service_type) +rib_register_service(rib_hca_t *hca, int service_type, + uint8_t protocol_num, in_port_t dst_port) { ibt_srv_desc_t sdesc; ibt_hca_portinfo_t *port_infop; @@ -3060,7 +3174,7 @@ rib_register_service(rib_hca_t *hca, int service_type) uint_t port_size; uint_t pki, i, num_ports, nbinds; ibt_status_t ibt_status; - rib_service_t *new_service; + rib_service_t *service; ib_pkey_t pkey; /* @@ -3099,75 +3213,123 @@ rib_register_service(rib_hca_t *hca, int service_type) * IP addresses as its different names. For now the only * type of service we support in RPCIB is NFS. */ - rw_enter(&hca->service_list_lock, RW_WRITER); + rw_enter(&rib_stat->service_list_lock, RW_WRITER); /* * Start registering and binding service to active * on active ports on this HCA. */ nbinds = 0; - new_service = NULL; + for (service = rib_stat->service_list; + service && (service->srv_type != service_type); + service = service->next) + ; - /* - * We use IP addresses as the service names for - * service registration. Register each of them - * with CM to obtain a svc_id and svc_hdl. We do not - * register the service with machine's loopback address. - */ - (void) bzero(&srv_id, sizeof (ib_svc_id_t)); - (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); - (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); + if (service == NULL) { + /* + * We use IP addresses as the service names for + * service registration. Register each of them + * with CM to obtain a svc_id and svc_hdl. We do not + * register the service with machine's loopback address. + */ + (void) bzero(&srv_id, sizeof (ib_svc_id_t)); + (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); + (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); + sdesc.sd_handler = rib_srv_cm_handler; + sdesc.sd_flags = 0; + ibt_status = ibt_register_service(hca->ibt_clnt_hdl, + &sdesc, ibt_get_ip_sid(protocol_num, dst_port), + 1, &srv_hdl, &srv_id); + if ((ibt_status != IBT_SUCCESS) && + (ibt_status != IBT_CM_SERVICE_EXISTS)) { + rw_exit(&rib_stat->service_list_lock); + DTRACE_PROBE1(rpcib__i__regservice__ibtres, + int, ibt_status); + ibt_free_portinfo(port_infop, port_size); + return (RDMA_FAILED); + } - sdesc.sd_handler = rib_srv_cm_handler; - sdesc.sd_flags = 0; - ibt_status = ibt_register_service(hca->ibt_clnt_hdl, - &sdesc, ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port), - 1, &srv_hdl, &srv_id); + /* + * Allocate and prepare a service entry + */ + service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); + + service->srv_type = service_type; + service->srv_hdl = srv_hdl; + service->srv_id = srv_id; + + service->next = rib_stat->service_list; + rib_stat->service_list = service; + DTRACE_PROBE1(rpcib__i__regservice__new__service, + int, service->srv_type); + } else { + srv_hdl = service->srv_hdl; + srv_id = service->srv_id; + DTRACE_PROBE1(rpcib__i__regservice__existing__service, + int, service->srv_type); + } for (i = 0; i < num_ports; i++) { + ibt_sbind_hdl_t sbp; + rib_hca_service_t *hca_srv; + ib_gid_t gid; + if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) continue; for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { pkey = port_infop[i].p_pkey_tbl[pki]; - if ((pkey & IBSRM_HB) && - (pkey != IB_PKEY_INVALID_FULL)) { + rw_enter(&hca->bound_services_lock, RW_READER); + gid = port_infop[i].p_sgid_tbl[0]; + for (hca_srv = hca->bound_services; hca_srv; + hca_srv = hca_srv->next) { + if ((hca_srv->srv_id == service->srv_id) && + (hca_srv->gid.gid_prefix == + gid.gid_prefix) && + (hca_srv->gid.gid_guid == gid.gid_guid)) + break; + } + rw_exit(&hca->bound_services_lock); + if (hca_srv != NULL) { /* - * Allocate and prepare a service entry + * port is alreay bound the the service */ - new_service = - kmem_zalloc(1 * sizeof (rib_service_t), - KM_SLEEP); + DTRACE_PROBE1( + rpcib__i__regservice__already__bound, + int, i+1); + nbinds++; + continue; + } - new_service->srv_type = service_type; - new_service->srv_hdl = srv_hdl; - new_service->srv_next = NULL; + if ((pkey & IBSRM_HB) && + (pkey != IB_PKEY_INVALID_FULL)) { + sbp = NULL; ibt_status = ibt_bind_service(srv_hdl, - port_infop[i].p_sgid_tbl[0], - NULL, rib_stat, NULL); + gid, NULL, hca, &sbp); + + if (ibt_status == IBT_SUCCESS) { + hca_srv = kmem_zalloc( + sizeof (rib_hca_service_t), + KM_SLEEP); + hca_srv->srv_id = srv_id; + hca_srv->gid = gid; + hca_srv->sbind_hdl = sbp; + + rw_enter(&hca->bound_services_lock, + RW_WRITER); + hca_srv->next = hca->bound_services; + hca->bound_services = hca_srv; + rw_exit(&hca->bound_services_lock); + nbinds++; + } DTRACE_PROBE1(rpcib__i__regservice__bindres, int, ibt_status); - - if (ibt_status != IBT_SUCCESS) { - kmem_free(new_service, - sizeof (rib_service_t)); - new_service = NULL; - continue; - } - - /* - * Add to the service list for this HCA - */ - new_service->srv_next = hca->service_list; - hca->service_list = new_service; - new_service = NULL; - nbinds++; } } } - rw_exit(&hca->service_list_lock); + rw_exit(&rib_stat->service_list_lock); ibt_free_portinfo(port_infop, port_size); @@ -3188,39 +3350,64 @@ rib_register_service(rib_hca_t *hca, int service_type) void rib_listen(struct rdma_svc_data *rd) { - rdma_stat status = RDMA_SUCCESS; - - rd->active = 0; - rd->err_code = RDMA_FAILED; + rdma_stat status; + int n_listening = 0; + rib_hca_t *hca; + mutex_enter(&rib_stat->listen_lock); /* - * First check if a hca is still attached + * if rd parameter is NULL then it means that rib_stat->q is + * already initialized by a call from RDMA and we just want to + * add a newly attached HCA to the same listening state as other + * HCAs. */ - rw_enter(&rib_stat->hca->state_lock, RW_READER); - if (rib_stat->hca->state != HCA_INITED) { - rw_exit(&rib_stat->hca->state_lock); - return; + if (rd == NULL) { + if (rib_stat->q == NULL) { + mutex_exit(&rib_stat->listen_lock); + return; + } + } else { + rib_stat->q = &rd->q; } - rw_exit(&rib_stat->hca->state_lock); + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + /* + * First check if a hca is still attached + */ + rw_enter(&hca->state_lock, RW_READER); + if (hca->state != HCA_INITED) { + rw_exit(&hca->state_lock); + continue; + } + rw_exit(&hca->state_lock); - rib_stat->q = &rd->q; - /* - * Right now the only service type is NFS. Hence force feed this - * value. Ideally to communicate the service type it should be - * passed down in rdma_svc_data. - */ - rib_stat->service_type = NFS; - status = rib_register_service(rib_stat->hca, NFS); - if (status != RDMA_SUCCESS) { - rd->err_code = status; - return; + /* + * Right now the only service type is NFS. Hence + * force feed this value. Ideally to communicate + * the service type it should be passed down in + * rdma_svc_data. + */ + status = rib_register_service(hca, NFS, + IPPROTO_TCP, nfs_rdma_port); + if (status == RDMA_SUCCESS) + n_listening++; } + rw_exit(&rib_stat->hcas_list_lock); + /* * Service active on an HCA, check rd->err_code for more * explainable errors. */ - rd->active = 1; - rd->err_code = status; + if (rd) { + if (n_listening > 0) { + rd->active = 1; + rd->err_code = RDMA_SUCCESS; + } else { + rd->active = 0; + rd->err_code = RDMA_FAILED; + } + } + mutex_exit(&rib_stat->listen_lock); } /* XXXX */ @@ -3230,6 +3417,7 @@ rib_listen_stop(struct rdma_svc_data *svcdata) { rib_hca_t *hca; + mutex_enter(&rib_stat->listen_lock); /* * KRPC called the RDMATF to stop the listeners, this means * stop sending incomming or recieved requests to KRPC master @@ -3242,64 +3430,72 @@ rib_listen_stop(struct rdma_svc_data *svcdata) svcdata->active = 0; mutex_exit(&plugin_state_lock); - /* - * First check if a hca is still attached - */ - hca = rib_stat->hca; - rw_enter(&hca->state_lock, RW_READER); - if (hca->state != HCA_INITED) { + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + /* + * First check if a hca is still attached + */ + rw_enter(&hca->state_lock, RW_READER); + if (hca->state == HCA_DETACHED) { + rw_exit(&hca->state_lock); + continue; + } + rib_close_channels(&hca->srv_conn_list); + rib_stop_services(hca); rw_exit(&hca->state_lock); - return; } - rib_close_channels(&hca->srv_conn_list); - rib_stop_services(hca); - rw_exit(&hca->state_lock); + rw_exit(&rib_stat->hcas_list_lock); + + /* + * Avoid rib_listen() using the stale q field. + * This could happen if a port goes up after all services + * are already unregistered. + */ + rib_stat->q = NULL; + mutex_exit(&rib_stat->listen_lock); } /* * Traverse the HCA's service list to unbind and deregister services. - * Instead of unbinding the service for a service handle by - * calling ibt_unbind_service() for each port/pkey, we unbind - * all the services for the service handle by making only one - * call to ibt_unbind_all_services(). Then, we deregister the - * service for the service handle. - * - * When traversing the entries in service_list, we compare the - * srv_hdl of the current entry with that of the next. If they - * are different or if the next entry is NULL, the current entry - * marks the last binding of the service handle. In this case, - * call ibt_unbind_all_services() and deregister the service for - * the service handle. If they are the same, the current and the - * next entries are bound to the same service handle. In this - * case, move on to the next entry. + * For each bound service of HCA to be removed, first find the corresponding + * service handle (srv_hdl) and then unbind the service by calling + * ibt_unbind_service(). */ static void rib_stop_services(rib_hca_t *hca) { - rib_service_t *srv_list, *to_remove; + rib_hca_service_t *srv_list, *to_remove; /* * unbind and deregister the services for this service type. * Right now there is only one service type. In future it will * be passed down to this function. */ - rw_enter(&hca->service_list_lock, RW_WRITER); - srv_list = hca->service_list; - while (srv_list != NULL) { - to_remove = srv_list; - srv_list = to_remove->srv_next; - if (srv_list == NULL || bcmp(to_remove->srv_hdl, - srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) { + rw_enter(&hca->bound_services_lock, RW_READER); + srv_list = hca->bound_services; + hca->bound_services = NULL; + rw_exit(&hca->bound_services_lock); - (void) ibt_unbind_all_services(to_remove->srv_hdl); - (void) ibt_deregister_service(hca->ibt_clnt_hdl, - to_remove->srv_hdl); - } + while (srv_list != NULL) { + rib_service_t *sc; - kmem_free(to_remove, sizeof (rib_service_t)); + to_remove = srv_list; + srv_list = to_remove->next; + rw_enter(&rib_stat->service_list_lock, RW_READER); + for (sc = rib_stat->service_list; + sc && (sc->srv_id != to_remove->srv_id); + sc = sc->next) + ; + /* + * if sc is NULL then the service doesn't exist anymore, + * probably just removed completely through rib_stat. + */ + if (sc != NULL) + (void) ibt_unbind_service(sc->srv_hdl, + to_remove->sbind_hdl); + rw_exit(&rib_stat->service_list_lock); + kmem_free(to_remove, sizeof (rib_hca_service_t)); } - hca->service_list = NULL; - rw_exit(&hca->service_list_lock); } static struct svc_recv * @@ -3436,7 +3632,7 @@ rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, IBT_MR_ENABLE_WINDOW_BIND | spec; rw_enter(&hca->state_lock, RW_READER); - if (hca->state == HCA_INITED) { + if (hca->state != HCA_DETACHED) { ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, &mem_attr, mr_hdlp, mr_descp); rw_exit(&hca->state_lock); @@ -3555,7 +3751,7 @@ rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, mr_segment.ms_flags = IBT_SYNC_READ; } rw_enter(&hca->state_lock, RW_READER); - if (hca->state == HCA_INITED) { + if (hca->state != HCA_DETACHED) { status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); rw_exit(&hca->state_lock); } else { @@ -3629,7 +3825,7 @@ rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) sizeof (ibt_mr_desc_t), KM_SLEEP); rw_enter(&hca->state_lock, RW_READER); - if (hca->state != HCA_INITED) { + if (hca->state == HCA_DETACHED) { rw_exit(&hca->state_lock); goto fail; } @@ -3939,43 +4135,20 @@ rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) } /* - * Connection management. - * IBTF does not support recycling of channels. So connections are only - * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or - * C_DISCONN_PEND state. No C_IDLE state. - * C_CONN_PEND state: Connection establishment in progress to the server. - * C_CONNECTED state: A connection when created is in C_CONNECTED state. - * It has an RC channel associated with it. ibt_post_send/recv are allowed - * only in this state. - * C_ERROR_CONN state: A connection transitions to this state when WRs on the - * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event - * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. - * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when - * c_ref drops to 0 (this indicates that RPC has no more references to this - * connection), the connection should be destroyed. A connection transitions - * into this state when it is being destroyed. + * rib_find_hca_connection + * + * if there is an existing connection to the specified address then + * it will be returned in conn, otherwise conn will be set to NULL. + * Also cleans up any connection that is in error state. */ -/* ARGSUSED */ -static rdma_stat -rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn) +static int +rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, + struct netbuf *d_svcaddr, CONN **conn) { CONN *cn; - int status = RDMA_SUCCESS; - rib_hca_t *hca = rib_stat->hca; - rib_qp_t *qp; clock_t cv_stat, timout; - rpcib_ping_t rpt; - - if (hca == NULL) - return (RDMA_FAILED); - - rw_enter(&rib_stat->hca->state_lock, RW_READER); - if (hca->state == HCA_DETACHED) { - rw_exit(&rib_stat->hca->state_lock); - return (RDMA_FAILED); - } - rw_exit(&rib_stat->hca->state_lock); + *conn = NULL; again: rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); cn = hca->cl_conn_list.conn_hd; @@ -4004,8 +4177,18 @@ again: cn = cn->c_next; continue; } - if ((cn->c_raddr.len == svcaddr->len) && - bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) { + + /* + * source address is only checked for if there is one, + * this is the case for retries. + */ + if ((cn->c_raddr.len == d_svcaddr->len) && + (bcmp(d_svcaddr->buf, cn->c_raddr.buf, + d_svcaddr->len) == 0) && + ((s_svcaddr->len == 0) || + ((cn->c_laddr.len == s_svcaddr->len) && + (bcmp(s_svcaddr->buf, cn->c_laddr.buf, + s_svcaddr->len) == 0)))) { /* * Our connection. Give up conn list lock * as we are done traversing the list. @@ -4015,7 +4198,7 @@ again: cn->c_ref++; /* sharing a conn */ mutex_exit(&cn->c_lock); *conn = cn; - return (status); + return (RDMA_SUCCESS); } if (cn->c_state == C_CONN_PEND) { /* @@ -4042,7 +4225,7 @@ again: if (cn->c_state == C_CONNECTED) { *conn = cn; mutex_exit(&cn->c_lock); - return (status); + return (RDMA_SUCCESS); } else { cn->c_ref--; mutex_exit(&cn->c_lock); @@ -4054,24 +4237,90 @@ again: cn = cn->c_next; } rw_exit(&hca->cl_conn_list.conn_lock); + *conn = NULL; + return (RDMA_FAILED); +} + +/* + * Connection management. + * IBTF does not support recycling of channels. So connections are only + * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or + * C_DISCONN_PEND state. No C_IDLE state. + * C_CONN_PEND state: Connection establishment in progress to the server. + * C_CONNECTED state: A connection when created is in C_CONNECTED state. + * It has an RC channel associated with it. ibt_post_send/recv are allowed + * only in this state. + * C_ERROR_CONN state: A connection transitions to this state when WRs on the + * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event + * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. + * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when + * c_ref drops to 0 (this indicates that RPC has no more references to this + * connection), the connection should be destroyed. A connection transitions + * into this state when it is being destroyed. + */ +/* ARGSUSED */ +static rdma_stat +rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, + int addr_type, void *handle, CONN **conn) +{ + CONN *cn; + int status; + rib_hca_t *hca; + rib_qp_t *qp; + rpcib_ping_t rpt; + int s_addr_len; + char *s_addr_buf; + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + rw_enter(&hca->state_lock, RW_READER); + if (hca->state != HCA_DETACHED) { + status = rib_find_hca_connection(hca, s_svcaddr, + d_svcaddr, conn); + rw_exit(&hca->state_lock); + if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { + rw_exit(&rib_stat->hcas_list_lock); + return (status); + } + } else + rw_exit(&hca->state_lock); + } + rw_exit(&rib_stat->hcas_list_lock); + + /* + * No existing connection found, establish a new connection. + */ bzero(&rpt, sizeof (rpcib_ping_t)); - status = rib_ping_srv(addr_type, svcaddr, &rpt); + status = rib_ping_srv(addr_type, d_svcaddr, &rpt); if (status != RDMA_SUCCESS) { return (RDMA_FAILED); } + hca = rpt.hca; + + if (rpt.srcip.family == AF_INET) { + s_addr_len = sizeof (rpt.srcip.un.ip4addr); + s_addr_buf = (char *)&rpt.srcip.un.ip4addr; + } else if (rpt.srcip.family == AF_INET6) { + s_addr_len = sizeof (rpt.srcip.un.ip6addr); + s_addr_buf = (char *)&rpt.srcip.un.ip6addr; + } else + return (RDMA_FAILED); /* * Channel to server doesn't exist yet, create one. */ - if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) { + if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { return (RDMA_FAILED); } cn = qptoc(qp); cn->c_state = C_CONN_PEND; cn->c_ref = 1; + cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); + bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); + cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; + /* * Add to conn list. * We had given up the READER lock. In the time since then, @@ -4432,47 +4681,21 @@ rib_free_hca(rib_hca_t *hca) hca->hca_hdl = NULL; } -/* - * Cleans and closes up all uses of the HCA - */ + static void -rib_detach_hca(rib_hca_t *hca) +rib_stop_hca_services(rib_hca_t *hca) { - - /* - * Stop all services on the HCA - * Go through cl_conn_list and close all rc_channels - * Go through svr_conn_list and close all rc_channels - * Free connections whose c_ref has dropped to 0 - * Destroy all CQs - * Deregister and released all buffer pool memory after all - * connections are destroyed - * Free the protection domain - * ibt_close_hca() - */ - rw_enter(&hca->state_lock, RW_WRITER); - if (hca->state == HCA_DETACHED) { - rw_exit(&hca->state_lock); - return; - } - - hca->state = HCA_DETACHED; - rib_stat->nhca_inited--; - rib_stop_services(hca); rib_close_channels(&hca->cl_conn_list); rib_close_channels(&hca->srv_conn_list); - rib_mod.rdma_count--; - - rw_exit(&hca->state_lock); - rib_purge_connlist(&hca->cl_conn_list); rib_purge_connlist(&hca->srv_conn_list); - if (stats_enabled) { + if ((rib_stat->hcas_list == NULL) && stats_enabled) { kstat_delete_byname_zone("unix", 0, "rpcib_cache", GLOBAL_ZONEID); + stats_enabled = FALSE; } rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); @@ -4496,6 +4719,7 @@ rib_detach_hca(rib_hca_t *hca) rib_free_hca(hca); } + rw_destroy(&hca->bound_services_lock); if (hca->cleanup_helper != NULL) { ddi_taskq_destroy(hca->cleanup_helper); @@ -4503,6 +4727,48 @@ rib_detach_hca(rib_hca_t *hca) } } +/* + * Cleans and closes up all uses of the HCA + */ +static void +rib_detach_hca(rib_hca_t *hca) +{ + rib_hca_t **hcap; + + /* + * Stop all services on the HCA + * Go through cl_conn_list and close all rc_channels + * Go through svr_conn_list and close all rc_channels + * Free connections whose c_ref has dropped to 0 + * Destroy all CQs + * Deregister and released all buffer pool memory after all + * connections are destroyed + * Free the protection domain + * ibt_close_hca() + */ + rw_enter(&hca->state_lock, RW_WRITER); + if (hca->state == HCA_DETACHED) { + rw_exit(&hca->state_lock); + return; + } + + hca->state = HCA_DETACHED; + rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); + for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca); + hcap = &(*hcap)->next) + ; + ASSERT(*hcap == hca); + *hcap = hca->next; + rib_stat->nhca_inited--; + rib_mod.rdma_count--; + rw_exit(&rib_stat->hcas_list_lock); + rw_exit(&hca->state_lock); + + rib_stop_hca_services(hca); + + kmem_free(hca, sizeof (*hca)); +} + static void rib_server_side_cache_reclaim(void *argp) { @@ -4518,13 +4784,13 @@ rib_server_side_cache_reclaim(void *argp) while (rcas != NULL) { while (rcas->r.forw != &rcas->r) { rcas->elements--; - rib_total_buffers --; rb = rcas->r.forw; remque(rb); if (rb->registered) (void) rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); - cache_allocation -= rb->lrc_len; + + hca->cache_allocation -= rb->lrc_len; kmem_free(rb->lrc_buf, rb->lrc_len); kmem_free(rb, sizeof (rib_lrc_entry_t)); } @@ -4544,12 +4810,12 @@ rib_server_side_cache_cleanup(void *argp) rib_lrc_entry_t *rb; rib_hca_t *hca = (rib_hca_t *)argp; - rw_enter(&hca->avl_rw_lock, RW_READER); - if (cache_allocation < cache_limit) { - rw_exit(&hca->avl_rw_lock); + mutex_enter(&hca->cache_allocation_lock); + if (hca->cache_allocation < cache_limit) { + mutex_exit(&hca->cache_allocation_lock); return; } - rw_exit(&hca->avl_rw_lock); + mutex_exit(&hca->cache_allocation_lock); rw_enter(&hca->avl_rw_lock, RW_WRITER); rcas = avl_last(&hca->avl_tree); @@ -4559,13 +4825,14 @@ rib_server_side_cache_cleanup(void *argp) while (rcas != NULL) { while (rcas->r.forw != &rcas->r) { rcas->elements--; - rib_total_buffers --; rb = rcas->r.forw; remque(rb); if (rb->registered) (void) rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle); - cache_allocation -= rb->lrc_len; + + hca->cache_allocation -= rb->lrc_len; + kmem_free(rb->lrc_buf, rb->lrc_len); kmem_free(rb, sizeof (rib_lrc_entry_t)); } @@ -4573,7 +4840,8 @@ rib_server_side_cache_cleanup(void *argp) if (hca->server_side_cache) { kmem_cache_free(hca->server_side_cache, rcas); } - if ((cache_allocation) < cache_limit) { + + if (hca->cache_allocation < cache_limit) { rw_exit(&hca->avl_rw_lock); return; } @@ -4607,7 +4875,7 @@ rib_destroy_cache(rib_hca_t *hca) hca->server_side_cache = NULL; } avl_destroy(&hca->avl_tree); - mutex_destroy(&hca->cache_allocation); + mutex_destroy(&hca->cache_allocation_lock); rw_destroy(&hca->avl_rw_lock); } hca->avl_init = FALSE; @@ -4639,9 +4907,9 @@ rib_get_cache_buf(CONN *conn, uint32_t len) rw_enter(&hca->avl_rw_lock, RW_READER); - mutex_enter(&hca->cache_allocation); - c_alloc = cache_allocation; - mutex_exit(&hca->cache_allocation); + mutex_enter(&hca->cache_allocation_lock); + c_alloc = hca->cache_allocation; + mutex_exit(&hca->cache_allocation_lock); if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL) { @@ -4649,7 +4917,9 @@ rib_get_cache_buf(CONN *conn, uint32_t len) if ((c_alloc + len) >= cache_limit) { rib_force_cleanup((void *)hca); rw_exit(&hca->avl_rw_lock); - cache_misses_above_the_limit ++; + mutex_enter(&hca->cache_allocation_lock); + hca->cache_misses_above_the_limit ++; + mutex_exit(&hca->cache_allocation_lock); /* Allocate and register the buffer directly */ goto error_alloc; @@ -4678,28 +4948,33 @@ rib_get_cache_buf(CONN *conn, uint32_t len) mutex_enter(&rcas->node_lock); if (rcas->r.forw != &rcas->r && rcas->elements > 0) { - rib_total_buffers--; - cache_hits++; reply_buf = rcas->r.forw; remque(reply_buf); rcas->elements--; mutex_exit(&rcas->node_lock); rw_exit(&hca->avl_rw_lock); - mutex_enter(&hca->cache_allocation); - cache_allocation -= len; - mutex_exit(&hca->cache_allocation); + + mutex_enter(&hca->cache_allocation_lock); + hca->cache_hits++; + hca->cache_allocation -= len; + mutex_exit(&hca->cache_allocation_lock); } else { /* Am I above the cache limit */ mutex_exit(&rcas->node_lock); if ((c_alloc + len) >= cache_limit) { rib_force_cleanup((void *)hca); rw_exit(&hca->avl_rw_lock); - cache_misses_above_the_limit ++; + + mutex_enter(&hca->cache_allocation_lock); + hca->cache_misses_above_the_limit++; + mutex_exit(&hca->cache_allocation_lock); /* Allocate and register the buffer directly */ goto error_alloc; } rw_exit(&hca->avl_rw_lock); - cache_misses ++; + mutex_enter(&hca->cache_allocation_lock); + hca->cache_misses++; + mutex_exit(&hca->cache_allocation_lock); /* Allocate a reply_buf entry */ reply_buf = (rib_lrc_entry_t *) kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); @@ -4746,16 +5021,15 @@ rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) rw_exit(&hca->avl_rw_lock); goto error_free; } else { - rib_total_buffers ++; cas.len = reg_buf->lrc_len; mutex_enter(&rcas->node_lock); insque(reg_buf, &rcas->r); rcas->elements ++; mutex_exit(&rcas->node_lock); rw_exit(&hca->avl_rw_lock); - mutex_enter(&hca->cache_allocation); - cache_allocation += cas.len; - mutex_exit(&hca->cache_allocation); + mutex_enter(&hca->cache_allocation_lock); + hca->cache_allocation += cas.len; + mutex_exit(&hca->cache_allocation_lock); } return; @@ -4956,20 +5230,28 @@ rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) } /* ARGSUSED */ -static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) { +static int +rpcib_cache_kstat_update(kstat_t *ksp, int rw) +{ + rib_hca_t *hca; if (KSTAT_WRITE == rw) { return (EACCES); } + rpcib_kstat.cache_limit.value.ui64 = (uint64_t)cache_limit; - rpcib_kstat.cache_allocation.value.ui64 = - (uint64_t)cache_allocation; - rpcib_kstat.cache_hits.value.ui64 = - (uint64_t)cache_hits; - rpcib_kstat.cache_misses.value.ui64 = - (uint64_t)cache_misses; - rpcib_kstat.cache_misses_above_the_limit.value.ui64 = - (uint64_t)cache_misses_above_the_limit; + rw_enter(&rib_stat->hcas_list_lock, RW_READER); + for (hca = rib_stat->hcas_list; hca; hca = hca->next) { + rpcib_kstat.cache_allocation.value.ui64 += + (uint64_t)hca->cache_allocation; + rpcib_kstat.cache_hits.value.ui64 += + (uint64_t)hca->cache_hits; + rpcib_kstat.cache_misses.value.ui64 += + (uint64_t)hca->cache_misses; + rpcib_kstat.cache_misses_above_the_limit.value.ui64 += + (uint64_t)hca->cache_misses_above_the_limit; + } + rw_exit(&rib_stat->hcas_list_lock); return (0); } |