diff options
author | Pavan Chandrashekar <Pavan.Chandrashekar@Sun.COM> | 2009-08-03 09:33:32 -0700 |
---|---|---|
committer | Pavan Chandrashekar <Pavan.Chandrashekar@Sun.COM> | 2009-08-03 09:33:32 -0700 |
commit | bd2ee4f4d736b3a98de7cb84206a8cd8d65ccdda (patch) | |
tree | 65b367f231cd6d9470f948a48554cd0e277c9daa | |
parent | 23524732d002da91177f82bdfa44378749661577 (diff) | |
download | illumos-gate-bd2ee4f4d736b3a98de7cb84206a8cd8d65ccdda.tar.gz |
6833247 IPonIB should create the MCG if it does not exist
6850975 Removal of a partition on the subnet manager is not reflected in dladm
-rw-r--r-- | usr/src/uts/common/io/ib/clients/ibd/ibd.c | 321 | ||||
-rw-r--r-- | usr/src/uts/common/sys/ib/clients/ibd/ibd.h | 1 |
2 files changed, 236 insertions, 86 deletions
diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c index 21b6cf0c54..f2cd48606c 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c @@ -135,6 +135,7 @@ uint_t ibd_separate_cqs = 1; uint_t ibd_txcomp_poll = 0; uint_t ibd_rx_softintr = 1; uint_t ibd_tx_softintr = 1; +uint_t ibd_create_broadcast_group = 1; #ifdef IBD_LOGGING uint_t ibd_log_sz = 0x20000; #endif @@ -261,6 +262,7 @@ static uint_t ibd_rxcomp_usec = 10; #define IBD_RECV 1 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF #define IBD_DEF_MAX_SDU 2044 +#define IBD_DEFAULT_QKEY 0xB1B #ifdef IBD_LOGGING #define IBD_DMAX_LINE 100 #endif @@ -420,6 +422,7 @@ static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); +static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); /* * Helpers for attach/start routines @@ -1957,6 +1960,28 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) } /* + * Check the pkey table to see if we can find the pkey we're looking for. + * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on + * failure. + */ +static int +ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, + uint16_t *pkix) +{ + uint16_t ndx; + + ASSERT(pkix != NULL); + + for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { + if (pkey_tbl[ndx] == pkey) { + *pkix = ndx; + return (0); + } + } + return (-1); +} + +/* * When the link is notified up, we need to do a few things, based * on the port's current p_init_type_reply claiming a reinit has been * done or not. The reinit steps are: @@ -1973,11 +1998,14 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) static void ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) { - ibt_hca_portinfo_t *port_infop; + ibt_hca_portinfo_t *port_infop = NULL; ibt_status_t ibt_status; uint_t psize, port_infosz; ibd_link_op_t opcode; ibd_req_t *req; + link_state_t new_link_state = LINK_STATE_UP; + uint8_t itreply; + uint16_t pkix; /* * Do not send a request to the async daemon if it has not @@ -2002,85 +2030,117 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) return; } - if ((code == IBT_EVENT_PORT_UP) || (code == IBT_CLNT_REREG_EVENT) || - (code == IBT_PORT_CHANGE_EVENT)) { - uint8_t itreply; - boolean_t badup = B_FALSE; + /* + * If this routine was called in response to a port down event, + * we just need to see if this should be informed. + */ + if (code == IBT_ERROR_PORT_DOWN) { + new_link_state = LINK_STATE_DOWN; + goto update_link_state; + } - ibt_status = ibt_query_hca_ports(state->id_hca_hdl, - state->id_port, &port_infop, &psize, &port_infosz); - if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { - mutex_exit(&state->id_link_mutex); - DPRINT(10, "ibd_link_up : failed in" - " ibt_query_port()\n"); - return; - } + /* + * If it's not a port down event we've received, try to get the port + * attributes first. If we fail here, the port is as good as down. + * Otherwise, if the link went down by the time the handler gets + * here, give up - we cannot even validate the pkey/gid since those + * are not valid and this is as bad as a port down anyway. + */ + ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, + &port_infop, &psize, &port_infosz); + if ((ibt_status != IBT_SUCCESS) || (psize != 1) || + (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { + new_link_state = LINK_STATE_DOWN; + goto update_link_state; + } + /* + * Check the SM InitTypeReply flags. If both NoLoadReply and + * PreserveContentReply are 0, we don't know anything about the + * data loaded into the port attributes, so we need to verify + * if gid0 and pkey are still valid. + */ + itreply = port_infop->p_init_type_reply; + if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && + ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { /* - * If the link already went down by the time the handler gets - * here, give up; we can not even validate pkey/gid since those - * are not valid. + * Check to see if the subnet part of GID0 has changed. If + * not, check the simple case first to see if the pkey + * index is the same as before; finally check to see if the + * pkey has been relocated to a different index in the table. */ - if (port_infop->p_linkstate != IBT_PORT_ACTIVE) - badup = B_TRUE; + if (bcmp(port_infop->p_sgid_tbl, + &state->id_sgid, sizeof (ib_gid_t)) != 0) { - itreply = port_infop->p_init_type_reply; + new_link_state = LINK_STATE_DOWN; - /* - * In InitTypeReply, check if NoLoadReply == - * PreserveContentReply == 0, in which case, verify Pkey/GID0. - */ - if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && - ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && - (!badup)) { - /* - * Check that the subnet part of GID0 has not changed. - */ - if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, - sizeof (ib_gid_t)) != 0) - badup = B_TRUE; - - /* - * Check that Pkey/index mapping is still valid. - */ - if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || - (port_infop->p_pkey_tbl[state->id_pkix] != - state->id_pkey)) - badup = B_TRUE; - } + } else if (port_infop->p_pkey_tbl[state->id_pkix] == + state->id_pkey) { - /* - * In InitTypeReply, if PreservePresenceReply indicates the SM - * has ensured that the port's presence in mcg, traps etc is - * intact, nothing more to do. - */ - opcode = IBD_LINK_UP_ABSENT; - if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == - SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) - opcode = IBD_LINK_UP; + new_link_state = LINK_STATE_UP; - ibt_free_portinfo(port_infop, port_infosz); + } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, + port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { - if (badup) { - code = IBT_ERROR_PORT_DOWN; - } else if (code == IBT_PORT_CHANGE_EVENT) { + ibt_free_portinfo(port_infop, port_infosz); mutex_exit(&state->id_link_mutex); + + ibd_m_stop(state); + if ((ibt_status = ibd_m_start(state)) != IBT_SUCCESS) { + DPRINT(10, "link_mod: cannot " + "restart, ret=%d", ibt_status); + } return; + } else { + new_link_state = LINK_STATE_DOWN; } } +update_link_state: + if (port_infop) { + ibt_free_portinfo(port_infop, port_infosz); + } + + /* + * If the old state is the same as the new state, nothing to do + */ + if (state->id_link_state == new_link_state) { + mutex_exit(&state->id_link_mutex); + return; + } + + /* + * Ok, so there was a link state change; see if it's safe to ask + * the async thread to do the work + */ if (!ibd_async_safe(state)) { - state->id_link_state = (((code == IBT_EVENT_PORT_UP) || - (code == IBT_CLNT_REREG_EVENT)) ? LINK_STATE_UP : - LINK_STATE_DOWN); + state->id_link_state = new_link_state; mutex_exit(&state->id_link_mutex); return; } + mutex_exit(&state->id_link_mutex); - if (code == IBT_ERROR_PORT_DOWN) + /* + * If we're reporting a link up, check InitTypeReply to see if + * the SM has ensured that the port's presence in mcg, traps, + * etc. is intact. + */ + if (new_link_state == LINK_STATE_DOWN) { opcode = IBD_LINK_DOWN; + } else { + if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == + SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { + opcode = IBD_LINK_UP; + } else { + opcode = IBD_LINK_UP_ABSENT; + } + } + /* + * Queue up a request for ibd_async_link() to handle this link + * state change event + */ req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_ptr = (void *)opcode; ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); @@ -2559,10 +2619,6 @@ ibd_state_init(ibd_state_t *state, dev_info_t *dip) state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -#ifdef IBD_LOGGING - mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); -#endif - return (DDI_SUCCESS); } @@ -2586,10 +2642,6 @@ ibd_state_fini(ibd_state_t *state) cv_destroy(&state->id_trap_cv); mutex_destroy(&state->id_trap_lock); mutex_destroy(&state->id_link_mutex); - -#ifdef IBD_LOGGING - mutex_destroy(&ibd_lbuf_lock); -#endif } /* @@ -2964,8 +3016,9 @@ ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) * In case we are handling a mcg trap, we might not find * the mcg in the non list. */ - if (mce == NULL) + if (mce == NULL) { return; + } } else { mce = IBD_MCACHE_FIND_FULL(state, mgid); @@ -2978,8 +3031,9 @@ ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) */ if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { if ((mce == NULL) || (mce->mc_jstate == - IB_MC_JSTATE_FULL)) + IB_MC_JSTATE_FULL)) { return; + } } else { ASSERT(jstate == IB_MC_JSTATE_FULL); @@ -2988,8 +3042,9 @@ ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) * This is because in GLDv3 driver, set multicast * will always return success. */ - if (mce == NULL) + if (mce == NULL) { return; + } mce->mc_fullreap = B_TRUE; } @@ -3034,7 +3089,12 @@ ibd_find_bgroup(ibd_state_t *state) IB_MC_SCOPE_GLOBAL }; int i, mcgmtu; boolean_t found = B_FALSE; + int ret; + ibt_mcg_info_t mcg_info; + state->id_bgroup_created = B_FALSE; + +query_bcast_grp: bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_pkey = state->id_pkey; state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; @@ -3055,12 +3115,51 @@ ibd_find_bgroup(ibd_state_t *state) found = B_TRUE; break; } - } if (!found) { - ibd_print_warn(state, "IPoIB broadcast group absent"); - return (IBT_FAILURE); + if (ibd_create_broadcast_group) { + /* + * If we created the broadcast group, but failed to + * find it, we can't do anything except leave the + * one we created and return failure. + */ + if (state->id_bgroup_created) { + ibd_print_warn(state, "IPoIB broadcast group " + "absent. Unable to query after create."); + goto find_bgroup_fail; + } + + /* + * Create the ipoib broadcast group if it didn't exist + */ + bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); + mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; + mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; + mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; + mcg_attr.mc_pkey = state->id_pkey; + mcg_attr.mc_flow = 0; + mcg_attr.mc_sl = 0; + mcg_attr.mc_tclass = 0; + state->id_mgid.gid_prefix = + (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | + ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | + ((uint32_t)(state->id_pkey << 16))); + mcg_attr.mc_mgid = state->id_mgid; + + if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, + &mcg_info, NULL, NULL)) != IBT_SUCCESS) { + ibd_print_warn(state, "IPoIB broadcast group " + "absent, create failed: ret = %d\n", ret); + state->id_bgroup_created = B_FALSE; + return (IBT_FAILURE); + } + state->id_bgroup_created = B_TRUE; + goto query_bcast_grp; + } else { + ibd_print_warn(state, "IPoIB broadcast group absent"); + return (IBT_FAILURE); + } } /* @@ -3071,11 +3170,21 @@ ibd_find_bgroup(ibd_state_t *state) ibd_print_warn(state, "IPoIB broadcast group MTU %d " "greater than port's maximum MTU %d", mcgmtu, state->id_mtu); - return (IBT_FAILURE); + ibt_free_mcg_info(state->id_mcinfo, 1); + goto find_bgroup_fail; } state->id_mtu = mcgmtu; return (IBT_SUCCESS); + +find_bgroup_fail: + if (state->id_bgroup_created) { + (void) ibt_leave_mcg(state->id_sgid, + mcg_info.mc_adds_vect.av_dgid, state->id_sgid, + IB_MC_JSTATE_FULL); + } + + return (IBT_FAILURE); } static int @@ -4185,7 +4294,9 @@ ibd_undo_m_start(ibd_state_t *state) * to be returned and give up after 5 seconds. */ if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { + ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); + attempts = 50; while (state->id_rx_list.dl_bufs_outstanding > 0) { delay(drv_usectohz(100000)); @@ -4229,8 +4340,11 @@ ibd_undo_m_start(ibd_state_t *state) * This call is guaranteed to return successfully for * UD QPNs. */ - ret = ibt_flush_channel(state->id_chnl_hdl); - ASSERT(ret == IBT_SUCCESS); + if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != + IBT_SUCCESS) { + DPRINT(10, "undo_m_start: flush_channel " + "failed, ret=%d", ret); + } /* * Turn off Tx interrupts and poll. By the time the polling @@ -4239,8 +4353,9 @@ ibd_undo_m_start(ibd_state_t *state) * ibt_set_cq_handler() returns, the old handler is * guaranteed not to be invoked anymore. */ - if (ibd_separate_cqs == 1) + if (ibd_separate_cqs == 1) { ibt_set_cq_handler(state->id_scq_hdl, 0, 0); + } ibd_poll_compq(state, state->id_scq_hdl); state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); @@ -4301,7 +4416,12 @@ ibd_undo_m_start(ibd_state_t *state) } if (progress & IBD_DRV_UD_CHANNEL_SETUP) { - (void) ibt_free_channel(state->id_chnl_hdl); + if ((ret = ibt_free_channel(state->id_chnl_hdl)) != + IBT_SUCCESS) { + DPRINT(10, "undo_m_start: free_channel " + "failed, ret=%d", ret); + } + state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); } @@ -4309,12 +4429,19 @@ ibd_undo_m_start(ibd_state_t *state) if (ibd_separate_cqs == 1) { kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * state->id_txwcs_size); - (void) ibt_free_cq(state->id_scq_hdl); + if ((ret = ibt_free_cq(state->id_scq_hdl)) != + IBT_SUCCESS) { + DPRINT(10, "undo_m_start: free_cq(scq) " + "failed, ret=%d", ret); + } } kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); - (void) ibt_free_cq(state->id_rcq_hdl); + if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { + DPRINT(10, "undo_m_start: free_cq(rcq) failed, " + "ret=%d", ret); + } state->id_txwcs = NULL; state->id_rxwcs = NULL; @@ -4332,7 +4459,18 @@ ibd_undo_m_start(ibd_state_t *state) } if (progress & IBD_DRV_BCAST_GROUP_FOUND) { + /* + * If we'd created the ipoib broadcast group and had + * successfully joined it, leave it now + */ + if (state->id_bgroup_created) { + mgid = state->id_mcinfo->mc_adds_vect.av_dgid; + jstate = IB_MC_JSTATE_FULL; + (void) ibt_leave_mcg(state->id_sgid, mgid, + state->id_sgid, jstate); + } ibt_free_mcg_info(state->id_mcinfo, 1); + state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); } @@ -4349,6 +4487,7 @@ ibd_m_start(void *arg) ibd_state_t *state = arg; kthread_t *kht; int err; + ibt_status_t ret; if (state->id_mac_state & IBD_DRV_STARTED) return (DDI_SUCCESS); @@ -4419,10 +4558,10 @@ ibd_m_start(void *arg) */ if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); - if (ibt_enable_cq_notify(state->id_scq_hdl, - IBT_NEXT_COMPLETION) != IBT_SUCCESS) { - DPRINT(10, - "ibd_m_start: ibt_enable_cq_notify(scq) failed"); + if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, + IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { + DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(scq) " + "failed, ret=%d", ret); err = EINVAL; goto m_start_fail; } @@ -4476,9 +4615,10 @@ ibd_m_start(void *arg) * Setup the receive cq handler */ ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); - if (ibt_enable_cq_notify(state->id_rcq_hdl, - IBT_NEXT_COMPLETION) != IBT_SUCCESS) { - DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) failed"); + if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, + IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { + DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) " + "failed, ret=%d", ret); err = EINVAL; goto m_start_fail; } @@ -5597,6 +5737,11 @@ ibd_m_tx(void *arg, mblk_t *mp) ibd_state_t *state = (ibd_state_t *)arg; mblk_t *next; + if (state->id_link_state != LINK_STATE_UP) { + freemsgchain(mp); + mp = NULL; + } + while (mp != NULL) { next = mp->b_next; mp->b_next = NULL; @@ -6161,6 +6306,8 @@ ibd_log_init(void) { ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); ibd_lbuf_ndx = 0; + + mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); } static void @@ -6170,6 +6317,8 @@ ibd_log_fini(void) kmem_free(ibd_lbuf, IBD_LOG_SZ); ibd_lbuf_ndx = 0; ibd_lbuf = NULL; + + mutex_destroy(&ibd_lbuf_lock); } static void diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h index 9dad155de1..f3e819ee68 100644 --- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h +++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h @@ -297,6 +297,7 @@ typedef struct ibd_state_s { uint16_t id_pkix; uint8_t id_port; ibt_mcg_info_t *id_mcinfo; + boolean_t id_bgroup_created; mac_handle_t id_mh; mac_resource_handle_t id_rh; |