diff options
author | Bill Taylor <Bill.Taylor@Sun.COM> | 2009-10-23 15:06:39 -0700 |
---|---|---|
committer | Bill Taylor <Bill.Taylor@Sun.COM> | 2009-10-23 15:06:39 -0700 |
commit | 71be8d8f808a6f8b1a1bbb502fb01c7ccdb8512d (patch) | |
tree | 0eb9d362434de933d1f1e6975888aa54ebdcfde5 /usr/src | |
parent | 35494a3df11fa3df46e5b76d62ec74e60a048a1e (diff) | |
download | illumos-gate-71be8d8f808a6f8b1a1bbb502fb01c7ccdb8512d.tar.gz |
6858031 ibd: one receive memory region is required to fix performance and scaling problems
6884097 ibt_map_mem_iov() fails (hermon reverses SLEEP and NOSLEEP)
6886372 hermon should support 4K IB MTU
6894485 ibd is not lint clean
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/io/ib/adapters/hermon/hermon.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/io/ib/clients/ibd/ibd.c | 1733 | ||||
-rw-r--r-- | usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h | 20 | ||||
-rw-r--r-- | usr/src/uts/common/sys/ib/clients/ibd/ibd.h | 83 |
5 files changed, 1001 insertions, 863 deletions
diff --git a/usr/src/uts/common/io/ib/adapters/hermon/hermon.c b/usr/src/uts/common/io/ib/adapters/hermon/hermon.c index a5ccb57ac7..6026386473 100644 --- a/usr/src/uts/common/io/ib/adapters/hermon/hermon.c +++ b/usr/src/uts/common/io/ib/adapters/hermon/hermon.c @@ -3374,6 +3374,10 @@ hermon_hca_port_init(hermon_state_t *state) goto init_ports_fail; } + /* Set mtu_cap to 4096 bytes */ + initport->mmc = 1; /* set the change bit */ + initport->mtu_cap = 5; /* for 4096 bytes */ + /* Validate the max port width */ maxval = state->hs_queryport.ib_port_wid; val = cfgprof->cp_max_port_width; @@ -3388,6 +3392,10 @@ hermon_hca_port_init(hermon_state_t *state) goto init_ports_fail; } + /* Since we're doing mtu_cap, cut vl_cap down */ + initport->mvc = 1; /* set this change bit */ + initport->vl_cap = 3; /* 3 means vl0-vl3, 4 total */ + /* Validate max GID table size */ maxval = ((uint64_t)1 << state->hs_queryport.log_max_gid); val = ((uint64_t)1 << cfgprof->cp_log_max_gidtbl); diff --git a/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c b/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c index 20f9aa7403..b5284feba2 100644 --- a/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c +++ b/usr/src/uts/common/io/ib/adapters/hermon/hermon_ci.c @@ -2399,7 +2399,7 @@ hermon_ci_map_mem_iov(ibc_hca_hdl_t hca, ibt_iov_attr_t *iov_attr, ibt_all_wr_t *wr, ibc_mi_hdl_t *mi_hdl_p) { int status; - int i, nds, max_nds; + int i, j, nds, max_nds; uint_t len; ibt_status_t ibt_status; ddi_dma_handle_t dmahdl; @@ -2431,7 +2431,7 @@ hermon_ci_map_mem_iov(ibc_hca_hdl_t hca, ibt_iov_attr_t *iov_attr, max_nds -= (iov_attr->iov_lso_hdr_sz + sizeof (uint32_t) + 0xf) >> 4; /* 0xf is for rounding up to a multiple of 16 */ rsvd_lkey = state->hs_devlim.rsv_lkey; - if (iov_attr->iov_flags & IBT_IOV_NOSLEEP) { + if ((iov_attr->iov_flags & IBT_IOV_NOSLEEP) == 0) { kmflag = KM_SLEEP; callback = DDI_DMA_SLEEP; } else { @@ -2490,11 +2490,19 @@ hermon_ci_map_mem_iov(ibc_hca_hdl_t hca, ibt_iov_attr_t *iov_attr, _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgl)) len = iov_attr->iov_list_len; + for (i = 0, j = 0; j < len; j++) { + if (iov_attr->iov[j].iov_len == 0) + continue; + i++; + } mi_hdl = kmem_alloc(sizeof (*mi_hdl) + - (len - 1) * sizeof (ddi_dma_handle_t), kmflag); + (i - 1) * sizeof (ddi_dma_handle_t), kmflag); if (mi_hdl == NULL) return (IBT_INSUFF_RESOURCE); - for (i = 0; i < len; i++) { + mi_hdl->imh_len = i; + for (i = 0, j = 0; j < len; j++) { + if (iov_attr->iov[j].iov_len == 0) + continue; status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr, callback, NULL, &dmahdl); if (status != DDI_SUCCESS) { @@ -2502,7 +2510,7 @@ hermon_ci_map_mem_iov(ibc_hca_hdl_t hca, ibt_iov_attr_t *iov_attr, goto fail2; } status = ddi_dma_addr_bind_handle(dmahdl, iov_attr->iov_as, - iov_attr->iov[i].iov_addr, iov_attr->iov[i].iov_len, + iov_attr->iov[j].iov_addr, iov_attr->iov[j].iov_len, DDI_DMA_RDWR | DDI_DMA_CONSISTENT, callback, NULL, &dmacookie, &cookie_cnt); if (status != DDI_DMA_MAPPED) { @@ -2522,13 +2530,13 @@ hermon_ci_map_mem_iov(ibc_hca_hdl_t hca, ibt_iov_attr_t *iov_attr, ddi_dma_nextcookie(dmahdl, &dmacookie); } mi_hdl->imh_dmahandle[i] = dmahdl; + i++; } if (iov_attr->iov_flags & IBT_IOV_RECV) wr->recv.wr_nds = nds; else wr->send.wr_nds = nds; - mi_hdl->imh_len = len; *mi_hdl_p = mi_hdl; return (IBT_SUCCESS); diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c index b181d97af8..6e26e1a82f 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c @@ -64,7 +64,7 @@ #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ /* - * Per-interface tunables + * Per-interface tunables (for developers) * * ibd_tx_copy_thresh * This sets the threshold at which ibd will attempt to do a bcopy of the @@ -102,17 +102,6 @@ * ibd_hash_size * Hash table size for the active AH list * - * ibd_separate_cqs - * ibd_txcomp_poll - * These boolean variables (1 or 0) may be used to tune the behavior of - * ibd in managing the send and receive completion queues and in deciding - * whether or not transmit completions should be polled or interrupt - * driven (when the completion queues are separate). If both the completion - * queues are interrupt driven, it may not be possible for the handlers to - * be invoked concurrently, depending on how the interrupts are tied on - * the PCI intr line. Note that some combination of these two parameters - * may not be meaningful (and therefore not allowed). - * * ibd_tx_softintr * ibd_rx_softintr * The softintr mechanism allows ibd to avoid event queue overflows if @@ -130,8 +119,6 @@ uint_t ibd_num_rwqe = 4000; uint_t ibd_num_lso_bufs = 0x400; uint_t ibd_num_ah = 64; uint_t ibd_hash_size = 32; -uint_t ibd_separate_cqs = 1; -uint_t ibd_txcomp_poll = 0; uint_t ibd_rx_softintr = 1; uint_t ibd_tx_softintr = 1; uint_t ibd_create_broadcast_group = 1; @@ -151,16 +138,16 @@ uint_t ibd_log_sz = 0x20000; #endif /* - * Receive CQ moderation parameters: NOT tunables + * Receive CQ moderation parameters: tunable (for developers) */ -static uint_t ibd_rxcomp_count = 4; -static uint_t ibd_rxcomp_usec = 10; +uint_t ibd_rxcomp_count = 4; +uint_t ibd_rxcomp_usec = 10; /* - * Send CQ moderation parameters: NOT tunables + * Send CQ moderation parameters: tunable (for developers) */ -#define IBD_TXCOMP_COUNT 10 -#define IBD_TXCOMP_USEC 300 +uint_t ibd_txcomp_count = 16; +uint_t ibd_txcomp_usec = 300; /* * Thresholds @@ -176,13 +163,23 @@ static uint_t ibd_rxcomp_usec = 10; #define IBD_TX_POLL_THRESH 80 /* - * When doing multiple-send-wr or multiple-recv-wr posts, this value - * determines how many to do at a time (in a single ibt_post_send/recv). + * When doing multiple-send-wr, this value determines how many to do at + * a time (in a single ibt_post_send). */ -#define IBD_MAX_POST_MULTIPLE 4 +#define IBD_MAX_TX_POST_MULTIPLE 4 + +/* Post IBD_RX_POST_CNT receive work requests at a time. */ +#define IBD_RX_POST_CNT 16 + +/* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ +#define IBD_LOG_RX_POST 3 + +/* Minimum number of receive work requests driver needs to always have */ +#define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) /* - * Maximum length for returning chained mps back to crossbow + * Maximum length for returning chained mps back to crossbow. + * Also used as the maximum number of rx wc's polled at a time. */ #define IBD_MAX_RX_MP_LEN 16 @@ -196,10 +193,8 @@ static uint_t ibd_rxcomp_usec = 10; /* * Completion queue polling control */ -#define IBD_RX_CQ_POLLING 0x1 -#define IBD_TX_CQ_POLLING 0x2 -#define IBD_REDO_RX_CQ_POLLING 0x4 -#define IBD_REDO_TX_CQ_POLLING 0x8 +#define IBD_CQ_POLLING 0x1 +#define IBD_REDO_CQ_POLLING 0x2 /* * Flag bits for resources to reap @@ -337,6 +332,7 @@ static void ibd_state_fini(ibd_state_t *); static void ibd_fini_txlist(ibd_state_t *); static void ibd_fini_rxlist(ibd_state_t *); static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); +static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); static void ibd_acache_fini(ibd_state_t *); #ifdef IBD_LOGGING static void ibd_log_fini(void); @@ -345,23 +341,21 @@ static void ibd_log_fini(void); /* * Allocation/acquire/map routines */ -static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); -static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); static int ibd_alloc_tx_copybufs(ibd_state_t *); +static int ibd_alloc_rx_copybufs(ibd_state_t *); static int ibd_alloc_tx_lsobufs(ibd_state_t *); -static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); +static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, uint32_t *); /* * Free/release/unmap routines */ -static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); -static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); static void ibd_free_tx_copybufs(ibd_state_t *); +static void ibd_free_rx_copybufs(ibd_state_t *); static void ibd_free_tx_lsobufs(ibd_state_t *); -static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); +static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); @@ -369,12 +363,14 @@ static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); /* * Handlers/callback routines */ -static uint_t ibd_intr(char *); -static uint_t ibd_tx_recycle(char *); +static uint_t ibd_intr(caddr_t); +static uint_t ibd_tx_recycle(caddr_t); static void ibd_rcq_handler(ibt_cq_hdl_t, void *); static void ibd_scq_handler(ibt_cq_hdl_t, void *); -static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); -static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); +static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); +static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); +static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); +static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); static void ibd_freemsg_cb(char *); static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); @@ -386,9 +382,8 @@ static void ibd_snet_notices_handler(void *, ib_gid_t, */ static boolean_t ibd_send(ibd_state_t *, mblk_t *); static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); -static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); -static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); -static void ibd_flush_rx(ibd_state_t *, mblk_t *); +static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); +static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); /* * Threads @@ -428,6 +423,7 @@ static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); +static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); @@ -451,7 +447,7 @@ static void ibd_clr_mac_progress(ibd_state_t *, uint_t); */ static int ibd_sched_poll(ibd_state_t *, int, int); static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); -static int ibd_resume_transmission(ibd_state_t *); +static void ibd_resume_transmission(ibd_state_t *); static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); static void *list_get_head(list_t *); @@ -542,7 +538,7 @@ debug_print(int l, char *fmt, ...) } #define DPRINT debug_print #else -#define DPRINT +#define DPRINT 0 && #endif /* @@ -584,13 +580,14 @@ ibd_print_warn(ibd_state_t *state, char *fmt, ...) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, ibd_state_t::id_lso)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) +_NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) /* - * id_cq_poll_lock + * id_scq_poll_lock */ -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, - ibd_state_t::id_cq_poll_busy)) +_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, + ibd_state_t::id_scq_poll_busy)) /* * id_txpost_lock @@ -599,18 +596,6 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, ibd_state_t::id_tx_head)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, ibd_state_t::id_tx_busy)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, - ibd_state_t::id_tx_tailp)) - -/* - * id_rxpost_lock - */ -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, - ibd_state_t::id_rx_head)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, - ibd_state_t::id_rx_busy)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, - ibd_state_t::id_rx_tailp)) /* * id_acache_req_lock @@ -619,6 +604,8 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_acache_req_cv)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_req_list)) +_NOTE(SCHEME_PROTECTS_DATA("atomic", + ibd_acache_s::ac_ref)) /* * id_ac_mutex @@ -640,6 +627,8 @@ _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", ibd_state_t::id_ah_op)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, ibd_state_t::id_ah_error)) +_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, + ibd_state_t::id_ac_hot_ace)) _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) /* @@ -680,26 +669,21 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", ibd_state_t::id_link_speed)) +_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) /* * id_tx_list.dl_mutex */ -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, +_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_head)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, - ibd_state_t::id_tx_list.dl_tail)) -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", +_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_pending_sends)) -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", +_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, ibd_state_t::id_tx_list.dl_cnt)) /* * id_rx_list.dl_mutex */ -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, - ibd_state_t::id_rx_list.dl_head)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, - ibd_state_t::id_rx_list.dl_tail)) _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", ibd_state_t::id_rx_list.dl_bufs_outstanding)) _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", @@ -743,24 +727,39 @@ _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", mac_capab_lso_s msgb::b_next msgb::b_rptr - msgb::b_wptr)) + msgb::b_wptr + ibd_state_s::id_bgroup_created + ibd_state_s::id_mac_state + ibd_state_s::id_mtu + ibd_state_s::id_num_rwqe + ibd_state_s::id_num_swqe + ibd_state_s::id_qpnum + ibd_state_s::id_rcq_hdl + ibd_state_s::id_rx_buf_sz + ibd_state_s::id_rx_bufs + ibd_state_s::id_rx_mr_hdl + ibd_state_s::id_rx_wqes + ibd_state_s::id_rxwcs + ibd_state_s::id_rxwcs_size + ibd_state_s::id_rx_nqueues + ibd_state_s::id_rx_queues + ibd_state_s::id_scope + ibd_state_s::id_scq_hdl + ibd_state_s::id_tx_buf_sz + ibd_state_s::id_tx_bufs + ibd_state_s::id_tx_mr_hdl + ibd_state_s::id_tx_rel_list.dl_cnt + ibd_state_s::id_tx_wqes + ibd_state_s::id_txwcs + ibd_state_s::id_txwcs_size)) int _init() { int status; - /* - * Sanity check some parameter settings. Tx completion polling - * only makes sense with separate CQs for Tx and Rx. - */ - if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { - cmn_err(CE_NOTE, "!ibd: %s", - "Setting ibd_txcomp_poll = 0 for combined CQ"); - ibd_txcomp_poll = 0; - } - - status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); + status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), + PAGESIZE), 0); if (status != 0) { DPRINT(10, "_init:failed in ddi_soft_state_init()"); return (status); @@ -957,9 +956,12 @@ ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, _ret_ = mod_hash_insert(state->id_ah_active_hash, \ (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ ASSERT(_ret_ == 0); \ + state->id_ac_hot_ace = ce; \ } #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ list_remove(&state->id_ah_active, ce); \ + if (state->id_ac_hot_ace == ce) \ + state->id_ac_hot_ace = NULL; \ (void) mod_hash_remove(state->id_ah_active_hash, \ (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ } @@ -982,7 +984,7 @@ ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, * of membership must be present before initiating the transmit. * This list is also emptied during driver detach, since sendonly * membership acquired during transmit is dropped at detach time - * alongwith ipv4 broadcast full membership. Insert/deletes to + * along with ipv4 broadcast full membership. Insert/deletes to * this list are done only by the async thread, but it is also * searched in program context (see multicast disable case), thus * the id_mc_mutex protects the list. The driver detach path also @@ -1094,7 +1096,7 @@ ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, * trap delivery. Querying the SA to establish presence/absence of the * mcg is also racy at best. Thus, the driver just prints a warning * message when it can not rejoin after receiving a create trap, although - * this might be (on rare occassions) a mis-warning if the create trap is + * this might be (on rare occasions) a mis-warning if the create trap is * received after the mcg was deleted. */ @@ -1353,6 +1355,7 @@ ibd_acache_init(ibd_state_t *state) mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&state->id_ac_mutex); list_create(&state->id_ah_free, sizeof (ibd_ace_t), offsetof(ibd_ace_t, ac_list)); list_create(&state->id_ah_active, sizeof (ibd_ace_t), @@ -1366,12 +1369,14 @@ ibd_acache_init(ibd_state_t *state) offsetof(ibd_mce_t, mc_list)); list_create(&state->id_req_list, sizeof (ibd_req_t), offsetof(ibd_req_t, rq_list)); + state->id_ac_hot_ace = NULL; state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * IBD_NUM_AH, KM_SLEEP); for (i = 0; i < IBD_NUM_AH; i++, ce++) { if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { + mutex_exit(&state->id_ac_mutex); ibd_acache_fini(state); return (DDI_FAILURE); } else { @@ -1380,6 +1385,7 @@ ibd_acache_init(ibd_state_t *state) IBD_ACACHE_INSERT_FREE(state, ce); } } + mutex_exit(&state->id_ac_mutex); return (DDI_SUCCESS); } @@ -1463,7 +1469,14 @@ ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) mutex_enter(&state->id_ac_mutex); - if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { + if (((ptr = state->id_ac_hot_ace) != NULL) && + (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { + INC_REF(ptr, numwqe); + mutex_exit(&state->id_ac_mutex); + return (ptr); + } + if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { + state->id_ac_hot_ace = ptr; mutex_exit(&state->id_ac_mutex); return (ptr); } @@ -1869,7 +1882,7 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) * this on a link down, since we will be unable to do SA operations, * defaulting to the lowest speed. Also notice that we update our * notion of speed before calling mac_link_update(), which will do - * neccesary higher level notifications for speed changes. + * necessary higher level notifications for speed changes. */ if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) @@ -2074,6 +2087,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) * index is the same as before; finally check to see if the * pkey has been relocated to a different index in the table. */ + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, sizeof (ib_gid_t)) != 0) { @@ -2098,7 +2112,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) * marked both the start and stop 'in-progress' flags, * so it is ok to go ahead and do this restart. */ - ibd_undo_start(state, LINK_STATE_DOWN); + (void) ibd_undo_start(state, LINK_STATE_DOWN); if ((ret = ibd_start(state)) != 0) { DPRINT(10, "ibd_restart: cannot restart, " "ret=%d", ret); @@ -2108,6 +2122,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) } else { new_link_state = LINK_STATE_DOWN; } + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) } update_link_state: @@ -2284,6 +2299,8 @@ ibd_record_capab(ibd_state_t *state, dev_info_t *dip) ibt_hca_attr_t hca_attrs; ibt_status_t ibt_status; + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) + /* * Query the HCA and fetch its attributes */ @@ -2344,6 +2361,14 @@ ibd_record_capab(ibd_state_t *state, dev_info_t *dip) } /* + * Translating the virtual address regions into physical regions + * for using the Reserved LKey feature results in a wr sgl that + * is a little longer. Since failing ibt_map_mem_iov() is costly, + * we'll fix a high-water mark (65%) for when we should stop. + */ + state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; + + /* * 5. Set number of recv and send wqes after checking hca maximum * channel size */ @@ -2352,11 +2377,13 @@ ibd_record_capab(ibd_state_t *state, dev_info_t *dip) } else { state->id_num_rwqe = IBD_NUM_RWQE; } + state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { state->id_num_swqe = hca_attrs.hca_max_chan_sz; } else { state->id_num_swqe = IBD_NUM_SWQE; } + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) return (DDI_SUCCESS); } @@ -2563,6 +2590,7 @@ ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) attach_fail: (void) ibd_unattach(state, dip); + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) return (DDI_FAILURE); } @@ -2613,26 +2641,32 @@ ibd_state_init(ibd_state_t *state, dev_info_t *dip) state->id_trap_stop = B_TRUE; state->id_trap_inprog = 0; - mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); state->id_dip = dip; mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&state->id_tx_list.dl_mutex); state->id_tx_list.dl_head = NULL; - state->id_tx_list.dl_tail = NULL; state->id_tx_list.dl_pending_sends = B_FALSE; state->id_tx_list.dl_cnt = 0; - mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_exit(&state->id_tx_list.dl_mutex); + mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&state->id_tx_rel_list.dl_mutex); + state->id_tx_rel_list.dl_head = NULL; + state->id_tx_rel_list.dl_pending_sends = B_FALSE; + state->id_tx_rel_list.dl_cnt = 0; + mutex_exit(&state->id_tx_rel_list.dl_mutex); mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); state->id_tx_busy = 0; + mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); - state->id_rx_list.dl_head = NULL; - state->id_rx_list.dl_tail = NULL; state->id_rx_list.dl_bufs_outstanding = 0; state->id_rx_list.dl_cnt = 0; mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); - mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); - + mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -2654,14 +2688,17 @@ ibd_state_fini(ibd_state_t *state) kmem_cache_destroy(state->id_req_kmc); - mutex_destroy(&state->id_rxpost_lock); mutex_destroy(&state->id_rx_list.dl_mutex); + mutex_destroy(&state->id_rx_free_list.dl_mutex); mutex_destroy(&state->id_txpost_lock); mutex_destroy(&state->id_tx_list.dl_mutex); + mutex_destroy(&state->id_tx_rel_list.dl_mutex); + mutex_destroy(&state->id_lso_lock); mutex_destroy(&state->id_sched_lock); - mutex_destroy(&state->id_cq_poll_lock); + mutex_destroy(&state->id_scq_poll_lock); + mutex_destroy(&state->id_rcq_poll_lock); cv_destroy(&state->id_trap_cv); mutex_destroy(&state->id_trap_lock); @@ -2955,7 +2992,7 @@ ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) /* * This code handles delayed Tx completion cleanups for mcg's to which * disable_multicast has been issued, regular mcg related cleanups during - * disable_multicast, disable_promiscous and mcg traps, as well as + * disable_multicast, disable_promiscuous and mcg traps, as well as * cleanups during driver detach time. Depending on the join state, * it deletes the mce from the appropriate list and issues the IBA * leave/detach; except in the disable_multicast case when the mce @@ -3121,7 +3158,9 @@ ibd_find_bgroup(ibd_state_t *state) query_bcast_grp: bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); mcg_attr.mc_pkey = state->id_pkey; + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { state->id_scope = mcg_attr.mc_scope = scopes[i]; @@ -3129,11 +3168,13 @@ query_bcast_grp: /* * Look for the IPoIB broadcast group. */ + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_prefix = (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | ((uint64_t)state->id_scope << 48) | ((uint32_t)(state->id_pkey << 16))); mcg_attr.mc_mgid = state->id_mgid; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, &state->id_mcinfo, &numg) == IBT_SUCCESS) { found = B_TRUE; @@ -3165,11 +3206,13 @@ query_bcast_grp: mcg_attr.mc_flow = 0; mcg_attr.mc_sl = 0; mcg_attr.mc_tclass = 0; + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) state->id_mgid.gid_prefix = (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | ((uint32_t)(state->id_pkey << 16))); mcg_attr.mc_mgid = state->id_mgid; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, &mcg_info, NULL, NULL)) != IBT_SUCCESS) { @@ -3228,6 +3271,9 @@ ibd_alloc_tx_copybufs(ibd_state_t *state) state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * state->id_tx_buf_sz, KM_SLEEP); + state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * + sizeof (ibd_swqe_t), KM_SLEEP); + /* * Do one memory registration on the entire txbuf area */ @@ -3238,6 +3284,8 @@ ibd_alloc_tx_copybufs(ibd_state_t *state) if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); + kmem_free(state->id_tx_wqes, + state->id_num_swqe * sizeof (ibd_swqe_t)); kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); state->id_tx_bufs = NULL; @@ -3283,6 +3331,8 @@ ibd_alloc_tx_lsobufs(ibd_state_t *state) return (DDI_FAILURE); } + mutex_enter(&state->id_lso_lock); + /* * Now allocate the buflist. Note that the elements in the buflist and * the buffers in the lso memory have a permanent 1-1 relation, so we @@ -3319,6 +3369,7 @@ ibd_alloc_tx_lsobufs(ibd_state_t *state) bktp->bkt_nfree = bktp->bkt_nelem; state->id_lso = bktp; + mutex_exit(&state->id_lso_lock); return (DDI_SUCCESS); } @@ -3332,6 +3383,8 @@ ibd_init_txlist(ibd_state_t *state) ibd_swqe_t *swqe; ibt_lkey_t lkey; int i; + uint_t len; + uint8_t *bufaddr; if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) return (DDI_FAILURE); @@ -3345,27 +3398,35 @@ ibd_init_txlist(ibd_state_t *state) * Allocate and setup the swqe list */ lkey = state->id_tx_mr_desc.md_lkey; - for (i = 0; i < state->id_num_swqe; i++) { - if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { - DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); - ibd_fini_txlist(state); - return (DDI_FAILURE); - } + bufaddr = state->id_tx_bufs; + len = state->id_tx_buf_sz; + swqe = state->id_tx_wqes; + mutex_enter(&state->id_tx_list.dl_mutex); + for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { + swqe->swqe_type = IBD_WQE_SEND; + swqe->swqe_next = NULL; + swqe->swqe_im_mblk = NULL; + + swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) + bufaddr; + swqe->swqe_copybuf.ic_sgl.ds_key = lkey; + swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ + + swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; + swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; + swqe->w_swr.wr_trans = IBT_UD_SRV; + + /* These are set in send */ + swqe->w_swr.wr_nds = 0; + swqe->w_swr.wr_sgl = NULL; + swqe->w_swr.wr_opcode = IBT_WRC_SEND; /* add to list */ state->id_tx_list.dl_cnt++; - if (state->id_tx_list.dl_head == NULL) { - swqe->swqe_prev = NULL; - swqe->swqe_next = NULL; - state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); - state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); - } else { - swqe->swqe_prev = state->id_tx_list.dl_tail; - swqe->swqe_next = NULL; - state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); - state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); - } + swqe->swqe_next = state->id_tx_list.dl_head; + state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); } + mutex_exit(&state->id_tx_list.dl_mutex); return (DDI_SUCCESS); } @@ -3503,7 +3564,9 @@ ibd_free_tx_copybufs(ibd_state_t *state) /* * Free txbuf memory */ + kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); + state->id_tx_wqes = NULL; state->id_tx_bufs = NULL; } @@ -3563,124 +3626,175 @@ ibd_fini_txlist(ibd_state_t *state) state->id_tx_list.dl_head = node->swqe_next; ASSERT(state->id_tx_list.dl_cnt > 0); state->id_tx_list.dl_cnt--; - ibd_free_swqe(state, node); } + ASSERT(state->id_tx_list.dl_cnt == 0); mutex_exit(&state->id_tx_list.dl_mutex); + mutex_enter(&state->id_tx_rel_list.dl_mutex); + while (state->id_tx_rel_list.dl_head != NULL) { + node = WQE_TO_SWQE(state->id_tx_rel_list.dl_head); + state->id_tx_rel_list.dl_head = node->swqe_next; + ASSERT(state->id_tx_rel_list.dl_cnt > 0); + state->id_tx_rel_list.dl_cnt--; + } + ASSERT(state->id_tx_rel_list.dl_cnt == 0); + mutex_exit(&state->id_tx_rel_list.dl_mutex); ibd_free_tx_lsobufs(state); ibd_free_tx_copybufs(state); } -/* - * Allocate a single send wqe and register it so it is almost - * ready to be posted to the hardware. - */ -static int -ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) +static void +ibd_post_recv_task(ibd_rwqe_t *rwqe, ibd_rwqe_t *tail) { - ibd_swqe_t *swqe; - - swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); - *wqe = swqe; - - swqe->swqe_type = IBD_WQE_SEND; - swqe->swqe_next = NULL; - swqe->swqe_prev = NULL; - swqe->swqe_im_mblk = NULL; - - swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) - (state->id_tx_bufs + ndx * state->id_tx_buf_sz); - swqe->swqe_copybuf.ic_sgl.ds_key = lkey; - swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ - - swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; - swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; - swqe->w_swr.wr_trans = IBT_UD_SRV; - - /* These are set in send */ - swqe->w_swr.wr_nds = 0; - swqe->w_swr.wr_sgl = NULL; - swqe->w_swr.wr_opcode = IBT_WRC_SEND; - - return (DDI_SUCCESS); + uint_t i; + uint_t num_posted; + ibt_status_t ibt_status; + ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; + ibd_state_t *state = rwqe->w_state; + + mutex_enter(&state->id_rx_post_lock); + if (state->id_rx_post_busy) { + tail->rwqe_next = state->id_rx_post_head; + state->id_rx_post_head = RWQE_TO_WQE(rwqe); + mutex_exit(&state->id_rx_post_lock); + return; + } + state->id_rx_post_busy = 1; + mutex_exit(&state->id_rx_post_lock); + +loop: + /* Post the IBD_RX_POST_CNT receive work requests pointed to by arg. */ + for (i = 0; i < IBD_RX_POST_CNT; i++) { + wrs[i] = rwqe->w_rwr; + rwqe = WQE_TO_RWQE(rwqe->rwqe_next); + } + + /* + * If posting fails for some reason, we'll never receive + * completion intimation, so we'll need to cleanup. But + * we need to make sure we don't clean up nodes whose + * wrs have been successfully posted. We assume that the + * hca driver returns on the first failure to post and + * therefore the first 'num_posted' entries don't need + * cleanup here. + */ + atomic_add_32(&state->id_rx_list.dl_cnt, IBD_RX_POST_CNT); + + num_posted = 0; + ibt_status = ibt_post_recv(state->id_chnl_hdl, + wrs, IBD_RX_POST_CNT, &num_posted); + if (ibt_status != IBT_SUCCESS) { + ibd_print_warn(state, "ibd_post_recv: FATAL: " + "posting multiple wrs failed: " + "requested=%d, done=%d, ret=%d", + IBD_RX_POST_CNT, num_posted, ibt_status); + atomic_add_32(&state->id_rx_list.dl_cnt, + -(IBD_RX_POST_CNT - num_posted)); + /* This cannot happen! */ + } + if (rwqe != NULL) /* more rwqes on our list? */ + goto loop; + + /* check if we have a new list */ + mutex_enter(&state->id_rx_post_lock); + if ((rwqe = WQE_TO_RWQE(state->id_rx_post_head)) != NULL) { + state->id_rx_post_head = NULL; + mutex_exit(&state->id_rx_post_lock); + goto loop; + } + state->id_rx_post_busy = 0; + mutex_exit(&state->id_rx_post_lock); } +/* macro explained below */ +#define RX_QUEUE_HASH(rwqe) \ + (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) + /* - * Free an allocated send wqe. + * Add a rwqe to one of the the Rx lists. If the list is large enough + * (exactly IBD_RX_POST_CNT), post the list to the hardware. + * + * Note: one of 2^N lists is chosen via a hash. This is done + * because using one list is contentious. If the first list is busy + * (mutex_tryenter fails), use a second list (just call mutex_enter). + * + * The number 8 in RX_QUEUE_HASH is a random choice that provides + * even distribution of mapping rwqes to the 2^N queues. */ -/*ARGSUSED*/ static void -ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) +ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) { - kmem_free(swqe, sizeof (ibd_swqe_t)); + ibd_rx_queue_t *rxp; + ibd_rwqe_t *tail; + + rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); + + if (!mutex_tryenter(&rxp->rx_post_lock)) { + /* Failed. Try a different queue ("ptr + 16" ensures that). */ + rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); + mutex_enter(&rxp->rx_post_lock); + } + rwqe->rwqe_next = rxp->rx_head; + if (rxp->rx_cnt == 0) + rxp->rx_tail = RWQE_TO_WQE(rwqe); + if (++rxp->rx_cnt == IBD_RX_POST_CNT) { + rxp->rx_head = NULL; + tail = WQE_TO_RWQE(rxp->rx_tail); + rxp->rx_cnt = 0; + } else { + rxp->rx_head = RWQE_TO_WQE(rwqe); + rwqe = NULL; + } + rxp->rx_stat++; + mutex_exit(&rxp->rx_post_lock); + if (rwqe) { + ibd_post_recv_task(rwqe, tail); + } } -/* - * Post a rwqe to the hardware and add it to the Rx list. The - * "recycle" parameter indicates whether an old rwqe is being - * recycled, or this is a new one. - */ static int -ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) +ibd_alloc_rx_copybufs(ibd_state_t *state) { - ibt_status_t ibt_status; + ibt_mr_attr_t mem_attr; + int i; - if (recycle == B_FALSE) { - mutex_enter(&state->id_rx_list.dl_mutex); - if (state->id_rx_list.dl_head == NULL) { - rwqe->rwqe_prev = NULL; - rwqe->rwqe_next = NULL; - state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); - state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); - } else { - rwqe->rwqe_prev = state->id_rx_list.dl_tail; - rwqe->rwqe_next = NULL; - state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); - state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); - } - mutex_exit(&state->id_rx_list.dl_mutex); - } + /* + * Allocate one big chunk for all regular rx copy bufs + */ + state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; - mutex_enter(&state->id_rxpost_lock); - if (state->id_rx_busy) { - rwqe->w_post_link = NULL; - if (state->id_rx_head) - *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; - else - state->id_rx_head = rwqe; - state->id_rx_tailp = &(rwqe->w_post_link); - } else { - state->id_rx_busy = 1; - do { - mutex_exit(&state->id_rxpost_lock); + state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * + state->id_rx_buf_sz, KM_SLEEP); - /* - * Here we should add dl_cnt before post recv, because - * we would have to make sure dl_cnt is updated before - * the corresponding ibd_process_rx() is called. - */ - atomic_add_32(&state->id_rx_list.dl_cnt, 1); - - ibt_status = ibt_post_recv(state->id_chnl_hdl, - &rwqe->w_rwr, 1, NULL); - if (ibt_status != IBT_SUCCESS) { - (void) atomic_add_32_nv( - &state->id_rx_list.dl_cnt, -1); - ibd_print_warn(state, "ibd_post_recv: " - "posting failed, ret=%d", ibt_status); - return (DDI_FAILURE); - } + state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * + sizeof (ibd_rwqe_t), KM_SLEEP); - mutex_enter(&state->id_rxpost_lock); - rwqe = state->id_rx_head; - if (rwqe) { - state->id_rx_head = - (ibd_rwqe_t *)(rwqe->w_post_link); - } - } while (rwqe); - state->id_rx_busy = 0; + state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; + state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * + sizeof (ibd_rx_queue_t), KM_SLEEP); + for (i = 0; i < state->id_rx_nqueues; i++) { + ibd_rx_queue_t *rxp = state->id_rx_queues + i; + mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); + } + + /* + * Do one memory registration on the entire rxbuf area + */ + mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; + mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; + mem_attr.mr_as = NULL; + mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; + if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, + &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); + kmem_free(state->id_rx_wqes, + state->id_num_rwqe * sizeof (ibd_rwqe_t)); + kmem_free(state->id_rx_bufs, + state->id_num_rwqe * state->id_rx_buf_sz); + state->id_rx_bufs = NULL; + state->id_rx_wqes = NULL; + return (DDI_FAILURE); } - mutex_exit(&state->id_rxpost_lock); return (DDI_SUCCESS); } @@ -3692,166 +3806,131 @@ static int ibd_init_rxlist(ibd_state_t *state) { ibd_rwqe_t *rwqe; + ibt_lkey_t lkey; int i; + uint_t len; + uint8_t *bufaddr; - for (i = 0; i < state->id_num_rwqe; i++) { - if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { - ibd_fini_rxlist(state); - return (DDI_FAILURE); - } + if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) + return (DDI_FAILURE); + + /* + * Allocate and setup the rwqe list + */ + lkey = state->id_rx_mr_desc.md_lkey; + rwqe = state->id_rx_wqes; + bufaddr = state->id_rx_bufs; + len = state->id_rx_buf_sz; + for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { + rwqe->rwqe_type = IBD_WQE_RECV; + rwqe->w_state = state; + rwqe->w_freeing_wqe = B_FALSE; + rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; + rwqe->w_freemsg_cb.free_arg = (char *)rwqe; - if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { - ibd_free_rwqe(state, rwqe); + rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; + + if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, + &rwqe->w_freemsg_cb)) == NULL) { + DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); + rwqe->rwqe_copybuf.ic_bufaddr = NULL; ibd_fini_rxlist(state); return (DDI_FAILURE); } + + rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; + rwqe->rwqe_copybuf.ic_sgl.ds_va = + (ib_vaddr_t)(uintptr_t)bufaddr; + rwqe->rwqe_copybuf.ic_sgl.ds_len = len; + rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; + rwqe->w_rwr.wr_nds = 1; + rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; + + ibd_post_recv(state, rwqe); } return (DDI_SUCCESS); } -/* - * Free the statically allocated Rx buffer list. - * - */ static void -ibd_fini_rxlist(ibd_state_t *state) +ibd_free_rx_copybufs(ibd_state_t *state) { - ibd_rwqe_t *node; + int i; - mutex_enter(&state->id_rx_list.dl_mutex); - while (state->id_rx_list.dl_head != NULL) { - node = WQE_TO_RWQE(state->id_rx_list.dl_head); - state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; - ASSERT(state->id_rx_list.dl_cnt > 0); - state->id_rx_list.dl_cnt--; + /* + * Unregister rxbuf mr + */ + if (ibt_deregister_mr(state->id_hca_hdl, + state->id_rx_mr_hdl) != IBT_SUCCESS) { + DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); + } + state->id_rx_mr_hdl = NULL; - ibd_free_rwqe(state, node); + /* + * Free rxbuf memory + */ + for (i = 0; i < state->id_rx_nqueues; i++) { + ibd_rx_queue_t *rxp = state->id_rx_queues + i; + mutex_destroy(&rxp->rx_post_lock); } - mutex_exit(&state->id_rx_list.dl_mutex); + kmem_free(state->id_rx_queues, state->id_rx_nqueues * + sizeof (ibd_rx_queue_t)); + kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); + kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); + state->id_rx_queues = NULL; + state->id_rx_wqes = NULL; + state->id_rx_bufs = NULL; } /* - * Allocate a single recv wqe and register it so it is almost - * ready to be posted to the hardware. + * Free the statically allocated Rx buffer list. + * */ -static int -ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) +static void +ibd_fini_rxlist(ibd_state_t *state) { - ibt_mr_attr_t mem_attr; ibd_rwqe_t *rwqe; + int i; - if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { - DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); - return (DDI_FAILURE); - } - *wqe = rwqe; - rwqe->rwqe_type = IBD_WQE_RECV; - rwqe->w_state = state; - rwqe->rwqe_next = NULL; - rwqe->rwqe_prev = NULL; - rwqe->w_freeing_wqe = B_FALSE; - rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; - rwqe->w_freemsg_cb.free_arg = (char *)rwqe; - - rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + - IPOIB_GRH_SIZE, KM_NOSLEEP); - if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { - DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); - kmem_free(rwqe, sizeof (ibd_rwqe_t)); - return (DDI_FAILURE); - } - - if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, - state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == - NULL) { - DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); - kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, - state->id_mtu + IPOIB_GRH_SIZE); - rwqe->rwqe_copybuf.ic_bufaddr = NULL; - kmem_free(rwqe, sizeof (ibd_rwqe_t)); - return (DDI_FAILURE); - } - - mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; - mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; - mem_attr.mr_as = NULL; - mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; - if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, - &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != - IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); - rwqe->w_freeing_wqe = B_TRUE; - freemsg(rwqe->rwqe_im_mblk); - kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, - state->id_mtu + IPOIB_GRH_SIZE); - rwqe->rwqe_copybuf.ic_bufaddr = NULL; - kmem_free(rwqe, sizeof (ibd_rwqe_t)); - return (DDI_FAILURE); + mutex_enter(&state->id_rx_list.dl_mutex); + rwqe = state->id_rx_wqes; + for (i = 0; i < state->id_num_rwqe; i++, rwqe++) { + if (rwqe->rwqe_im_mblk != NULL) { + rwqe->w_freeing_wqe = B_TRUE; + freemsg(rwqe->rwqe_im_mblk); + } } + mutex_exit(&state->id_rx_list.dl_mutex); - rwqe->rwqe_copybuf.ic_sgl.ds_va = - (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; - rwqe->rwqe_copybuf.ic_sgl.ds_key = - rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; - rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; - rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; - rwqe->w_rwr.wr_nds = 1; - rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; - - return (DDI_SUCCESS); + ibd_free_rx_copybufs(state); } /* * Free an allocated recv wqe. */ +/* ARGSUSED */ static void ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) { - if (ibt_deregister_mr(state->id_hca_hdl, - rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { - DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); - return; - } - /* - * Indicate to the callback function that this rwqe/mblk - * should not be recycled. The freemsg() will invoke - * ibd_freemsg_cb(). - */ - if (rwqe->rwqe_im_mblk != NULL) { - rwqe->w_freeing_wqe = B_TRUE; - freemsg(rwqe->rwqe_im_mblk); - } - kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, - state->id_mtu + IPOIB_GRH_SIZE); - rwqe->rwqe_copybuf.ic_bufaddr = NULL; - kmem_free(rwqe, sizeof (ibd_rwqe_t)); -} - -/* - * Delete the rwqe being freed from the rx list. - */ -static void -ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) -{ - mutex_enter(&state->id_rx_list.dl_mutex); - if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) - state->id_rx_list.dl_head = rwqe->rwqe_next; - else - rwqe->rwqe_prev->w_next = rwqe->rwqe_next; - if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) - state->id_rx_list.dl_tail = rwqe->rwqe_prev; - else - rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; - mutex_exit(&state->id_rx_list.dl_mutex); + * desballoc() failed (no memory). + * + * This rwqe is placed on a free list so that it + * can be reinstated when memory is available. + * + * NOTE: no code currently exists to reinstate + * these "lost" rwqes. + */ + mutex_enter(&state->id_rx_free_list.dl_mutex); + state->id_rx_free_list.dl_cnt++; + rwqe->rwqe_next = state->id_rx_free_list.dl_head; + state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); + mutex_exit(&state->id_rx_free_list.dl_mutex); } /* - * IBA Rx/Tx completion queue handler. Guaranteed to be single - * threaded and nonreentrant for this CQ. When using combined CQ, - * this handles Tx and Rx completions. With separate CQs, this handles - * only Rx completions. + * IBA Rx completion queue handler. Guaranteed to be single + * threaded and nonreentrant for this CQ. */ /* ARGSUSED */ static void @@ -3861,14 +3940,22 @@ ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) atomic_add_64(&state->id_num_intrs, 1); - if (ibd_rx_softintr == 1) - ddi_trigger_softintr(state->id_rx); - else - (void) ibd_intr((char *)state); + if (ibd_rx_softintr == 1) { + mutex_enter(&state->id_rcq_poll_lock); + if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { + state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; + mutex_exit(&state->id_rcq_poll_lock); + return; + } else { + mutex_exit(&state->id_rcq_poll_lock); + ddi_trigger_softintr(state->id_rx); + } + } else + (void) ibd_intr((caddr_t)state); } /* - * Separate CQ handler for Tx completions, when the Tx CQ is in + * CQ handler for Tx completions, when the Tx CQ is in * interrupt driven mode. */ /* ARGSUSED */ @@ -3879,10 +3966,18 @@ ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) atomic_add_64(&state->id_num_intrs, 1); - if (ibd_tx_softintr == 1) - ddi_trigger_softintr(state->id_tx); - else - (void) ibd_tx_recycle((char *)state); + if (ibd_tx_softintr == 1) { + mutex_enter(&state->id_scq_poll_lock); + if (state->id_scq_poll_busy & IBD_CQ_POLLING) { + state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; + mutex_exit(&state->id_scq_poll_lock); + return; + } else { + mutex_exit(&state->id_scq_poll_lock); + ddi_trigger_softintr(state->id_tx); + } + } else + (void) ibd_tx_recycle((caddr_t)state); } /* @@ -3901,14 +3996,16 @@ ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, /* * The trap handler will get invoked once for every event for - * evert port. The input "gid" is the GID0 of the port the + * every port. The input "gid" is the GID0 of the port the * trap came in on; we just need to act on traps that came * to our port, meaning the port on which the ipoib interface * resides. Since ipoib uses GID0 of the port, we just match * the gids to check whether we need to handle the trap. */ + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) return; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) DPRINT(10, "ibd_notices_handler : %d\n", code); @@ -4101,7 +4198,9 @@ ibd_get_port_details(ibd_state_t *state) } state->id_mtu = (128 << port_infop->p_mtu); + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) state->id_sgid = *port_infop->p_sgid_tbl; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) state->id_link_state = LINK_STATE_UP; mutex_exit(&state->id_link_mutex); @@ -4129,7 +4228,7 @@ ibd_alloc_cqs(ibd_state_t *state) /* * Allocate Rx/combined CQ: * Theoretically, there is no point in having more than #rwqe - * plus #swqe cqe's, except that the CQ will be signalled for + * plus #swqe cqe's, except that the CQ will be signaled for * overflow when the last wqe completes, if none of the previous * cqe's have been polled. Thus, we allocate just a few less wqe's * to make sure such overflow does not occur. @@ -4137,94 +4236,63 @@ ibd_alloc_cqs(ibd_state_t *state) cq_attr.cq_sched = NULL; cq_attr.cq_flags = IBT_CQ_NO_FLAGS; - if (ibd_separate_cqs == 1) { - /* - * Allocate Receive CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { - cq_attr.cq_size = state->id_num_rwqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_rwqe = cq_attr.cq_size - 1; - } - - if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " - "failed, ret=%d\n", ret); - return (DDI_FAILURE); - } + /* + * Allocate Receive CQ. + */ + if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { + cq_attr.cq_size = state->id_num_rwqe + 1; + } else { + cq_attr.cq_size = hca_attrs.hca_max_cq_sz; + state->id_num_rwqe = cq_attr.cq_size - 1; + } - if ((ret = ibt_modify_cq(state->id_rcq_hdl, - ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " - "moderation failed, ret=%d\n", ret); - } + if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, + &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " + "failed, ret=%d\n", ret); + return (DDI_FAILURE); + } - state->id_rxwcs_size = state->id_num_rwqe + 1; - state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_rxwcs_size, KM_SLEEP); + if ((ret = ibt_modify_cq(state->id_rcq_hdl, + ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " + "moderation failed, ret=%d\n", ret); + } - /* - * Allocate Send CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { - cq_attr.cq_size = state->id_num_swqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_swqe = cq_attr.cq_size - 1; - } + /* make the #rx wc's the same as max rx chain size */ + state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; + state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_rxwcs_size, KM_SLEEP); - if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " - "failed, ret=%d\n", ret); - kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * - state->id_rxwcs_size); - (void) ibt_free_cq(state->id_rcq_hdl); - return (DDI_FAILURE); - } - if ((ret = ibt_modify_cq(state->id_scq_hdl, - IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " - "moderation failed, ret=%d\n", ret); - } - - state->id_txwcs_size = state->id_num_swqe + 1; - state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_txwcs_size, KM_SLEEP); + /* + * Allocate Send CQ. + */ + if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { + cq_attr.cq_size = state->id_num_swqe + 1; } else { - /* - * Allocate combined Send/Receive CQ. - */ - if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + - state->id_num_swqe + 1)) { - cq_attr.cq_size = state->id_num_rwqe + - state->id_num_swqe + 1; - } else { - cq_attr.cq_size = hca_attrs.hca_max_cq_sz; - state->id_num_rwqe = ((cq_attr.cq_size - 1) * - state->id_num_rwqe) / (state->id_num_rwqe + - state->id_num_swqe); - state->id_num_swqe = cq_attr.cq_size - 1 - - state->id_num_rwqe; - } - - state->id_rxwcs_size = cq_attr.cq_size; - state->id_txwcs_size = state->id_rxwcs_size; + cq_attr.cq_size = hca_attrs.hca_max_cq_sz; + state->id_num_swqe = cq_attr.cq_size - 1; + } - if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, - &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { - DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " - "failed, ret=%d\n", ret); - return (DDI_FAILURE); - } - state->id_scq_hdl = state->id_rcq_hdl; - state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * - state->id_rxwcs_size, KM_SLEEP); - state->id_txwcs = state->id_rxwcs; + if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, + &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " + "failed, ret=%d\n", ret); + kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * + state->id_rxwcs_size); + (void) ibt_free_cq(state->id_rcq_hdl); + return (DDI_FAILURE); + } + if ((ret = ibt_modify_cq(state->id_scq_hdl, + ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { + DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " + "moderation failed, ret=%d\n", ret); } + state->id_txwcs_size = IBD_TX_POLL_THRESH; + state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_txwcs_size, KM_SLEEP); + /* * Print message in case we could not allocate as many wqe's * as was requested. @@ -4248,7 +4316,7 @@ ibd_setup_ud_channel(ibd_state_t *state) ibt_ud_chan_query_attr_t ud_chan_attr; ibt_status_t ret; - ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; + ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; if (state->id_hca_res_lkey_capab) ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; if (state->id_lso_policy && state->id_lso_capable) @@ -4341,7 +4409,7 @@ ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) */ DPRINT(2, "ibd_undo_start: " "reclaiming failed"); - ibd_poll_compq(state, state->id_rcq_hdl); + ibd_poll_rcq(state, state->id_rcq_hdl); ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); return (DDI_FAILURE); @@ -4383,10 +4451,8 @@ ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) * ibt_set_cq_handler() returns, the old handler is * guaranteed not to be invoked anymore. */ - if (ibd_separate_cqs == 1) { - ibt_set_cq_handler(state->id_scq_hdl, 0, 0); - } - ibd_poll_compq(state, state->id_scq_hdl); + ibt_set_cq_handler(state->id_scq_hdl, 0, 0); + ibd_poll_scq(state, state->id_scq_hdl); state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); } @@ -4456,14 +4522,12 @@ ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) } if (progress & IBD_DRV_CQS_ALLOCD) { - if (ibd_separate_cqs == 1) { - kmem_free(state->id_txwcs, - sizeof (ibt_wc_t) * state->id_txwcs_size); - if ((ret = ibt_free_cq(state->id_scq_hdl)) != - IBT_SUCCESS) { - DPRINT(10, "ibd_undo_start: free_cq(scq) " - "failed, ret=%d", ret); - } + kmem_free(state->id_txwcs, + sizeof (ibt_wc_t) * state->id_txwcs_size); + if ((ret = ibt_free_cq(state->id_scq_hdl)) != + IBT_SUCCESS) { + DPRINT(10, "ibd_undo_start: free_cq(scq) " + "failed, ret=%d", ret); } kmem_free(state->id_rxwcs, @@ -4482,7 +4546,9 @@ ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) } if (progress & IBD_DRV_ACACHE_INITIALIZED) { + mutex_enter(&state->id_ac_mutex); mod_hash_destroy_hash(state->id_ah_active_hash); + mutex_exit(&state->id_ac_mutex); ibd_acache_fini(state); state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); @@ -4626,19 +4692,17 @@ ibd_start(ibd_state_t *state) state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; /* - * If we have separate cqs, create the send cq handler here + * Create the send cq handler here */ - if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { - ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); - if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, - IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { - DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " - "failed, ret=%d", ret); - err = EINVAL; - goto start_fail; - } - state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; + ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); + if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, + IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { + DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " + "failed, ret=%d", ret); + err = EINVAL; + goto start_fail; } + state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; /* * Allocate and initialize the rx buffer list @@ -4665,7 +4729,9 @@ ibd_start(ibd_state_t *state) */ kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, TS_RUN, minclsyspri); + _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) state->id_async_thrid = kht->t_did; + _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; /* @@ -4680,7 +4746,7 @@ ibd_start(ibd_state_t *state) ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, state->id_mgid.gid_guid); - mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); + (void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); /* @@ -4789,7 +4855,7 @@ ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) if (op == IBD_ASYNC_JOIN) { if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { - ibd_print_warn(state, "Joint multicast group failed :" + ibd_print_warn(state, "Join multicast group failed :" "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); } } else { @@ -4835,7 +4901,7 @@ ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) /* * Check validity of MCG address. We could additionally check * that a enable/disable is not being issued on the "broadcast" - * mcg, but since this operation is only invokable by priviledged + * mcg, but since this operation is only invokable by privileged * programs anyway, we allow the flexibility to those dlpi apps. * Note that we do not validate the "scope" of the IBA mcg. */ @@ -5046,124 +5112,100 @@ ibd_m_stat(void *arg, uint_t stat, uint64_t *val) static void ibd_async_txsched(ibd_state_t *state) { - ibd_req_t *req; - int ret; - - if (ibd_txcomp_poll) - ibd_poll_compq(state, state->id_scq_hdl); - - ret = ibd_resume_transmission(state); - if (ret && ibd_txcomp_poll) { - if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) - ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); - else { - ibd_print_warn(state, "ibd_async_txsched: " - "no memory, can't schedule work slot"); - } - } + ibd_resume_transmission(state); } -static int +static void ibd_resume_transmission(ibd_state_t *state) { int flag; int met_thresh = 0; + int thresh = 0; int ret = -1; mutex_enter(&state->id_sched_lock); if (state->id_sched_needed & IBD_RSRC_SWQE) { - met_thresh = (state->id_tx_list.dl_cnt > - IBD_FREE_SWQES_THRESH); + mutex_enter(&state->id_tx_list.dl_mutex); + mutex_enter(&state->id_tx_rel_list.dl_mutex); + met_thresh = state->id_tx_list.dl_cnt + + state->id_tx_rel_list.dl_cnt; + mutex_exit(&state->id_tx_rel_list.dl_mutex); + mutex_exit(&state->id_tx_list.dl_mutex); + thresh = IBD_FREE_SWQES_THRESH; flag = IBD_RSRC_SWQE; } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { ASSERT(state->id_lso != NULL); - met_thresh = (state->id_lso->bkt_nfree > - IBD_FREE_LSOS_THRESH); + mutex_enter(&state->id_lso_lock); + met_thresh = state->id_lso->bkt_nfree; + thresh = IBD_FREE_LSOS_THRESH; + mutex_exit(&state->id_lso_lock); flag = IBD_RSRC_LSOBUF; + if (met_thresh > thresh) + state->id_sched_lso_cnt++; } - if (met_thresh) { + if (met_thresh > thresh) { state->id_sched_needed &= ~flag; + state->id_sched_cnt++; ret = 0; } mutex_exit(&state->id_sched_lock); if (ret == 0) mac_tx_update(state->id_mh); - - return (ret); } /* * Release the send wqe back into free list. */ static void -ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) +ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) { /* * Add back on Tx list for reuse. */ - swqe->swqe_next = NULL; - mutex_enter(&state->id_tx_list.dl_mutex); - if (state->id_tx_list.dl_pending_sends) { - state->id_tx_list.dl_pending_sends = B_FALSE; - } - if (state->id_tx_list.dl_head == NULL) { - state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); - } else { - state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); - } - state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); - state->id_tx_list.dl_cnt++; - mutex_exit(&state->id_tx_list.dl_mutex); + ASSERT(tail->swqe_next == NULL); + mutex_enter(&state->id_tx_rel_list.dl_mutex); + state->id_tx_rel_list.dl_pending_sends = B_FALSE; + tail->swqe_next = state->id_tx_rel_list.dl_head; + state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); + state->id_tx_rel_list.dl_cnt += n; + mutex_exit(&state->id_tx_rel_list.dl_mutex); } /* * Acquire a send wqe from free list. * Returns error number and send wqe pointer. */ -static int -ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) +static ibd_swqe_t * +ibd_acquire_swqe(ibd_state_t *state) { - int rc = 0; ibd_swqe_t *wqe; - /* - * Check and reclaim some of the completed Tx requests. - * If someone else is already in this code and pulling Tx - * completions, no need to poll, since the current lock holder - * will do the work anyway. Normally, we poll for completions - * every few Tx attempts, but if we are short on Tx descriptors, - * we always try to poll. - */ - if ((ibd_txcomp_poll == 1) && - (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { - ibd_poll_compq(state, state->id_scq_hdl); - } + mutex_enter(&state->id_tx_rel_list.dl_mutex); + if (state->id_tx_rel_list.dl_head != NULL) { + /* transfer id_tx_rel_list to id_tx_list */ + state->id_tx_list.dl_head = + state->id_tx_rel_list.dl_head; + state->id_tx_list.dl_cnt = + state->id_tx_rel_list.dl_cnt; + state->id_tx_list.dl_pending_sends = B_FALSE; - /* - * Grab required transmit wqes. - */ - mutex_enter(&state->id_tx_list.dl_mutex); - wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); - if (wqe != NULL) { + /* clear id_tx_rel_list */ + state->id_tx_rel_list.dl_head = NULL; + state->id_tx_rel_list.dl_cnt = 0; + mutex_exit(&state->id_tx_rel_list.dl_mutex); + + wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); state->id_tx_list.dl_cnt -= 1; state->id_tx_list.dl_head = wqe->swqe_next; - if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) - state->id_tx_list.dl_tail = NULL; - } else { - /* - * If we did not find the number we were looking for, flag - * no resource. Adjust list appropriately in either case. - */ - rc = ENOENT; + } else { /* no free swqe */ + mutex_exit(&state->id_tx_rel_list.dl_mutex); state->id_tx_list.dl_pending_sends = B_TRUE; DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); - atomic_add_64(&state->id_tx_short, 1); + state->id_tx_short++; + wqe = NULL; } - mutex_exit(&state->id_tx_list.dl_mutex); - *swqe = wqe; - - return (rc); + return (wqe); } static int @@ -5283,60 +5325,44 @@ ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) uint_t num_posted; uint_t n_wrs; ibt_status_t ibt_status; - ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; - ibd_swqe_t *elem; - ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; - - node->swqe_next = NULL; - - mutex_enter(&state->id_txpost_lock); - - /* - * Enqueue the new node in chain of wqes to send - */ - if (state->id_tx_head) { - *(state->id_tx_tailp) = (ibd_wqe_t *)node; - } else { - state->id_tx_head = node; - } - state->id_tx_tailp = &(node->swqe_next); - - /* - * If someone else is helping out with the sends, - * just go back - */ - if (state->id_tx_busy) { - mutex_exit(&state->id_txpost_lock); - return; + ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; + ibd_swqe_t *tx_head, *elem; + ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; + + /* post the one request, then check for more */ + ibt_status = ibt_post_send(state->id_chnl_hdl, + &node->w_swr, 1, NULL); + if (ibt_status != IBT_SUCCESS) { + ibd_print_warn(state, "ibd_post_send: " + "posting one wr failed: ret=%d", ibt_status); + ibd_tx_cleanup(state, node); } - /* - * Otherwise, mark the flag to indicate that we'll be - * doing the dispatch of what's there in the wqe chain - */ - state->id_tx_busy = 1; + tx_head = NULL; + for (;;) { + if (tx_head == NULL) { + mutex_enter(&state->id_txpost_lock); + tx_head = state->id_tx_head; + if (tx_head == NULL) { + state->id_tx_busy = 0; + mutex_exit(&state->id_txpost_lock); + return; + } + state->id_tx_head = NULL; + mutex_exit(&state->id_txpost_lock); + } - while (state->id_tx_head) { /* - * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs + * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs * at a time if possible, and keep posting them. */ - for (n_wrs = 0, elem = state->id_tx_head; - (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); + for (n_wrs = 0, elem = tx_head; + (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { - nodes[n_wrs] = elem; wrs[n_wrs] = elem->w_swr; } - state->id_tx_head = elem; - - /* - * Release the txpost lock before posting the - * send request to the hca; if the posting fails - * for some reason, we'll never receive completion - * intimation, so we'll need to cleanup. - */ - mutex_exit(&state->id_txpost_lock); + tx_head = elem; ASSERT(n_wrs != 0); @@ -5353,7 +5379,6 @@ ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) ibt_status = ibt_post_send(state->id_chnl_hdl, wrs, n_wrs, &num_posted); if (ibt_status != IBT_SUCCESS) { - ibd_print_warn(state, "ibd_post_send: " "posting multiple wrs failed: " "requested=%d, done=%d, ret=%d", @@ -5362,15 +5387,7 @@ ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) for (i = num_posted; i < n_wrs; i++) ibd_tx_cleanup(state, nodes[i]); } - - /* - * Grab the mutex before we go and check the tx Q again - */ - mutex_enter(&state->id_txpost_lock); } - - state->id_tx_busy = 0; - mutex_exit(&state->id_txpost_lock); } static int @@ -5388,7 +5405,6 @@ ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, uint_t pktsize; uint_t frag_len; uint_t pending_hdr; - uint_t hiwm; int nmblks; int i; @@ -5420,21 +5436,13 @@ ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, pktsize -= pending_hdr; /* - * Translating the virtual address regions into physical regions - * for using the Reserved LKey feature results in a wr sgl that - * is a little longer. Since failing ibt_map_mem_iov() is costly, - * we'll fix a high-water mark (65%) for when we should stop. - */ - hiwm = (state->id_max_sqseg * 65) / 100; - - /* * We only do ibt_map_mem_iov() if the pktsize is above the * "copy-threshold", and if the number of mp fragments is less than * the maximum acceptable. */ if ((state->id_hca_res_lkey_capab) && (pktsize > IBD_TX_COPY_THRESH) && - (nmblks < hiwm)) { + (nmblks < state->id_max_sqseg_hiwm)) { ibt_iov_t iov_arr[IBD_MAX_SQSEG]; ibt_iov_attr_t iov_attr; @@ -5591,14 +5599,22 @@ ibd_send(ibd_state_t *state, mblk_t *mp) if ((state->id_mac_state & IBD_DRV_STARTED) == 0) return (B_FALSE); - node = NULL; - if (ibd_acquire_swqe(state, &node) != 0) { + mutex_enter(&state->id_tx_list.dl_mutex); + node = WQE_TO_SWQE(state->id_tx_list.dl_head); + if (node != NULL) { + state->id_tx_list.dl_cnt -= 1; + state->id_tx_list.dl_head = node->swqe_next; + } else { + node = ibd_acquire_swqe(state); + } + mutex_exit(&state->id_tx_list.dl_mutex); + if (node == NULL) { /* * If we don't have an swqe available, schedule a transmit * completion queue cleanup and hold off on sending more * more packets until we have some free swqes */ - if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) + if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) return (B_FALSE); /* @@ -5650,14 +5666,6 @@ ibd_send(ibd_state_t *state, mblk_t *mp) node->w_ahandle = NULL; /* - * for the poll mode, it is probably some cqe pending in the - * cq. So ibd has to poll cq here, otherwise acache probably - * may not be recycled. - */ - if (ibd_txcomp_poll == 1) - ibd_poll_compq(state, state->id_scq_hdl); - - /* * Here if ibd_acache_lookup() returns EFAULT, it means ibd * can not find a path for the specific dest address. We * should get rid of this kind of packet. We also should get @@ -5781,7 +5789,23 @@ ibd_send(ibd_state_t *state, mblk_t *mp) * post instead of doing it serially, we cannot assume anything * about the 'node' after ibd_post_send() returns. */ - ibd_post_send(state, node); + node->swqe_next = NULL; + + mutex_enter(&state->id_txpost_lock); + if (state->id_tx_busy) { + if (state->id_tx_head) { + state->id_tx_tail->swqe_next = + SWQE_TO_WQE(node); + } else { + state->id_tx_head = node; + } + state->id_tx_tail = node; + mutex_exit(&state->id_txpost_lock); + } else { + state->id_tx_busy = 1; + mutex_exit(&state->id_txpost_lock); + ibd_post_send(state, node); + } return (B_TRUE); @@ -5831,65 +5855,118 @@ ibd_m_tx(void *arg, mblk_t *mp) * only Rx completions. */ static uint_t -ibd_intr(char *arg) +ibd_intr(caddr_t arg) { ibd_state_t *state = (ibd_state_t *)arg; - ibd_poll_compq(state, state->id_rcq_hdl); + ibd_poll_rcq(state, state->id_rcq_hdl); return (DDI_INTR_CLAIMED); } /* - * Poll and drain the cq + * Poll and fully drain the send cq */ -static uint_t -ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, - uint_t numwcs) +static void +ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { + ibt_wc_t *wcs = state->id_txwcs; + uint_t numwcs = state->id_txwcs_size; ibd_wqe_t *wqe; + ibd_swqe_t *head, *tail; ibt_wc_t *wc; - uint_t total_polled = 0; uint_t num_polled; int i; while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { - total_polled += num_polled; + head = tail = NULL; for (i = 0, wc = wcs; i < num_polled; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; - ASSERT((wqe->w_type == IBD_WQE_SEND) || - (wqe->w_type == IBD_WQE_RECV)); + ASSERT(wqe->w_type == IBD_WQE_SEND); if (wc->wc_status != IBT_WC_SUCCESS) { /* * Channel being torn down. */ if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { - DPRINT(5, "ibd_drain_cq: flush error"); + DPRINT(5, "ibd_drain_scq: flush error"); /* * Only invoke the Tx handler to * release possibly held resources - * like AH refcount etc. Can not - * invoke Rx handler because it might - * try adding buffers to the Rx pool - * when we are trying to deinitialize. + * like AH refcount etc. */ - if (wqe->w_type == IBD_WQE_RECV) { - continue; - } else { - DPRINT(10, "ibd_drain_cq: Bad " - "status %d", wc->wc_status); - } + DPRINT(10, "ibd_drain_scq: Bad " + "status %d", wc->wc_status); } + return; /* give up. no need to clean up */ } - if (wqe->w_type == IBD_WQE_SEND) { - ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); - } else { - ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); - } + /* + * Add this swqe to the list to be cleaned up. + */ + if (head) + tail->swqe_next = wqe; + else + head = WQE_TO_SWQE(wqe); + tail = WQE_TO_SWQE(wqe); } + tail->swqe_next = NULL; + ibd_tx_cleanup_list(state, head, tail); + + /* + * Resume any blocked transmissions if possible + */ + ibd_resume_transmission(state); } +} + +/* + * Poll and fully drain the receive cq + */ +static void +ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) +{ + ibt_wc_t *wcs = state->id_rxwcs; + uint_t numwcs = state->id_rxwcs_size; + ibd_wqe_t *wqe; + ibt_wc_t *wc; + uint_t num_polled; + int i; + mblk_t *head, *tail, *mp; + + while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { + head = tail = NULL; + for (i = 0, wc = wcs; i < num_polled; i++, wc++) { + wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; + ASSERT(wqe->w_type == IBD_WQE_RECV); + if (wc->wc_status != IBT_WC_SUCCESS) { + /* + * Channel being torn down. + */ + if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { + DPRINT(5, "ibd_drain_rcq: flush error"); + /* + * Do not invoke Rx handler because + * it might add buffers to the Rx pool + * when we are trying to deinitialize. + */ + continue; + } + } + mp = ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); + if (mp == NULL) + continue; - return (total_polled); + /* + * Add this mp to the list to send to the nw layer. + */ + if (head) + tail->b_next = mp; + else + head = mp; + tail = mp; + } + if (head) + mac_rx(state->id_mh, state->id_rh, head); + } } /* @@ -5897,68 +5974,41 @@ ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, * for all completed wqe's while detaching. */ static void -ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) +ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { - ibt_wc_t *wcs; - uint_t numwcs; int flag, redo_flag; int redo = 1; - uint_t num_polled = 0; - if (ibd_separate_cqs == 1) { - if (cq_hdl == state->id_rcq_hdl) { - flag = IBD_RX_CQ_POLLING; - redo_flag = IBD_REDO_RX_CQ_POLLING; - } else { - flag = IBD_TX_CQ_POLLING; - redo_flag = IBD_REDO_TX_CQ_POLLING; - } - } else { - flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; - redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; - } + flag = IBD_CQ_POLLING; + redo_flag = IBD_REDO_CQ_POLLING; - mutex_enter(&state->id_cq_poll_lock); - if (state->id_cq_poll_busy & flag) { - state->id_cq_poll_busy |= redo_flag; - mutex_exit(&state->id_cq_poll_lock); + mutex_enter(&state->id_scq_poll_lock); + if (state->id_scq_poll_busy & flag) { + ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); + state->id_scq_poll_busy |= redo_flag; + mutex_exit(&state->id_scq_poll_lock); return; } - state->id_cq_poll_busy |= flag; - mutex_exit(&state->id_cq_poll_lock); + state->id_scq_poll_busy |= flag; + mutex_exit(&state->id_scq_poll_lock); /* * In some cases (eg detaching), this code can be invoked on * any cpu after disabling cq notification (thus no concurrency * exists). Apart from that, the following applies normally: - * The receive completion handling is always on the Rx interrupt - * cpu. Transmit completion handling could be from any cpu if + * Transmit completion handling could be from any cpu if * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ - * is interrupt driven. Combined completion handling is always - * on the interrupt cpu. Thus, lock accordingly and use the - * proper completion array. - */ - if (ibd_separate_cqs == 1) { - if (cq_hdl == state->id_rcq_hdl) { - wcs = state->id_rxwcs; - numwcs = state->id_rxwcs_size; - } else { - wcs = state->id_txwcs; - numwcs = state->id_txwcs_size; - } - } else { - wcs = state->id_rxwcs; - numwcs = state->id_rxwcs_size; - } + * is interrupt driven. + */ /* * Poll and drain the CQ */ - num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); + ibd_drain_scq(state, cq_hdl); /* * Enable CQ notifications and redrain the cq to catch any - * completions we might have missed after the ibd_drain_cq() + * completions we might have missed after the ibd_drain_scq() * above and before the ibt_enable_cq_notify() that follows. * Finally, service any new requests to poll the cq that * could've come in after the ibt_enable_cq_notify(). @@ -5969,26 +6019,73 @@ ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); } - num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); + ibd_drain_scq(state, cq_hdl); - mutex_enter(&state->id_cq_poll_lock); - if (state->id_cq_poll_busy & redo_flag) - state->id_cq_poll_busy &= ~redo_flag; + mutex_enter(&state->id_scq_poll_lock); + if (state->id_scq_poll_busy & redo_flag) + state->id_scq_poll_busy &= ~redo_flag; else { - state->id_cq_poll_busy &= ~flag; + state->id_scq_poll_busy &= ~flag; redo = 0; } - mutex_exit(&state->id_cq_poll_lock); + mutex_exit(&state->id_scq_poll_lock); } while (redo); +} + +/* + * Common code for interrupt handling as well as for polling + * for all completed wqe's while detaching. + */ +static void +ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) +{ + int flag, redo_flag; + int redo = 1; + + flag = IBD_CQ_POLLING; + redo_flag = IBD_REDO_CQ_POLLING; + + mutex_enter(&state->id_rcq_poll_lock); + if (state->id_rcq_poll_busy & flag) { + ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); + state->id_rcq_poll_busy |= redo_flag; + mutex_exit(&state->id_rcq_poll_lock); + return; + } + state->id_rcq_poll_busy |= flag; + mutex_exit(&state->id_rcq_poll_lock); /* - * If we polled the receive cq and found anything, we need to flush - * it out to the nw layer here. + * Poll and drain the CQ */ - if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { - ibd_flush_rx(state, NULL); - } + ibd_drain_rcq(state, rcq); + + /* + * Enable CQ notifications and redrain the cq to catch any + * completions we might have missed after the ibd_drain_cq() + * above and before the ibt_enable_cq_notify() that follows. + * Finally, service any new requests to poll the cq that + * could've come in after the ibt_enable_cq_notify(). + */ + do { + if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != + IBT_SUCCESS) { + DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); + } + + ibd_drain_rcq(state, rcq); + + mutex_enter(&state->id_rcq_poll_lock); + if (state->id_rcq_poll_busy & redo_flag) + state->id_rcq_poll_busy &= ~redo_flag; + else { + state->id_rcq_poll_busy &= ~flag; + redo = 0; + } + mutex_exit(&state->id_rcq_poll_lock); + + } while (redo); } /* @@ -6012,6 +6109,65 @@ ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) swqe->w_swr.wr_nds = 0; } +static void +ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) +{ + /* + * The recycling logic can be eliminated from here + * and put into the async thread if we create another + * list to hold ACE's for unjoined mcg's. + */ + if (DEC_REF_DO_CYCLE(ace)) { + ibd_mce_t *mce; + + /* + * Check with the lock taken: we decremented + * reference count without the lock, and some + * transmitter might already have bumped the + * reference count (possible in case of multicast + * disable when we leave the AH on the active + * list). If not still 0, get out, leaving the + * recycle bit intact. + * + * Atomically transition the AH from active + * to free list, and queue a work request to + * leave the group and destroy the mce. No + * transmitter can be looking at the AH or + * the MCE in between, since we have the + * ac_mutex lock. In the SendOnly reap case, + * it is not necessary to hold the ac_mutex + * and recheck the ref count (since the AH was + * taken off the active list), we just do it + * to have uniform processing with the Full + * reap case. + */ + mutex_enter(&state->id_ac_mutex); + mce = ace->ac_mce; + if (GET_REF_CYCLE(ace) == 0) { + CLEAR_REFCYCLE(ace); + /* + * Identify the case of fullmember reap as + * opposed to mcg trap reap. Also, port up + * might set ac_mce to NULL to indicate Tx + * cleanup should do no more than put the + * AH in the free list (see ibd_async_link). + */ + if (mce != NULL) { + ace->ac_mce = NULL; + IBD_ACACHE_PULLOUT_ACTIVE(state, ace); + /* + * mc_req was initialized at mce + * creation time. + */ + ibd_queue_work_slot(state, + &mce->mc_req, IBD_ASYNC_REAP); + } + IBD_ACACHE_INSERT_FREE(state, ace); + } + mutex_exit(&state->id_ac_mutex); + } +} + /* * Common code that deals with clean ups after a successful or * erroneous transmission attempt. @@ -6051,89 +6207,66 @@ ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) * ibd_send() error path. */ if (ace != NULL) { - /* - * The recycling logic can be eliminated from here - * and put into the async thread if we create another - * list to hold ACE's for unjoined mcg's. - */ - if (DEC_REF_DO_CYCLE(ace)) { - ibd_mce_t *mce; - - /* - * Check with the lock taken: we decremented - * reference count without the lock, and some - * transmitter might alreay have bumped the - * reference count (possible in case of multicast - * disable when we leave the AH on the active - * list). If not still 0, get out, leaving the - * recycle bit intact. - * - * Atomically transition the AH from active - * to free list, and queue a work request to - * leave the group and destroy the mce. No - * transmitter can be looking at the AH or - * the MCE in between, since we have the - * ac_mutex lock. In the SendOnly reap case, - * it is not neccesary to hold the ac_mutex - * and recheck the ref count (since the AH was - * taken off the active list), we just do it - * to have uniform processing with the Full - * reap case. - */ - mutex_enter(&state->id_ac_mutex); - mce = ace->ac_mce; - if (GET_REF_CYCLE(ace) == 0) { - CLEAR_REFCYCLE(ace); - /* - * Identify the case of fullmember reap as - * opposed to mcg trap reap. Also, port up - * might set ac_mce to NULL to indicate Tx - * cleanup should do no more than put the - * AH in the free list (see ibd_async_link). - */ - if (mce != NULL) { - ace->ac_mce = NULL; - IBD_ACACHE_PULLOUT_ACTIVE(state, ace); - /* - * mc_req was initialized at mce - * creation time. - */ - ibd_queue_work_slot(state, - &mce->mc_req, IBD_ASYNC_REAP); - } - IBD_ACACHE_INSERT_FREE(state, ace); - } - mutex_exit(&state->id_ac_mutex); - } + ibd_dec_ref_ace(state, ace); } /* * Release the send wqe for reuse. */ - ibd_release_swqe(state, swqe); + swqe->swqe_next = NULL; + ibd_release_swqe(state, swqe, swqe, 1); } -/* - * Hand off the processed rx mp chain to mac_rx() - */ static void -ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) +ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) { - if (mpc == NULL) { - mutex_enter(&state->id_rx_lock); + ibd_ace_t *ace; + ibd_swqe_t *swqe; + int n = 0; - mpc = state->id_rx_mp; + DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); - state->id_rx_mp = NULL; - state->id_rx_mp_tail = NULL; - state->id_rx_mp_len = 0; + for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { - mutex_exit(&state->id_rx_lock); - } + /* + * If this was a dynamic mapping in ibd_send(), we need to + * unmap here. If this was an lso buffer we'd used for sending, + * we need to release the lso buf to the pool, since the + * resource is scarce. However, if this was simply a normal + * send using the copybuf (present in each swqe), we don't need + * to release it. + */ + if (swqe->swqe_im_mblk != NULL) { + if (swqe->w_buftype == IBD_WQE_MAPPED) { + ibd_unmap_mem(state, swqe); + } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { + ibd_release_lsobufs(state, + swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); + } + ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); + freemsg(swqe->swqe_im_mblk); + swqe->swqe_im_mblk = NULL; + } - if (mpc) { - mac_rx(state->id_mh, state->id_rh, mpc); + /* + * Drop the reference count on the AH; it can be reused + * now for a different destination if there are no more + * posted sends that will use it. This can be eliminated + * if we can always associate each Tx buffer with an AH. + * The ace can be null if we are cleaning up from the + * ibd_send() error path. + */ + ace = swqe->w_ahandle; + if (ace != NULL) { + ibd_dec_ref_ace(state, ace); + } + n++; } + + /* + * Release the send wqes for reuse. + */ + ibd_release_swqe(state, head, tail, n); } /* @@ -6141,30 +6274,48 @@ ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) * in the format expected by GLD. The received packet has this * format: 2b sap :: 00 :: data. */ -static void +static mblk_t * ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) { ib_header_info_t *phdr; mblk_t *mp; - mblk_t *mpc = NULL; ipoib_hdr_t *ipibp; ipha_t *iphap; ip6_t *ip6h; - int rxcnt, len; + int len; + ib_msglen_t pkt_len = wc->wc_bytes_xfer; + uint32_t bufs; + + atomic_add_32(&state->id_rx_list.dl_cnt, -1); /* * Track number handed to upper layer, and number still * available to receive packets. */ - rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); - ASSERT(rxcnt >= 0); - atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); + bufs = atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 1); + + /* Never run out of rwqes, use allocb when running low */ + if (bufs >= state->id_rx_bufs_outstanding_limit) { + atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); + atomic_inc_32(&state->id_rx_allocb); + mp = allocb(pkt_len, BPRI_HI); + if (mp) { + bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); + ibd_post_recv(state, rwqe); + } else { /* no memory */ + atomic_inc_32(&state->id_rx_allocb_failed); + ibd_post_recv(state, rwqe); + return (NULL); + } + } else { + mp = rwqe->rwqe_im_mblk; + } + /* * Adjust write pointer depending on how much data came in. */ - mp = rwqe->rwqe_im_mblk; - mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; + mp->b_wptr = mp->b_rptr + pkt_len; /* * Make sure this is NULL or we're in trouble. @@ -6192,7 +6343,7 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, IPOIB_ADDRL) == 0) { freemsg(mp); - return; + return (NULL); } ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, @@ -6220,32 +6371,9 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) */ ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { - if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { - if (!pullupmsg(mp, IPV6_HDR_LEN + - sizeof (ipoib_hdr_t))) { - DPRINT(10, "ibd_process_rx: pullupmsg failed"); - freemsg(mp); - return; - } - ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + - sizeof (ipoib_pgrh_t)); - } ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { - if (MBLKL(mp) < sizeof (ipoib_hdr_t) + - IPV6_HDR_LEN + len) { - if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + - IPV6_HDR_LEN + len)) { - DPRINT(10, "ibd_process_rx: pullupmsg" - " failed"); - freemsg(mp); - return; - } - ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + - sizeof (ipoib_pgrh_t) + - sizeof (ipoib_hdr_t)); - } /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_RECV); } @@ -6254,7 +6382,7 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) /* * Update statistics */ - atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); + atomic_add_64(&state->id_rcv_bytes, pkt_len); atomic_inc_64(&state->id_rcv_pkt); if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) atomic_inc_64(&state->id_brd_rcv); @@ -6278,35 +6406,7 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); } - /* - * Add this mp to the list of processed mp's to send to - * the nw layer - */ - mutex_enter(&state->id_rx_lock); - if (state->id_rx_mp) { - ASSERT(state->id_rx_mp_tail != NULL); - state->id_rx_mp_tail->b_next = mp; - } else { - ASSERT(state->id_rx_mp_tail == NULL); - state->id_rx_mp = mp; - } - - state->id_rx_mp_tail = mp; - state->id_rx_mp_len++; - - if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { - mpc = state->id_rx_mp; - - state->id_rx_mp = NULL; - state->id_rx_mp_tail = NULL; - state->id_rx_mp_len = 0; - } - - mutex_exit(&state->id_rx_lock); - - if (mpc) { - ibd_flush_rx(state, mpc); - } + return (mp); } /* @@ -6325,47 +6425,30 @@ ibd_freemsg_cb(char *arg) if (rwqe->w_freeing_wqe == B_TRUE) { DPRINT(6, "ibd_freemsg: wqe being freed"); return; - } else { - /* - * Upper layer has released held mblk, so we have - * no more use for keeping the old pointer in - * our rwqe. - */ - rwqe->rwqe_im_mblk = NULL; } rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { - ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); DPRINT(6, "ibd_freemsg: desballoc failed"); return; } - if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { - ibd_delete_rwqe(state, rwqe); - ibd_free_rwqe(state, rwqe); - return; - } + ibd_post_recv(state, rwqe); atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); } static uint_t -ibd_tx_recycle(char *arg) +ibd_tx_recycle(caddr_t arg) { ibd_state_t *state = (ibd_state_t *)arg; /* * Poll for completed entries */ - ibd_poll_compq(state, state->id_scq_hdl); - - /* - * Resume any blocked transmissions if possible - */ - (void) ibd_resume_transmission(state); + ibd_poll_scq(state, state->id_scq_hdl); return (DDI_INTR_CLAIMED); } diff --git a/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h b/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h index 0062418e04..cb60902f7a 100644 --- a/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h +++ b/usr/src/uts/common/sys/ib/adapters/hermon/hermon_hw.h @@ -1151,13 +1151,19 @@ struct hermon_hw_set_port_s { uint32_t cap_mask; uint32_t rqk :1; /* reset qkey violation cntr */ - uint32_t :15; + uint32_t rcm :1; /* reset capability mask */ + uint32_t :2; + uint32_t vl_cap :4; + uint32_t :4; + uint32_t mtu_cap :4; uint32_t g0 :1; /* set port GUID0 */ uint32_t ng :1; /* set node GUID (all ports) */ uint32_t sig :1; /* set sys image */ uint32_t mg :1; /* change GID table */ uint32_t mp :1; /* change pkey table size */ - uint32_t :11; + uint32_t mvc :1; /* change vl_cap */ + uint32_t mmc :1; /* change mtu_cap */ + uint32_t :9; uint64_t sys_img_guid; @@ -1185,13 +1191,19 @@ struct hermon_hw_set_port_s { }; #else /* BIG ENDIAN */ struct hermon_hw_set_port_s { - uint32_t :11; + uint32_t :9; + uint32_t mmc :1; /* change mtu_cap */ + uint32_t mvc :1; /* change vl_cap */ uint32_t mp :1; /* change pkey table size */ uint32_t mg :1; /* change GID table size */ uint32_t sig :1; /* set sys image GUID */ uint32_t ng :1; /* set node GUID (all ports) */ uint32_t g0 :1; /* set port GUID0 */ - uint32_t :15; + uint32_t mtu_cap :4; + uint32_t :4; + uint32_t vl_cap :4; + uint32_t :2; + uint32_t rcm :1; /* reset capability mask */ uint32_t rqk :1; /* reset qkey violation cntr */ uint32_t cap_mask; diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h index f994545684..031f748a6e 100644 --- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h +++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h @@ -144,15 +144,12 @@ typedef enum { * Pre-registered copybuf used for send and receive */ typedef struct ibd_copybuf_s { - ibt_mr_hdl_t ic_mr_hdl; ibt_wr_ds_t ic_sgl; - ibt_mr_desc_t ic_mr_desc; uint8_t *ic_bufaddr; } ibd_copybuf_t; typedef struct ibd_wqe_s { struct ibd_wqe_s *w_next; - struct ibd_wqe_s *w_prev; ibd_wqe_type_t w_type; ibd_copybuf_t w_copybuf; mblk_t *im_mblk; @@ -171,7 +168,6 @@ typedef struct ibd_swqe_s { } ibd_swqe_t; #define swqe_next w_ibd_swqe.w_next -#define swqe_prev w_ibd_swqe.w_prev #define swqe_type w_ibd_swqe.w_type #define swqe_copybuf w_ibd_swqe.w_copybuf #define swqe_im_mblk w_ibd_swqe.im_mblk @@ -187,11 +183,9 @@ typedef struct ibd_rwqe_s { ibt_recv_wr_t w_rwr; boolean_t w_freeing_wqe; frtn_t w_freemsg_cb; - ibd_wqe_t *w_post_link; } ibd_rwqe_t; #define rwqe_next w_ibd_rwqe.w_next -#define rwqe_prev w_ibd_rwqe.w_prev #define rwqe_type w_ibd_rwqe.w_type #define rwqe_copybuf w_ibd_rwqe.w_copybuf #define rwqe_im_mblk w_ibd_rwqe.im_mblk @@ -199,14 +193,13 @@ typedef struct ibd_rwqe_s { #define WQE_TO_RWQE(wqe) (ibd_rwqe_t *)wqe typedef struct ibd_list_s { + kmutex_t dl_mutex; ibd_wqe_t *dl_head; - ibd_wqe_t *dl_tail; union { boolean_t pending_sends; uint32_t bufs_outstanding; } ustat; uint32_t dl_cnt; - kmutex_t dl_mutex; } ibd_list_t; #define dl_pending_sends ustat.pending_sends @@ -240,6 +233,25 @@ typedef struct ibd_lsobkt_s { } ibd_lsobkt_t; /* + * Posting to a single software rx post queue is contentious, + * so break it out to (multiple) an array of queues. + * + * Try to ensure rx_queue structs fall in different cache lines using a filler. + * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes. + */ +#define RX_QUEUE_CACHE_LINE \ + (64 - ((sizeof (kmutex_t) + 2 * sizeof (ibd_wqe_t *) + \ + 2 * sizeof (uint32_t)))) +typedef struct ibd_rx_queue_s { + kmutex_t rx_post_lock; + ibd_wqe_t *rx_head; + ibd_wqe_t *rx_tail; + uint32_t rx_stat; + uint32_t rx_cnt; + uint8_t rx_cache_filler[RX_QUEUE_CACHE_LINE]; +} ibd_rx_queue_t; + +/* * This structure maintains information per port per HCA * (per network interface). */ @@ -250,47 +262,59 @@ typedef struct ibd_state_s { ibt_pd_hdl_t id_pd_hdl; kmem_cache_t *id_req_kmc; + ibd_list_t id_tx_rel_list; + uint32_t id_max_sqseg; + uint32_t id_max_sqseg_hiwm; ibd_list_t id_tx_list; ddi_softintr_t id_tx; uint32_t id_tx_sends; + kmutex_t id_txpost_lock; + ibd_swqe_t *id_tx_head; + ibd_swqe_t *id_tx_tail; + int id_tx_busy; + + uint_t id_tx_buf_sz; uint8_t *id_tx_bufs; + ibd_swqe_t *id_tx_wqes; ibt_mr_hdl_t id_tx_mr_hdl; ibt_mr_desc_t id_tx_mr_desc; - uint_t id_tx_buf_sz; kmutex_t id_lso_lock; ibd_lsobkt_t *id_lso; - kmutex_t id_cq_poll_lock; - int id_cq_poll_busy; + kmutex_t id_scq_poll_lock; + int id_scq_poll_busy; ibt_cq_hdl_t id_scq_hdl; ibt_wc_t *id_txwcs; uint32_t id_txwcs_size; - kmutex_t id_txpost_lock; - ibd_swqe_t *id_tx_head; - ibd_wqe_t **id_tx_tailp; - int id_tx_busy; - - kmutex_t id_rxpost_lock; - ibd_rwqe_t *id_rx_head; - ibd_wqe_t **id_rx_tailp; - int id_rx_busy; - - kmutex_t id_rx_lock; - mblk_t *id_rx_mp; - mblk_t *id_rx_mp_tail; - uint32_t id_rx_mp_len; - + kmutex_t id_rx_post_lock; + int id_rx_post_busy; + int id_rx_nqueues; + ibd_rx_queue_t *id_rx_queues; + ibd_wqe_t *id_rx_post_head; + + ibd_rwqe_t *id_rx_wqes; + uint8_t *id_rx_bufs; + ibt_mr_hdl_t id_rx_mr_hdl; + ibt_mr_desc_t id_rx_mr_desc; + uint_t id_rx_buf_sz; uint32_t id_num_rwqe; ibd_list_t id_rx_list; ddi_softintr_t id_rx; - ibt_cq_hdl_t id_rcq_hdl; - ibt_wc_t *id_rxwcs; + uint32_t id_rx_bufs_outstanding_limit; + uint32_t id_rx_allocb; + uint32_t id_rx_allocb_failed; + ibd_list_t id_rx_free_list; + + kmutex_t id_rcq_poll_lock; + int id_rcq_poll_busy; uint32_t id_rxwcs_size; + ibt_wc_t *id_rxwcs; + ibt_cq_hdl_t id_rcq_hdl; ibt_channel_hdl_t id_chnl_hdl; ib_pkey_t id_pkey; @@ -315,6 +339,7 @@ typedef struct ibd_state_s { kt_did_t id_async_thrid; kmutex_t id_ac_mutex; + ibd_ace_t *id_ac_hot_ace; struct list id_ah_active; struct list id_ah_free; ipoib_mac_t id_ah_addr; @@ -337,6 +362,8 @@ typedef struct ibd_state_s { kmutex_t id_sched_lock; int id_sched_needed; + int id_sched_cnt; + int id_sched_lso_cnt; kmutex_t id_link_mutex; link_state_t id_link_state; |