diff options
author | gg161487 <none@none> | 2008-01-03 17:38:32 -0800 |
---|---|---|
committer | gg161487 <none@none> | 2008-01-03 17:38:32 -0800 |
commit | 87ba907dbdcf5c1692d70a9b22fe1d0e718d9896 (patch) | |
tree | 72931a20716c9a30cab6a442e8e9e30e7901eee3 | |
parent | 8f38d41910063e19709864b025684a228961299f (diff) | |
download | illumos-gate-87ba907dbdcf5c1692d70a9b22fe1d0e718d9896.tar.gz |
PSARC 2007/636 IPoIB Conversion to GLDv3
6445733 IPoIB driver needs to be ported to GLDv3
-rw-r--r-- | usr/src/pkgdefs/SUNWckr/prototype_i386 | 4 | ||||
-rw-r--r-- | usr/src/pkgdefs/SUNWckr/prototype_sparc | 3 | ||||
-rw-r--r-- | usr/src/uts/common/Makefile.files | 4 | ||||
-rw-r--r-- | usr/src/uts/common/io/ib/clients/ibd/ibd.c | 1704 | ||||
-rw-r--r-- | usr/src/uts/common/io/mac/plugins/mac_ib.c | 307 | ||||
-rw-r--r-- | usr/src/uts/common/sys/ib/clients/ibd/ibd.h | 42 | ||||
-rw-r--r-- | usr/src/uts/common/sys/mac_ib.h | 78 | ||||
-rw-r--r-- | usr/src/uts/intel/Makefile.intel.shared | 3 | ||||
-rw-r--r-- | usr/src/uts/intel/ibd/Makefile | 4 | ||||
-rw-r--r-- | usr/src/uts/intel/mac_ib/Makefile | 95 | ||||
-rw-r--r-- | usr/src/uts/sparc/Makefile.sparc.shared | 3 | ||||
-rw-r--r-- | usr/src/uts/sparc/ibd/Makefile | 4 | ||||
-rw-r--r-- | usr/src/uts/sparc/mac_ib/Makefile | 95 |
13 files changed, 1334 insertions, 1012 deletions
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386 index b016c0a3f8..670bc09371 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 @@ -20,7 +20,7 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -166,6 +166,7 @@ d none boot/acpi 755 root sys d none boot/acpi/tables 755 root sys f none kernel/mac/mac_ether 755 root sys f none kernel/mac/mac_wifi 755 root sys +f none kernel/mac/mac_ib 755 root sys d none kernel/misc/scsi_vhci 755 root sys f none kernel/misc/scsi_vhci/scsi_vhci_f_asym_emc 755 root sys f none kernel/misc/scsi_vhci/scsi_vhci_f_asym_lsi 755 root sys @@ -363,6 +364,7 @@ f none kernel/ipp/amd64/ipgpc 755 root sys d none kernel/mac/amd64 755 root sys f none kernel/mac/amd64/mac_ether 755 root sys f none kernel/mac/amd64/mac_wifi 755 root sys +f none kernel/mac/amd64/mac_ib 755 root sys d none kernel/misc/scsi_vhci/amd64 755 root sys f none kernel/misc/scsi_vhci/amd64/scsi_vhci_f_asym_emc 755 root sys f none kernel/misc/scsi_vhci/amd64/scsi_vhci_f_asym_lsi 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc index 73976c4180..43e9f4889e 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc @@ -20,7 +20,7 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -166,6 +166,7 @@ f none kernel/ipp/sparcv9/ipgpc 755 root sys d none kernel/mac/sparcv9 755 root sys f none kernel/mac/sparcv9/mac_ether 755 root sys f none kernel/mac/sparcv9/mac_wifi 755 root sys +f none kernel/mac/sparcv9/mac_ib 755 root sys d none kernel/misc/scsi_vhci 755 root sys d none kernel/misc/scsi_vhci/sparcv9 755 root sys f none kernel/misc/scsi_vhci/sparcv9/scsi_vhci_f_asym_emc 755 root sys diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index c2a2d6d105..916735e982 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -20,7 +20,7 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -570,6 +570,8 @@ MAC_ETHER_OBJS += mac_ether.o MAC_WIFI_OBJS += mac_wifi.o +MAC_IB_OBJS += mac_ib.o + AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \ aggr_send.o aggr_recv.o aggr_lacp.o diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c index 6cbf736013..323ab3d48a 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -86,11 +86,12 @@ typedef enum {IBD_LINK_DOWN, IBD_LINK_UP, IBD_LINK_UP_ABSENT} ibd_link_op_t; */ static uint_t ibd_rx_threshold = 16; static uint_t ibd_tx_current_copy_threshold = 0x10000000; -static uint_t ibd_num_rwqe = 4095; /* 1 less than max Tavor CQsize */ -static uint_t ibd_num_swqe = 4095; /* 1 less than max Tavor CQsize */ +/* should less than max Tavor CQsize and be 2^n - 1 */ +static uint_t ibd_num_rwqe = 511; +static uint_t ibd_num_swqe = 511; static uint_t ibd_num_ah = 16; static uint_t ibd_hash_size = 16; -static uint_t ibd_srv_fifos = 0xffff; +static uint_t ibd_srv_fifos = 0x0; static uint_t ibd_fifo_depth = 0; static ibd_csum_type_t ibd_csum_send = IBD_CSUM_NONE; static ibd_csum_type_t ibd_csum_recv = IBD_CSUM_NONE; @@ -117,6 +118,15 @@ static uint_t ibd_separate_cqs = 1; static uint_t ibd_txcomp_poll = 0; /* + * the softintr is introduced to avoid Event Queue overflow. It + * should not have heavy load in CQ event handle function. + * If service fifos is enabled, this is not required, because + * mac_rx() will be called by service threads. + */ +static uint_t ibd_rx_softintr = 1; +static uint_t ibd_tx_softintr = 1; + +/* * Initial number of IBA resources allocated. */ #define IBD_NUM_RWQE ibd_num_rwqe @@ -138,28 +148,37 @@ static uint_t ibd_txcomp_poll = 0; */ #define IBD_HASH_SIZE ibd_hash_size +#define IBD_TXPOLL_THRESHOLD 64 /* - * Size of completion array to be filled by a single poll call. + * PAD routine called during send/recv context */ -#define IBD_WC_SIZE 16 +#define IBD_SEND 0 +#define IBD_RECV 1 /* - * We poll every (IBD_TXPOLL_MASK + 1) sends for completions. This - * is based on our above completion array size. + * fill / clear in <scope> and <p_key> in multicast/broadcast address. */ -#define IBD_TXPOLL_MASK 0xf +#define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ + { \ + *(uint32_t *)((char *)(maddr) + 4) |= \ + htonl((uint32_t)(scope) << 16); \ + *(uint32_t *)((char *)(maddr) + 8) |= \ + htonl((uint32_t)(pkey) << 16); \ + } -/* - * Number of payload areas the MDT code can support. Choose the same value - * that we know is supported by TCP/MDT. - */ -#define IBD_MDTMAX_SEGS 16 +#define IBD_CLEAR_SCOPE_PKEY(maddr) \ + { \ + *(uint32_t *)((char *)(maddr) + 4) &= \ + htonl(~((uint32_t)0xF << 16)); \ + *(uint32_t *)((char *)(maddr) + 8) &= \ + htonl(~((uint32_t)0xFFFF << 16)); \ + } /* - * PAD routine called during send/recv context + * when free tx wqes >= threshold and reschedule flag is set, + * ibd will call mac_tx_update to re-enable Tx. */ -#define IBD_SEND 0 -#define IBD_RECV 1 +#define IBD_TX_UPDATE_THRESHOLD 1 /* Driver State Pointer */ void *ibd_list; @@ -168,21 +187,20 @@ void *ibd_list; static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); -/* Required driver entry points for GLD */ -static int ibd_reset(gld_mac_info_t *); -static int ibd_start(gld_mac_info_t *); -static int ibd_stop(gld_mac_info_t *); -static int ibd_set_mac_addr(gld_mac_info_t *, unsigned char *); -static int ibd_set_multicast(gld_mac_info_t *, unsigned char *, int); -static int ibd_set_promiscuous(gld_mac_info_t *, int); -static int ibd_get_stats(gld_mac_info_t *, struct gld_stats *); -static int ibd_send(gld_mac_info_t *, mblk_t *); -static int ibd_mdt_pre(gld_mac_info_t *, mblk_t *, void **); -static void ibd_mdt_txone(gld_mac_info_t *, void *, pdescinfo_t *); -static void ibd_mdt_post(gld_mac_info_t *, mblk_t *, void *); -static uint_t ibd_intr(gld_mac_info_t *); - -/* Private driver entry points for GLD */ +/* Required driver entry points for GLDv3 */ +static int ibd_m_start(void *); +static void ibd_m_stop(void *); +static int ibd_m_unicst(void *, const uint8_t *); +static int ibd_m_multicst(void *, boolean_t, const uint8_t *); +static int ibd_m_promisc(void *, boolean_t); +static int ibd_m_stat(void *, uint_t, uint64_t *); +static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); +static mblk_t *ibd_m_tx(void *, mblk_t *); + +/* Private driver entry points for GLDv3 */ +static boolean_t ibd_send(ibd_state_t *, mblk_t *); +static uint_t ibd_intr(char *); +static uint_t ibd_tx_recycle(char *); static int ibd_state_init(ibd_state_t *, dev_info_t *); static void ibd_state_fini(ibd_state_t *); static int ibd_drv_init(ibd_state_t *); @@ -196,7 +214,7 @@ static void ibd_fini_txlist(ibd_state_t *); static int ibd_init_rxlist(ibd_state_t *); static void ibd_fini_rxlist(ibd_state_t *); static void ibd_freemsg_cb(char *); -static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *, boolean_t); +static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **); static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); @@ -208,8 +226,8 @@ static int ibd_acache_init(ibd_state_t *); static void ibd_acache_fini(ibd_state_t *); static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); -static void ibd_async_unsetprom(ibd_state_t *, boolean_t); -static void ibd_async_setprom(ibd_state_t *, boolean_t); +static void ibd_async_unsetprom(ibd_state_t *); +static void ibd_async_setprom(ibd_state_t *); static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); static void ibd_async_txsched(ibd_state_t *); @@ -230,93 +248,19 @@ static uint64_t ibd_get_portspeed(ibd_state_t *); static void ibd_perf(ibd_state_t *); #endif -/* Streams Module Info */ -static struct module_info ibd_minfo = { - IBD_IDNUM, /* module ID Number */ - "ibd", /* module name */ - 0, /* min packet size */ - INFPSZ, /* maximum packet size */ - IBD_HIWAT, /* high water mark */ - IBD_LOWAT /* low water mark */ -}; - -/* Streams Read Queue */ -static struct qinit ibd_rdinit = { - NULL, /* put */ - gld_rsrv, /* service */ - gld_open, /* open */ - gld_close, /* close */ - NULL, /* unused */ - &ibd_minfo, /* parameters */ - NULL /* statistics */ -}; - -/* Streams Write Queue */ -static struct qinit ibd_wrinit = { - gld_wput, /* put */ - gld_wsrv, /* service */ - NULL, /* open */ - NULL, /* close */ - NULL, /* unused */ - &ibd_minfo, /* parameters */ - NULL /* statistics */ -}; - -/* Stream Operations */ -static struct streamtab ibd_streamtab = { - &ibd_rdinit, /* read queue */ - &ibd_wrinit, /* write queue */ - NULL, /* lower read queue (MUX) */ - NULL /* lower write queue (MUX) */ -}; - -/* Character/Block Operations */ -static struct cb_ops ibd_cb_ops = { - nulldev, /* open */ - nulldev, /* close */ - nodev, /* strategy (block) */ - nodev, /* print (block) */ - nodev, /* dump (block) */ - nodev, /* read */ - nodev, /* write */ - nodev, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* chpoll */ - ddi_prop_op, /* prop_op */ - &ibd_streamtab, /* streams */ - D_MP | D_64BIT, /* flags */ - CB_REV /* rev */ -}; - -/* Driver Operations */ -static struct dev_ops ibd_dev_ops = { - DEVO_REV, /* struct rev */ - 0, /* refcnt */ - gld_getinfo, /* getinfo */ - nulldev, /* identify */ - nulldev, /* probe */ - ibd_attach, /* attach */ - ibd_detach, /* detach */ - nodev, /* reset */ - &ibd_cb_ops, /* cb_ops */ - NULL, /* bus_ops */ - nodev /* power */ -}; +DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, + nodev, NULL, D_MP, NULL); /* Module Driver Info */ static struct modldrv ibd_modldrv = { - &mod_driverops, - "InfiniBand DLPI Driver %I%", - &ibd_dev_ops + &mod_driverops, /* This one is a driver */ + "InfiniBand GLDv3 Driver 1.3", /* short description */ + &ibd_dev_ops /* driver specific ops */ }; /* Module Linkage */ static struct modlinkage ibd_modlinkage = { - MODREV_1, - &ibd_modldrv, - NULL + MODREV_1, (void *)&ibd_modldrv, NULL }; /* @@ -341,7 +285,6 @@ static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { #define ASYNC_PROMON 4 #define ASYNC_PROMOFF 5 #define ASYNC_REAP 6 -#define ASYNC_POKE 7 #define ASYNC_TRAP 8 #define ASYNC_SCHED 9 #define ASYNC_LINK 10 @@ -358,16 +301,31 @@ static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { #define IB_MCGID_IPV4_LOW_GROUP_MASK 0xFFFFFFFF +#define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) +static mac_callbacks_t ib_m_callbacks = { + IBD_M_CALLBACK_FLAGS, + ibd_m_stat, + ibd_m_start, + ibd_m_stop, + ibd_m_promisc, + ibd_m_multicst, + ibd_m_unicst, + ibd_m_tx, + NULL, + NULL, + ibd_m_getcapab +}; + #ifdef DEBUG static int rxpack = 1, txpack = 1; -int debuglevel = 100; +int ibd_debuglevel = 100; static void debug_print(int l, char *fmt, ...) { va_list ap; - if (l < debuglevel) + if (l < ibd_debuglevel) return; va_start(ap, fmt); vcmn_err(CE_CONT, fmt, ap); @@ -400,9 +358,9 @@ ibd_print_warn(ibd_state_t *state, char *fmt, ...) hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 0, "hca-guid", 0); len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), - "%s%d: HCA GUID %016llx port %d PKEY %02x ", ibd_minfo.mi_idname, - state->id_macinfo->gldm_ppa, (u_longlong_t)hca_guid, - state->id_port, state->id_pkey); + "%s%d: HCA GUID %016llx port %d PKEY %02x ", + ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), + (u_longlong_t)hca_guid, state->id_port, state->id_pkey); va_start(ap, fmt); (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, fmt, ap); @@ -419,14 +377,6 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, ibd_state_t::id_acache_req_cv)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, - ibd_state_t::id_multi_req)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, - ibd_state_t::id_multi_addr)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, - ibd_state_t::id_multi_op)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, - ibd_state_t::id_multi_queued)) -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_full)) _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, ibd_state_t::id_mc_non)) @@ -437,7 +387,6 @@ _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, ibd_state_s::id_rx_list)) -_NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_multi_op)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_error)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_ah_op)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_state_s::id_num_intrs)) @@ -464,7 +413,6 @@ _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mce_t::mc_jstate)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_rptr)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", msgb::b_wptr)) -_NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", gld_stats)) _NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", callb_cpr::cc_id)) #ifdef DEBUG @@ -482,7 +430,7 @@ _init() * only makes sense with separate CQs for Tx and Rx. */ if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { - cmn_err(CE_NOTE, "!%s: %s", ibd_minfo.mi_idname, + cmn_err(CE_NOTE, "!ibd: %s", "Setting ibd_txcomp_poll = 0 for combined CQ"); ibd_txcomp_poll = 0; } @@ -493,10 +441,12 @@ _init() return (status); } + mac_init_ops(&ibd_dev_ops, "ibd"); status = mod_install(&ibd_modlinkage); if (status != 0) { DPRINT(10, "_init:failed in mod_install()"); ddi_soft_state_fini(&ibd_list); + mac_fini_ops(&ibd_dev_ops); return (status); } @@ -518,6 +468,7 @@ _fini() if (status != 0) return (status); + mac_fini_ops(&ibd_dev_ops); ddi_soft_state_fini(&ibd_list); return (0); } @@ -682,42 +633,6 @@ punt_recv: \ ; \ } -#define IBD_CKSUM_MDT(mp, dlmdp, np, stp, stfp, ep, vp, fp) { \ - /* \ - * Query IP whether Tx cksum needs to be done. \ - */ \ - if (ibd_csum_send != IBD_CSUM_NONE) \ - hcksum_retrieve(mp, dlmdp, np, stp, stfp, ep, vp, fp); \ -} - -#define IBD_CKSUM_MDT_PACKET(pinfo, st, stf, fl) { \ - if ((ibd_csum_send != IBD_CSUM_NONE) && \ - (fl == HCK_PARTIALCKSUM)) { \ - extern uint_t bcksum(uchar_t *, int, uint32_t); \ - uint16_t *up; \ - uint32_t sum; \ - uchar_t *hp = (pinfo)->hdr_rptr + IPOIB_HDRSIZE; \ - int k; \ - \ - up = (uint16_t *)(hp + stf); \ - if (ibd_csum_send == IBD_CSUM_PARTIAL) { \ - sum = *up; \ - *up = 0; \ - sum = IP_BCSUM_PARTIAL(hp + st, \ - PDESC_HDRL(pinfo) - st - IPOIB_HDRSIZE, \ - sum); \ - for (k = 0; k < pinfo->pld_cnt; k++) \ - sum = IP_BCSUM_PARTIAL(pinfo->pld_ary[k].\ - pld_rptr, PDESC_PLDL(pinfo, k), \ - sum); \ - } else { \ - sum = *up; \ - } \ - sum = ~(sum); \ - *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ - } \ -} - /* * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at * front of optional src/tgt link layer address. Right now Solaris inserts @@ -952,10 +867,10 @@ drain_fifo(p_srv_fifo_t handle) state = (ibd_state_t *)_ddi_srv_fifo_begin(handle); while (_ddi_get_fifo(handle, (p_fifo_obj_t)&mp) == DDI_SUCCESS) { /* - * Hand off to GLD. + * Hand off to GLDv3. */ IBD_CKSUM_RECV(mp); - gld_recv(state->id_macinfo, mp); + mac_rx(state->id_mh, NULL, mp); } _ddi_srv_fifo_end(handle); } @@ -1064,7 +979,7 @@ ibd_send_up(ibd_state_t *state, mblk_t *mp) if (nfifos == 0) { hand_off: IBD_CKSUM_RECV(mp); - gld_recv(state->id_macinfo, mp); + mac_rx(state->id_mh, NULL, mp); return; } @@ -1409,18 +1324,16 @@ ibd_async_work(ibd_state_t *state) case ASYNC_GETAH: ibd_async_acache(state, &ptr->rq_mac); break; - case ASYNC_POKE: - /* - * We need the gld_sched; that - * happens below. No locks are - * needed for the multi_op update. - */ - state->id_multi_op = NOTSTARTED; - break; case ASYNC_REAP: ibd_async_reap_group(state, ptr->rq_ptr, ptr->rq_gid, IB_MC_JSTATE_FULL); + /* + * the req buf contains in mce + * structure, so we do not need + * to free it here. + */ + ptr = NULL; break; case ASYNC_LEAVE: case ASYNC_JOIN: @@ -1428,10 +1341,10 @@ ibd_async_work(ibd_state_t *state) ptr->rq_gid, ptr->rq_op); break; case ASYNC_PROMON: - ibd_async_setprom(state, B_TRUE); + ibd_async_setprom(state); break; case ASYNC_PROMOFF: - ibd_async_unsetprom(state, B_TRUE); + ibd_async_unsetprom(state); break; case ASYNC_TRAP: ibd_async_trap(state, ptr); @@ -1447,21 +1360,10 @@ ibd_async_work(ibd_state_t *state) #ifndef __lock_lint CALLB_CPR_EXIT(&cprinfo); #endif /* !__lock_lint */ - _NOTE(NOT_REACHED) return; } - - /* - * Indicate blocked operation can now be retried. - * Note gld_sched() gets the gld_maclock, - * and the multicast/promiscuous paths - * (ibd_set_multicast(), ibd_set_promiscuous()) - * grab id_acache_req_lock in ibd_queue_work_slot() - * with gld_maclock held, so we must not hold the - * id_acache_req_lock while calling gld_sched to - * prevent deadlock. - */ - gld_sched(state->id_macinfo); + if (ptr != NULL) + kmem_cache_free(state->id_req_kmc, ptr); mutex_enter(&state->id_acache_req_lock); } else { @@ -1478,6 +1380,7 @@ ibd_async_work(ibd_state_t *state) } } /*NOTREACHED*/ + _NOTE(NOT_REACHED) } /* @@ -1656,6 +1559,7 @@ static ibd_ace_t * ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) { ibd_ace_t *ptr; + ibd_req_t *req; /* * Only attempt to print when we can; in the mdt pattr case, the @@ -1681,21 +1585,25 @@ ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) * to ongoing state. Remember in id_ah_addr for which address * we are queueing the request, in case we need to flag an error; * Any further requests, for the same or different address, until - * the operation completes, is sent back to GLD to be retried. + * the operation completes, is sent back to GLDv3 to be retried. * The async thread will update id_ah_op with an error indication * or will set it to indicate the next look up can start; either - * way, it will gld_sched() so that all blocked requests come + * way, it will mac_tx_update() so that all blocked requests come * back here. */ - *err = GLD_NORESOURCES; + *err = EAGAIN; if (state->id_ah_op == NOTSTARTED) { - /* - * We did not even find the entry; queue a request for it. - */ - bcopy(mac, &(state->id_ah_req.rq_mac), IPOIB_ADDRL); - ibd_queue_work_slot(state, &state->id_ah_req, ASYNC_GETAH); - state->id_ah_op = ONGOING; - bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); + req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); + if (req != NULL) { + /* + * We did not even find the entry; queue a request + * for it. + */ + bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); + ibd_queue_work_slot(state, req, ASYNC_GETAH); + state->id_ah_op = ONGOING; + bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); + } } else if ((state->id_ah_op != ONGOING) && (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { /* @@ -1703,7 +1611,7 @@ ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) * we had queued before. */ if (state->id_ah_op == ERRORED) { - *err = GLD_FAILURE; + *err = EFAULT; state->id_ah_error++; } else { /* @@ -1731,14 +1639,6 @@ ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) } mutex_exit(&state->id_ac_mutex); - /* - * The PathRecord lookup failed; retry any other blocked - * Tx requests that might have come in between when we - * initiated the path lookup and now that were sent back - * to GLD to implement single outstanding lookup scheme. - */ - if (*err == GLD_FAILURE) - gld_sched(state->id_macinfo); return (ptr); } @@ -2068,8 +1968,8 @@ static void ibd_async_link(ibd_state_t *state, ibd_req_t *req) { ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; - int32_t lstate = (opcode == IBD_LINK_DOWN) ? GLD_LINKSTATE_DOWN : - GLD_LINKSTATE_UP; + link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : + LINK_STATE_UP; ibd_mce_t *mce, *pmce; ibd_ace_t *ace, *pace; @@ -2079,7 +1979,7 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) * On a link up, revalidate the link speed/width. No point doing * this on a link down, since we will be unable to do SA operations, * defaulting to the lowest speed. Also notice that we update our - * notion of speed before calling gld_linkstate(), which will do + * notion of speed before calling mac_link_update(), which will do * neccesary higher level notifications for speed changes. */ if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { @@ -2100,12 +2000,12 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) /* * Drop all nonmembership. */ - ibd_async_unsetprom(state, B_FALSE); + ibd_async_unsetprom(state); /* * Then, try to regain nonmembership to all mcg's. */ - ibd_async_setprom(state, B_FALSE); + ibd_async_setprom(state); } @@ -2167,21 +2067,16 @@ ibd_async_link(ibd_state_t *state, ibd_req_t *req) } /* - * Macinfo is guaranteed to exist since driver does ibt_close_hca() + * mac handle is guaranteed to exist since driver does ibt_close_hca() * (which stops further events from being delivered) before - * gld_mac_free(). At this point, it is guaranteed that gld_register + * mac_unreigster(). At this point, it is guaranteed that mac_register * has already been done. */ mutex_enter(&state->id_link_mutex); state->id_link_state = lstate; - gld_linkstate(state->id_macinfo, lstate); + mac_link_update(state->id_mh, lstate); mutex_exit(&state->id_link_mutex); - /* - * Free the request slot allocated by the event thread. - */ - kmem_free(req, sizeof (ibd_req_t)); - ibd_async_done(state); } @@ -2226,7 +2121,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) * If the init code in ibd_drv_init hasn't yet set up the * pkey/gid, nothing to do; that code will set the link state. */ - if (state->id_link_state == GLD_LINKSTATE_UNKNOWN) { + if (state->id_link_state == LINK_STATE_UNKNOWN) { mutex_exit(&state->id_link_mutex); return; } @@ -2294,7 +2189,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) if (!ibd_async_safe(state)) { state->id_link_state = ((code == IBT_EVENT_PORT_UP) ? - GLD_LINKSTATE_UP : GLD_LINKSTATE_DOWN); + LINK_STATE_UP : LINK_STATE_DOWN); mutex_exit(&state->id_link_mutex); return; } @@ -2303,7 +2198,7 @@ ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) if (code == IBT_ERROR_PORT_DOWN) opcode = IBD_LINK_DOWN; - req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); + req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_ptr = (void *)opcode; ibd_queue_work_slot(state, req, ASYNC_LINK); } @@ -2370,8 +2265,10 @@ ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + mac_register_t *macp; ibd_state_t *state; int instance; + int err; switch (cmd) { case DDI_ATTACH: @@ -2396,6 +2293,22 @@ ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) goto attach_fail_state_init; } + /* alloc rx soft intr */ + if ((ibd_rx_softintr == 1) && + ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, + NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { + DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); + goto attach_fail_ddi_add_rx_softintr; + } + + /* alloc tx soft intr */ + if ((ibd_tx_softintr == 1) && + ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, + NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { + DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); + goto attach_fail_ddi_add_tx_softintr; + } + /* "attach" to IBTL */ if (ibt_attach(&ibd_clnt_modinfo, dip, state, &state->id_ibt_hdl) != IBT_SUCCESS) { @@ -2410,42 +2323,50 @@ ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } /* - * Register ourselves with the GLD interface - * - * gld_register will: - * link us with the GLD module; - * set our ddi_set_driver_private(9F) data to the macinfo ptr; - * save the devinfo pointer in macinfo->gldm_devinfo; - * create the minor device node. + * Initialize pointers to device specific functions which will be + * used by the generic layer. + */ + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + DPRINT(10, "ibd_attach : failed in mac_alloc()"); + goto attach_fail_drv_init; + } + + macp->m_type_ident = MAC_PLUGIN_IDENT_IB; + macp->m_driver = state; + macp->m_dip = state->id_dip; + macp->m_src_addr = (uint8_t *)&state->id_macaddr; + macp->m_callbacks = &ib_m_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; + + /* + * Register ourselves with the GLDv3 interface */ - if (gld_register(dip, "ibd", state->id_macinfo) != DDI_SUCCESS) { - DPRINT(10, "ibd_attach : failed in gld_register()"); - goto attach_fail_gld_register; + err = mac_register(macp, &state->id_mh); + mac_free(macp); + if (err != 0) { + DPRINT(10, "ibd_attach : failed in mac_register()"); + goto attach_fail_mac_register; } /* * Setup the handler we will use for regular DLPI stuff. Its important - * to setup the recv handler after registering with gld. Setting it - * before causes at times an incoming packet to be forwarded to gld - * before the gld_register. This will result in gld dropping the packet - * which is ignored by ibd_rcq_handler, thus failing to re-arm the - * tavor events. This will cause tavor_isr on recv path to be not - * invoked any further. + * to setup the recv handler after registering with gldv3. */ ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != IBT_SUCCESS) { DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); - goto attach_fail_gld_register; + goto attach_fail_setup_handler; } /* * Setup the subnet notices handler after we initialize the a/mcaches * and start the async thread, both of which are required for the * trap handler to function properly. Enable the trap handler to - * queue requests to the async thread after the gld_register, because - * the async daemon invokes gld_sched(), which must be done after - * gld_register(). + * queue requests to the async thread after the mac_register, because + * the async daemon invokes mac_tx_update(), which must be done after + * mac_register(). */ ibt_register_subnet_notices(state->id_ibt_hdl, ibd_snet_notices_handler, state); @@ -2454,23 +2375,24 @@ ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) mutex_exit(&state->id_trap_lock); /* - * Indicate link status to GLD and higher layers. By default, + * Indicate link status to GLDv3 and higher layers. By default, * we assume we are in up state (which must have been true at * least at the time the broadcast mcg's were probed); if there * were any up/down transitions till the time we come here, the * async handler will have updated last known state, which we - * use to tell GLD. The async handler will not send any - * notifications to GLD till we reach here in the initialization + * use to tell GLDv3. The async handler will not send any + * notifications to GLDv3 till we reach here in the initialization * sequence. */ - mutex_enter(&state->id_link_mutex); - gld_linkstate(state->id_macinfo, state->id_link_state); - mutex_exit(&state->id_link_mutex); + mac_link_update(state->id_mh, state->id_link_state); return (DDI_SUCCESS); /* Attach failure points, cleanup */ -attach_fail_gld_register: +attach_fail_setup_handler: + (void) mac_unregister(state->id_mh); + +attach_fail_mac_register: ibd_drv_fini(state); attach_fail_drv_init: @@ -2478,6 +2400,14 @@ attach_fail_drv_init: ibd_print_warn(state, "failed to free IB resources"); attach_fail_ibt_attach: + if (ibd_tx_softintr == 1) + ddi_remove_softintr(state->id_tx); + +attach_fail_ddi_add_tx_softintr: + if (ibd_rx_softintr == 1) + ddi_remove_softintr(state->id_rx); + +attach_fail_ddi_add_rx_softintr: ibd_state_fini(state); attach_fail_state_init: @@ -2523,11 +2453,17 @@ ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) } } - if (gld_unregister(state->id_macinfo) != DDI_SUCCESS) { - DPRINT(10, "ibd_detach : failed in gld_unregister()"); + if (mac_unregister(state->id_mh) != DDI_SUCCESS) { + DPRINT(10, "ibd_detach : failed in mac_unregister()"); goto failed; } + if (ibd_rx_softintr == 1) + ddi_remove_softintr(state->id_rx); + + if (ibd_tx_softintr == 1) + ddi_remove_softintr(state->id_tx); + ibd_drv_fini(state); if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) @@ -2558,15 +2494,10 @@ failed: static int ibd_state_init(ibd_state_t *state, dev_info_t *dip) { - gld_mac_info_t *macinfo; - - if ((macinfo = gld_mac_alloc(dip)) == NULL) { - DPRINT(10, "ibd_state_init : failed in gld_mac_alloc()"); - return (DDI_FAILURE); - } + char buf[64]; mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); - state->id_link_state = GLD_LINKSTATE_UNKNOWN; + state->id_link_state = LINK_STATE_UNKNOWN; mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); @@ -2575,10 +2506,7 @@ ibd_state_init(ibd_state_t *state, dev_info_t *dip) mutex_init(&state->id_txcomp_lock, NULL, MUTEX_DRIVER, NULL); state->id_dip = dip; - state->id_wcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); - state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * IBD_WC_SIZE, KM_SLEEP); - state->id_sched_queued = B_FALSE; mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); state->id_tx_list.dl_head = NULL; @@ -2592,41 +2520,11 @@ ibd_state_init(ibd_state_t *state, dev_info_t *dip) state->id_rx_list.dl_bufs_outstanding = 0; state->id_rx_list.dl_cnt = 0; mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&state->id_rx_mutex, NULL, MUTEX_DRIVER, NULL); - /* Link up various structs for later access */ - macinfo->gldm_private = (caddr_t)state; - state->id_macinfo = macinfo; - - /* - * Initialize pointers to device specific functions which will be - * used by the generic layer. - */ - macinfo->gldm_reset = ibd_reset; - macinfo->gldm_start = ibd_start; - macinfo->gldm_stop = ibd_stop; - macinfo->gldm_set_mac_addr = ibd_set_mac_addr; - macinfo->gldm_set_multicast = ibd_set_multicast; - macinfo->gldm_set_promiscuous = ibd_set_promiscuous; - macinfo->gldm_get_stats = ibd_get_stats; - macinfo->gldm_send = ibd_send; - macinfo->gldm_intr = ibd_intr; - macinfo->gldm_mdt_pre = ibd_mdt_pre; - macinfo->gldm_mdt_send = ibd_mdt_txone; - macinfo->gldm_mdt_post = ibd_mdt_post; - macinfo->gldm_mdt_sgl = state->id_max_sqseg; - macinfo->gldm_mdt_segs = IBD_MDTMAX_SEGS; - - /* Initialize board characteristics needed by the generic layer. */ - macinfo->gldm_ident = "InfiniBand DLPI Driver"; - macinfo->gldm_type = DL_IB; - macinfo->gldm_minpkt = 0; /* assumes we pad ourselves */ - macinfo->gldm_addrlen = IPOIB_ADDRL; - macinfo->gldm_saplen = -2; - macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; - - /* Other required initialization */ - macinfo->gldm_ppa = ddi_get_instance(dip); - macinfo->gldm_devinfo = dip; + (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); + state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); return (DDI_SUCCESS); } @@ -2639,14 +2537,14 @@ ibd_state_fini(ibd_state_t *state) { mutex_destroy(&state->id_tx_list.dl_mutex); mutex_destroy(&state->id_rx_list.dl_mutex); + mutex_destroy(&state->id_rx_mutex); mutex_destroy(&state->id_sched_lock); mutex_destroy(&state->id_txcomp_lock); - kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); - kmem_free(state->id_wcs, sizeof (ibt_wc_t) * IBD_WC_SIZE); + cv_destroy(&state->id_trap_cv); mutex_destroy(&state->id_trap_lock); mutex_destroy(&state->id_link_mutex); - gld_mac_free(state->id_macinfo); + kmem_cache_destroy(state->id_req_kmc); } /* @@ -2698,12 +2596,10 @@ static uint64_t ibd_get_portspeed(ibd_state_t *state) { int ret; + ibt_path_info_t path; + ibt_path_attr_t path_attr; + uint8_t num_paths; uint64_t ifspeed; - size_t length; - ib_lid_t lid; - sa_portinfo_record_t req, *resp = NULL; - ibmf_saa_access_args_t args; - ibmf_saa_handle_t saa_handle; /* * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire @@ -2712,53 +2608,58 @@ ibd_get_portspeed(ibd_state_t *state) */ ifspeed = 2000000000; - /* Get port lid */ - if (ibt_get_port_state(state->id_hca_hdl, state->id_port, NULL, - &lid) != IBT_SUCCESS) - goto earlydone; - - if (ibmf_sa_session_open(state->id_sgid.gid_guid, 0, NULL, - IBMF_VERSION, 0, &saa_handle) != IBMF_SUCCESS) - goto earlydone; + bzero(&path_attr, sizeof (path_attr)); - /* Contact SA Access */ - bzero(&req, sizeof (sa_portinfo_record_t)); - req.EndportLID = lid; + /* + * Get the port speed from Loopback path information. + */ + path_attr.pa_dgids = &state->id_sgid; + path_attr.pa_num_dgids = 1; + path_attr.pa_sgid = state->id_sgid; - args.sq_attr_id = SA_PORTINFORECORD_ATTRID; - args.sq_access_type = IBMF_SAA_RETRIEVE; - args.sq_component_mask = SA_PORTINFO_COMPMASK_PORTLID; - args.sq_template = &req; - args.sq_callback = NULL; - args.sq_callback_arg = NULL; + if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, + &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) + goto earlydone; - ret = ibmf_sa_access(saa_handle, &args, 0, &length, (void **) &resp); - if ((ret != IBMF_SUCCESS) || (length == 0) || (resp == NULL)) - goto done; + if (num_paths < 1) + goto earlydone; /* - * 4X/12X needs appropriate multipliers. With IBA 1.2 additions, - * double and quad multipliers are also needed per LinkSpeedEnabled. * In case SA does not return an expected value, report the default * speed as 1X. */ ret = 1; - switch (resp->PortInfo.LinkWidthActive) { - case SM_LINK_WIDTH_ACTIVE_1X: + switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { + case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ ret = 1; break; - case SM_LINK_WIDTH_ACTIVE_4X: + case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ ret = 4; break; - case SM_LINK_WIDTH_ACTIVE_12X: + case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ ret = 12; break; + case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ + ret = 2; + break; + case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ + ret = 8; + break; + case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ + ret = 16; + break; + case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ + ret = 24; + break; + case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ + ret = 32; + break; + case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ + ret = 48; + break; } - ifspeed *= ret; - kmem_free(resp, length); -done: - (void) ibmf_sa_session_close(&saa_handle, 0); + ifspeed *= ret; earlydone: return (ifspeed); @@ -3079,8 +2980,15 @@ ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) ASSERT(mce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); } else { ASSERT(jstate == IB_MC_JSTATE_FULL); - ASSERT((mce != NULL) && (mce->mc_jstate == - IB_MC_JSTATE_FULL)); + ASSERT(mce->mc_jstate == IB_MC_JSTATE_FULL); + + /* + * If join group failed, mce will be NULL here. + * This is because in GLDv3 driver, set multicast + * will always return success. + */ + if (mce == NULL) + return; mce->mc_fullreap = B_TRUE; } @@ -3241,7 +3149,7 @@ ibd_drv_init(ibd_state_t *state) state->id_mtu = (128 << port_infop->p_mtu); state->id_sgid = *port_infop->p_sgid_tbl; - state->id_link_state = GLD_LINKSTATE_UP; + state->id_link_state = LINK_STATE_UP; mutex_exit(&state->id_link_mutex); ibt_free_portinfo(port_infop, port_infosz); @@ -3251,18 +3159,10 @@ ibd_drv_init(ibd_state_t *state) ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); ASSERT(ibt_status == IBT_SUCCESS); - /* - * We need to determine whether the HCA can support checksum - * and indicate that to higher layers. - */ - if (ibd_csum_send > IBD_CSUM_NONE) - state->id_macinfo->gldm_capabilities |= GLD_CAP_CKSUM_PARTIAL; - if (ibd_find_bgroup(state) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); goto drv_init_fail_find_bgroup; } - state->id_macinfo->gldm_maxpkt = state->id_mtu - IPOIB_HDRSIZE; if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, &state->id_pd_hdl) != IBT_SUCCESS) { @@ -3335,6 +3235,10 @@ ibd_drv_init(ibd_state_t *state) DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_rcq; } + state->id_rxwcs_size = state->id_num_rwqe + 1; + state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_rxwcs_size, KM_SLEEP); + /* * Allocate Send CQ. @@ -3351,6 +3255,9 @@ ibd_drv_init(ibd_state_t *state) DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_scq; } + state->id_txwcs_size = state->id_num_swqe + 1; + state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_txwcs_size, KM_SLEEP); } else { /* * Allocate combined Send/Receive CQ. @@ -3376,12 +3283,18 @@ ibd_drv_init(ibd_state_t *state) goto drv_init_fail_min_rwqes; } + state->id_rxwcs_size = cq_attr.cq_size; + state->id_txwcs_size = state->id_rxwcs_size; + if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); goto drv_init_fail_alloc_rcq; } state->id_scq_hdl = state->id_rcq_hdl; + state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * + state->id_rxwcs_size, KM_SLEEP); + state->id_txwcs = state->id_rxwcs; } /* @@ -3473,14 +3386,11 @@ ibd_drv_init(ibd_state_t *state) */ ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, state->id_sgid.gid_prefix, state->id_sgid.gid_guid); - state->id_macinfo->gldm_vendor_addr = (uchar_t *)&state->id_macaddr; - /* * Similarly, program in the broadcast mac address. */ ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, state->id_mgid.gid_guid); - state->id_macinfo->gldm_broadcast_addr = (uchar_t *)&state->id_bcaddr; ptr = (uint32_t *)&state->id_macaddr; DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", @@ -3523,9 +3433,14 @@ drv_init_fail_alloc_chan: IBT_SUCCESS)) DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); + if (ibd_separate_cqs == 1) + kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * + state->id_txwcs_size); + drv_init_fail_alloc_scq: if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); + kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); drv_init_fail_min_rwqes: drv_init_fail_alloc_rcq: @@ -3613,7 +3528,6 @@ ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe) swqe->swqe_next = NULL; swqe->swqe_prev = NULL; swqe->swqe_im_mblk = NULL; - swqe->w_mdtinfo = NULL; /* alloc copy buffer, must be max size to handle multiple mblk case */ swqe->swqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu, KM_SLEEP); @@ -3674,12 +3588,18 @@ ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) static int ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) { + /* + * Here we should add dl_cnt before post recv, because we would + * have to make sure dl_cnt has already updated before + * corresponding ibd_process_rx() is called. + */ + atomic_add_32(&state->id_rx_list.dl_cnt, 1); if (ibt_post_recv(state->id_chnl_hdl, &rwqe->w_rwr, 1, NULL) != IBT_SUCCESS) { + (void) atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); DPRINT(10, "ibd_post_rwqe : failed in ibt_post_recv()"); return (DDI_FAILURE); } - atomic_add_32(&state->id_rx_list.dl_cnt, 1); /* * Buffers being recycled are already in the list. @@ -3967,10 +3887,13 @@ ibd_drv_fini(ibd_state_t *state) */ status = ibt_free_cq(state->id_rcq_hdl); ASSERT(status == IBT_SUCCESS); + kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); if (ibd_separate_cqs == 1) { status = ibt_free_cq(state->id_scq_hdl); ASSERT(status == IBT_SUCCESS); + kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * + state->id_txwcs_size); } /* @@ -4024,7 +3947,11 @@ ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) ibd_state_t *state = (ibd_state_t *)arg; atomic_add_64(&state->id_num_intrs, 1); - (void) gld_intr(state->id_macinfo); + + if (ibd_rx_softintr == 1) + ddi_trigger_softintr(state->id_rx); + else + (void) ibd_intr((char *)state); } /* @@ -4039,30 +3966,10 @@ ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) atomic_add_64(&state->id_num_intrs, 1); - /* - * Poll for completed entries; the CQ will not interrupt any - * more for completed packets. - */ - ibd_poll_compq(state, state->id_scq_hdl); - - /* - * Now enable CQ notifications; all completions originating now - * will cause new interrupts. - */ - if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != - IBT_SUCCESS) { - /* - * We do not expect a failure here. - */ - DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); - } - - /* - * Repoll to catch all packets that might have completed after - * we finished the first poll loop and before interrupts got - * armed. - */ - ibd_poll_compq(state, state->id_scq_hdl); + if (ibd_tx_softintr == 1) + ddi_trigger_softintr(state->id_tx); + else + (void) ibd_tx_recycle((char *)state); } /* @@ -4130,7 +4037,7 @@ ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, if (!ibd_async_safe(state)) return; - req = kmem_alloc(sizeof (ibd_req_t), KM_SLEEP); + req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); req->rq_gid = event->sm_notice_gid; req->rq_ptr = (void *)code; ibd_queue_work_slot(state, req, ASYNC_TRAP); @@ -4176,63 +4083,76 @@ ibd_async_trap(ibd_state_t *state, ibd_req_t *req) /* * Free the request slot allocated by the subnet event thread. */ - kmem_free(req, sizeof (ibd_req_t)); - ibd_async_done(state); } /* - * GLD entry point to reset hardware. + * GLDv3 entry point to get capabilities. */ -/* ARGSUSED */ -static int -ibd_reset(gld_mac_info_t *macinfo) +static boolean_t +ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - /* - * This will be invoked from Style 1 open() and Style 2 - * attach() routines, ie just before the interface starts - * getting used. - */ - return (GLD_SUCCESS); + _NOTE(ARGUNUSED(arg)); + + switch (cap) { + case MAC_CAPAB_HCKSUM: { + uint32_t *txflags = cap_data; + + if (ibd_csum_send > IBD_CSUM_NONE) + *txflags = HCKSUM_INET_PARTIAL; + else + return (B_FALSE); + break; + } + case MAC_CAPAB_POLL: + /* + * Fallthrough to default, as we don't support GLDv3 + * polling. When blanking is implemented, we will need to + * change this to return B_TRUE in addition to registering + * an mc_resources callback. + */ + default: + return (B_FALSE); + } + return (B_TRUE); } /* - * GLD entry point to start hardware. + * GLDv3 entry point to start hardware. */ /* ARGSUSED */ static int -ibd_start(gld_mac_info_t *macinfo) +ibd_m_start(void *arg) { - return (GLD_SUCCESS); + return (0); } /* - * GLD entry point to stop hardware from receiving packets. + * GLDv3 entry point to stop hardware from receiving packets. */ /* ARGSUSED */ -static int -ibd_stop(gld_mac_info_t *macinfo) +static void +ibd_m_stop(void *arg) { #ifdef RUN_PERFORMANCE - ibd_perf((ibd_state_t *)macinfo->gldm_private); + ibd_perf((ibd_state_t *)arg); #endif - return (GLD_SUCCESS); } /* - * GLD entry point to modify device's mac address. We do not + * GLDv3 entry point to modify device's mac address. We do not * allow address modifications. */ static int -ibd_set_mac_addr(gld_mac_info_t *macinfo, unsigned char *macaddr) +ibd_m_unicst(void *arg, const uint8_t *macaddr) { ibd_state_t *state; - state = (ibd_state_t *)macinfo->gldm_private; + state = (ibd_state_t *)arg; if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) - return (GLD_SUCCESS); + return (0); else - return (GLD_FAILURE); + return (EINVAL); } /* @@ -4246,12 +4166,11 @@ ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); if (op == ASYNC_JOIN) { - int ret = ERRORED; - if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) != NULL) - ret = COMPLETED; - - state->id_multi_op = ret; + if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { + ibd_print_warn(state, "Joint multicast group failed :" + "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); + } } else { /* * Here, we must search for the proper mcg_info and @@ -4262,20 +4181,17 @@ ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) } /* - * GLD entry point for multicast enable/disable requests. - * Invoked by GLD only on the first multicast enable for a specific - * address (GLD is free to retry ocassionally if we return RETRY), - * and on last disable of the same address. Just queue the operation - * to the async thread. + * GLDv3 entry point for multicast enable/disable requests. + * This function queues the operation to the async thread and + * return success for a valid multicast address. */ static int -ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) +ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) { - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; - ipoib_mac_t *mcast; + ibd_state_t *state = (ibd_state_t *)arg; + ipoib_mac_t maddr, *mcast; ib_gid_t mgid; - ib_qpn_t mcqpn; - int ret; + ibd_req_t *req; /* * The incoming multicast address might not be aligned properly @@ -4284,7 +4200,8 @@ ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) * since we know we are not going to dereference any values with * the ipoib_mac_t pointer. */ - mcast = (ipoib_mac_t *)mcmac; + bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); + mcast = &maddr; /* * Check validity of MCG address. We could additionally check @@ -4293,9 +4210,13 @@ ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) * programs anyway, we allow the flexibility to those dlpi apps. * Note that we do not validate the "scope" of the IBA mcg. */ - bcopy(&mcast->ipoib_qpn, &mcqpn, sizeof (ib_qpn_t)); - if (mcqpn != htonl(IB_MC_QPN)) - return (GLD_FAILURE); + if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) + return (EINVAL); + + /* + * fill in multicast pkey and scope + */ + IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); /* * If someone is trying to JOIN/LEAVE the broadcast group, we do @@ -4305,71 +4226,26 @@ ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also * depends on this. */ - if (bcmp(mcast, state->id_macinfo->gldm_broadcast_addr, - IPOIB_ADDRL) == 0) - return (GLD_SUCCESS); + if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) + return (0); ibd_n2h_gid(mcast, &mgid); + req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); + if (req == NULL) + return (ENOMEM); - if (op == GLD_MULTI_ENABLE) { - DPRINT(1, "ibd_set_multicast : %016llx:%016llx\n", + req->rq_gid = mgid; + + if (add) { + DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); - ret = GLD_RETRY; - mutex_enter(&state->id_mc_mutex); - if (state->id_multi_op == NOTSTARTED) { - state->id_multi_req.rq_gid = mgid; - ibd_queue_work_slot(state, &state->id_multi_req, - ASYNC_JOIN); - state->id_multi_op = ONGOING; - bcopy(mcast, &state->id_multi_addr, IPOIB_ADDRL); - } else if (bcmp(&state->id_multi_addr, mcast, - IPOIB_ADDRL) == 0) { - if (state->id_multi_op != ONGOING) { - if (state->id_multi_op == COMPLETED) - ret = GLD_SUCCESS; - else if (state->id_multi_op == ERRORED) - ret = GLD_FAILURE; - if (state->id_multi_queued) { - state->id_multi_queued = B_FALSE; - ibd_queue_work_slot(state, - &state->id_multi_req, ASYNC_POKE); - } else { - state->id_multi_op = NOTSTARTED; - } - } - } else { - /* - * Hmmm, a set was tried on another mcg. We - * need to make sure to gld_sched for this - * stream to retry once the ongoing one terminates. - * The gld_sched out of the async thread on completion - * of the mcg join is not enough; because the queued - * stream might come in and get a RETRY again because - * the mcg join result has still not been reaped by - * the originator. If gld_sched ensured that streams - * get tried in the order they received RETRYs, things - * would be simpler. - */ - state->id_multi_queued = B_TRUE; - } - mutex_exit(&state->id_mc_mutex); + ibd_queue_work_slot(state, req, ASYNC_JOIN); } else { - ibd_mce_t *mce; - DPRINT(1, "ibd_set_multicast : unset_multicast : " + DPRINT(1, "ibd_m_multicst : unset_multicast : " "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); - ret = GLD_SUCCESS; - mutex_enter(&state->id_mc_mutex); - mce = IBD_MCACHE_FIND_FULL(state, mgid); - mutex_exit(&state->id_mc_mutex); - /* - * GLD should not have invoked us unless the mcg was - * added in the past. - */ - ASSERT(mce != NULL); - ASSERT(bcmp(&mce->mc_req.rq_gid, &mgid, sizeof (mgid)) == 0); - ibd_queue_work_slot(state, &mce->mc_req, ASYNC_LEAVE); + ibd_queue_work_slot(state, req, ASYNC_LEAVE); } - return (ret); + return (0); } /* @@ -4379,25 +4255,19 @@ ibd_set_multicast(gld_mac_info_t *macinfo, unsigned char *mcmac, int op) * a port up/down event. */ static void -ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq) +ibd_async_unsetprom(ibd_state_t *state) { ibd_mce_t *mce = list_head(&state->id_mc_non); ib_gid_t mgid; DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); - /* - * Mark the request slot as empty and reusable for the - * next promiscuous set request. - */ - if (dlpireq) - state->id_prom_op = NOTSTARTED; - while (mce != NULL) { mgid = mce->mc_info.mc_adds_vect.av_dgid; mce = list_next(&state->id_mc_non, mce); ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); } + state->id_prom_op = NOTSTARTED; } /* @@ -4407,13 +4277,13 @@ ibd_async_unsetprom(ibd_state_t *state, boolean_t dlpireq) * a port up/down event. */ static void -ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) +ibd_async_setprom(ibd_state_t *state) { ibt_mcg_attr_t mcg_attr; ibt_mcg_info_t *mcg_info; ib_gid_t mgid; uint_t numg; - int i; + int i, ret = COMPLETED; DPRINT(2, "ibd_async_setprom : async_set_promisc"); @@ -4431,9 +4301,8 @@ ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) IBT_SUCCESS) { ibd_print_warn(state, "Could not get list of IBA multicast " "groups"); - if (dlpireq) - state->id_prom_op = ERRORED; - return; + ret = ERRORED; + goto done; } /* @@ -4453,185 +4322,157 @@ ibd_async_setprom(ibd_state_t *state, boolean_t dlpireq) } ibt_free_mcg_info(mcg_info, numg); - if (dlpireq) - state->id_prom_op = COMPLETED; DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); +done: + state->id_prom_op = ret; } /* - * GLD entry point for multicast promiscuous enable/disable requests. - * GLD assumes phys state receives more packets than multi state, + * GLDv3 entry point for multicast promiscuous enable/disable requests. + * GLDv3 assumes phys state receives more packets than multi state, * which is not true for IPoIB. Thus, treat the multi and phys - * promiscuous states the same way to work with GLD's assumption. + * promiscuous states the same way to work with GLDv3's assumption. */ static int -ibd_set_promiscuous(gld_mac_info_t *macinfo, int mode) +ibd_m_promisc(void *arg, boolean_t on) { - ibd_state_t *state; - int ret; - - state = (ibd_state_t *)macinfo->gldm_private; - switch (mode) { - case GLD_MAC_PROMISC_PHYS: - case GLD_MAC_PROMISC_MULTI: - DPRINT(1, "ibd_set_promiscuous : set_promisc : %d", - mode); - /* - * Look at gld: this might be getting - * called because someone is turning off - * prom_phys. Nothing needs to be done in - * that case. - */ - ret = GLD_RETRY; - mutex_enter(&state->id_mc_mutex); - switch (state->id_prom_op) { - case NOTSTARTED: - ibd_queue_work_slot(state, - &state->id_prom_req, ASYNC_PROMON); - state->id_prom_op = ONGOING; - break; - case COMPLETED: - ret = GLD_SUCCESS; - break; - case ERRORED: - state->id_prom_op = NOTSTARTED; - ret = GLD_FAILURE; - } - /* - * Else in the ONGOING case, nothing special - * needs to be done; the async thread will poke - * all streams. A prior set, or the last unset - * request is still in the async queue. - */ - mutex_exit(&state->id_mc_mutex); - return (ret); - case GLD_MAC_PROMISC_NONE: - DPRINT(1, "ibd_set_promiscuous : unset_promisc"); - /* - * Look at gld: this might be getting - * called because someone is turning off - * prom_phys or prom_multi. Mark operation - * as ongoing, to prevent a subsequent set - * operation from using the request slot - * unless the async thread is ready to give - * it up. The async thread will mark the - * request slot as usable as soon as it - * starts doing the unset operation. - */ - ASSERT(state->id_prom_op == COMPLETED); - state->id_prom_op = ONGOING; - ibd_queue_work_slot(state, &state->id_prom_req, - ASYNC_PROMOFF); - return (GLD_SUCCESS); - default: - return (GLD_NOTSUPPORTED); + ibd_state_t *state = (ibd_state_t *)arg; + ibd_req_t *req; + + req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); + if (req == NULL) + return (ENOMEM); + if (on) { + DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); + ibd_queue_work_slot(state, req, ASYNC_PROMON); + } else { + DPRINT(1, "ibd_m_promisc : unset_promisc"); + ibd_queue_work_slot(state, req, ASYNC_PROMOFF); } + + return (0); } /* - * GLD entry point for gathering statistics. + * GLDv3 entry point for gathering statistics. */ static int -ibd_get_stats(gld_mac_info_t *macinfo, struct gld_stats *sp) +ibd_m_stat(void *arg, uint_t stat, uint64_t *val) { - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; - - sp->glds_errrcv = 0; - sp->glds_underflow = 0; - sp->glds_missed = 0; + ibd_state_t *state = (ibd_state_t *)arg; - sp->glds_overflow = state->id_tx_short; /* Tx overflow */ - sp->glds_speed = state->id_link_speed; - sp->glds_media = GLDM_IB; - sp->glds_errxmt = state->id_ah_error; /* failed AH translation */ - sp->glds_norcvbuf = state->id_rx_short; /* # times below water mark */ - sp->glds_intr = state->id_num_intrs; /* number of intrs */ + switch (stat) { + case MAC_STAT_IFSPEED: + *val = state->id_link_speed; + break; + case MAC_STAT_MULTIRCV: + *val = state->id_multi_rcv; + break; + case MAC_STAT_BRDCSTRCV: + *val = state->id_brd_rcv; + break; + case MAC_STAT_MULTIXMT: + *val = state->id_multi_xmt; + break; + case MAC_STAT_BRDCSTXMT: + *val = state->id_brd_xmt; + break; + case MAC_STAT_RBYTES: + *val = state->id_recv_bytes; + break; + case MAC_STAT_IPACKETS: + *val = state->id_rcv_pkt; + break; + case MAC_STAT_OBYTES: + *val = state->id_xmt_bytes; + break; + case MAC_STAT_OPACKETS: + *val = state->id_xmt_pkt; + break; + case MAC_STAT_NORCVBUF: + *val = state->id_rx_short; /* # times below water mark */ + break; + case MAC_STAT_OERRORS: + *val = state->id_ah_error; /* failed AH translation */ + break; + case MAC_STAT_IERRORS: + *val = 0; + break; + case MAC_STAT_NOXMTBUF: + *val = state->id_tx_short; + break; + default: + return (ENOTSUP); + } - return (GLD_SUCCESS); + return (0); } /* - * Arrange for a Tx request that is failing, or has already failed due to - * Tx descriptor shortage to be retried soon. Used mostly with poll based - * Tx completion, since gld_sched() can not be invoked in ibd_send() context - * due to potential single processor deadlock (when the ibd_send() is - * caused by gld_recv()). + * Tx reschedule */ static void -ibd_tx_sched(ibd_state_t *state) +ibd_async_txsched(ibd_state_t *state) { - mutex_enter(&state->id_sched_lock); + ibd_req_t *req; + /* - * If a sched request is already enqueued, do not try to do - * that again, since the async work request list would get - * corrupted. + * For poll mode, if ibd is out of Tx wqe, reschedule to collect + * the CQEs. Otherwise, just return for out of Tx wqe. */ - if (!state->id_sched_queued) { - state->id_sched_queued = B_TRUE; - ibd_queue_work_slot(state, &state->id_sched_req, ASYNC_SCHED); + + if (ibd_txcomp_poll == 1) { + mutex_enter(&state->id_txcomp_lock); + ibd_poll_compq(state, state->id_scq_hdl); + mutex_exit(&state->id_txcomp_lock); + if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { + req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); + ibd_queue_work_slot(state, req, ASYNC_SCHED); + return; + } + } else if (state->id_tx_list.dl_cnt < IBD_TX_UPDATE_THRESHOLD) { + return; } - mutex_exit(&state->id_sched_lock); -} -/* - * The gld_sched() in ibd_async_work() does the work for us. - */ -static void -ibd_async_txsched(ibd_state_t *state) -{ - mutex_enter(&state->id_sched_lock); - state->id_sched_queued = B_FALSE; - mutex_exit(&state->id_sched_lock); + if (state->id_sched_needed) { + mac_tx_update(state->id_mh); + state->id_sched_needed = B_FALSE; + } } /* * Release one or more chained send wqes back into free list. */ static void -ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *fswqe, ibd_swqe_t *lswqe, - boolean_t send_context) +ibd_release_swqes(ibd_state_t *state, ibd_swqe_t *swqe) { - boolean_t call_gld_sched = B_FALSE; - /* * Add back on Tx list for reuse. */ - lswqe->swqe_next = NULL; + swqe->swqe_next = NULL; mutex_enter(&state->id_tx_list.dl_mutex); if (state->id_tx_list.dl_pending_sends) { state->id_tx_list.dl_pending_sends = B_FALSE; - call_gld_sched = B_TRUE; } if (state->id_tx_list.dl_head == NULL) { - state->id_tx_list.dl_head = SWQE_TO_WQE(fswqe); + state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); } else { - state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(fswqe); + state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); } - state->id_tx_list.dl_tail = SWQE_TO_WQE(lswqe); + state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); + state->id_tx_list.dl_cnt++; mutex_exit(&state->id_tx_list.dl_mutex); - - /* - * See comments in ibd_tx_sched(); make sure not to call - * gld_sched() if we are in ibd_send() context. - */ - if (call_gld_sched) - if ((ibd_txcomp_poll == 0) && (!send_context)) - gld_sched(state->id_macinfo); - else - ibd_tx_sched(state); } /* - * Acquire a number of chained send wqe's from the free list. Returns the - * number of wqe's actually allocated, and pointers to the first and last - * in the chain. + * Acquire send wqe from free list. + * Returns error number and send wqe pointer. */ static int -ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, - int number) +ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **swqe) { - int numwqe = number; - ibd_swqe_t *node, *wqes; + int rc = 0; + ibd_swqe_t *wqe; /* * Check and reclaim some of the completed Tx requests. @@ -4642,8 +4483,7 @@ ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, * we always try to poll. */ if ((ibd_txcomp_poll == 1) && - (((atomic_add_32_nv(&state->id_tx_sends, 1) & IBD_TXPOLL_MASK) == - 0) || state->id_tx_list.dl_pending_sends) && + (state->id_tx_list.dl_cnt < IBD_TXPOLL_THRESHOLD) && (mutex_tryenter(&state->id_txcomp_lock) != 0)) { DPRINT(10, "ibd_send : polling"); ibd_poll_compq(state, state->id_scq_hdl); @@ -4654,298 +4494,42 @@ ibd_acquire_swqes(ibd_state_t *state, ibd_swqe_t **fswqe, ibd_swqe_t **lswqe, * Grab required transmit wqes. */ mutex_enter(&state->id_tx_list.dl_mutex); - node = wqes = WQE_TO_SWQE(state->id_tx_list.dl_head); - while ((node != NULL) && (numwqe-- > 1)) - node = WQE_TO_SWQE(node->swqe_next); - - /* - * If we did not find the number we were looking for, flag no resource. - * Adjust list appropriately in either case. - */ - if (numwqe != 0) { - state->id_tx_list.dl_head = state->id_tx_list.dl_tail = NULL; - state->id_tx_list.dl_pending_sends = B_TRUE; - mutex_exit(&state->id_tx_list.dl_mutex); - DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); - atomic_add_64(&state->id_tx_short, 1); - if (ibd_txcomp_poll == 1) { - /* - * Arrange for a future gld_sched(). Note that when - * the Tx is retried after a little bit, it will - * surely poll the completion queue above. - */ - ibd_tx_sched(state); - } - } else { - state->id_tx_list.dl_head = node->swqe_next; - if (state->id_tx_list.dl_tail == SWQE_TO_WQE(node)) + wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); + if (wqe != NULL) { + state->id_tx_list.dl_cnt -= 1; + state->id_tx_list.dl_head = wqe->swqe_next; + if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) state->id_tx_list.dl_tail = NULL; - mutex_exit(&state->id_tx_list.dl_mutex); - } - - /* - * Set return parameters. - */ - *fswqe = wqes; - *lswqe = node; - return (number - numwqe); -} - -typedef struct ibd_mpack_s { - ibd_swqe_t *ip_swqe; - uint32_t ip_start, ip_stuff, ip_flags; - ibd_ace_t *ip_ace; - boolean_t ip_copy; - boolean_t ip_noresources; - int ip_segs; - ibt_mr_hdl_t ip_mhdl[IBD_MDTMAX_SEGS + 1]; - ibt_mr_desc_t ip_mdsc[IBD_MDTMAX_SEGS + 1]; -} ibd_mpack_t; -_NOTE(SCHEME_PROTECTS_DATA("Protected by Scheme", ibd_mpack_s)) - -static void -ibd_mdt_txone(gld_mac_info_t *macinfo, void *cookie, pdescinfo_t *dl_pkt_info) -{ - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; - ibd_mpack_t *ptx = (ibd_mpack_t *)cookie; - ibd_ace_t *ace = ptx->ip_ace; - ibd_swqe_t *wqes, *node = ptx->ip_swqe; - boolean_t docopy = ptx->ip_copy; - uchar_t *pptr; - int i, pktsize, seglen, seg = 0; - - /* - * Snag the next wqe before we post this one, since it could complete - * very fast and the wqe could get put at the end of the list, - * corrupting our chain. Set up for the next packet. - */ - wqes = WQE_TO_SWQE(node->swqe_next); - ptx->ip_swqe = wqes; - - IBD_CKSUM_MDT_PACKET(dl_pkt_info, ptx->ip_start, ptx->ip_stuff, - ptx->ip_flags); - node->w_ahandle = ace; - node->w_swr.wr.ud.udwr_dest = ace->ac_dest; - - if (docopy) { - node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; - pptr = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; - pktsize = seglen = PDESC_HDRL(dl_pkt_info); - if (seglen > 0) { - bcopy(dl_pkt_info->hdr_rptr, pptr, seglen); - pptr += seglen; - } - for (; seg < dl_pkt_info->pld_cnt; seg++) - if ((seglen = PDESC_PLDL(dl_pkt_info, seg)) > 0) { - bcopy(dl_pkt_info->pld_ary[seg].pld_rptr, - pptr, seglen); - pptr += seglen; - pktsize += seglen; - } - node->w_swr.wr_nds = 1; - node->swqe_copybuf.ic_sgl.ds_len = pktsize; } else { - seglen = PDESC_HDRL(dl_pkt_info); - if (seglen > 0) { - node->w_smblk_sgl[seg].ds_va = - (ib_vaddr_t)(uintptr_t)dl_pkt_info->hdr_rptr; - node->w_smblk_sgl[seg].ds_key = ptx->ip_mdsc[0].md_lkey; - node->w_smblk_sgl[seg].ds_len = seglen; - seg++; - } - for (i = 0; i < dl_pkt_info->pld_cnt; i++) { - if ((seglen = PDESC_PLDL(dl_pkt_info, i)) > 0) { - node->w_smblk_sgl[seg].ds_va = (ib_vaddr_t) - (uintptr_t)dl_pkt_info->pld_ary[i].pld_rptr; - node->w_smblk_sgl[seg].ds_key = - ptx->ip_mdsc[dl_pkt_info-> - pld_ary[i].pld_pbuf_idx + 1].md_lkey; - node->w_smblk_sgl[seg].ds_len = seglen; - seg++; - } - } - node->w_swr.wr_sgl = node->w_smblk_sgl; - node->w_swr.wr_nds = seg; - } - - if (ibt_post_send(state->id_chnl_hdl, &node->w_swr, 1, NULL) != - IBT_SUCCESS) { /* - * We never expect a failure here. But handle it, just in case. - * If this is not the last packet, there are no problems; if - * it is the last packet and the previous ones have not been - * transmitted yet by the hardware, in the registration case, - * the hardware might transmit garbage since we will be - * freemsg'ing. The AH is still safe. + * If we did not find the number we were looking for, flag + * no resource. Adjust list appropriately in either case. */ - DPRINT(5, "ibd_mdt_txone: posting failed"); - ibd_tx_cleanup(state, node, B_TRUE); - } -} - -static int -ibd_mdt_pre(gld_mac_info_t *macinfo, mblk_t *mp, void **cookie) -{ - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; - multidata_t *dlmdp = mmd_getmultidata(mp); - ibd_mpack_t *mdinfo; - mbufinfo_t bufinfo, *binfo = &bufinfo; - pattrinfo_t attr_info; - uchar_t *dlap; - ibt_mr_attr_t mem_attr; - ibd_swqe_t *wqes, *node; - ipoib_mac_t *dest; - size_t hsize, psize = 0; - int numwqes, numpackets = (int)mmd_getcnt(dlmdp, NULL, NULL); - int i, ret; - uint32_t end, value; - boolean_t noresources = B_FALSE; - - ASSERT(DB_TYPE(mp) == M_MULTIDATA); - ASSERT(mp->b_cont == NULL); - - if ((numwqes = ibd_acquire_swqes(state, &wqes, &node, numpackets)) == 0) - return (0); - else if (numwqes != numpackets) - noresources = B_TRUE; - - DPRINT(20, "ibd_mdt_pre: %d packets %p/%p\n", numwqes, wqes, node); - - /* - * Allocate the cookie that will be passed to subsequent packet - * transmit and post_mdt calls by GLD. We can not sleep, so if - * there is no memory, just tell GLD to drop the entire MDT message. - */ - if ((mdinfo = kmem_zalloc(sizeof (ibd_mpack_t), KM_NOSLEEP)) == NULL) { - ibd_release_swqes(state, wqes, node, B_TRUE); - return (-1); - } - *cookie = (void *)mdinfo; - mdinfo->ip_noresources = noresources; - - /* - * Walk Global Attributes. If TCP failed to provide destination - * information, or some interposing module removed the information, - * fail the entire message. - */ - attr_info.type = PATTR_DSTADDRSAP; - if (mmd_getpattr(dlmdp, NULL, &attr_info) == NULL) { - ibd_release_swqes(state, wqes, node, B_TRUE); - kmem_free(mdinfo, sizeof (ibd_mpack_t)); - return (-1); - } - dlap = ((pattr_addr_t *)attr_info.buf)->addr; - dest = (ipoib_mac_t *)dlap; - - /* - * Get the AH for this destination, incrementing the posted - * reference count properly. - */ - if ((mdinfo->ip_ace = ibd_acache_lookup(state, dest, &ret, - numwqes)) == NULL) { - ibd_release_swqes(state, wqes, node, B_TRUE); - kmem_free(mdinfo, sizeof (ibd_mpack_t)); - return ((ret == GLD_FAILURE) ? -1 : 0); - } - - /* - * Depending on how costly it is to copy vs register, we try to - * register, falling back on copying if we fail. - */ - mmd_getregions(dlmdp, &bufinfo); - hsize = binfo->hbuf_wptr - binfo->hbuf_rptr; - for (i = 0; i < binfo->pbuf_cnt; i++) - psize += (binfo->pbuf_ary[i].pbuf_wptr - - binfo->pbuf_ary[i].pbuf_rptr); - if ((hsize + psize) > IBD_TX_COPY_THRESHOLD) { - mdinfo->ip_segs = i + 1; - if (hsize != 0) { - mem_attr.mr_as = NULL; - mem_attr.mr_flags = IBT_MR_NOSLEEP; - mem_attr.mr_vaddr = - (uint64_t)(uintptr_t)binfo->hbuf_rptr; - mem_attr.mr_len = hsize; - if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, - &mem_attr, &mdinfo->ip_mhdl[0], - &mdinfo->ip_mdsc[0]) != IBT_SUCCESS) - goto ibd_mdt_copy; - DPRINT(10, "ibd_mdt_pre: hsize = %d\n", hsize); - } - for (i = 0; i < binfo->pbuf_cnt; i++) { - if ((psize = (binfo->pbuf_ary[i].pbuf_wptr - - binfo->pbuf_ary[i].pbuf_rptr)) != 0) { - mem_attr.mr_as = NULL; - mem_attr.mr_flags = IBT_MR_NOSLEEP; - mem_attr.mr_vaddr = (uint64_t)(uintptr_t) - binfo->pbuf_ary[i].pbuf_rptr; - mem_attr.mr_len = psize; - if (ibt_register_mr(state->id_hca_hdl, - state->id_pd_hdl, &mem_attr, - &mdinfo->ip_mhdl[i + 1], - &mdinfo->ip_mdsc[i + 1]) != IBT_SUCCESS) { - for (; i >= 0; i--) { - (void) ibt_deregister_mr( - state->id_hca_hdl, - mdinfo->ip_mhdl[i]); - } - goto ibd_mdt_copy; - } - DPRINT(10, "ibd_mdt_pre: psize = %lu\n", psize); - } - } - - mdinfo->ip_copy = B_FALSE; - - /* - * All the deregistration must happen once the last swqe - * completes. - */ - node->swqe_im_mblk = mp; - node->w_mdtinfo = mdinfo; - DPRINT(10, "ibd_mdt_pre: last wqe = %p\n", node); - } else { -ibd_mdt_copy: - mdinfo->ip_copy = B_TRUE; + rc = ENOENT; + state->id_tx_list.dl_pending_sends = B_TRUE; + DPRINT(5, "ibd_acquire_swqes: out of Tx wqe"); + atomic_add_64(&state->id_tx_short, 1); } + mutex_exit(&state->id_tx_list.dl_mutex); + *swqe = wqe; - /* - * Do checksum related work. - */ - IBD_CKSUM_MDT(mp, dlmdp, NULL, &mdinfo->ip_start, &mdinfo->ip_stuff, - &end, &value, &mdinfo->ip_flags); - - mdinfo->ip_swqe = wqes; - return (numwqes); -} - -/* ARGSUSED */ -static void -ibd_mdt_post(gld_mac_info_t *macinfo, mblk_t *mp, void *cookie) -{ - ibd_mpack_t *mdinfo = (ibd_mpack_t *)cookie; - - if (mdinfo->ip_copy) { - if (!mdinfo->ip_noresources) - freemsg(mp); - kmem_free(mdinfo, sizeof (ibd_mpack_t)); - } + return (rc); } /* - * GLD entry point for transmitting a datagram. * The passed in packet has this format: * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data */ -static int -ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) +static boolean_t +ibd_send(ibd_state_t *state, mblk_t *mp) { ibt_status_t ibt_status; ibt_mr_attr_t mem_attr; - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; ibd_ace_t *ace; - ibd_swqe_t *node; + ibd_swqe_t *node = NULL; ipoib_mac_t *dest; - ipoib_ptxhdr_t *ipibp; + ibd_req_t *req; + ib_header_info_t *ipibp; ip6_t *ip6h; mblk_t *nmp = mp; uint_t pktsize; @@ -4954,25 +4538,66 @@ ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) int i, ret, len, nmblks = 1; boolean_t dofree = B_TRUE; - if (ibd_acquire_swqes(state, &node, &node, 1) == 0) - return (GLD_NORESOURCES); + if ((ret = ibd_acquire_swqes(state, &node)) != 0) { + state->id_sched_needed = B_TRUE; + if (ibd_txcomp_poll == 1) { + goto ibd_send_fail; + } + return (B_FALSE); + } /* * Obtain an address handle for the destination. */ - dest = (ipoib_mac_t *)mp->b_rptr; + ipibp = (ib_header_info_t *)mp->b_rptr; + dest = (ipoib_mac_t *)&ipibp->ib_dst; + if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) + IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); + + pktsize = msgsize(mp); + atomic_add_64(&state->id_xmt_bytes, pktsize); + atomic_inc_64(&state->id_xmt_pkt); + if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) + atomic_inc_64(&state->id_brd_xmt); + else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) + atomic_inc_64(&state->id_multi_xmt); + if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { node->w_ahandle = ace; node->w_swr.wr.ud.udwr_dest = ace->ac_dest; } else { DPRINT(5, "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", - ((ret == GLD_FAILURE) ? "failed" : "queued"), + ((ret == EFAULT) ? "failed" : "queued"), htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), htonl(dest->ipoib_gidpref[1]), htonl(dest->ipoib_gidsuff[0]), htonl(dest->ipoib_gidsuff[1])); node->w_ahandle = NULL; + /* + * for the poll mode, it is probably some cqe pending in the + * cq. So ibd has to poll cq here, otherwise acache probably + * may not be recycled. + */ + if (ibd_txcomp_poll == 1) { + mutex_enter(&state->id_txcomp_lock); + ibd_poll_compq(state, state->id_scq_hdl); + mutex_exit(&state->id_txcomp_lock); + } + /* + * Here if ibd_acache_lookup() returns EFAULT, it means ibd + * can not find a path for the specific dest address. We + * should get rid of this kind of packet. With the normal + * case, ibd will return the packet to upper layer and wait + * for AH creating. + */ + if (ret == EFAULT) + ret = B_TRUE; + else { + ret = B_FALSE; + dofree = B_FALSE; + state->id_sched_needed = B_TRUE; + } goto ibd_send_fail; } @@ -4980,46 +4605,51 @@ ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) * For ND6 packets, padding is at the front of the source lladdr. * Insert the padding at front. */ - ipibp = (ipoib_ptxhdr_t *)mp->b_rptr; - if (ntohs(ipibp->ipoib_rhdr.ipoib_type) == IP6_DL_SAP) { - if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + IPV6_HDR_LEN) { + if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { + if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { if (!pullupmsg(mp, IPV6_HDR_LEN + - sizeof (ipoib_ptxhdr_t))) { + sizeof (ib_header_info_t))) { DPRINT(10, "ibd_send: pullupmsg failure "); - ret = GLD_FAILURE; + ret = B_TRUE; goto ibd_send_fail; } + ipibp = (ib_header_info_t *)mp->b_rptr; } - ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_ptxhdr_t)); + ip6h = (ip6_t *)((uchar_t *)ipibp + + sizeof (ib_header_info_t)); len = ntohs(ip6h->ip6_plen); if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { - if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t) + - IPV6_HDR_LEN + len) { - if (!pullupmsg(mp, sizeof (ipoib_ptxhdr_t) + - IPV6_HDR_LEN + len)) { + mblk_t *pad; + + pad = allocb(4, 0); + pad->b_wptr = (uchar_t *)pad->b_rptr + 4; + linkb(mp, pad); + if (MBLKL(mp) < sizeof (ib_header_info_t) + + IPV6_HDR_LEN + len + 4) { + if (!pullupmsg(mp, sizeof (ib_header_info_t) + + IPV6_HDR_LEN + len + 4)) { DPRINT(10, "ibd_send: pullupmsg " "failure "); - ret = GLD_FAILURE; + ret = B_TRUE; goto ibd_send_fail; } + ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + + sizeof (ib_header_info_t)); } + /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_SEND); } } - mp->b_rptr += IPOIB_ADDRL; + mp->b_rptr += sizeof (ib_addrs_t); while (((nmp = nmp->b_cont) != NULL) && (++nmblks < (state->id_max_sqseg + 1))) - ; - pktsize = msgsize(mp); - if (pktsize > state->id_mtu) { - ret = GLD_BADARG; - goto ibd_send_fail; - } + ; + pktsize = msgsize(mp); /* - * Do checksum related work. + * GLDv3 will check mtu. We do checksum related work here. */ IBD_CKSUM_SEND(mp); @@ -5043,7 +4673,7 @@ ibd_send(gld_mac_info_t *macinfo, mblk_t *mp) * IBT_INSUFF_RESOURCE. */ if (ibt_status != IBT_INSUFF_RESOURCE) - DPRINT(10, "ibd_send:%d\n", + DPRINT(10, "ibd_send: %d\n", "failed in ibt_register_mem()", ibt_status); DPRINT(5, "ibd_send: registration failed"); @@ -5087,11 +4717,10 @@ ibd_copy_path: if (ibt_status != IBT_SUCCESS) { /* * We should not fail here; but just in case we do, we - * tell GLD about this error. + * print out a warning to log. */ - ret = GLD_FAILURE; - DPRINT(5, "ibd_send: posting failed"); - goto ibd_send_fail; + ibd_print_warn(state, "ibd_send: posting failed: %d", + ibt_status); } DPRINT(10, "ibd_send : posted packet %d to %08X:%08X:%08X:%08X:%08X", @@ -5104,23 +4733,59 @@ ibd_copy_path: if (dofree) freemsg(mp); - return (GLD_SUCCESS); + return (B_TRUE); ibd_send_fail: - ibd_tx_cleanup(state, node, B_TRUE); + if (state->id_sched_needed == B_TRUE) { + req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); + if (req != NULL) + ibd_queue_work_slot(state, req, ASYNC_SCHED); + else { + dofree = B_TRUE; + ret = B_TRUE; + } + } + + if (dofree) + freemsg(mp); + + if (node != NULL) + ibd_tx_cleanup(state, node); + return (ret); } /* - * GLD entry point for handling interrupts. When using combined CQ, + * GLDv3 entry point for transmitting datagram. + */ +static mblk_t * +ibd_m_tx(void *arg, mblk_t *mp) +{ + ibd_state_t *state = (ibd_state_t *)arg; + mblk_t *next; + + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + if (!ibd_send(state, mp)) { + /* Send fail */ + mp->b_next = next; + break; + } + mp = next; + } + + return (mp); +} + +/* * this handles Tx and Rx completions. With separate CQs, this handles * only Rx completions. */ static uint_t -ibd_intr(gld_mac_info_t *macinfo) +ibd_intr(char *arg) { - ibd_state_t *state = (ibd_state_t *)macinfo->gldm_private; - + ibd_state_t *state = (ibd_state_t *)arg; /* * Poll for completed entries; the CQ will not interrupt any * more for incoming (or transmitted) packets. @@ -5158,7 +4823,7 @@ ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) { ibd_wqe_t *wqe; ibt_wc_t *wc, *wcs; - uint_t numwcs; + uint_t numwcs, real_numwcs; int i; /* @@ -5172,14 +4837,21 @@ ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) * on the interrupt cpu. Thus, lock accordingly and use the * proper completion array. */ - if (cq_hdl == state->id_rcq_hdl) - wcs = state->id_wcs; - else - wcs = state->id_txwcs; - - while (ibt_poll_cq(cq_hdl, wcs, IBD_WC_SIZE, &numwcs) == IBT_SUCCESS) { + if (ibd_separate_cqs == 1) { + if (cq_hdl == state->id_rcq_hdl) { + wcs = state->id_rxwcs; + numwcs = state->id_rxwcs_size; + } else { + wcs = state->id_txwcs; + numwcs = state->id_txwcs_size; + } + } else { + wcs = state->id_rxwcs; + numwcs = state->id_rxwcs_size; + } - for (i = 0, wc = wcs; i < numwcs; i++, wc++) { + if (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) { + for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) { wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; ASSERT((wqe->w_type == IBD_WQE_SEND) || (wqe->w_type == IBD_WQE_RECV)); @@ -5197,19 +4869,20 @@ ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) * try adding buffers to the Rx pool * when we are trying to deinitialize. */ - if (wqe->w_type == IBD_WQE_RECV) + if (wqe->w_type == IBD_WQE_RECV) { continue; - } else { - DPRINT(10, "%s %d", - "ibd_intr: Bad CQ status", - wc->wc_status); + } else { + DPRINT(10, "%s %d", + "ibd_intr: Bad CQ status", + wc->wc_status); + } } } - if (wqe->w_type == IBD_WQE_SEND) - ibd_tx_cleanup(state, WQE_TO_SWQE(wqe), - B_FALSE); - else + if (wqe->w_type == IBD_WQE_SEND) { + ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); + } else { ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); + } } } } @@ -5224,22 +4897,6 @@ ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) DPRINT(20, "ibd_deregister_mr: wqe = %p, seg = %d\n", swqe, swqe->w_swr.wr_nds); - /* - * If this is an MDT case, process accordingly. - */ - if (swqe->w_mdtinfo != NULL) { - ibd_mpack_t *mdinfo = (ibd_mpack_t *)swqe->w_mdtinfo; - - for (i = 0; i < mdinfo->ip_segs; i++) - if ((mdinfo->ip_mhdl[i] != 0) && - (ibt_deregister_mr(state->id_hca_hdl, - mdinfo->ip_mhdl[i]) != IBT_SUCCESS)) - DPRINT(10, "MDT deregistration failed\n"); - ASSERT(!mdinfo->ip_copy); - kmem_free(mdinfo, sizeof (ibd_mpack_t)); - swqe->w_mdtinfo = NULL; - return; - } for (i = 0; i < swqe->w_swr.wr_nds; i++) { if (ibt_deregister_mr(state->id_hca_hdl, @@ -5257,14 +4914,14 @@ ibd_deregister_mr(ibd_state_t *state, ibd_swqe_t *swqe) * erroneous transmission attempt. */ static void -ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) +ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) { ibd_ace_t *ace = swqe->w_ahandle; DPRINT(20, "ibd_tx_cleanup %p\n", swqe); /* - * If this was a dynamic registration in ibd_send() or in MDT, + * If this was a dynamic registration in ibd_send(), * deregister now. */ if (swqe->swqe_im_mblk != NULL) { @@ -5341,7 +4998,7 @@ ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) /* * Release the send wqe for reuse. */ - ibd_release_swqes(state, swqe, swqe, send_context); + ibd_release_swqes(state, swqe); } /* @@ -5352,7 +5009,7 @@ ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe, boolean_t send_context) static void ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) { - ipoib_pgrh_t *pgrh; + ib_header_info_t *phdr; mblk_t *mp; ipoib_hdr_t *ipibp; ip6_t *ip6h; @@ -5373,15 +5030,42 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; /* - * If the GRH is not valid, indicate to GLD by setting - * the VerTcFlow field to 0. Else, update the pseudoGRH - * so that GLD can determine the source mac of the packet. + * the IB link will deliver one of the IB link layer + * headers called, the Global Routing Header (GRH). + * ibd driver uses the information in GRH to build the + * Header_info structure and pass it with the datagram up + * to GLDv3. + * If the GRH is not valid, indicate to GLDv3 by setting + * the VerTcFlow field to 0. */ - pgrh = (ipoib_pgrh_t *)mp->b_rptr; - if (wc->wc_flags & IBT_WC_GRH_PRESENT) - pgrh->ipoib_sqpn = htonl(wc->wc_qpn); - else - pgrh->ipoib_vertcflow = 0; + phdr = (ib_header_info_t *)mp->b_rptr; + if (wc->wc_flags & IBT_WC_GRH_PRESENT) { + phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); + + /* if it is loop back packet, just drop it. */ + if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, + IPOIB_ADDRL) == 0) { + freemsg(mp); + return; + } + + ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, + sizeof (ipoib_mac_t)); + if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { + phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); + IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); + } else { + phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; + } + } else { + /* + * It can not be a IBA multicast packet. Must have been + * unicast for us. Just copy the interface address to dst. + */ + phdr->ib_grh.ipoib_vertcflow = 0; + ovbcopy(&state->id_macaddr, &phdr->ib_dst, + sizeof (ipoib_mac_t)); + } DPRINT(10, "ibd_process_rx : got packet %d", INCRXPACK); @@ -5399,6 +5083,8 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) freemsg(mp); return; } + ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + + sizeof (ipoib_pgrh_t)); } ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); len = ntohs(ip6h->ip6_plen); @@ -5412,12 +5098,21 @@ ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) freemsg(mp); return; } + ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + + sizeof (ipoib_pgrh_t) + + sizeof (ipoib_hdr_t)); } /* LINTED: E_CONSTANT_CONDITION */ IBD_PAD_NSNA(ip6h, len, IBD_RECV); } } + atomic_add_64(&state->id_recv_bytes, wc->wc_bytes_xfer); + atomic_inc_64(&state->id_rcv_pkt); + if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) + atomic_inc_64(&state->id_brd_rcv); + else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) + atomic_inc_64(&state->id_multi_rcv); /* * Hand off to service thread/GLD. When we have hardware that * does hardware checksum, we will pull the checksum from the @@ -5455,7 +5150,7 @@ ibd_freemsg_cb(char *arg) * If the wqe is being destructed, do not attempt recycling. */ if (rwqe->w_freeing_wqe == B_TRUE) { - DPRINT(6, "ibd_freemsg_cb: wqe being freed"); + DPRINT(6, "ibd_freemsg: wqe being freed"); return; } @@ -5472,14 +5167,14 @@ ibd_freemsg_cb(char *arg) rwqe->rwqe_im_mblk = NULL; ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); - DPRINT(6, "ibd_freemsg_cb: free up wqe"); + DPRINT(6, "ibd_freemsg: free up wqe"); } else { rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); if (rwqe->rwqe_im_mblk == NULL) { ibd_delete_rwqe(state, rwqe); ibd_free_rwqe(state, rwqe); - DPRINT(6, "ibd_freemsg_cb: desballoc failed"); + DPRINT(6, "ibd_freemsg: desballoc failed"); return; } @@ -5498,6 +5193,43 @@ ibd_freemsg_cb(char *arg) } } +static uint_t +ibd_tx_recycle(char *arg) +{ + ibd_state_t *state = (ibd_state_t *)arg; + + /* + * Poll for completed entries; the CQ will not interrupt any + * more for completed packets. + */ + ibd_poll_compq(state, state->id_scq_hdl); + + /* + * Now enable CQ notifications; all completions originating now + * will cause new interrupts. + */ + if (ibt_enable_cq_notify(state->id_scq_hdl, IBT_NEXT_COMPLETION) != + IBT_SUCCESS) { + /* + * We do not expect a failure here. + */ + DPRINT(10, "ibd_tx_recycle: ibt_enable_cq_notify() failed"); + } + + /* + * Repoll to catch all packets that might have completed after + * we finished the first poll loop and before interrupts got + * armed. + */ + ibd_poll_compq(state, state->id_scq_hdl); + + /* + * Call txsched to notify GLDv3 if it required. + */ + ibd_async_txsched(state); + + return (DDI_INTR_CLAIMED); +} #ifdef RUN_PERFORMANCE /* @@ -5772,7 +5504,7 @@ retry: */ sspin = gethrtime(); while (!cq_handler_ran) - ; + ; espin = gethrtime(); tspin += (espin - sspin); cq_handler_ran = B_FALSE; @@ -5789,7 +5521,7 @@ done: * completion. */ while (num_completions != (packets / IBD_NUM_UNSIGNAL)) - ; + ; etime = gethrtime(); cmn_err(CE_CONT, "ibd_perf_tx: # signaled completions = %d", diff --git a/usr/src/uts/common/io/mac/plugins/mac_ib.c b/usr/src/uts/common/io/mac/plugins/mac_ib.c new file mode 100644 index 0000000000..97fd438dbd --- /dev/null +++ b/usr/src/uts/common/io/mac/plugins/mac_ib.c @@ -0,0 +1,307 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * DL_IB MAC Type plugin for the Nemo mac module + */ + +#include <sys/types.h> +#include <sys/modctl.h> +#include <sys/dlpi.h> +#include <sys/ib/clients/ibd/ibd.h> +#include <sys/mac.h> +#include <sys/mac_ib.h> +#include <sys/dls.h> +#include <sys/byteorder.h> +#include <sys/strsun.h> +#include <inet/common.h> +#include <sys/note.h> + +static uint8_t ib_brdcst[] = { 0x00, 0xff, 0xff, 0xff, + 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; + +static struct modlmisc mac_ib_modlmisc = { + &mod_miscops, + "Infiniband MAC Type plugin 1.0" +}; + +static struct modlinkage mac_ib_modlinkage = { + MODREV_1, + &mac_ib_modlmisc, + NULL +}; + +static mactype_ops_t mac_ib_type_ops; + +int +_init(void) +{ + mactype_register_t *mtrp; + int err; + + if ((mtrp = mactype_alloc(MACTYPE_VERSION)) == NULL) + return (ENOTSUP); + mtrp->mtr_ident = MAC_PLUGIN_IDENT_IB; + mtrp->mtr_ops = &mac_ib_type_ops; + mtrp->mtr_mactype = DL_IB; + mtrp->mtr_addrlen = IPOIB_ADDRL; + mtrp->mtr_brdcst_addr = ib_brdcst; + + /* + * So far, generic stats maintained by GLDv3 are sufficient for IB. + */ + mtrp->mtr_stats = NULL; + mtrp->mtr_statcount = 0; + if ((err = mactype_register(mtrp)) == 0) { + if ((err = mod_install(&mac_ib_modlinkage)) != 0) + (void) mactype_unregister(MAC_PLUGIN_IDENT_IB); + } + mactype_free(mtrp); + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = mactype_unregister(MAC_PLUGIN_IDENT_IB)) != 0) + return (err); + return (mod_remove(&mac_ib_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&mac_ib_modlinkage, modinfop)); +} + +/* + * MAC Type plugin operations + */ + +/* ARGSUSED */ +int +mac_ib_unicst_verify(const void *addr, void *mac_pdata) +{ + ipoib_mac_t *ibaddr = (ipoib_mac_t *)addr; + + /* + * The address must not be a multicast address. + */ + return (ntohl(ibaddr->ipoib_qpn) == IB_MC_QPN ? EINVAL : 0); +} + +int +mac_ib_multicst_verify(const void *addr, void *mac_pdata) +{ + ipoib_mac_t *ibaddr = (ipoib_mac_t *)addr; + uint8_t *p_gid = (uint8_t *)addr + sizeof (ipoib_mac_t) + - MAC_IB_GID_SIZE; + uint32_t bcst_gid[3] = { 0x0, 0x0, MAC_IB_BROADCAST_GID }; + + _NOTE(ARGUNUSED(mac_pdata)); + + /* + * The address must be a multicast address. + */ + if ((ntohl(ibaddr->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) + return (EINVAL); + + /* + * The address must not be the broadcast address. + */ + if (bcmp(p_gid, (uint8_t *)bcst_gid + sizeof (bcst_gid) - + MAC_IB_GID_SIZE, MAC_IB_GID_SIZE) == 0) + return (EINVAL); + + return (0); +} + +/* + * Check the legality of a SAP value. The following values are + * allowed, as specified by PSARC 2003/150: + * + * min-ethertype-sap (256).. EtherType max(65535) ethertype semantics + * (0) .. max-802-sap(255) IEEE 802 semantics + */ +boolean_t +mac_ib_sap_verify(uint32_t sap, uint32_t *bind_sap, void *mac_pdata) +{ + _NOTE(ARGUNUSED(mac_pdata)); + + if (sap > MAC_IB_MAX_802_SAP && sap <= MAC_IB_ETHERTYPE_MAX) { + if (bind_sap != NULL) + *bind_sap = sap; + return (B_TRUE); + } + + if (sap <= MAC_IB_MAX_802_SAP) { + if (bind_sap != NULL) + *bind_sap = DLS_SAP_LLC; + return (B_TRUE); + } + + return (B_FALSE); +} + +/* ARGSUSED */ +mblk_t * +mac_ib_header(const void *saddr, const void *daddr, uint32_t sap, + void *mac_pdata, mblk_t *payload, size_t extra_len) +{ + ib_header_info_t *ibhp; + mblk_t *mp; + + if (!mac_ib_sap_verify(sap, NULL, NULL)) + return (NULL); + + mp = allocb(sizeof (ib_header_info_t) + extra_len, BPRI_HI); + if (mp == NULL) + return (NULL); + + ibhp = (ib_header_info_t *)mp->b_rptr; + ibhp->ipib_rhdr.ipoib_type = htons(sap); + ibhp->ipib_rhdr.ipoib_mbz = 0; + bcopy(daddr, &ibhp->ib_dst, IPOIB_ADDRL); + mp->b_wptr += sizeof (ib_header_info_t); + return (mp); +} + +int +mac_ib_header_info(mblk_t *mp, void *mac_pdata, mac_header_info_t *hdr_info) +{ + ib_header_info_t *ibhp; + uint16_t sap; + + if (MBLKL(mp) < sizeof (ib_header_info_t)) + return (EINVAL); + + ibhp = (ib_header_info_t *)mp->b_rptr; + + hdr_info->mhi_hdrsize = sizeof (ib_header_info_t); + hdr_info->mhi_daddr = (const uint8_t *)&(ibhp->ib_dst); + if (ibhp->ib_grh.ipoib_vertcflow != 0) + hdr_info->mhi_saddr = (const uint8_t *)&(ibhp->ib_src); + else + hdr_info->mhi_saddr = NULL; + + if (mac_ib_unicst_verify(hdr_info->mhi_daddr, mac_pdata) == 0) { + hdr_info->mhi_dsttype = MAC_ADDRTYPE_UNICAST; + } else if (mac_ib_multicst_verify(hdr_info->mhi_daddr, + mac_pdata) == 0) { + hdr_info->mhi_dsttype = MAC_ADDRTYPE_MULTICAST; + } else { + hdr_info->mhi_dsttype = MAC_ADDRTYPE_BROADCAST; + } + + sap = ntohs(ibhp->ipib_rhdr.ipoib_type); + hdr_info->mhi_origsap = hdr_info->mhi_bindsap = sap; + hdr_info->mhi_pktsize = 0; + + return (0); +} + +/* + * Take the provided `mp' (which is expected to have a header "dst + type"), + * and return a pointer to an mblk_t with a header "GRH + type". + * If the conversion cannot be performed, return NULL. + */ +static mblk_t * +mac_ib_header_cook(mblk_t *mp, void *pdata) +{ + ipoib_ptxhdr_t *orig_hp; + mblk_t *llmp; + + if (MBLKL(mp) < sizeof (ipoib_ptxhdr_t)) + return (NULL); + + orig_hp = (ipoib_ptxhdr_t *)mp->b_rptr; + llmp = mac_ib_header(NULL, &orig_hp->ipoib_dest, + ntohs(orig_hp->ipoib_rhdr.ipoib_type), pdata, NULL, 0); + if (llmp == NULL) + return (NULL); + + /* + * The plugin framework guarantees that we have the only reference + * to the mblk_t, so we can safely modify it. + */ + ASSERT(DB_REF(mp) == 1); + mp->b_rptr += sizeof (ipoib_ptxhdr_t); + llmp->b_cont = mp; + return (llmp); +} + +/* + * Take the provided `mp' (which is expected to have a header "GRH + type"), + * and return a pointer to an mblk_t with a header "type". If the conversion + * cannot be performed, return NULL. + */ +static mblk_t * +mac_ib_header_uncook(mblk_t *mp, void *pdata) +{ + _NOTE(ARGUNUSED(pdata)); + + /* + * The plugin framework guarantees that we have the only reference to + * the mblk_t and the underlying dblk_t, so we can safely modify it. + */ + ASSERT(DB_REF(mp) == 1); + + mp->b_rptr += sizeof (ib_addrs_t); + return (mp); +} + +void +mac_ib_link_details(char *buf, size_t sz, mac_handle_t mh, void *mac_pdata) +{ + uint64_t speed; + + _NOTE(ARGUNUSED(mac_pdata)); + + speed = mac_stat_get(mh, MAC_STAT_IFSPEED); + + /* convert to Mbps */ + speed /= 1000000; + + buf[0] = 0; + (void) snprintf(buf, sz, "%u Mbps", (uint32_t)speed); +} + +static mactype_ops_t mac_ib_type_ops = { + MTOPS_HEADER_COOK | MTOPS_HEADER_UNCOOK | MTOPS_LINK_DETAILS, + mac_ib_unicst_verify, + mac_ib_multicst_verify, + mac_ib_sap_verify, + mac_ib_header, + mac_ib_header_info, + NULL, + mac_ib_header_cook, + mac_ib_header_uncook, + mac_ib_link_details +}; diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h index 548d20058e..8cdf2cf96a 100644 --- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h +++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -97,7 +96,8 @@ typedef struct ipoib_pgrh { #include <sys/ib/ibtl/ibti.h> #include <sys/ib/ib_pkt_hdrs.h> #include <sys/list.h> -#include <sys/gld.h> +#include <sys/mac.h> +#include <sys/mac_ib.h> #include <sys/modhash.h> #define IBD_HIWAT (64*1024) /* drv flow control high water */ @@ -163,7 +163,6 @@ typedef struct ibd_swqe_s { ibt_wr_ds_t w_smblk_sgl[IBD_MAX_SQSEG]; ibd_mblkbuf_t w_smblkbuf[IBD_MAX_SQSEG]; ibd_ace_t *w_ahandle; - void *w_mdtinfo; } ibd_swqe_t; #define swqe_next w_ibd_swqe.w_next @@ -214,20 +213,26 @@ typedef struct ibd_state_s { ibt_clnt_hdl_t id_ibt_hdl; ibt_hca_hdl_t id_hca_hdl; ibt_pd_hdl_t id_pd_hdl; + kmem_cache_t *id_req_kmc; uint32_t id_max_sqseg; ibd_list_t id_tx_list; + ddi_softintr_t id_tx; uint32_t id_tx_sends; kmutex_t id_txcomp_lock; ibt_cq_hdl_t id_scq_hdl; ibt_wc_t *id_txwcs; + uint32_t id_txwcs_size; uint32_t id_num_rwqe; ibd_list_t id_rx_list; + ddi_softintr_t id_rx; ibt_cq_hdl_t id_rcq_hdl; void *id_fifos; int id_nfifos; - ibt_wc_t *id_wcs; + ibt_wc_t *id_rxwcs; + uint32_t id_rxwcs_size; + kmutex_t id_rx_mutex; ibt_channel_hdl_t id_chnl_hdl; ib_pkey_t id_pkey; @@ -235,7 +240,7 @@ typedef struct ibd_state_s { uint8_t id_port; ibt_mcg_info_t *id_mcinfo; - gld_mac_info_t *id_macinfo; + mac_handle_t id_mh; ib_gid_t id_sgid; ib_qpn_t id_qpnum; ipoib_mac_t id_macaddr; @@ -263,25 +268,19 @@ typedef struct ibd_state_s { kmutex_t id_mc_mutex; struct list id_mc_full; struct list id_mc_non; - ibd_req_t id_multi_req; - ipoib_mac_t id_multi_addr; - char id_multi_op; - boolean_t id_multi_queued; kmutex_t id_trap_lock; kcondvar_t id_trap_cv; boolean_t id_trap_stop; uint32_t id_trap_inprog; - int id_prom_op; - ibd_req_t id_prom_req; + char id_prom_op; kmutex_t id_sched_lock; - boolean_t id_sched_queued; - ibd_req_t id_sched_req; + boolean_t id_sched_needed; kmutex_t id_link_mutex; - int32_t id_link_state; + link_state_t id_link_state; uint64_t id_link_speed; uint64_t id_ah_error; @@ -289,6 +288,15 @@ typedef struct ibd_state_s { uint64_t id_num_intrs; uint64_t id_tx_short; uint32_t id_num_swqe; + + uint64_t id_xmt_bytes; + uint64_t id_recv_bytes; + uint64_t id_multi_xmt; + uint64_t id_brd_xmt; + uint64_t id_multi_rcv; + uint64_t id_brd_rcv; + uint64_t id_xmt_pkt; + uint64_t id_rcv_pkt; } ibd_state_t; #endif /* _KERNEL && !_BOOT */ diff --git a/usr/src/uts/common/sys/mac_ib.h b/usr/src/uts/common/sys/mac_ib.h new file mode 100644 index 0000000000..79c802d7f5 --- /dev/null +++ b/usr/src/uts/common/sys/mac_ib.h @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MAC_ETHER_H +#define _SYS_MAC_ETHER_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Ethernet MAC Plugin + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#define MAC_PLUGIN_IDENT_IB "mac_ib" + +#define MAC_IB_MAX_802_SAP 255 +#define MAC_IB_ETHERTYPE_MAX 65535 +#define MAC_IB_GID_SIZE 10 +#define MAC_IB_BROADCAST_GID 0xFFFFFFFF + +/* + * In order to transmit the datagram to correct destination, an extra + * header including destination address is required. IB does not provide an + * interface for sending a link layer header directly to the IB link and the + * link layer header received from the IB link is missing information that + * GLDv3 requires. So mac_ib plugin defines a "soft" header as below. + */ +typedef struct ib_addrs { + ipoib_mac_t ipib_src; + ipoib_mac_t ipib_dst; +} ib_addrs_t; + +typedef struct ib_header_info { + union { + ipoib_pgrh_t ipib_grh; + ib_addrs_t ipib_addrs; + } ipib_prefix; + ipoib_hdr_t ipib_rhdr; +} ib_header_info_t; + +#define ib_dst ipib_prefix.ipib_addrs.ipib_dst +#define ib_src ipib_prefix.ipib_addrs.ipib_src +#define ib_grh ipib_prefix.ipib_grh + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_ETHER_H */ diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index 882368cfc0..39d95c1fca 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -20,7 +20,7 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -640,6 +640,7 @@ PCBE_KMODS += p123_pcbe p4_pcbe opteron_pcbe # MAC_KMODS += mac_ether MAC_KMODS += mac_wifi +MAC_KMODS += mac_ib # # 'Devname' Modules (kernel/devname) diff --git a/usr/src/uts/intel/ibd/Makefile b/usr/src/uts/intel/ibd/Makefile index a87b1bd332..fa67c6584c 100644 --- a/usr/src/uts/intel/ibd/Makefile +++ b/usr/src/uts/intel/ibd/Makefile @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -38,7 +38,7 @@ MODULE = ibd OBJECTS = $(IBD_OBJS:%=$(OBJS_DIR)/%) LINTS = $(IBD_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -LDFLAGS += -dy -Nmisc/gld -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf +LDFLAGS += -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf WARLOCK_OUT = $(IBD_OBJS:%.o=%.ll) WARLOCK_OK = $(MODULE).ok WLCMD_DIR = $(UTSBASE)/common/io/warlock diff --git a/usr/src/uts/intel/mac_ib/Makefile b/usr/src/uts/intel/mac_ib/Makefile new file mode 100644 index 0000000000..fd88e98a7a --- /dev/null +++ b/usr/src/uts/intel/mac_ib/Makefile @@ -0,0 +1,95 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# +# This makefile drives the production of the mac_ib MAC-Type plugin +# kernel module. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = mac_ib +OBJECTS = $(MAC_IB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(MAC_IB_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MAC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/mac + +# +# For now, disable these lint checks; maintainers should endeavor +# to investigate and remove these for maximum lint coverage. +# Please do not carry these forward to new Makefiles. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index fa79ebca79..93edd00412 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -20,7 +20,7 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -456,6 +456,7 @@ DACF_KMODS += usb_ac_dacf # MAC_KMODS += mac_ether MAC_KMODS += mac_wifi +MAC_KMODS += mac_ib # # 'Devname' Modules (kernel/devname) diff --git a/usr/src/uts/sparc/ibd/Makefile b/usr/src/uts/sparc/ibd/Makefile index 4ed6eeb598..e64421db63 100644 --- a/usr/src/uts/sparc/ibd/Makefile +++ b/usr/src/uts/sparc/ibd/Makefile @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #ident "%Z%%M% %I% %E% SMI" @@ -38,7 +38,7 @@ MODULE = ibd OBJECTS = $(IBD_OBJS:%=$(OBJS_DIR)/%) LINTS = $(IBD_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -LDFLAGS += -dy -Nmisc/gld -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf +LDFLAGS += -dy -Nmisc/mac -Nmisc/ibtl -Nmisc/ibcm -Nmisc/ibmf WARLOCK_OUT = $(IBD_OBJS:%.o=%.ll) WARLOCK_OK = $(MODULE).ok WLCMD_DIR = $(UTSBASE)/common/io/warlock diff --git a/usr/src/uts/sparc/mac_ib/Makefile b/usr/src/uts/sparc/mac_ib/Makefile new file mode 100644 index 0000000000..745518b4da --- /dev/null +++ b/usr/src/uts/sparc/mac_ib/Makefile @@ -0,0 +1,95 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" +# +# +# This makefile drives the production of the mac_ib MAC-Type plugin +# kernel module. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = mac_ib +OBJECTS = $(MAC_IB_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(MAC_IB_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MAC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/mac + +# +# For now, disable these lint checks; maintainers should endeavor +# to investigate and remove these for maximum lint coverage. +# Please do not carry these forward to new Makefiles. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ |