diff options
author | narayan <none@none> | 2006-07-10 13:54:40 -0700 |
---|---|---|
committer | narayan <none@none> | 2006-07-10 13:54:40 -0700 |
commit | d10e4ef2fabf16c3237c6d6592496df3eac6a1ef (patch) | |
tree | eba8ce41c34abcfb747da37e51cbe610162cf334 | |
parent | 0ccf9e790d232720597416743840df88825a9317 (diff) | |
download | illumos-joyent-d10e4ef2fabf16c3237c6d6592496df3eac6a1ef.tar.gz |
6412648 VIO service drivers auto-unload after some time
6413569 Possible memory leaks needs to be investigated.
6423722 vds should use finer-grained locking for better performance
6429738 LDom panics using a destroyed ldc
6431111 LDOM manager should use P_FAULTED state for cpu instead of P_OFFLINE/
6431300 can not enter '~' character on a ldom's console
6431458 vDisk drivers need to handle read/write requests asynchronously
6437436 ldc read/write operations are serialized due to a common lock
6437766 vDisk VTOC should handle the timestamp field
6440543 vSwitch/vNet should use aligned IP frame headers
6440553 vNet/vSwitch should reuse previously allocated mblks
6442270 vDisk server should set FREAD and FWRITE mode when calling ldi_ioctl
6442851 Remove VLDC max write_pa limit
6442973 vntsd dumps core with assertion failure message
6443193 vDisk client incorrectly implements DKIOCSGEOM ioctl
6443198 vDisk client incorrectly caches new VTOC on DKIOCSVTOC ioctl
6444392 vswitch/vnet should set end_idx to -1 for increased performance
24 files changed, 2499 insertions, 1255 deletions
diff --git a/usr/src/cmd/vntsd/cmd.c b/usr/src/cmd/vntsd/cmd.c index c39ef03399..8bee8417fe 100644 --- a/usr/src/cmd/vntsd/cmd.c +++ b/usr/src/cmd/vntsd/cmd.c @@ -275,14 +275,22 @@ exit_daemon_cmd(vntsd_client_t *clientp, int rv) return (rv); } -/* vntsd_process_daemon_cmd() - special commands */ +/* + * vntsd_process_daemon_cmd() - special commands + * "<RET>~" vntsd daemon commands + * "<RET>~~" enter '~' character + */ int vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c) { esctable_t *p; int rv; + char prev_char; + + prev_char = clientp->prev_char; + clientp->prev_char = c; - if (c != VNTSD_DAEMON_CMD) { + if (c != VNTSD_DAEMON_CMD || (prev_char != 0 && prev_char != CR)) { /* not a daemon command */ return (VNTSD_SUCCESS); } @@ -304,6 +312,18 @@ vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c) return (exit_daemon_cmd(clientp, rv)); } + clientp->prev_char = c; + if (c == VNTSD_DAEMON_CMD) { + /* + * received another '~' + * a user types '~~' to get '~' + */ + (void) mutex_lock(&clientp->lock); + clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD; + (void) mutex_unlock(&clientp->lock); + return (VNTSD_SUCCESS); + } + for (p = etable; p->e_char; p++) { if (p->e_char == c) { /* found match */ diff --git a/usr/src/cmd/vntsd/vntsd.h b/usr/src/cmd/vntsd/vntsd.h index 16b1bbe90f..efaf724c15 100644 --- a/usr/src/cmd/vntsd/vntsd.h +++ b/usr/src/cmd/vntsd/vntsd.h @@ -248,6 +248,8 @@ typedef struct vntsd_client { struct vntsd_cons *cons; /* back link to console configuration */ + char prev_char; /* previous char read by this client */ + } vntsd_client_t; /* console structure */ diff --git a/usr/src/cmd/vntsd/write.c b/usr/src/cmd/vntsd/write.c index 16f07029c5..9110056c11 100644 --- a/usr/src/cmd/vntsd/write.c +++ b/usr/src/cmd/vntsd/write.c @@ -50,6 +50,12 @@ #include "vntsd.h" #include "chars.h" +/* handle for writing all clients */ +typedef struct write_buf { + uint_t sz; /* data size */ + char *buf; +} write_buf_t; + /* * check the state of write thread. exit if no more client connects to the * console. @@ -81,20 +87,16 @@ write_chk_status(vntsd_cons_t *consp, int status) * skip_terminal_null() * scan terminal null character sequence (0x5e 0x40) * return number of characters in the buf after skipping terminal null - * sequence. + * sequence. buf size must be at least sz+1. */ static int -skip_terminal_null(char *buf, int buf_sz, int sz) +skip_terminal_null(char *buf, int sz) { int i, j; static int term_null_seq = 0; assert(sz >= 0); - if (buf_sz < sz+1) { - return (-1); - } - if (term_null_seq) { /* skip 0x5e previously */ term_null_seq = 0; @@ -180,14 +182,18 @@ read_vcc(vntsd_cons_t *consp, char *buf, ssize_t *sz) return (VNTSD_STATUS_VCC_IO_ERR); } -static int s_sz; -/* write to a client */ +/* + * write to a client + * this function is passed as a parameter to vntsd_que_find. + * for each client that connected to the console, vntsd_que_find + * applies this function. + */ static boolean_t -write_all_clients(vntsd_client_t *clientp, char *buf) +write_one_client(vntsd_client_t *clientp, write_buf_t *write_buf) { int rv; - rv = vntsd_write_client(clientp, buf, s_sz); + rv = vntsd_write_client(clientp, write_buf->buf, write_buf->sz); if (rv != VNTSD_SUCCESS) { (void) mutex_lock(&clientp->lock); clientp->status |= VNTSD_CLIENT_IO_ERR; @@ -206,6 +212,7 @@ vntsd_write_thread(vntsd_cons_t *consp) char buf[VNTSD_MAX_BUF_SIZE+1]; int sz; int rv; + write_buf_t write_buf; D1(stderr, "t@%d vntsd_write@%d\n", thr_self(), consp->vcc_fd); @@ -225,12 +232,13 @@ vntsd_write_thread(vntsd_cons_t *consp) } /* has data */ - if ((s_sz = skip_terminal_null(buf, sz+1, sz)) == 0) { + if ((sz = skip_terminal_null(buf, sz)) == 0) { /* terminal null sequence */ continue; } - assert(s_sz > 0); + write_buf.sz = sz; + write_buf.buf = buf; /* * output data to all clients connected @@ -239,7 +247,7 @@ vntsd_write_thread(vntsd_cons_t *consp) (void) mutex_lock(&consp->lock); (void) vntsd_que_find(consp->clientpq, - (compare_func_t)write_all_clients, buf); + (compare_func_t)write_one_client, &write_buf); (void) mutex_unlock(&consp->lock); write_chk_status(consp, VNTSD_SUCCESS); diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files index 5150da4b60..7ed94b375c 100644 --- a/usr/src/uts/sun4v/Makefile.files +++ b/usr/src/uts/sun4v/Makefile.files @@ -134,7 +134,7 @@ VNEX_OBJS = vnex.o CNEX_OBJS = cnex.o GLVC_OBJS = glvc.o glvc_hcall.o MDESC_OBJS = mdesc.o -LDC_OBJS = ldc.o +LDC_OBJS = ldc.o vio_util.o VLDC_OBJS = vldc.o VCC_OBJS = vcc.o VNET_OBJS = vnet.o vnet_gen.o diff --git a/usr/src/uts/sun4v/io/cnex.c b/usr/src/uts/sun4v/io/cnex.c index 08a70cc810..293c20e131 100644 --- a/usr/src/uts/sun4v/io/cnex.c +++ b/usr/src/uts/sun4v/io/cnex.c @@ -765,9 +765,11 @@ cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype) if (rv) { DWARN("cnex_rem_intr: ino=0x%llx, cannot get state\n", iinfo->ino); + mutex_exit(&cldcp->lock); + return (ENXIO); } - if (rv || ((gethrtime() - start) > cnex_pending_tmout)) + if ((gethrtime() - start) > cnex_pending_tmout) break; } while (!panicstr && istate == HV_INTR_DELIVERED_STATE); @@ -776,9 +778,8 @@ cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype) if (istate != HV_INTR_IDLE_STATE) { DWARN("cnex_rem_intr: cannot remove intr busy ino=%x\n", iinfo->ino); - /* clear interrupt state */ - (void) hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino, - HV_INTR_IDLE_STATE); + mutex_exit(&cldcp->lock); + return (EAGAIN); } /* remove interrupt */ @@ -850,6 +851,8 @@ cnex_clr_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype) HV_INTR_IDLE_STATE); if (rv) { DWARN("cnex_intr_wrapper: cannot clear interrupt state\n"); + mutex_exit(&cldcp->lock); + return (ENXIO); } mutex_exit(&cldcp->lock); diff --git a/usr/src/uts/sun4v/io/fault_iso.c b/usr/src/uts/sun4v/io/fault_iso.c index 0123c19291..d7b884e37f 100644 --- a/usr/src/uts/sun4v/io/fault_iso.c +++ b/usr/src/uts/sun4v/io/fault_iso.c @@ -212,7 +212,7 @@ cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen) } break; case FMA_CPU_REQ_OFFLINE: - rv = p_online_internal(msg->cpu_id, P_OFFLINE, + rv = p_online_internal(msg->cpu_id, P_FAULTED, &cpu_status); if (rv == EINVAL) { FI_DBG(CE_CONT, "Failed p_online call failed." diff --git a/usr/src/uts/sun4v/io/ldc.c b/usr/src/uts/sun4v/io/ldc.c index 3e526a623c..4b2bd1a092 100644 --- a/usr/src/uts/sun4v/io/ldc.c +++ b/usr/src/uts/sun4v/io/ldc.c @@ -421,6 +421,8 @@ i_ldc_txq_reconf(ldc_chan_t *ldcp) int rv; ASSERT(MUTEX_HELD(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); + rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries); if (rv) { cmn_err(CE_WARN, @@ -513,6 +515,9 @@ i_ldc_reset(ldc_chan_t *ldcp) { D2(ldcp->id, "i_ldc_reset: (0x%llx) channel reset\n", ldcp->id); + ASSERT(MUTEX_HELD(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); + (void) i_ldc_txq_reconf(ldcp); (void) i_ldc_rxq_reconf(ldcp); i_ldc_reset_state(ldcp); @@ -558,7 +563,9 @@ i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head) cmn_err(CE_WARN, "ldc_rx_set_qhead: (0x%lx) cannot set qhead 0x%lx", ldcp->id, head); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -575,7 +582,7 @@ i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail) int rv; uint64_t current_head, new_tail; - ASSERT(MUTEX_HELD(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); /* Read the head and tail ptrs from HV */ rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); @@ -626,7 +633,7 @@ i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail) int rv, retval = EWOULDBLOCK; int retries; - ASSERT(MUTEX_HELD(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); for (retries = 0; retries < ldc_max_retries; retries++) { if ((rv = hv_ldc_tx_set_qtail(ldcp->id, tail)) == 0) { @@ -658,7 +665,9 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype, uint64_t tx_tail; uint32_t curr_seqid = ldcp->last_msg_snt; - ASSERT(MUTEX_HELD(&ldcp->lock)); + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + /* get the current tail for the message */ rv = i_ldc_get_tx_tail(ldcp, &tx_tail); if (rv) { @@ -666,6 +675,7 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype, "i_ldc_send_pkt: (0x%llx) error sending pkt, " "type=0x%x,subtype=0x%x,ctrl=0x%x\n", ldcp->id, pkttype, subtype, ctrlmsg); + mutex_exit(&ldcp->tx_lock); return (rv); } @@ -698,12 +708,14 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype, "i_ldc_send_pkt:(0x%llx) error sending pkt, " "type=0x%x,stype=0x%x,ctrl=0x%x\n", ldcp->id, pkttype, subtype, ctrlmsg); + mutex_exit(&ldcp->tx_lock); return (EIO); } ldcp->last_msg_snt = curr_seqid; ldcp->tx_tail = tx_tail; + mutex_exit(&ldcp->tx_lock); return (0); } @@ -755,6 +767,9 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) D2(ldcp->id, "i_ldc_process_VER: (0x%llx) received VER v%u.%u\n", ldcp->id, rcvd_ver->major, rcvd_ver->minor); + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + switch (msg->stype) { case LDC_INFO: @@ -765,6 +780,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%llx) err sending " "version ACK/NACK\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -850,6 +866,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%llx) error sending " "ACK/NACK\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -871,6 +888,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%llx) cannot send RTS\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -898,6 +916,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%llx) no listener\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -914,6 +933,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%llx) no version match\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -924,6 +944,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%lx) err sending " "version ACK/NACK\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -973,6 +994,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) if (idx == LDC_NUM_VERS) { /* no version match - terminate */ ldcp->next_vidx = 0; + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } } @@ -992,12 +1014,14 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_VER: (0x%lx) error sending version" "INFO\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } break; } + mutex_exit(&ldcp->tx_lock); return (rv); } @@ -1022,7 +1046,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) ldcp->id); /* Reset the channel -- as we cannot continue */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; @@ -1040,7 +1066,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTS); if (rv) { /* if cannot send NACK - reset channel */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; } @@ -1050,7 +1078,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) default: DWARN(ldcp->id, "i_ldc_process_RTS: (0x%llx) unexp ACK\n", ldcp->id); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; } @@ -1070,6 +1100,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) /* store initial SEQID info */ ldcp->last_msg_snt = msg->seqid; + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + /* get the current tail for the response */ rv = i_ldc_get_tx_tail(ldcp, &tx_tail); if (rv != 0) { @@ -1077,6 +1110,7 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_RTS: (0x%lx) err sending RTR\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -1111,9 +1145,11 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_process_RTS: (0x%lx) error sending RTR\n", ldcp->id); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } + mutex_exit(&ldcp->tx_lock); return (0); } @@ -1136,7 +1172,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg) ldcp->id); /* Reset the channel -- as we cannot continue */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; @@ -1155,7 +1193,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg) rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTR); if (rv) { /* if cannot send NACK - reset channel */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; } @@ -1168,7 +1208,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg) ldcp->id); /* Reset the channel -- as we cannot continue */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; } @@ -1190,7 +1232,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg) cmn_err(CE_NOTE, "i_ldc_process_RTR: (0x%lx) cannot send RDX\n", ldcp->id); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } D2(ldcp->id, @@ -1224,7 +1268,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg) ldcp->id); /* Reset the channel -- as we cannot continue */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; @@ -1239,7 +1285,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg) DWARN(DBG_ALL_LDCS, "i_ldc_process_RDX: (0x%llx) unexpected RDX" " - LDC reset\n", ldcp->id); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -1255,7 +1303,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg) ldcp->id); /* Reset the channel -- as we cannot continue */ + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); rv = ECONNRESET; break; } @@ -1273,8 +1323,11 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg) uint64_t tx_head; ldc_msg_t *pkt; + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + /* - * Read the curret Tx head and tail + * Read the current Tx head and tail */ rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); @@ -1282,7 +1335,11 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg) cmn_err(CE_WARN, "i_ldc_process_data_ACK: (0x%lx) cannot read qptrs\n", ldcp->id); - return (0); + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); + return (ECONNRESET); } /* @@ -1310,10 +1367,15 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg) DWARN(ldcp->id, "i_ldc_process_data_ACK: (0x%llx) invalid ACKid\n", ldcp->id); - break; + + /* Reset the channel -- as we cannot continue */ + i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); + return (ECONNRESET); } } + mutex_exit(&ldcp->tx_lock); return (0); } @@ -1353,8 +1415,10 @@ i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg) switch (msg->ctrl & LDC_CTRL_MASK) { case LDC_VER: /* peer is redoing version negotiation */ + mutex_enter(&ldcp->tx_lock); (void) i_ldc_txq_reconf(ldcp); i_ldc_reset_state(ldcp); + mutex_exit(&ldcp->tx_lock); rv = EAGAIN; break; case LDC_RTS: @@ -1387,8 +1451,10 @@ i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg) "i_ldc_ctrlmsg: (0x%llx) unexpected VER " "- LDC reset\n", ldcp->id); /* peer is redoing version negotiation */ + mutex_enter(&ldcp->tx_lock); (void) i_ldc_txq_reconf(ldcp); i_ldc_reset_state(ldcp); + mutex_exit(&ldcp->tx_lock); rv = EAGAIN; break; @@ -1472,20 +1538,28 @@ i_ldc_unregister_channel(ldc_chan_t *ldcp) if (ldcp->tstate & TS_CNEX_RDY) { + /* Remove the Rx interrupt */ rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR); if (rv) { DWARN(ldcp->id, "i_ldc_unregister_channel: err removing Rx intr\n"); + return (rv); } + + /* Remove the Tx interrupt */ rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR); if (rv) { DWARN(ldcp->id, "i_ldc_unregister_channel: err removing Tx intr\n"); + return (rv); } + + /* Unregister the channel */ rv = cinfo->unreg_chan(ldcssp->cinfo.dip, ldcp->id); if (rv) { DWARN(ldcp->id, "i_ldc_unregister_channel: cannot unreg channel\n"); + return (rv); } ldcp->tstate &= ~TS_CNEX_RDY; @@ -1520,12 +1594,16 @@ i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2) /* Lock channel */ mutex_enter(&ldcp->lock); + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state); if (rv) { cmn_err(CE_WARN, "i_ldc_tx_hdlr: (0x%lx) cannot read queue ptrs rv=0x%d\n", ldcp->id, rv); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (DDI_INTR_CLAIMED); } @@ -1565,6 +1643,7 @@ i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2) ldcp->cb_inprogress = B_TRUE; /* Unlock channel */ + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); if (notify_client) { @@ -1603,6 +1682,7 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) ldc_chan_t *ldcp; boolean_t notify_client = B_FALSE; uint64_t notify_event = 0; + uint64_t first_fragment = 0; /* Get the channel for which interrupt was received */ if (arg1 == NULL) { @@ -1645,7 +1725,9 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) if (ldcp->link_state == LDC_CHANNEL_DOWN) { D1(ldcp->id, "i_ldc_rx_hdlr: channel link down\n", ldcp->id); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); notify_client = B_TRUE; notify_event = LDC_EVT_DOWN; break; @@ -1653,7 +1735,9 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) if (ldcp->link_state == LDC_CHANNEL_RESET) { D1(ldcp->id, "i_ldc_rx_hdlr: channel link reset\n", ldcp->id); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); notify_client = B_TRUE; notify_event = LDC_EVT_RESET; } @@ -1715,11 +1799,11 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) "q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail); /* Reset last_msg_rcd to start of message */ - if (ldcp->first_fragment != 0) { - ldcp->last_msg_rcd = - ldcp->first_fragment - 1; - ldcp->first_fragment = 0; + if (first_fragment != 0) { + ldcp->last_msg_rcd = first_fragment - 1; + first_fragment = 0; } + /* * Send a NACK due to seqid mismatch */ @@ -1730,6 +1814,13 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) cmn_err(CE_NOTE, "i_ldc_rx_hdlr: (0x%lx) err sending " "CTRL/NACK msg\n", ldcp->id); + + /* if cannot send NACK - reset channel */ + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); + rv = ECONNRESET; + break; } /* purge receive queue */ @@ -1769,7 +1860,11 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2) /* process data ACKs */ if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) { - (void) i_ldc_process_data_ACK(ldcp, msg); + if (rv = i_ldc_process_data_ACK(ldcp, msg)) { + notify_client = B_TRUE; + notify_event = LDC_EVT_RESET; + break; + } } /* move the head one position */ @@ -1878,11 +1973,24 @@ ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle) /* Allocate an ldcp structure */ ldcp = kmem_zalloc(sizeof (ldc_chan_t), KM_SLEEP); - /* Initialize the channel lock */ + /* + * Initialize the channel and Tx lock + * + * The channel 'lock' protects the entire channel and + * should be acquired before initializing, resetting, + * destroying or reading from a channel. + * + * The 'tx_lock' should be acquired prior to transmitting + * data over the channel. The lock should also be acquired + * prior to channel reconfiguration (in order to prevent + * concurrent writes). + * + * ORDERING: When both locks are being acquired, to prevent + * deadlocks, the channel lock should be always acquired prior + * to the tx_lock. + */ mutex_init(&ldcp->lock, NULL, MUTEX_DRIVER, NULL); - - /* Channel specific processing */ - mutex_enter(&ldcp->lock); + mutex_init(&ldcp->tx_lock, NULL, MUTEX_DRIVER, NULL); /* Initialize the channel */ ldcp->id = id; @@ -1996,8 +2104,6 @@ ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle) /* mark status as INITialized */ ldcp->status = LDC_INIT; - mutex_exit(&ldcp->lock); - /* Add to channel list */ mutex_enter(&ldcssp->lock); ldcp->next = ldcssp->chan_list; @@ -2025,7 +2131,7 @@ cleanup_on_exit: contig_mem_free((caddr_t)ldcp->rx_q_va, (ldcp->rx_q_entries << LDC_PACKET_SHIFT)); - mutex_exit(&ldcp->lock); + mutex_destroy(&ldcp->tx_lock); mutex_destroy(&ldcp->lock); if (ldcp) @@ -2121,6 +2227,7 @@ ldc_fini(ldc_handle_t handle) mutex_exit(&ldcp->lock); /* Destroy mutex */ + mutex_destroy(&ldcp->tx_lock); mutex_destroy(&ldcp->lock); /* free channel structure */ @@ -2289,7 +2396,7 @@ int ldc_close(ldc_handle_t handle) { ldc_chan_t *ldcp; - int rv = 0; + int rv = 0, retries = 0; boolean_t chk_done = B_FALSE; if (handle == NULL) { @@ -2331,6 +2438,9 @@ ldc_close(ldc_handle_t handle) return (EBUSY); } + /* Obtain Tx lock */ + mutex_enter(&ldcp->tx_lock); + /* * Wait for pending transmits to complete i.e Tx queue to drain * if there are pending pkts - wait 1 ms and retry again @@ -2342,6 +2452,7 @@ ldc_close(ldc_handle_t handle) if (rv) { cmn_err(CE_WARN, "ldc_close: (0x%lx) cannot read qptrs\n", ldcp->id); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (EIO); } @@ -2366,13 +2477,27 @@ ldc_close(ldc_handle_t handle) /* * Unregister the channel with the nexus */ - rv = i_ldc_unregister_channel(ldcp); - if (rv && rv != EAGAIN) { - cmn_err(CE_WARN, - "ldc_close: (0x%lx) channel unregister failed\n", - ldcp->id); + while ((rv = i_ldc_unregister_channel(ldcp)) != 0) { + + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); - return (rv); + + /* if any error other than EAGAIN return back */ + if (rv != EAGAIN || retries >= LDC_MAX_RETRIES) { + cmn_err(CE_WARN, + "ldc_close: (0x%lx) unregister failed, %d\n", + ldcp->id, rv); + return (rv); + } + + /* + * As there could be pending interrupts we need + * to wait and try again + */ + drv_usecwait(LDC_DELAY); + mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); + retries++; } /* @@ -2383,6 +2508,7 @@ ldc_close(ldc_handle_t handle) cmn_err(CE_WARN, "ldc_close: (0x%lx) channel TX queue unconf failed\n", ldcp->id); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (EIO); } @@ -2391,6 +2517,7 @@ ldc_close(ldc_handle_t handle) cmn_err(CE_WARN, "ldc_close: (0x%lx) channel RX queue unconf failed\n", ldcp->id); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (EIO); } @@ -2406,6 +2533,7 @@ ldc_close(ldc_handle_t handle) ldcp->tstate = TS_INIT; ldcp->status = LDC_INIT; + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); /* Decrement number of open channels */ @@ -2557,11 +2685,14 @@ ldc_up(ldc_handle_t handle) return (0); } + mutex_enter(&ldcp->tx_lock); + /* get the current tail for the LDC msg */ rv = i_ldc_get_tx_tail(ldcp, &tx_tail); if (rv) { DWARN(ldcp->id, "ldc_up: (0x%llx) cannot initiate handshake\n", ldcp->id); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (ECONNREFUSED); } @@ -2586,6 +2717,7 @@ ldc_up(ldc_handle_t handle) DWARN(ldcp->id, "ldc_up: (0x%llx) cannot initiate handshake rv=%d\n", ldcp->id, rv); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (rv); } @@ -2594,6 +2726,7 @@ ldc_up(ldc_handle_t handle) ldcp->tx_tail = tx_tail; D1(ldcp->id, "ldc_up: (0x%llx) channel up initiated\n", ldcp->id); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (rv); @@ -2615,7 +2748,9 @@ ldc_reset(ldc_handle_t handle) ldcp = (ldc_chan_t *)handle; mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (0); @@ -2736,7 +2871,9 @@ ldc_chkq(ldc_handle_t handle, boolean_t *isempty) /* reset the channel state if the channel went down */ if (ldcp->link_state == LDC_CHANNEL_DOWN || ldcp->link_state == LDC_CHANNEL_RESET) { + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); mutex_exit(&ldcp->lock); return (ECONNRESET); } @@ -2839,7 +2976,9 @@ i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) /* reset the channel state if the channel went down */ if (ldcp->link_state == LDC_CHANNEL_DOWN) { + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -2886,14 +3025,12 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) size_t len = 0, bytes_read = 0; int retries = 0; uint64_t q_size_mask; + uint64_t first_fragment = 0; target = target_bufp; ASSERT(mutex_owned(&ldcp->lock)); - /* reset first frag to 0 */ - ldcp->first_fragment = 0; - /* compute mask for increment */ q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT; @@ -2913,7 +3050,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) /* reset the channel state if the channel went down */ if (ldcp->link_state == LDC_CHANNEL_DOWN) { + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } @@ -2930,7 +3069,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) } /* reset the channel state if the channel went down */ if (ldcp->link_state == LDC_CHANNEL_DOWN) { + mutex_enter(&ldcp->tx_lock); i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); return (ECONNRESET); } } @@ -2938,7 +3079,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) if (curr_head == rx_tail) { /* If in the middle of a fragmented xfer */ - if (ldcp->first_fragment != 0) { + if (first_fragment != 0) { /* wait for ldc_delay usecs */ drv_usecwait(ldc_delay); @@ -2947,7 +3088,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) continue; *sizep = 0; - ldcp->last_msg_rcd = ldcp->first_fragment - 1; + ldcp->last_msg_rcd = first_fragment - 1; DWARN(DBG_ALL_LDCS, "ldc_read: (0x%llx) read timeout", ldcp->id); @@ -2978,10 +3119,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) bytes_read = 0; /* Reset last_msg_rcd to start of message */ - if (ldcp->first_fragment != 0) { - ldcp->last_msg_rcd = - ldcp->first_fragment - 1; - ldcp->first_fragment = 0; + if (first_fragment != 0) { + ldcp->last_msg_rcd = first_fragment - 1; + first_fragment = 0; } /* * Send a NACK -- invalid seqid @@ -2993,6 +3133,13 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) cmn_err(CE_NOTE, "ldc_read: (0x%lx) err sending " "NACK msg\n", ldcp->id); + + /* if cannot send NACK - reset channel */ + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); + rv = ECONNRESET; + break; } /* purge receive queue */ @@ -3021,7 +3168,11 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) /* process data ACKs */ if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) { - (void) i_ldc_process_data_ACK(ldcp, msg); + if (rv = i_ldc_process_data_ACK(ldcp, msg)) { + *sizep = 0; + bytes_read = 0; + break; + } } /* process data messages */ @@ -3047,7 +3198,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) * currently expensive. */ - if (ldcp->first_fragment == 0) { + if (first_fragment == 0) { /* * first packets should always have the start @@ -3074,7 +3225,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) continue; } - ldcp->first_fragment = msg->seqid; + first_fragment = msg->seqid; } else { /* check to see if this is a pkt w/ START bit */ if (msg->env & LDC_FRAG_START) { @@ -3089,7 +3240,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) /* throw data we have read so far */ bytes_read = 0; target = target_bufp; - ldcp->first_fragment = msg->seqid; + first_fragment = msg->seqid; if (rv = i_ldc_set_rx_head(ldcp, curr_head)) @@ -3113,7 +3264,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) "head=0x%lx, expect=%d, got=%d\n", ldcp->id, curr_head, *sizep, bytes_read+len); - ldcp->first_fragment = 0; + first_fragment = 0; target = target_bufp; bytes_read = 0; @@ -3173,10 +3324,15 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep) ldcp->mode == LDC_MODE_STREAM)) { rv = i_ldc_send_pkt(ldcp, LDC_DATA, LDC_ACK, 0); - if (rv != 0) { + if (rv) { cmn_err(CE_NOTE, "ldc_read: (0x%lx) cannot send ACK\n", ldcp->id); - return (0); + + /* if cannot send ACK - reset channel */ + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->tx_lock); + rv = ECONNRESET; } } @@ -3250,20 +3406,28 @@ ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep) } ldcp = (ldc_chan_t *)handle; - mutex_enter(&ldcp->lock); + /* check if writes can occur */ + if (!mutex_tryenter(&ldcp->tx_lock)) { + /* + * Could not get the lock - channel could + * be in the process of being unconfigured + * or reader has encountered an error + */ + return (EAGAIN); + } /* check if non-zero data to write */ if (buf == NULL || sizep == NULL) { DWARN(ldcp->id, "ldc_write: (0x%llx) invalid data write\n", ldcp->id); - mutex_exit(&ldcp->lock); + mutex_exit(&ldcp->tx_lock); return (EINVAL); } if (*sizep == 0) { DWARN(ldcp->id, "ldc_write: (0x%llx) write size of zero\n", ldcp->id); - mutex_exit(&ldcp->lock); + mutex_exit(&ldcp->tx_lock); return (0); } @@ -3278,7 +3442,7 @@ ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep) rv = ldcp->write_p(ldcp, buf, sizep); } - mutex_exit(&ldcp->lock); + mutex_exit(&ldcp->tx_lock); return (rv); } @@ -3295,7 +3459,7 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) int rv = 0; size_t size; - ASSERT(mutex_owned(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); ASSERT(ldcp->mode == LDC_MODE_RAW); size = *sizep; @@ -3326,8 +3490,22 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) ldcp->link_state == LDC_CHANNEL_RESET) { DWARN(ldcp->id, "ldc_write: (0x%llx) channel down/reset\n", ldcp->id); - i_ldc_reset(ldcp); + *sizep = 0; + if (mutex_tryenter(&ldcp->lock)) { + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } else { + /* + * Release Tx lock, and then reacquire channel + * and Tx lock in correct order + */ + mutex_exit(&ldcp->tx_lock); + mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } return (ECONNRESET); } @@ -3349,10 +3527,10 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) /* Send the data now */ ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail); - /* copy the data into pkt */ + /* copy the data into pkt */ bcopy((uint8_t *)buf, ldcmsg, size); - /* increment tail */ + /* increment tail */ tx_tail = new_tail; /* @@ -3368,9 +3546,21 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) return (EWOULDBLOCK); } - /* cannot write data - reset channel */ - i_ldc_reset(ldcp); *sizep = 0; + if (mutex_tryenter(&ldcp->lock)) { + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } else { + /* + * Release Tx lock, and then reacquire channel + * and Tx lock in correct order + */ + mutex_exit(&ldcp->tx_lock); + mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } return (ECONNRESET); } @@ -3403,7 +3593,7 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size) int rv; uint32_t curr_seqid; - ASSERT(mutex_owned(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); ASSERT(ldcp->mode == LDC_MODE_RELIABLE || ldcp->mode == LDC_MODE_UNRELIABLE || @@ -3427,7 +3617,20 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size) DWARN(ldcp->id, "ldc_write: (0x%llx) channel down/reset\n", ldcp->id); *size = 0; - i_ldc_reset(ldcp); + if (mutex_tryenter(&ldcp->lock)) { + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } else { + /* + * Release Tx lock, and then reacquire channel + * and Tx lock in correct order + */ + mutex_exit(&ldcp->tx_lock); + mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } return (ECONNRESET); } @@ -3522,9 +3725,21 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size) int rv2; if (rv != EWOULDBLOCK) { - /* cannot write data - reset channel */ - i_ldc_reset(ldcp); *size = 0; + if (mutex_tryenter(&ldcp->lock)) { + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } else { + /* + * Release Tx lock, and then reacquire channel + * and Tx lock in correct order + */ + mutex_exit(&ldcp->tx_lock); + mutex_enter(&ldcp->lock); + mutex_enter(&ldcp->tx_lock); + i_ldc_reset(ldcp); + mutex_exit(&ldcp->lock); + } return (ECONNRESET); } @@ -3560,7 +3775,7 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size) static int i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep) { - ASSERT(mutex_owned(&ldcp->lock)); + ASSERT(MUTEX_HELD(&ldcp->tx_lock)); ASSERT(ldcp->mode == LDC_MODE_STREAM); /* Truncate packet to max of MTU size */ @@ -4692,7 +4907,7 @@ ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie, uint32_t ccount, } D1(ldcp->id, "ldc_mem_map: (0x%llx) cookie = 0x%llx,0x%llx\n", - mhandle, cookie->addr, cookie->size); + ldcp->id, cookie->addr, cookie->size); /* FUTURE: get the page size, pgsz code, and shift */ pg_size = MMU_PAGESIZE; diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c index 6502c8394a..a04c57e32d 100644 --- a/usr/src/uts/sun4v/io/vdc.c +++ b/usr/src/uts/sun4v/io/vdc.c @@ -69,6 +69,7 @@ #include <sys/mdeg.h> #include <sys/note.h> #include <sys/open.h> +#include <sys/sdt.h> #include <sys/stat.h> #include <sys/sunddi.h> #include <sys/types.h> @@ -152,7 +153,6 @@ static int vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, static int vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg); static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); -static int vdc_get_response(vdc_t *vdc, int start, int end); static int vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, int operation); static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int @@ -162,19 +162,26 @@ static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); static int vdc_create_fake_geometry(vdc_t *vdc); static int vdc_setup_disk_layout(vdc_t *vdc); -static int vdc_null_copy_func(void *from, void *to, int mode, int dir); -static int vdc_get_vtoc_convert(void *from, void *to, int mode, int dir); -static int vdc_set_vtoc_convert(void *from, void *to, int mode, int dir); -static int vdc_get_geom_convert(void *from, void *to, int mode, int dir); -static int vdc_set_geom_convert(void *from, void *to, int mode, int dir); -static int vdc_uscsicmd_convert(void *from, void *to, int mode, int dir); +static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_uscsicmd_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); /* * Module variables */ uint64_t vdc_hz_timeout; uint64_t vdc_usec_timeout = VDC_USEC_TIMEOUT_MIN; -uint64_t vdc_dump_usec_timeout = VDC_USEC_TIMEOUT_MIN / 300; +uint64_t vdc_usec_timeout_dump = VDC_USEC_TIMEOUT_MIN / 300; +uint64_t vdc_usec_timeout_dring = 10 * MILLISEC; static int vdc_retries = VDC_RETRIES; static int vdc_dump_retries = VDC_RETRIES * 10; @@ -932,18 +939,38 @@ vdc_print(dev_t dev, char *str) static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { - int rv = 0; - size_t nbytes = (nblk * DEV_BSIZE); - int instance = SDUNIT(getminor(dev)); - vdc_t *vdc; + buf_t *buf; /* BWRITE requests need to be in a buf_t structure */ + int rv; + size_t nbytes = nblk * DEV_BSIZE; + int instance = SDUNIT(getminor(dev)); + vdc_t *vdc = NULL; if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { vdc_msg("%s (%d): Could not get state.", __func__, instance); return (ENXIO); } - rv = vdc_populate_descriptor(vdc, addr, nbytes, VD_OP_BWRITE, - blkno, SDPART(getminor(dev))); + buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); + bioinit(buf); + buf->b_un.b_addr = addr; + buf->b_bcount = nbytes; + buf->b_flags = B_BUSY | B_WRITE; + buf->b_dev = dev; + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, nbytes, + VD_OP_BWRITE, blkno, SDPART(getminor(dev))); + + /* + * If the OS instance is panicking, the call above will ensure that + * the descriptor is done before returning. This should always be + * case when coming through this function but we check just in case + * and wait if necessary for the vDisk server to ACK and trigger + * the biodone. + */ + if (!ddi_in_panic()) + rv = biowait(buf); + + biofini(buf); + kmem_free(buf, sizeof (buf_t)); PR1("%s: status=%d\n", __func__, rv); @@ -983,22 +1010,32 @@ vdc_strategy(struct buf *buf) return (0); } + DTRACE_IO2(vstart, buf_t *, buf, vdc_t *, vdc); + ASSERT(buf->b_bcount <= (vdc->max_xfer_sz * vdc->block_size)); if (!vdc_is_able_to_tx_data(vdc, O_NONBLOCK)) { - vdc_msg("%s: Not ready to transmit data", __func__); + PR0("%s: Not ready to transmit data\n", __func__); bioerror(buf, ENXIO); biodone(buf); return (0); } bp_mapin(buf); - rv = vdc_populate_descriptor(vdc, buf->b_un.b_addr, buf->b_bcount, op, + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, buf->b_bcount, op, buf->b_lblkno, SDPART(getminor(buf->b_edev))); - PR1("%s: status=%d", __func__, rv); - bioerror(buf, rv); - biodone(buf); + /* + * If the request was successfully sent, the strategy call returns and + * the ACK handler calls the bioxxx functions when the vDisk server is + * done. + */ + if (rv) { + PR0("[%d] Failed to read/write (err=%d)\n", instance, rv); + bioerror(buf, rv); + biodone(buf); + } + return (0); } @@ -1900,6 +1937,8 @@ vdc_destroy_descriptor_ring(vdc_t *vdc) * * Description: * This function gets the index of the next Descriptor Ring entry available + * If the ring is full, it will back off and wait for the next entry to be + * freed (the ACK handler will signal). * * Return Value: * 0 <= rv < VD_DRING_LEN Next available slot @@ -1910,9 +1949,9 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) { _NOTE(ARGUNUSED(num_slots_needed)) - vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + vd_dring_entry_t *dep = NULL; /* DRing Entry Pointer */ + vdc_local_desc_t *ldep = NULL; /* Local DRing Entry Pointer */ int idx = -1; - int start_idx = 0; ASSERT(vdc != NULL); ASSERT(vdc->dring_len == VD_DRING_LEN); @@ -1920,67 +1959,31 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) ASSERT(vdc->dring_curr_idx < VD_DRING_LEN); ASSERT(mutex_owned(&vdc->dring_lock)); - /* Start at the last entry used */ - idx = start_idx = vdc->dring_curr_idx; - - /* - * Loop through Descriptor Ring checking for a free entry until we reach - * the entry we started at. We should never come close to filling the - * Ring at any stage, instead this is just to prevent an entry which - * gets into an inconsistent state (e.g. due to a request timing out) - * from blocking progress. - */ - do { - /* Get the next entry after the last known index tried */ - idx = (idx + 1) % VD_DRING_LEN; - - dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx); - ASSERT(dep != NULL); + /* pick the next descriptor after the last one used */ + idx = (vdc->dring_curr_idx + 1) % VD_DRING_LEN; + ldep = &vdc->local_dring[idx]; + ASSERT(ldep != NULL); + dep = ldep->dep; + ASSERT(dep != NULL); + mutex_enter(&ldep->lock); + if (dep->hdr.dstate == VIO_DESC_FREE) { + vdc->dring_curr_idx = idx; + } else { + DTRACE_PROBE(full); + (void) cv_timedwait(&ldep->cv, &ldep->lock, + VD_GET_TIMEOUT_HZ(1)); if (dep->hdr.dstate == VIO_DESC_FREE) { - ASSERT(idx >= 0); - ASSERT(idx < VD_DRING_LEN); vdc->dring_curr_idx = idx; - return (idx); - - } else if (dep->hdr.dstate == VIO_DESC_READY) { - PR0("%s: Entry %d waiting to be accepted\n", - __func__, idx); - continue; - - } else if (dep->hdr.dstate == VIO_DESC_ACCEPTED) { - PR0("%s: Entry %d waiting to be processed\n", - __func__, idx); - continue; - - } else if (dep->hdr.dstate == VIO_DESC_DONE) { - PR0("%s: Entry %d done but not marked free\n", - __func__, idx); - - /* - * If we are currently panicking, interrupts are - * disabled and we will not be getting ACKs from the - * vDisk server so we mark the descriptor ring entries - * as FREE here instead of in the ACK handler. - */ - if (panicstr) { - (void) vdc_depopulate_descriptor(vdc, idx); - dep->hdr.dstate = VIO_DESC_FREE; - vdc->local_dring[idx].flags = VIO_DESC_FREE; - } - continue; - } else { - vdc_msg("Public Descriptor Ring entry corrupted"); - mutex_enter(&vdc->lock); - vdc_reset_connection(vdc, B_FALSE); - mutex_exit(&vdc->lock); - return (-1); + PR0("[%d] Entry %d unavailable still in state %d\n", + vdc->instance, idx, dep->hdr.dstate); + idx = -1; /* indicate that the ring is full */ } + } + mutex_exit(&ldep->lock); - } while (idx != start_idx); - - return (-1); + return (idx); } /* @@ -1994,7 +1997,11 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) * * Arguments: * vdc - the soft state pointer - * addr - start address of memory region. + * addr - address of structure to be written. In the case of block + * reads and writes this structure will be a buf_t and the + * address of the data to be written will be in the b_un.b_addr + * field. Otherwise the value of addr will be the address + * to be written. * nbytes - number of bytes to read/write * operation - operation we want vds to perform (VD_OP_XXX) * arg - parameter to be sent to server (depends on VD_OP_XXX type) @@ -2031,8 +2038,8 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, idx = vdc_get_next_dring_entry_idx(vdc, 1); if (idx == -1) { mutex_exit(&vdc->dring_lock); - vdc_msg("%s[%d]: no descriptor ring entry avail, seq=%d\n", - __func__, vdc->instance, vdc->seq_num); + PR0("[%d] no descriptor ring entry avail, last seq=%d\n", + vdc->instance, vdc->seq_num - 1); /* * Since strategy should not block we don't wait for the DRing @@ -2047,17 +2054,23 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, ASSERT(dep != NULL); /* - * Wait for anybody still using the DRing entry to finish. - * (e.g. still waiting for vds to respond to a request) + * We now get the lock for this descriptor before dropping the overall + * DRing lock. This prevents a race condition where another vdc thread + * could grab the descriptor we selected. */ + ASSERT(!MUTEX_HELD(&local_dep->lock)); mutex_enter(&local_dep->lock); + mutex_exit(&vdc->dring_lock); switch (operation) { case VD_OP_BREAD: case VD_OP_BWRITE: + local_dep->buf = (struct buf *)addr; + local_dep->addr = local_dep->buf->b_un.b_addr; PR1("buf=%p, block=%lx, nbytes=%lx\n", addr, arg, nbytes); dep->payload.addr = (diskaddr_t)arg; - rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation); + rv = vdc_populate_mem_hdl(vdc, idx, local_dep->addr, + nbytes, operation); break; case VD_OP_GET_VTOC: @@ -2065,6 +2078,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, case VD_OP_GET_DISKGEOM: case VD_OP_SET_DISKGEOM: case VD_OP_SCSICMD: + local_dep->addr = addr; if (nbytes > 0) { rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation); @@ -2085,7 +2099,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, if (rv != 0) { mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } @@ -2101,30 +2114,34 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, dep->hdr.ack = 1; /* request an ACK for every message */ local_dep->flags = VIO_DESC_READY; - local_dep->addr = addr; /* * Send a msg with the DRing details to vds */ + mutex_enter(&vdc->lock); VIO_INIT_DRING_DATA_TAG(dmsg); VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); dmsg.dring_ident = vdc->dring_ident; dmsg.start_idx = idx; dmsg.end_idx = idx; + DTRACE_IO2(send, vio_dring_msg_t *, &dmsg, vdc_t *, vdc); + PR1("ident=0x%llx, st=%d, end=%d, seq=%d req=%d dep=%p\n", vdc->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num, dep->payload.req_id, dep); - mutex_enter(&vdc->lock); rv = vdc_send(vdc, (caddr_t)&dmsg, &msglen); - mutex_exit(&vdc->lock); PR1("%s[%d]: ldc_write() rv=%d\n", __func__, vdc->instance, rv); if (rv != 0) { + mutex_exit(&vdc->lock); mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); vdc_msg("%s: ldc_write(%d)\n", __func__, rv); - return (EAGAIN); + + /* Clear the DRing entry */ + rv = vdc_depopulate_descriptor(vdc, idx); + + return (rv ? rv : EAGAIN); } /* @@ -2132,14 +2149,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, * number to be used by the next message */ vdc->seq_num++; - - /* - * XXX - potential performance enhancement (Investigate at a later date) - * - * for calls from strategy(9E), instead of waiting for a response from - * vds, we could return at this stage and let the ACK handling code - * trigger the biodone(9F) - */ + mutex_exit(&vdc->lock); /* * When a guest is panicking, the completion of requests needs to be @@ -2170,7 +2180,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, } PR1("Waiting for next packet @ %d\n", idx); - delay(drv_usectohz(vdc_dump_usec_timeout)); + drv_usecwait(vdc_usec_timeout_dump); continue; } @@ -2238,14 +2248,24 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, } mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } /* - * Now watch the DRing entries we modified to get the response - * from vds. + * In the case of calls from strategy and dump (in the non-panic case), + * instead of waiting for a response from the vDisk server return now. + * They will be processed asynchronously and the vdc ACK handling code + * will trigger the biodone(9F) + */ + if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { + mutex_exit(&local_dep->lock); + return (rv); + } + + /* + * In the case of synchronous calls we watch the DRing entries we + * modified and await the response from vds. */ rv = vdc_wait_for_descriptor_update(vdc, idx, dmsg); if (rv == ETIMEDOUT) { @@ -2257,7 +2277,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, PR1("%s[%d] Status=%d\n", __func__, vdc->instance, rv); mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } @@ -2287,7 +2306,6 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) int rv = 0; ASSERT(vdc != NULL); - ASSERT(mutex_owned(&vdc->dring_lock)); ASSERT(idx < VD_DRING_LEN); local_dep = &vdc->local_dring[idx]; ASSERT(local_dep != NULL); @@ -2329,12 +2347,12 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) * and have never made it to the other side (vds). * (We reuse the original message but update seq ID) */ + mutex_enter(&vdc->lock); VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); retries = 0; - mutex_enter(&vdc->lock); status = vdc_send(vdc, (caddr_t)&dmsg, &msglen); - mutex_exit(&vdc->lock); if (status != 0) { + mutex_exit(&vdc->lock); vdc_msg("%s: Error (%d) while resending after " "timeout\n", __func__, status); status = ETIMEDOUT; @@ -2345,60 +2363,13 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) * the sequence number to be used by the next message. */ vdc->seq_num++; + mutex_exit(&vdc->lock); } } return (status); } -static int -vdc_get_response(vdc_t *vdc, int start, int end) -{ - vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ - vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ - int status = ENXIO; - int idx = -1; - - ASSERT(vdc != NULL); - ASSERT(start >= 0); - ASSERT(start <= VD_DRING_LEN); - ASSERT(start >= -1); - ASSERT(start <= VD_DRING_LEN); - - idx = start; - ldep = &vdc->local_dring[idx]; - ASSERT(ldep != NULL); - dep = ldep->dep; - ASSERT(dep != NULL); - - PR0("%s[%d] DRING entry=%d status=%d\n", __func__, vdc->instance, - idx, VIO_GET_DESC_STATE(dep->hdr.dstate)); - while (VIO_GET_DESC_STATE(dep->hdr.dstate) == VIO_DESC_DONE) { - if ((end != -1) && (idx > end)) - return (0); - - switch (ldep->operation) { - case VD_OP_BREAD: - case VD_OP_BWRITE: - /* call bioxxx */ - break; - default: - /* signal waiter */ - break; - } - - /* Clear the DRing entry */ - status = vdc_depopulate_descriptor(vdc, idx); - PR0("%s[%d] Status=%d\n", __func__, vdc->instance, status); - - /* loop accounting to get next DRing entry */ - idx++; - ldep = &vdc->local_dring[idx]; - dep = ldep->dep; - } - - return (status); -} /* * Function: @@ -2452,7 +2423,7 @@ vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes); kmem_free(ldep->align_addr, - sizeof (caddr_t) * dep->payload.nbytes); + sizeof (caddr_t) * P2ROUNDUP(dep->payload.nbytes, 8)); ldep->align_addr = NULL; } @@ -2536,17 +2507,20 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, */ vaddr = addr; if (((uint64_t)addr & 0x7) != 0) { + ASSERT(ldep->align_addr == NULL); ldep->align_addr = - kmem_zalloc(sizeof (caddr_t) * nbytes, KM_SLEEP); + kmem_zalloc(sizeof (caddr_t) * P2ROUNDUP(nbytes, 8), + KM_SLEEP); PR0("%s[%d] Misaligned address %lx reallocating " - "(buf=%lx entry=%d)\n", - __func__, vdc->instance, addr, ldep->align_addr, idx); + "(buf=%lx nb=%d op=%d entry=%d)\n", + __func__, vdc->instance, addr, ldep->align_addr, nbytes, + operation, idx); bcopy(addr, ldep->align_addr, nbytes); vaddr = ldep->align_addr; } rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), - vdc->dring_mem_info.mtype, perm, &dep->payload.cookie[0], + LDC_SHADOW_MAP, perm, &dep->payload.cookie[0], &dep->payload.ncookies); PR1("%s[%d] bound mem handle; ncookies=%d\n", __func__, vdc->instance, dep->payload.ncookies); @@ -2556,7 +2530,7 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, __func__, vdc->instance, mhdl, addr, idx, rv); if (ldep->align_addr) { kmem_free(ldep->align_addr, - sizeof (caddr_t) * dep->payload.nbytes); + sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); ldep->align_addr = NULL; } return (EAGAIN); @@ -2986,7 +2960,7 @@ static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) { int status = 0; - vdc_local_desc_t *local_dep = NULL; + vdc_local_desc_t *ldep = NULL; vio_dring_msg_t *dring_msg = NULL; uint_t num_msgs; uint_t start; @@ -3010,6 +2984,8 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) return (EPROTO); } + DTRACE_IO2(recv, vio_dring_msg_t, dring_msg, vdc_t *, vdc); + /* * calculate the number of messages that vds ACK'ed * @@ -3031,12 +3007,32 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) * Wake the thread waiting for each DRing entry ACK'ed */ for (i = 0; i < num_msgs; i++) { + int operation; int idx = (start + i) % VD_DRING_LEN; - local_dep = &vdc->local_dring[idx]; - mutex_enter(&local_dep->lock); - cv_signal(&local_dep->cv); - mutex_exit(&local_dep->lock); + ldep = &vdc->local_dring[idx]; + mutex_enter(&ldep->lock); + operation = ldep->dep->payload.operation; + if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { + /* + * The vDisk server responds when it accepts a + * descriptor so we continue looping and process + * it when it sends the message that it is done. + */ + if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { + mutex_exit(&ldep->lock); + continue; + } + bioerror(ldep->buf, ldep->dep->payload.status); + biodone(ldep->buf); + + DTRACE_IO2(vdone, buf_t *, ldep->buf, vdc_t *, vdc); + + /* Clear the DRing entry */ + status = vdc_depopulate_descriptor(vdc, idx); + } + cv_signal(&ldep->cv); + mutex_exit(&ldep->lock); } if (msg.tag.vio_subtype == VIO_SUBTYPE_NACK) { @@ -3348,6 +3344,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs) { ASSERT(vdc != NULL); ASSERT(dring_msg != NULL); + ASSERT(mutex_owned(&vdc->lock)); /* * Check to see if the messages were responded to in the correct @@ -3357,7 +3354,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs) * if so something is seriously wrong so we reset the connection * - a seq_num greater than what we expected is returned. */ - if (dring_msg->seq_num != (vdc->seq_num_reply + num_msgs)) { + if (dring_msg->seq_num < vdc->seq_num_reply) { vdc_msg("%s[%d]: Bogus seq_num %d, expected %d\n", __func__, vdc->instance, dring_msg->seq_num, vdc->seq_num_reply + num_msgs); @@ -3529,7 +3526,8 @@ typedef struct vdc_dk_ioctl { size_t nbytes; /* size of structure to be copied */ /* function to convert between vDisk and Solaris structure formats */ - int (*convert)(void *vd_buf, void *ioctl_arg, int mode, int dir); + int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, + int mode, int dir); } vdc_dk_ioctl_t; /* @@ -3546,15 +3544,13 @@ static vdc_dk_ioctl_t dk_ioctl[] = { vdc_get_vtoc_convert}, {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), vdc_set_vtoc_convert}, - {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), - vdc_get_geom_convert}, {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, - {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), + {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, - {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), + {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), vdc_set_geom_convert}, /* @@ -3600,6 +3596,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) size_t alloc_len = 0; /* #bytes to allocate mem for */ caddr_t mem_p = NULL; size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); + struct vtoc vtoc_saved; PR0("%s: Processing ioctl(%x) for dev %x : model %x\n", __func__, cmd, dev, ddi_model_convert_from(mode & FMODELS)); @@ -3740,13 +3737,21 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) ASSERT(alloc_len != 0); /* sanity check */ mem_p = kmem_zalloc(alloc_len, KM_SLEEP); + if (cmd == DKIOCSVTOC) { + /* + * Save a copy of the current VTOC so that we can roll back + * if the setting of the new VTOC fails. + */ + bcopy(vdc->vtoc, &vtoc_saved, sizeof (struct vtoc)); + } + /* * Call the conversion function for this ioctl whhich if necessary * converts from the Solaris format to the format ARC'ed * as part of the vDisk protocol (FWARC 2006/195) */ ASSERT(dk_ioctl[idx].convert != NULL); - rv = (dk_ioctl[idx].convert)(arg, mem_p, mode, VD_COPYIN); + rv = (dk_ioctl[idx].convert)(vdc, arg, mem_p, mode, VD_COPYIN); if (rv != 0) { PR0("%s[%d]: convert returned %d for ioctl 0x%x\n", __func__, instance, rv, cmd); @@ -3770,20 +3775,24 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) __func__, instance, rv, cmd); if (mem_p != NULL) kmem_free(mem_p, alloc_len); + + if (cmd == DKIOCSVTOC) { + /* update of the VTOC has failed, roll back */ + bcopy(&vtoc_saved, vdc->vtoc, sizeof (struct vtoc)); + } + return (rv); } - /* - * If the VTOC has been changed, then vdc needs to update the copy - * it saved in the soft state structure and try and update the device - * node properties. Failing to set the properties should not cause - * an error to be return the caller though. - */ if (cmd == DKIOCSVTOC) { - bcopy(mem_p, vdc->vtoc, sizeof (struct vtoc)); + /* + * The VTOC has been changed, try and update the device + * node properties. Failing to set the properties should + * not cause an error to be return the caller though. + */ if (vdc_create_device_nodes_props(vdc)) { cmn_err(CE_NOTE, "![%d] Failed to update device nodes" - " properties", instance); + " properties", vdc->instance); } } @@ -3793,7 +3802,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) * protocol (FWARC 2006/195) back to a format understood by * the rest of Solaris. */ - rv = (dk_ioctl[idx].convert)(mem_p, arg, mode, VD_COPYOUT); + rv = (dk_ioctl[idx].convert)(vdc, mem_p, arg, mode, VD_COPYOUT); if (rv != 0) { PR0("%s[%d]: convert returned %d for ioctl 0x%x\n", __func__, instance, rv, cmd); @@ -3816,8 +3825,9 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) * do not need to convert the data being passed in/out to userland */ static int -vdc_null_copy_func(void *from, void *to, int mode, int dir) +vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) _NOTE(ARGUNUSED(from)) _NOTE(ARGUNUSED(to)) _NOTE(ARGUNUSED(mode)) @@ -3831,9 +3841,16 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir) * vdc_get_vtoc_convert() * * Description: - * This routine fakes up the disk info needed for some DKIO ioctls. + * This routine performs the necessary convertions from the DKIOCGVTOC + * Solaris structure to the format defined in FWARC 2006/195. + * + * In the struct vtoc definition, the timestamp field is marked as not + * supported so it is not part of vDisk protocol (FWARC 2006/195). + * However SVM uses that field to check it can write into the VTOC, + * so we fake up the info of that field. * * Arguments: + * vdc - the vDisk client * from - the buffer containing the data to be copied from * to - the buffer to be copied to * mode - flags passed to ioctl() call @@ -3842,11 +3859,12 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir) * Return Code: * 0 - Success * ENXIO - incorrect buffer passed in. - * EFAULT - ddi_copyxxx routine encountered an error. + * EFAULT - ddi_copyout routine encountered an error. */ static int -vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) +vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + int i; void *tmp_mem = NULL; void *tmp_memp; struct vtoc vt; @@ -3868,6 +3886,12 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) tmp_mem = kmem_alloc(copy_len, KM_SLEEP); VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); + + /* fake the VTOC timestamp field */ + for (i = 0; i < V_NUMPAR; i++) { + vt.timestamp[i] = vdc->vtoc->timestamp[i]; + } + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { vtoctovtoc32(vt, vt32); tmp_memp = &vt32; @@ -3887,8 +3911,11 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) * vdc_set_vtoc_convert() * * Description: + * This routine performs the necessary convertions from the DKIOCSVTOC + * Solaris structure to the format defined in FWARC 2006/195. * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3900,7 +3927,7 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) * EFAULT - ddi_copyin of data failed */ static int -vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) +vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { void *tmp_mem = NULL; struct vtoc vt; @@ -3934,6 +3961,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) vtp = tmp_mem; } + /* + * The VTOC is being changed, then vdc needs to update the copy + * it saved in the soft state structure. + */ + bcopy(vtp, vdc->vtoc, sizeof (struct vtoc)); + VTOC2VD_VTOC(vtp, &vtvd); bcopy(&vtvd, to, sizeof (vd_vtoc_t)); kmem_free(tmp_mem, copy_len); @@ -3946,8 +3979,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) * vdc_get_geom_convert() * * Description: + * This routine performs the necessary convertions from the DKIOCGGEOM, + * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format + * defined in FWARC 2006/195 * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3956,11 +3993,13 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) * Return Code: * 0 - Success * ENXIO - Invalid buffer passed in - * EFAULT - ddi_copyin of data failed + * EFAULT - ddi_copyout of data failed */ static int -vdc_get_geom_convert(void *from, void *to, int mode, int dir) +vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) + struct dk_geom geom; int copy_len = sizeof (struct dk_geom); int rv = 0; @@ -3984,10 +4023,11 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir) * vdc_set_geom_convert() * * Description: - * This routine performs the necessary convertions from the DKIOCSVTOC - * Solaris structure to the format defined in FWARC 2006/195 + * This routine performs the necessary convertions from the DKIOCSGEOM + * Solaris structure to the format defined in FWARC 2006/195. * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3999,8 +4039,10 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir) * EFAULT - ddi_copyin of data failed */ static int -vdc_set_geom_convert(void *from, void *to, int mode, int dir) +vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) + vd_geom_t vdgeom; void *tmp_mem = NULL; int copy_len = sizeof (struct dk_geom); @@ -4104,6 +4146,7 @@ vdc_create_fake_geometry(vdc_t *vdc) static int vdc_setup_disk_layout(vdc_t *vdc) { + buf_t *buf; /* BREAD requests need to be in a buf_t structure */ dev_t dev; int slice = 0; int rv; @@ -4129,14 +4172,9 @@ vdc_setup_disk_layout(vdc_t *vdc) } /* - * Read disk label from start of disk - */ - vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP); - - /* * find the slice that represents the entire "disk" and use that to * read the disk label. The convention in Solaris is that slice 2 - * represents the whole disk so we check that it is otherwise we + * represents the whole disk so we check that it is, otherwise we * default to slice 0 */ if ((vdc->vdisk_type == VD_DISK_TYPE_DISK) && @@ -4145,8 +4183,22 @@ vdc_setup_disk_layout(vdc_t *vdc) } else { slice = 0; } - rv = vdc_populate_descriptor(vdc, (caddr_t)vdc->label, DK_LABEL_SIZE, + + /* + * Read disk label from start of disk + */ + vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP); + buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); + bioinit(buf); + buf->b_un.b_addr = (caddr_t)vdc->label; + buf->b_bcount = DK_LABEL_SIZE; + buf->b_flags = B_BUSY | B_READ; + buf->b_dev = dev; + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, DK_LABEL_SIZE, VD_OP_BREAD, 0, slice); + rv = biowait(buf); + biofini(buf); + kmem_free(buf, sizeof (buf_t)); return (rv); } diff --git a/usr/src/uts/sun4v/io/vds.c b/usr/src/uts/sun4v/io/vds.c index adcea0c944..7a06a331d1 100644 --- a/usr/src/uts/sun4v/io/vds.c +++ b/usr/src/uts/sun4v/io/vds.c @@ -50,9 +50,8 @@ /* Virtual disk server initialization flags */ -#define VDS_LOCKING 0x01 -#define VDS_LDI 0x02 -#define VDS_MDEG 0x04 +#define VDS_LDI 0x01 +#define VDS_MDEG 0x02 /* Virtual disk server tunable parameters */ #define VDS_LDC_RETRIES 3 @@ -71,11 +70,10 @@ /* Virtual disk initialization flags */ #define VD_LOCKING 0x01 -#define VD_TASKQ 0x02 -#define VD_LDC 0x04 -#define VD_DRING 0x08 -#define VD_SID 0x10 -#define VD_SEQ_NUM 0x20 +#define VD_LDC 0x02 +#define VD_DRING 0x04 +#define VD_SID 0x08 +#define VD_SEQ_NUM 0x10 /* Flags for opening/closing backing devices via LDI */ #define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE) @@ -135,20 +133,48 @@ #endif /* DEBUG */ +/* + * Soft state structure for a vds instance + */ typedef struct vds { uint_t initialized; /* driver inst initialization flags */ dev_info_t *dip; /* driver inst devinfo pointer */ - kmutex_t lock; /* lock for this structure */ ldi_ident_t ldi_ident; /* driver's identifier for LDI */ mod_hash_t *vd_table; /* table of virtual disks served */ mdeg_handle_t mdeg; /* handle for MDEG operations */ } vds_t; +/* + * Types of descriptor-processing tasks + */ +typedef enum vd_task_type { + VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */ + VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */ +} vd_task_type_t; + +/* + * Structure describing the task for processing a descriptor + */ +typedef struct vd_task { + struct vd *vd; /* vd instance task is for */ + vd_task_type_t type; /* type of descriptor task */ + int index; /* dring elem index for task */ + vio_msg_t *msg; /* VIO message task is for */ + size_t msglen; /* length of message content */ + size_t msgsize; /* size of message buffer */ + vd_dring_payload_t *request; /* request task will perform */ + struct buf buf; /* buf(9s) for I/O request */ + +} vd_task_t; + +/* + * Soft state structure for a virtual disk instance + */ typedef struct vd { uint_t initialized; /* vdisk initialization flags */ - kmutex_t lock; /* lock for this structure */ vds_t *vds; /* server for this vdisk */ - ddi_taskq_t *taskq; /* taskq for this vdisk */ + ddi_taskq_t *startq; /* queue for I/O start tasks */ + ddi_taskq_t *completionq; /* queue for completion tasks */ ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */ dev_t dev[V_NUMPAR]; /* dev numbers for slices */ uint_t nslices; /* number for slices */ @@ -160,7 +186,6 @@ typedef struct vd { ldc_status_t ldc_state; /* LDC connection state */ ldc_handle_t ldc_handle; /* handle for LDC comm */ size_t max_msglen; /* largest LDC message len */ - boolean_t enabled; /* whether vdisk is enabled */ vd_state_t state; /* client handshake state */ uint8_t xfer_mode; /* transfer mode with client */ uint32_t sid; /* client's session ID */ @@ -170,11 +195,19 @@ typedef struct vd { uint32_t descriptor_size; /* num bytes in desc */ uint32_t dring_len; /* number of dring elements */ caddr_t dring; /* address of dring */ + vd_task_t inband_task; /* task for inband descriptor */ + vd_task_t *dring_task; /* tasks dring elements */ + + kmutex_t lock; /* protects variables below */ + boolean_t enabled; /* is vdisk enabled? */ + boolean_t reset_state; /* reset connection state? */ + boolean_t reset_ldc; /* reset LDC channel? */ } vd_t; typedef struct vds_operation { uint8_t operation; - int (*function)(vd_t *vd, vd_dring_payload_t *request); + int (*start)(vd_task_t *task); + void (*complete)(void *arg); } vds_operation_t; typedef struct vd_ioctl { @@ -217,86 +250,245 @@ static int vd_msglevel; static int -vd_bread(vd_t *vd, vd_dring_payload_t *request) +vd_start_bio(vd_task_t *task) { - int status; - struct buf buf; + int status = 0; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + struct buf *buf = &task->buf; + + + ASSERT(vd != NULL); + ASSERT(request != NULL); + ASSERT(request->slice < vd->nslices); + ASSERT((request->operation == VD_OP_BREAD) || + (request->operation == VD_OP_BWRITE)); - PR1("Read %lu bytes at block %lu", request->nbytes, request->addr); if (request->nbytes == 0) return (EINVAL); /* no service for trivial requests */ - ASSERT(mutex_owned(&vd->lock)); - ASSERT(request->slice < vd->nslices); - bioinit(&buf); - buf.b_flags = B_BUSY | B_READ; - buf.b_bcount = request->nbytes; - buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP); - buf.b_lblkno = request->addr; - buf.b_edev = vd->dev[request->slice]; + PR1("%s %lu bytes at block %lu", + (request->operation == VD_OP_BREAD) ? "Read" : "Write", + request->nbytes, request->addr); + + bioinit(buf); + buf->b_flags = B_BUSY; + buf->b_bcount = request->nbytes; + buf->b_un.b_addr = kmem_alloc(buf->b_bcount, KM_SLEEP); + buf->b_lblkno = request->addr; + buf->b_edev = vd->dev[request->slice]; + + if (request->operation == VD_OP_BREAD) { + buf->b_flags |= B_READ; + } else { + buf->b_flags |= B_WRITE; + /* Get data to write from client */ + if ((status = ldc_mem_copy(vd->ldc_handle, buf->b_un.b_addr, 0, + &request->nbytes, request->cookie, + request->ncookies, LDC_COPY_IN)) != 0) { + PRN("ldc_mem_copy() returned errno %d " + "copying from client", status); + } + } - if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0) - status = biowait(&buf); - biofini(&buf); + /* Start the block I/O */ if ((status == 0) && - ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0, - &request->nbytes, request->cookie, request->ncookies, - LDC_COPY_OUT)) != 0)) { - PRN("ldc_mem_copy() returned errno %d copying to client", - status); - } - kmem_free(buf.b_un.b_addr, buf.b_bcount); /* nbytes can change */ + ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0)) + return (EINPROGRESS); /* will complete on completionq */ + + /* Clean up after error */ + kmem_free(buf->b_un.b_addr, buf->b_bcount); + biofini(buf); return (status); } static int -vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes, - ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data) +send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) { - int status; - struct buf buf; + int retry, status; + size_t nbytes; - ASSERT(mutex_owned(&vd->lock)); - ASSERT(slice < vd->nslices); - ASSERT(nbytes != 0); - ASSERT(data != NULL); - /* Get data from client */ - if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes, - cookie, ncookies, LDC_COPY_IN)) != 0) { - PRN("ldc_mem_copy() returned errno %d copying from client", - status); + for (retry = 0, status = EWOULDBLOCK; + retry < vds_ldc_retries && status == EWOULDBLOCK; + retry++) { + PR1("ldc_write() attempt %d", (retry + 1)); + nbytes = msglen; + status = ldc_write(ldc_handle, msg, &nbytes); + } + + if (status != 0) { + PRN("ldc_write() returned errno %d", status); return (status); + } else if (nbytes != msglen) { + PRN("ldc_write() performed only partial write"); + return (EIO); } - bioinit(&buf); - buf.b_flags = B_BUSY | B_WRITE; - buf.b_bcount = nbytes; - buf.b_un.b_addr = data; - buf.b_lblkno = block; - buf.b_edev = vd->dev[slice]; + PR1("SENT %lu bytes", msglen); + return (0); +} - if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0) - status = biowait(&buf); - biofini(&buf); - return (status); +static void +vd_need_reset(vd_t *vd, boolean_t reset_ldc) +{ + mutex_enter(&vd->lock); + vd->reset_state = B_TRUE; + vd->reset_ldc = reset_ldc; + mutex_exit(&vd->lock); +} + +/* + * Reset the state of the connection with a client, if needed; reset the LDC + * transport as well, if needed. This function should only be called from the + * "startq", as it waits for tasks on the "completionq" and will deadlock if + * called from that queue. + */ +static void +vd_reset_if_needed(vd_t *vd) +{ + int status = 0; + + + mutex_enter(&vd->lock); + if (!vd->reset_state) { + ASSERT(!vd->reset_ldc); + mutex_exit(&vd->lock); + return; + } + mutex_exit(&vd->lock); + + + PR0("Resetting connection state with %s", VD_CLIENT(vd)); + + /* + * Let any asynchronous I/O complete before possibly pulling the rug + * out from under it; defer checking vd->reset_ldc, as one of the + * asynchronous tasks might set it + */ + ddi_taskq_wait(vd->completionq); + + + if ((vd->initialized & VD_DRING) && + ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) + PRN("ldc_mem_dring_unmap() returned errno %d", status); + + if (vd->dring_task != NULL) { + ASSERT(vd->dring_len != 0); + kmem_free(vd->dring_task, + (sizeof (*vd->dring_task)) * vd->dring_len); + vd->dring_task = NULL; + } + + + mutex_enter(&vd->lock); + if (vd->reset_ldc && ((status = ldc_reset(vd->ldc_handle)) != 0)) + PRN("ldc_reset() returned errno %d", status); + + vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); + vd->state = VD_STATE_INIT; + vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ + + vd->reset_state = B_FALSE; + vd->reset_ldc = B_FALSE; + mutex_exit(&vd->lock); } static int -vd_bwrite(vd_t *vd, vd_dring_payload_t *request) +vd_mark_elem_done(vd_t *vd, int idx, int elem_status) { - int status; - caddr_t data; + boolean_t accepted; + int status; + vd_dring_entry_t *elem = VD_DRING_ELEM(idx); - PR1("Write %ld bytes at block %lu", request->nbytes, request->addr); - if (request->nbytes == 0) - return (EINVAL); /* no service for trivial requests */ - data = kmem_alloc(request->nbytes, KM_SLEEP); - status = vd_do_bwrite(vd, request->slice, request->addr, - request->nbytes, request->cookie, request->ncookies, data); - kmem_free(data, request->nbytes); - return (status); + /* Acquire the element */ + if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { + PRN("ldc_mem_dring_acquire() returned errno %d", status); + return (status); + } + + /* Set the element's status and mark it done */ + accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED); + if (accepted) { + elem->payload.status = elem_status; + elem->hdr.dstate = VIO_DESC_DONE; + } else { + /* Perhaps client timed out waiting for I/O... */ + PRN("element %u no longer \"accepted\"", idx); + VD_DUMP_DRING_ELEM(elem); + } + /* Release the element */ + if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { + PRN("ldc_mem_dring_release() returned errno %d", status); + return (status); + } + + return (accepted ? 0 : EINVAL); +} + +static void +vd_complete_bio(void *arg) +{ + int status = 0; + vd_task_t *task = (vd_task_t *)arg; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + struct buf *buf = &task->buf; + + + ASSERT(vd != NULL); + ASSERT(request != NULL); + ASSERT(task->msg != NULL); + ASSERT(task->msglen >= sizeof (*task->msg)); + ASSERT(task->msgsize >= task->msglen); + + /* Wait for the I/O to complete */ + request->status = biowait(buf); + + /* If data was read, copy it to the client */ + if ((request->status == 0) && (request->operation == VD_OP_BREAD) && + ((status = ldc_mem_copy(vd->ldc_handle, buf->b_un.b_addr, 0, + &request->nbytes, request->cookie, request->ncookies, + LDC_COPY_OUT)) != 0)) { + PRN("ldc_mem_copy() returned errno %d copying to client", + status); + } + + /* Release I/O buffer */ + kmem_free(buf->b_un.b_addr, buf->b_bcount); + biofini(buf); + + /* Update the dring element for a dring client */ + if ((status == 0) && (vd->xfer_mode == VIO_DRING_MODE)) + status = vd_mark_elem_done(vd, task->index, request->status); + + /* + * If a transport error occurred, arrange to "nack" the message when + * the final task in the descriptor element range completes + */ + if (status != 0) + task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK; + + /* + * Only the final task for a range of elements will respond to and + * free the message + */ + if (task->type == VD_NONFINAL_RANGE_TASK) + return; + + /* + * Send the "ack" or "nack" back to the client; if sending the message + * via LDC fails, arrange to reset both the connection state and LDC + * itself + */ + PR1("Sending %s", + (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); + if (send_msg(vd->ldc_handle, task->msg, task->msglen) != 0) + vd_need_reset(vd, B_TRUE); + + /* Free the message now that it has been used for the reply */ + kmem_free(task->msg, task->msgsize); } static void @@ -347,7 +539,6 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) size_t nbytes = request->nbytes; /* modifiable copy */ - ASSERT(mutex_owned(&vd->lock)); ASSERT(request->slice < vd->nslices); PR0("Performing %s", ioctl->operation_name); @@ -379,8 +570,8 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) (void *)ioctl->arg)) != 0) return (status); } else if ((status = ldi_ioctl(vd->ldi_handle[request->slice], - ioctl->cmd, (intptr_t)ioctl->arg, FKIOCTL, kcred, - &rval)) != 0) { + ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL), + kcred, &rval)) != 0) { PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status); return (status); } @@ -453,7 +644,7 @@ vd_open_new_slices(vd_t *vd) /* Get the (new) VTOC for updated slice sizes */ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc, - FKIOCTL, kcred, &rval)) != 0) { + (vd_open_flags | FKIOCTL), kcred, &rval)) != 0) { PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status); return; } @@ -483,13 +674,15 @@ vd_open_new_slices(vd_t *vd) #define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t)) static int -vd_ioctl(vd_t *vd, vd_dring_payload_t *request) +vd_ioctl(vd_task_t *task) { - int i, status; - void *buf = NULL; - struct dk_geom dk_geom = {0}; - struct vtoc vtoc = {0}; - vd_ioctl_t ioctl[] = { + int i, status; + void *buf = NULL; + struct dk_geom dk_geom = {0}; + struct vtoc vtoc = {0}; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + vd_ioctl_t ioctl[] = { /* Command (no-copy) operations */ {VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0, DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE), @@ -522,7 +715,8 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request) size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); - ASSERT(mutex_owned(&vd->lock)); + ASSERT(vd != NULL); + ASSERT(request != NULL); ASSERT(request->slice < vd->nslices); /* @@ -554,6 +748,7 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request) if ((request->operation == VD_OP_SET_VTOC) && (vd->vdisk_type == VD_DISK_TYPE_DISK)) vd_open_new_slices(vd); + PR0("Returning %d", status); return (status); } @@ -562,31 +757,33 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request) * been defined */ static const vds_operation_t vds_operation[] = { - {VD_OP_BREAD, vd_bread}, - {VD_OP_BWRITE, vd_bwrite}, - {VD_OP_FLUSH, vd_ioctl}, - {VD_OP_GET_WCE, vd_ioctl}, - {VD_OP_SET_WCE, vd_ioctl}, - {VD_OP_GET_VTOC, vd_ioctl}, - {VD_OP_SET_VTOC, vd_ioctl}, - {VD_OP_GET_DISKGEOM, vd_ioctl}, - {VD_OP_SET_DISKGEOM, vd_ioctl} + {VD_OP_BREAD, vd_start_bio, vd_complete_bio}, + {VD_OP_BWRITE, vd_start_bio, vd_complete_bio}, + {VD_OP_FLUSH, vd_ioctl, NULL}, + {VD_OP_GET_WCE, vd_ioctl, NULL}, + {VD_OP_SET_WCE, vd_ioctl, NULL}, + {VD_OP_GET_VTOC, vd_ioctl, NULL}, + {VD_OP_SET_VTOC, vd_ioctl, NULL}, + {VD_OP_GET_DISKGEOM, vd_ioctl, NULL}, + {VD_OP_SET_DISKGEOM, vd_ioctl, NULL} }; static const size_t vds_noperations = (sizeof (vds_operation))/(sizeof (vds_operation[0])); /* - * Process a request using a defined operation + * Process a task specifying a client I/O request */ static int -vd_process_request(vd_t *vd, vd_dring_payload_t *request) +vd_process_task(vd_task_t *task) { - int i; + int i, status; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; - PR1("Entered"); - ASSERT(mutex_owned(&vd->lock)); + ASSERT(vd != NULL); + ASSERT(request != NULL); /* Range-check slice */ if (request->slice >= vd->nslices) { @@ -595,41 +792,37 @@ vd_process_request(vd_t *vd, vd_dring_payload_t *request) return (EINVAL); } - /* Perform the requested operation */ + /* Find the requested operation */ for (i = 0; i < vds_noperations; i++) if (request->operation == vds_operation[i].operation) - return (vds_operation[i].function(vd, request)); - - /* No matching operation found */ - PRN("Unsupported operation %u", request->operation); - return (ENOTSUP); -} - -static int -send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen) -{ - int retry, status; - size_t nbytes; - + break; + if (i == vds_noperations) { + PRN("Unsupported operation %u", request->operation); + return (ENOTSUP); + } - for (retry = 0, status = EWOULDBLOCK; - retry < vds_ldc_retries && status == EWOULDBLOCK; - retry++) { - PR1("ldc_write() attempt %d", (retry + 1)); - nbytes = msglen; - status = ldc_write(ldc_handle, msg, &nbytes); + /* Start the operation */ + if ((status = vds_operation[i].start(task)) != EINPROGRESS) { + request->status = status; /* op succeeded or failed */ + return (0); /* but request completed */ } - if (status != 0) { - PRN("ldc_write() returned errno %d", status); - return (status); - } else if (nbytes != msglen) { - PRN("ldc_write() performed only partial write"); - return (EIO); + ASSERT(vds_operation[i].complete != NULL); /* debug case */ + if (vds_operation[i].complete == NULL) { /* non-debug case */ + PRN("Unexpected return of EINPROGRESS " + "with no I/O completion handler"); + request->status = EIO; /* operation failed */ + return (0); /* but request completed */ } - PR1("SENT %lu bytes", msglen); - return (0); + /* Queue a task to complete the operation */ + status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete, + task, DDI_SLEEP); + /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ + ASSERT(status == DDI_SUCCESS); + + PR1("Operation in progress"); + return (EINPROGRESS); /* completion handler will finish request */ } /* @@ -782,13 +975,12 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg; - PR0("Entered"); - ASSERT(mutex_owned(&vd->lock)); ASSERT(msglen >= sizeof (msg->tag)); if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_ATTR_INFO)) { - return (ENOMSG); /* not an attribute message */ + PR0("Message is not an attribute message"); + return (ENOMSG); } if (msglen != sizeof (*attr_msg)) { @@ -835,6 +1027,14 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) * their cookies */ vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen); + + /* + * Initialize the data structure for processing in-band I/O + * request descriptors + */ + vd->inband_task.vd = vd; + vd->inband_task.index = 0; + vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */ } attr_msg->vdisk_size = vd->vdisk_size; @@ -853,13 +1053,12 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg; - PR0("Entered"); - ASSERT(mutex_owned(&vd->lock)); ASSERT(msglen >= sizeof (msg->tag)); if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_DRING_REG)) { - return (ENOMSG); /* not a register-dring message */ + PR0("Message is not a register-dring message"); + return (ENOMSG); } if (msglen < sizeof (*reg_msg)) { @@ -881,6 +1080,12 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) return (EBADMSG); } + if (reg_msg->num_descriptors > INT32_MAX) { + PRN("reg_msg->num_descriptors = %u; must be <= %u (%s)", + reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX)); + return (EBADMSG); + } + if (reg_msg->ncookies != 1) { /* * In addition to fixing the assertion in the success case @@ -928,7 +1133,7 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) } - /* Valid message and dring mapped */ + /* Initialize for valid message and mapped dring */ PR1("descriptor size = %u, dring length = %u", vd->descriptor_size, vd->dring_len); vd->initialized |= VD_DRING; @@ -937,6 +1142,19 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) vd->descriptor_size = reg_msg->descriptor_size; vd->dring_len = reg_msg->num_descriptors; reg_msg->dring_ident = vd->dring_ident; + + /* + * Allocate and initialize a "shadow" array of data structures for + * tasks to process I/O requests in dring elements + */ + vd->dring_task = + kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP); + for (int i = 0; i < vd->dring_len; i++) { + vd->dring_task[i].vd = vd; + vd->dring_task[i].index = i; + vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload; + } + return (0); } @@ -946,13 +1164,12 @@ vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg; - PR0("Entered"); - ASSERT(mutex_owned(&vd->lock)); ASSERT(msglen >= sizeof (msg->tag)); if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_DRING_UNREG)) { - return (ENOMSG); /* not an unregister-dring message */ + PR0("Message is not an unregister-dring message"); + return (ENOMSG); } if (msglen != sizeof (*unreg_msg)) { @@ -973,11 +1190,12 @@ vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) static int process_rdx_msg(vio_msg_t *msg, size_t msglen) { - PR0("Entered"); ASSERT(msglen >= sizeof (msg->tag)); - if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) - return (ENOMSG); /* not an RDX message */ + if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) { + PR0("Message is not an RDX message"); + return (ENOMSG); + } if (msglen != sizeof (vio_rdx_msg_t)) { PRN("Expected %lu-byte RDX message; received %lu bytes", @@ -985,36 +1203,17 @@ process_rdx_msg(vio_msg_t *msg, size_t msglen) return (EBADMSG); } + PR0("Valid RDX message"); return (0); } -static void -vd_reset_connection(vd_t *vd, boolean_t reset_ldc) -{ - int status = 0; - - - ASSERT(mutex_owned(&vd->lock)); - PR0("Resetting connection with %s", VD_CLIENT(vd)); - if ((vd->initialized & VD_DRING) && - ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)) - PRN("ldc_mem_dring_unmap() returned errno %d", status); - if ((reset_ldc == B_TRUE) && - ((status = ldc_reset(vd->ldc_handle)) != 0)) - PRN("ldc_reset() returned errno %d", status); - vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); - vd->state = VD_STATE_INIT; - vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ -} - static int vd_check_seq_num(vd_t *vd, uint64_t seq_num) { - ASSERT(mutex_owned(&vd->lock)); if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) { PRN("Received seq_num %lu; expected %lu", seq_num, (vd->seq_num + 1)); - vd_reset_connection(vd, B_FALSE); + vd_need_reset(vd, B_FALSE); return (1); } @@ -1040,19 +1239,19 @@ expected_inband_size(vd_dring_inband_msg_t *msg) * operating on them within a descriptor ring */ static int -vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize) { size_t expected; vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg; - PR1("Entered"); - ASSERT(mutex_owned(&vd->lock)); ASSERT(msglen >= sizeof (msg->tag)); if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, - VIO_DESC_DATA)) - return (ENOMSG); /* not an in-band-descriptor message */ + VIO_DESC_DATA)) { + PR1("Message is not an in-band-descriptor message"); + return (ENOMSG); + } if (msglen < sizeof (*desc_msg)) { PRN("Expected at least %lu-byte descriptor message; " @@ -1066,129 +1265,124 @@ vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) return (EBADMSG); } - if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) { + if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) return (EBADMSG); - } - /* Valid message; process the request */ - desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload); - return (0); + /* + * Valid message: Set up the in-band descriptor task and process the + * request. Arrange to acknowledge the client's message, unless an + * error processing the descriptor task results in setting + * VIO_SUBTYPE_NACK + */ + PR1("Valid in-band-descriptor message"); + msg->tag.vio_subtype = VIO_SUBTYPE_ACK; + vd->inband_task.msg = msg; + vd->inband_task.msglen = msglen; + vd->inband_task.msgsize = msgsize; + vd->inband_task.request = &desc_msg->payload; + return (vd_process_task(&vd->inband_task)); } -static boolean_t -vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc) +static int +vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx, + vio_msg_t *msg, size_t msglen, size_t msgsize) { - uint32_t i, n; + int status; + boolean_t ready; + vd_dring_entry_t *elem = VD_DRING_ELEM(idx); - /* Check descriptor states */ - for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { - if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) { - PRN("descriptor %u not ready", i); - VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i)); - return (B_FALSE); - } + /* Accept the updated dring element */ + if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) { + PRN("ldc_mem_dring_acquire() returned errno %d", status); + return (status); } + ready = (elem->hdr.dstate == VIO_DESC_READY); + if (ready) { + elem->hdr.dstate = VIO_DESC_ACCEPTED; + } else { + PRN("descriptor %u not ready", idx); + VD_DUMP_DRING_ELEM(elem); + } + if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) { + PRN("ldc_mem_dring_release() returned errno %d", status); + return (status); + } + if (!ready) + return (EBUSY); - /* Descriptors are valid; accept them */ - for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) - VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED; - return (B_TRUE); + /* Initialize a task and process the accepted element */ + PR1("Processing dring element %u", idx); + vd->dring_task[idx].type = type; + vd->dring_task[idx].msg = msg; + vd->dring_task[idx].msglen = msglen; + vd->dring_task[idx].msgsize = msgsize; + if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS) + status = vd_mark_elem_done(vd, idx, elem->payload.status); + + return (status); } static int -vd_process_dring(vd_t *vd, uint32_t start, uint32_t end) +vd_process_element_range(vd_t *vd, int start, int end, + vio_msg_t *msg, size_t msglen, size_t msgsize) { - int status; - boolean_t accepted; - uint32_t i, io_status, n, ndesc; + int i, n, nelem, status = 0; + boolean_t inprogress = B_FALSE; + vd_task_type_t type; - ASSERT(mutex_owned(&vd->lock)); - PR1("start = %u, end = %u", start, end); + ASSERT(start >= 0); + ASSERT(end >= 0); - /* Validate descriptor range */ - if ((start >= vd->dring_len) || (end >= vd->dring_len)) { - PRN("\"start\" = %u, \"end\" = %u; both must be less than %u", - start, end, vd->dring_len); - return (EINVAL); - } + /* + * Arrange to acknowledge the client's message, unless an error + * processing one of the dring elements results in setting + * VIO_SUBTYPE_NACK + */ + msg->tag.vio_subtype = VIO_SUBTYPE_ACK; - /* Acquire updated dring elements */ - if ((status = ldc_mem_dring_acquire(vd->dring_handle, - start, end)) != 0) { - PRN("ldc_mem_dring_acquire() returned errno %d", status); - return (status); - } - /* Accept updated dring elements */ - ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1; - PR1("ndesc = %u", ndesc); - accepted = vd_accept_dring_elems(vd, start, ndesc); - /* Release dring elements */ - if ((status = ldc_mem_dring_release(vd->dring_handle, - start, end)) != 0) { - PRN("ldc_mem_dring_release() returned errno %d", status); - return (status); + /* + * Process the dring elements in the range + */ + nelem = ((end < start) ? end + vd->dring_len : end) - start + 1; + for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) { + ((vio_dring_msg_t *)msg)->end_idx = i; + type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK; + status = vd_process_element(vd, type, i, msg, msglen, msgsize); + if (status == EINPROGRESS) + inprogress = B_TRUE; + else if (status != 0) + break; } - /* If a descriptor was in the wrong state, return an error */ - if (!accepted) - return (EINVAL); + /* + * If some, but not all, operations of a multi-element range are in + * progress, wait for other operations to complete before returning + * (which will result in "ack" or "nack" of the message). Note that + * all outstanding operations will need to complete, not just the ones + * corresponding to the current range of dring elements; howevever, as + * this situation is an error case, performance is less critical. + */ + if ((nelem > 1) && (status != EINPROGRESS) && inprogress) + ddi_taskq_wait(vd->completionq); - /* Process accepted dring elements */ - for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) { - vd_dring_entry_t *elem = VD_DRING_ELEM(i); - - /* Process descriptor outside acquire/release bracket */ - PR1("Processing dring element %u", i); - io_status = vd_process_request(vd, &elem->payload); - - /* Re-acquire client's dring element */ - if ((status = ldc_mem_dring_acquire(vd->dring_handle, - i, i)) != 0) { - PRN("ldc_mem_dring_acquire() returned errno %d", - status); - return (status); - } - /* Update processed element */ - if (elem->hdr.dstate == VIO_DESC_ACCEPTED) { - elem->payload.status = io_status; - elem->hdr.dstate = VIO_DESC_DONE; - } else { - /* Perhaps client timed out waiting for I/O... */ - accepted = B_FALSE; - PRN("element %u no longer \"accepted\"", i); - VD_DUMP_DRING_ELEM(elem); - } - /* Release updated processed element */ - if ((status = ldc_mem_dring_release(vd->dring_handle, - i, i)) != 0) { - PRN("ldc_mem_dring_release() returned errno %d", - status); - return (status); - } - /* If the descriptor was in the wrong state, return an error */ - if (!accepted) - return (EINVAL); - } - - return (0); + return (status); } static int -vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize) { vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg; - PR1("Entered"); - ASSERT(mutex_owned(&vd->lock)); ASSERT(msglen >= sizeof (msg->tag)); if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO, VIO_DRING_DATA)) { - return (ENOMSG); /* not a dring-data message */ + PR1("Message is not a dring-data message"); + return (ENOMSG); } if (msglen != sizeof (*dring_msg)) { @@ -1197,9 +1391,8 @@ vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) return (EBADMSG); } - if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) { + if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) return (EBADMSG); - } if (dring_msg->dring_ident != vd->dring_ident) { PRN("Expected dring ident %lu; received ident %lu", @@ -1207,10 +1400,24 @@ vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) return (EBADMSG); } + if (dring_msg->start_idx >= vd->dring_len) { + PRN("\"start_idx\" = %u; must be less than %u", + dring_msg->start_idx, vd->dring_len); + return (EBADMSG); + } - /* Valid message; process dring */ - dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK; - return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx)); + if ((dring_msg->end_idx < 0) || + (dring_msg->end_idx >= vd->dring_len)) { + PRN("\"end_idx\" = %u; must be >= 0 and less than %u", + dring_msg->end_idx, vd->dring_len); + return (EBADMSG); + } + + /* Valid message; process range of updated dring elements */ + PR1("Processing descriptor range, start = %u, end = %u", + dring_msg->start_idx, dring_msg->end_idx); + return (vd_process_element_range(vd, dring_msg->start_idx, + dring_msg->end_idx, msg, msglen, msgsize)); } static int @@ -1241,14 +1448,13 @@ recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes) } static int -vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize) { int status; PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype, msg->tag.vio_subtype, msg->tag.vio_subtype_env); - ASSERT(mutex_owned(&vd->lock)); /* * Validate session ID up front, since it applies to all messages @@ -1338,7 +1544,7 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) case VD_STATE_DATA: switch (vd->xfer_mode) { case VIO_DESC_MODE: /* expect in-band-descriptor message */ - return (vd_process_desc_msg(vd, msg, msglen)); + return (vd_process_desc_msg(vd, msg, msglen, msgsize)); case VIO_DRING_MODE: /* expect dring-data or unreg-dring */ /* @@ -1346,7 +1552,7 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) * them first */ if ((status = vd_process_dring_msg(vd, msg, - msglen)) != ENOMSG) + msglen, msgsize)) != ENOMSG) return (status); /* @@ -1371,15 +1577,13 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) } } -static void -vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) +static int +vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize) { int status; boolean_t reset_ldc = B_FALSE; - ASSERT(mutex_owned(&vd->lock)); - /* * Check that the message is at least big enough for a "tag", so that * message processing can proceed based on tag-specified message type @@ -1387,19 +1591,22 @@ vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) if (msglen < sizeof (vio_msg_tag_t)) { PRN("Received short (%lu-byte) message", msglen); /* Can't "nack" short message, so drop the big hammer */ - vd_reset_connection(vd, B_TRUE); - return; + vd_need_reset(vd, B_TRUE); + return (EBADMSG); } /* * Process the message */ - switch (status = vd_do_process_msg(vd, msg, msglen)) { + switch (status = vd_do_process_msg(vd, msg, msglen, msgsize)) { case 0: /* "ack" valid, successfully-processed messages */ msg->tag.vio_subtype = VIO_SUBTYPE_ACK; break; + case EINPROGRESS: + /* The completion handler will "ack" or "nack" the message */ + return (EINPROGRESS); case ENOMSG: PRN("Received unexpected message"); _NOTE(FALLTHROUGH); @@ -1417,15 +1624,29 @@ vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) break; } - /* "ack" or "nack" the message */ + /* Send the "ack" or "nack" to the client */ PR1("Sending %s", (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK"); if (send_msg(vd->ldc_handle, msg, msglen) != 0) reset_ldc = B_TRUE; - /* Reset the connection for nack'ed or failed messages */ + /* Arrange to reset the connection for nack'ed or failed messages */ if ((status != 0) || reset_ldc) - vd_reset_connection(vd, reset_ldc); + vd_need_reset(vd, reset_ldc); + + return (status); +} + +static boolean_t +vd_enabled(vd_t *vd) +{ + boolean_t enabled; + + + mutex_enter(&vd->lock); + enabled = vd->enabled; + mutex_exit(&vd->lock); + return (enabled); } static void @@ -1435,74 +1656,70 @@ vd_recv_msg(void *arg) int status = 0; - PR2("Entered"); ASSERT(vd != NULL); - mutex_enter(&vd->lock); - /* - * Receive and process any messages in the LDC queue; max_msglen is - * reset each time through the loop, as vd->max_msglen can increase - * during connection handshake - */ - for (size_t max_msglen = vd->max_msglen; - vd->enabled && status == 0; - max_msglen = vd->max_msglen) { - size_t msglen = max_msglen; - vio_msg_t *vio_msg = kmem_alloc(max_msglen, KM_SLEEP); - - if ((status = recv_msg(vd->ldc_handle, vio_msg, &msglen)) == 0) - vd_process_msg(vd, vio_msg, msglen); - else if (status != ENOMSG) - vd_reset_connection(vd, B_TRUE); - kmem_free(vio_msg, max_msglen); + PR2("New task to receive incoming message(s)"); + while (vd_enabled(vd) && status == 0) { + size_t msglen, msgsize; + vio_msg_t *vio_msg; + + + /* + * Receive and process a message + */ + vd_reset_if_needed(vd); /* can change vd->max_msglen */ + msgsize = vd->max_msglen; /* stable copy for alloc/free */ + msglen = msgsize; /* actual length after recv_msg() */ + vio_msg = kmem_alloc(msgsize, KM_SLEEP); + if ((status = recv_msg(vd->ldc_handle, vio_msg, &msglen)) == + 0) { + if (vd_process_msg(vd, vio_msg, msglen, msgsize) == + EINPROGRESS) + continue; /* handler will free msg */ + } else if (status != ENOMSG) { + /* Probably an LDC failure; arrange to reset it */ + vd_need_reset(vd, B_TRUE); + } + kmem_free(vio_msg, msgsize); } - mutex_exit(&vd->lock); - PR2("Returning"); + PR2("Task finished"); } static uint_t -vd_do_handle_ldc_events(vd_t *vd, uint64_t event) +vd_handle_ldc_events(uint64_t event, caddr_t arg) { - ASSERT(mutex_owned(&vd->lock)); + vd_t *vd = (vd_t *)(void *)arg; + - if (!vd->enabled) + ASSERT(vd != NULL); + + if (!vd_enabled(vd)) return (LDC_SUCCESS); if (event & LDC_EVT_RESET) { - PR0("Channel was reset"); + PR0("LDC channel was reset"); return (LDC_SUCCESS); } if (event & LDC_EVT_UP) { - /* Reset the connection state when channel comes (back) up */ - vd_reset_connection(vd, B_FALSE); + PR0("LDC channel came up: Resetting client connection state"); + vd_need_reset(vd, B_FALSE); } if (event & LDC_EVT_READ) { + int status; + PR1("New data available"); /* Queue a task to receive the new data */ - if (ddi_taskq_dispatch(vd->taskq, vd_recv_msg, vd, DDI_SLEEP) != - DDI_SUCCESS) - PRN("Unable to dispatch vd_recv_msg()"); + status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, + DDI_SLEEP); + /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */ + ASSERT(status == DDI_SUCCESS); } return (LDC_SUCCESS); } static uint_t -vd_handle_ldc_events(uint64_t event, caddr_t arg) -{ - uint_t status; - vd_t *vd = (vd_t *)(void *)arg; - - - ASSERT(vd != NULL); - mutex_enter(&vd->lock); - status = vd_do_handle_ldc_events(vd, event); - mutex_exit(&vd->lock); - return (status); -} - -static uint_t vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg) { _NOTE(ARGUNUSED(key, val)) @@ -1519,15 +1736,15 @@ vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) vds_t *vds; - PR0("Entered"); switch (cmd) { case DDI_DETACH: /* the real work happens below */ break; case DDI_SUSPEND: - /* nothing to do for this non-device */ + PR0("No action required for DDI_SUSPEND"); return (DDI_SUCCESS); default: + PRN("Unrecognized \"cmd\""); return (DDI_FAILURE); } @@ -1552,8 +1769,6 @@ vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (vds->initialized & VDS_LDI) (void) ldi_ident_release(vds->ldi_ident); mod_hash_destroy_hash(vds->vd_table); - if (vds->initialized & VDS_LOCKING) - mutex_destroy(&vds->lock); ddi_soft_state_free(vds_state, instance); return (DDI_SUCCESS); } @@ -1584,7 +1799,7 @@ vd_setup_full_disk(vd_t *vd) /* Get the VTOC for slice sizes */ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc, - FKIOCTL, kcred, &rval)) != 0) { + (vd_open_flags | FKIOCTL), kcred, &rval)) != 0) { PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status); return (status); } @@ -1701,7 +1916,8 @@ vd_setup_vd(char *block_device, vd_t *vd) /* Get dk_cinfo to determine slice of backing block device */ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO, - (intptr_t)&dk_cinfo, FKIOCTL, kcred, &rval)) != 0) { + (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred, + &rval)) != 0) { PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s", status, block_device); return (status); @@ -1726,7 +1942,8 @@ vd_setup_vd(char *block_device, vd_t *vd) /* Initialize dk_geom structure for single-slice block device */ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM, - (intptr_t)&vd->dk_geom, FKIOCTL, kcred, &rval)) != 0) { + (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), kcred, + &rval)) != 0) { PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s", status, block_device); return (status); @@ -1747,7 +1964,8 @@ vd_setup_vd(char *block_device, vd_t *vd) /* Initialize vtoc structure for single-slice block device */ if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, - (intptr_t)&vd->vtoc, FKIOCTL, kcred, &rval)) != 0) { + (intptr_t)&vd->vtoc, (vd_open_flags | FKIOCTL), kcred, + &rval)) != 0) { PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s", status, block_device); return (status); @@ -1811,16 +2029,22 @@ vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id, vd->initialized |= VD_LOCKING; - /* Create the task queue for the vdisk */ - (void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id); + /* Create start and completion task queues for the vdisk */ + (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id); + PR1("tq_name = %s", tq_name); + if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + PRN("Could not create task queue"); + return (EIO); + } + (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id); PR1("tq_name = %s", tq_name); - if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1, + if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1, TASKQ_DEFAULTPRI, 0)) == NULL) { PRN("Could not create task queue"); return (EIO); } - vd->initialized |= VD_TASKQ; - vd->enabled = 1; /* before callback can dispatch to taskq */ + vd->enabled = 1; /* before callback can dispatch to startq */ /* Bring up LDC */ @@ -1864,10 +2088,11 @@ vds_destroy_vd(void *arg) vd_t *vd = (vd_t *)arg; - PR0("Entered"); if (vd == NULL) return; + PR0("Destroying vdisk state"); + /* Disable queuing requests for the vdisk */ if (vd->initialized & VD_LOCKING) { mutex_enter(&vd->lock); @@ -1875,9 +2100,19 @@ vds_destroy_vd(void *arg) mutex_exit(&vd->lock); } - /* Drain and destroy the task queue (*before* shutting down LDC) */ - if (vd->initialized & VD_TASKQ) - ddi_taskq_destroy(vd->taskq); /* waits for queued tasks */ + /* Drain and destroy start queue (*before* destroying completionq) */ + if (vd->startq != NULL) + ddi_taskq_destroy(vd->startq); /* waits for queued tasks */ + + /* Drain and destroy completion queue (*before* shutting down LDC) */ + if (vd->completionq != NULL) + ddi_taskq_destroy(vd->completionq); /* waits for tasks */ + + if (vd->dring_task != NULL) { + ASSERT(vd->dring_len != 0); + kmem_free(vd->dring_task, + (sizeof (*vd->dring_task)) * vd->dring_len); + } /* Shut down LDC */ if (vd->initialized & VD_LDC) { @@ -2171,9 +2406,6 @@ vds_do_attach(dev_info_t *dip) sizeof (void *)); ASSERT(vds->vd_table != NULL); - mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL); - vds->initialized |= VDS_LOCKING; - if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) { PRN("ldi_ident_from_dip() returned errno %d", status); return (DDI_FAILURE); @@ -2205,14 +2437,14 @@ vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int status; - PR0("Entered"); switch (cmd) { case DDI_ATTACH: + PR0("Attaching"); if ((status = vds_do_attach(dip)) != DDI_SUCCESS) (void) vds_detach(dip, DDI_DETACH); return (status); case DDI_RESUME: - /* nothing to do for this non-device */ + PR0("No action required for DDI_RESUME"); return (DDI_SUCCESS); default: return (DDI_FAILURE); @@ -2251,6 +2483,7 @@ _init(void) { int i, status; + if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0) return (status); if ((status = mod_install(&modlinkage)) != 0) { @@ -2276,6 +2509,7 @@ _fini(void) { int status; + if ((status = mod_remove(&modlinkage)) != 0) return (status); ddi_soft_state_fini(&vds_state); diff --git a/usr/src/uts/sun4v/io/vio_util.c b/usr/src/uts/sun4v/io/vio_util.c new file mode 100644 index 0000000000..42cbf34fa2 --- /dev/null +++ b/usr/src/uts/sun4v/io/vio_util.c @@ -0,0 +1,184 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/stream.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/vio_util.h> + +/* + * Create a pool of mblks from which future vio_allocb() requests + * will be serviced. + * + * NOTE: num_mblks has to non-zero and a power-of-2 + * + * Returns 0 on success or EINVAL if num_mblks is zero or not + * a power of 2. + */ +int +vio_create_mblks(uint64_t num_mblks, size_t mblk_size, vio_mblk_pool_t **poolp) +{ + vio_mblk_pool_t *vmplp; + vio_mblk_t *vmp; + uint8_t *datap; + int i; + + if (!(num_mblks) || (!ISP2(num_mblks))) { + *poolp = 0; + return (EINVAL); + } + + vmplp = kmem_zalloc(sizeof (*vmplp), KM_SLEEP); + vmplp->quelen = num_mblks; + vmplp->quemask = num_mblks - 1; /* expects quelen is power-of-2 */ + vmplp->mblk_size = mblk_size; + + mutex_init(&vmplp->hlock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(DDI_INTR_SOFTPRI_DEFAULT)); + mutex_init(&vmplp->tlock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(DDI_INTR_SOFTPRI_DEFAULT)); + + vmplp->basep = kmem_zalloc(num_mblks * sizeof (vio_mblk_t), KM_SLEEP); + vmplp->datap = kmem_zalloc(num_mblks * mblk_size, KM_SLEEP); + vmplp->nextp = NULL; + + /* create a queue of pointers to free vio_mblk_t's */ + vmplp->quep = kmem_zalloc(vmplp->quelen * sizeof (vio_mblk_t *), + KM_SLEEP); + vmplp->head = 0; + vmplp->tail = 0; + + for (i = 0, datap = vmplp->datap; i < num_mblks; i++) { + + vmp = &(vmplp->basep[i]); + vmp->vmplp = vmplp; + vmp->datap = datap; + vmp->reclaim.free_func = vio_freeb; + vmp->reclaim.free_arg = (caddr_t)vmp; + vmp->mp = desballoc(vmp->datap, mblk_size, BPRI_MED, + &vmp->reclaim); + + if (vmp->mp == NULL) + continue; + + /* put this vmp on the free stack */ + vmplp->quep[vmplp->tail] = vmp; + vmplp->tail = (vmplp->tail + 1) & vmplp->quemask; + + datap += mblk_size; + } + + *poolp = vmplp; + return (0); +} + +/* + * Destroy the pool of mblks. This can only succeed when + * all allocated mblks have been returned to the pool. + * + * It is up to the caller to ensure that no further mblks are + * requested from the pool after destroy has been invoked. + * + * Returns 0 on success, EINVAL if handle is invalid, or + * EBUSY if not all mblks reclaimed yet. + */ +int +vio_destroy_mblks(vio_mblk_pool_t *vmplp) +{ + if (vmplp == NULL) + return (EINVAL); + + /* + * We can only destroy the pool once all the mblks have + * been reclaimed. + */ + if (vmplp->head != vmplp->tail) { + /* some mblks still in use */ + return (EBUSY); + } + + kmem_free(vmplp->basep, vmplp->quelen * sizeof (vio_mblk_t)); + kmem_free(vmplp->datap, vmplp->quelen * vmplp->mblk_size); + kmem_free(vmplp->quep, vmplp->quelen * sizeof (vio_mblk_t *)); + + mutex_destroy(&vmplp->hlock); + mutex_destroy(&vmplp->tlock); + + kmem_free(vmplp, sizeof (*vmplp)); + + return (0); +} + +/* + * Allocate a mblk from the free pool if one is available. + * Otherwise returns NULL. + */ +mblk_t * +vio_allocb(vio_mblk_pool_t *vmplp) +{ + vio_mblk_t *vmp = NULL; + mblk_t *mp = NULL; + uint32_t head; + + mutex_enter(&vmplp->hlock); + head = (vmplp->head + 1) & vmplp->quemask; + if (head != vmplp->tail) { + /* we have free mblks */ + vmp = vmplp->quep[vmplp->head]; + mp = vmp->mp; + vmplp->head = head; + } + mutex_exit(&vmplp->hlock); + + return (mp); +} + +/* + * Return a mblk to the free pool. Invoked when the upper IP + * layers do freemsg() etc on the mblk they were passed. + */ +void +vio_freeb(void *arg) +{ + vio_mblk_t *vmp = (vio_mblk_t *)arg; + vio_mblk_pool_t *vmplp = vmp->vmplp; + + vmp->mp = desballoc(vmp->datap, vmplp->mblk_size, + BPRI_MED, &vmp->reclaim); + + mutex_enter(&vmplp->tlock); + vmplp->quep[vmplp->tail] = vmp; + vmplp->tail = (vmplp->tail + 1) & vmplp->quemask; + mutex_exit(&vmplp->tlock); +} diff --git a/usr/src/uts/sun4v/io/vldc.c b/usr/src/uts/sun4v/io/vldc.c index 6c366c5c59..6b9d48a76c 100644 --- a/usr/src/uts/sun4v/io/vldc.c +++ b/usr/src/uts/sun4v/io/vldc.c @@ -408,6 +408,7 @@ i_vldc_mdeg_register(vldc_t *vldcp) bcopy(nameprop, name, namesz); VLDC_SET_MDEG_PROP_NAME(pspecp, name); + ddi_prop_free(nameprop); /* copy in the instance property */ VLDC_SET_MDEG_PROP_INST(pspecp, inst); @@ -728,6 +729,9 @@ i_vldc_close_port(vldc_t *vldcp, uint_t portno) kmem_free(vport->send_buf, vport->mtu); kmem_free(vport->recv_buf, vport->mtu); + if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME) == 0) + kmem_free(vport->cookie_buf, vldc_max_cookie); + vport->status = VLDC_PORT_CLOSED; return (rv); @@ -910,6 +914,9 @@ vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred) vport->recv_buf = kmem_alloc(vport->mtu, KM_SLEEP); vport->send_buf = kmem_alloc(vport->mtu, KM_SLEEP); + if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME) == 0) + vport->cookie_buf = kmem_alloc(vldc_max_cookie, KM_SLEEP); + vport->is_stream = B_FALSE; /* assume not a stream */ vport->hanged_up = B_FALSE; @@ -1057,50 +1064,57 @@ i_vldc_ioctl_read_cookie(vldc_port_t *vport, int vldc_instance, void *arg, int mode) { vldc_data_t copy_info; - caddr_t buf; - uint64_t len; + uint64_t len, balance, copy_size; + caddr_t src_addr, dst_addr; int rv; if (ddi_copyin(arg, ©_info, sizeof (copy_info), mode) == -1) { return (EFAULT); } - len = copy_info.length; - if (len > vldc_max_cookie) { - return (EINVAL); - } + len = balance = copy_info.length; + src_addr = (caddr_t)copy_info.src_addr; + dst_addr = (caddr_t)copy_info.dst_addr; + while (balance > 0) { - /* allocate a temporary buffer */ - buf = kmem_alloc(len, KM_SLEEP); + /* get the max amount to the copied */ + copy_size = MIN(balance, vldc_max_cookie); - mutex_enter(&vport->minorp->lock); + mutex_enter(&vport->minorp->lock); - D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%lx " - "size 0x%lx to 0x%lx\n", vldc_instance, vport->number, - copy_info.dst_addr, copy_info.length, copy_info.src_addr); + D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%p " + "size 0x%lx to 0x%p\n", vldc_instance, vport->number, + dst_addr, copy_size, src_addr); - /* read from the HV into the temporary buffer */ - rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len, - (caddr_t)copy_info.dst_addr, LDC_COPY_IN); - if (rv != 0) { - DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot read " - "address 0x%lx, rv=%d\n", vldc_instance, vport->number, - copy_info.dst_addr, rv); - mutex_exit(&vport->minorp->lock); - kmem_free(buf, copy_info.length); - return (EFAULT); - } + /* read from the HV into the temporary buffer */ + rv = ldc_mem_rdwr_pa(vport->ldc_handle, vport->cookie_buf, + ©_size, dst_addr, LDC_COPY_IN); + if (rv != 0) { + DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot " + "read address 0x%p, rv=%d\n", + vldc_instance, vport->number, dst_addr, rv); + mutex_exit(&vport->minorp->lock); + return (EFAULT); + } - D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n", - vldc_instance, vport->number); + D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n", + vldc_instance, vport->number); - mutex_exit(&vport->minorp->lock); + mutex_exit(&vport->minorp->lock); - /* copy data from temporary buffer out to the caller and free buffer */ - rv = ddi_copyout(buf, (caddr_t)copy_info.src_addr, len, mode); - kmem_free(buf, copy_info.length); - if (rv != 0) { - return (EFAULT); + /* + * copy data from temporary buffer out to the + * caller and free buffer + */ + rv = ddi_copyout(vport->cookie_buf, src_addr, copy_size, mode); + if (rv != 0) { + return (EFAULT); + } + + /* adjust len, source and dest */ + balance -= copy_size; + src_addr += copy_size; + dst_addr += copy_size; } /* set the structure to reflect outcome */ @@ -1118,54 +1132,58 @@ i_vldc_ioctl_write_cookie(vldc_port_t *vport, int vldc_instance, void *arg, int mode) { vldc_data_t copy_info; - caddr_t buf; - uint64_t len; + uint64_t len, balance, copy_size; + caddr_t src_addr, dst_addr; int rv; - if (ddi_copyin((caddr_t)arg, ©_info, - sizeof (copy_info), mode) != 0) { + if (ddi_copyin(arg, ©_info, sizeof (copy_info), mode) != 0) { return (EFAULT); } - len = copy_info.length; - if (len > vldc_max_cookie) { - return (EINVAL); - } - D2("i_vldc_ioctl_write_cookie: vldc@%d:%d writing 0x%lx size 0x%lx " "to 0x%lx\n", vldc_instance, vport->number, copy_info.src_addr, copy_info.length, copy_info.dst_addr); - /* allocate a temporary buffer */ - buf = kmem_alloc(len, KM_SLEEP); + len = balance = copy_info.length; + src_addr = (caddr_t)copy_info.src_addr; + dst_addr = (caddr_t)copy_info.dst_addr; + while (balance > 0) { - /* copy into the temporary buffer the data to be written to the HV */ - if (ddi_copyin((caddr_t)copy_info.src_addr, buf, - copy_info.length, mode) != 0) { - kmem_free(buf, copy_info.length); - return (EFAULT); - } + /* get the max amount to the copied */ + copy_size = MIN(balance, vldc_max_cookie); - mutex_enter(&vport->minorp->lock); + /* + * copy into the temporary buffer the data + * to be written to the HV + */ + if (ddi_copyin((caddr_t)src_addr, vport->cookie_buf, + copy_size, mode) != 0) { + return (EFAULT); + } - /* write the data from the temporary buffer to the HV */ - rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len, - (caddr_t)copy_info.dst_addr, LDC_COPY_OUT); - if (rv != 0) { - DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d failed to write at" - " address 0x%lx\n, rv=%d", vldc_instance, vport->number, - copy_info.dst_addr, rv); - mutex_exit(&vport->minorp->lock); - kmem_free(buf, copy_info.length); - return (EFAULT); - } + mutex_enter(&vport->minorp->lock); + + /* write the data from the temporary buffer to the HV */ + rv = ldc_mem_rdwr_pa(vport->ldc_handle, vport->cookie_buf, + ©_size, dst_addr, LDC_COPY_OUT); + if (rv != 0) { + DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d " + "failed to write at address 0x%p\n, rv=%d", + vldc_instance, vport->number, dst_addr, rv); + mutex_exit(&vport->minorp->lock); + return (EFAULT); + } - D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n", - vldc_instance, vport->number); + D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n", + vldc_instance, vport->number); - mutex_exit(&vport->minorp->lock); + mutex_exit(&vport->minorp->lock); - kmem_free(buf, copy_info.length); + /* adjust len, source and dest */ + balance -= copy_size; + src_addr += copy_size; + dst_addr += copy_size; + } /* set the structure to reflect outcome */ copy_info.length = len; @@ -1315,13 +1333,19 @@ vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, break; case VLDC_IOCTL_READ_COOKIE: - + if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME)) { + rv = EINVAL; + break; + } rv = i_vldc_ioctl_read_cookie(vport, instance, (void *)arg, mode); break; case VLDC_IOCTL_WRITE_COOKIE: - + if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME)) { + rv = EINVAL; + break; + } rv = i_vldc_ioctl_write_cookie(vport, instance, (void *)arg, mode); break; diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c index 29ebf6bc59..c0cc116beb 100644 --- a/usr/src/uts/sun4v/io/vnet.c +++ b/usr/src/uts/sun4v/io/vnet.c @@ -86,7 +86,7 @@ void vnet_tx_update(void *arg); /* externs */ extern int vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr, mac_register_t **vgenmacp); -extern void vgen_uninit(void *arg); +extern int vgen_uninit(void *arg); static mac_callbacks_t vnet_m_callbacks = { 0, @@ -116,6 +116,7 @@ uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */ uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT; /* tx timeout in msec */ uint32_t vnet_ldc_qlen = VNET_LDC_QLEN; /* ldc qlen */ uint32_t vnet_nfdb_hash = VNET_NFDB_HASH; /* size of fdb hash table */ +uint32_t vnet_nrbufs = VNET_NRBUFS; /* number of receive buffers */ /* * Property names @@ -296,8 +297,9 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd) int instance; int status; enum { AST_init = 0x0, AST_vnet_alloc = 0x1, - AST_read_macaddr = 0x2, AST_vgen_init = 0x4, - AST_vptl_alloc = 0x8, AST_fdbh_alloc = 0x10 } + AST_mac_alloc = 0x2, AST_read_macaddr = 0x4, + AST_vgen_init = 0x8, AST_vptl_alloc = 0x10, + AST_fdbh_alloc = 0x20 } attach_state; mac_register_t *vgenmacp = NULL; uint32_t nfdbh = 0; @@ -400,7 +402,7 @@ vnet_attach_fail: RW_EXIT(&vnetp->trwlock); } if (attach_state & AST_vgen_init) { - vgen_uninit(vgenmacp->m_driver); + (void) vgen_uninit(vgenmacp->m_driver); } if (attach_state & AST_vnet_alloc) { KMEM_FREE(vnetp); @@ -418,6 +420,7 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) vnet_t **vnetpp; vp_tl_t *vp_tlp; int instance; + int rv; instance = ddi_get_instance(dip); DBG1((NULL, "vnetdetach: instance(%d) enter\n", instance)); @@ -436,6 +439,21 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) goto vnet_detach_fail; } + /* uninit and free vnet proxy transports */ + WRITE_ENTER(&vnetp->trwlock); + while ((vp_tlp = vnetp->tlp) != NULL) { + if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) { + /* uninitialize generic transport */ + rv = vgen_uninit(vp_tlp->macp->m_driver); + if (rv != DDI_SUCCESS) { + RW_EXIT(&vnetp->trwlock); + goto vnet_detach_fail; + } + } + vnet_del_vptl(vnetp, vp_tlp); + } + RW_EXIT(&vnetp->trwlock); + /* * Unregister from the MAC subsystem. This can fail, in * particular if there are DLPI style-2 streams still open - @@ -454,17 +472,6 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) } RW_EXIT(&vnet_rw); - /* uninit and free vnet proxy transports */ - WRITE_ENTER(&vnetp->trwlock); - while ((vp_tlp = vnetp->tlp) != NULL) { - if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) { - /* uninitialize generic transport */ - vgen_uninit(vp_tlp->macp->m_driver); - } - vnet_del_vptl(vnetp, vp_tlp); - } - RW_EXIT(&vnetp->trwlock); - KMEM_FREE(vnetp); return (DDI_SUCCESS); diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c index 9d01b82837..1fdbf79873 100644 --- a/usr/src/uts/sun4v/io/vnet_gen.c +++ b/usr/src/uts/sun4v/io/vnet_gen.c @@ -50,8 +50,9 @@ #include <sys/vio_mailbox.h> #include <sys/vio_common.h> #include <sys/vnet_common.h> -#include <sys/vnet_gen.h> #include <sys/vnet_mailbox.h> +#include <sys/vio_util.h> +#include <sys/vnet_gen.h> /* * Implementation of the mac functionality for vnet using the @@ -64,7 +65,7 @@ /* vgen proxy entry points */ int vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr, mac_register_t **vgenmacp); -void vgen_uninit(void *arg); +int vgen_uninit(void *arg); static int vgen_start(void *arg); static void vgen_stop(void *arg); static mblk_t *vgen_tx(void *arg, mblk_t *mp); @@ -129,7 +130,6 @@ static int vgen_num_txpending(vgen_ldc_t *ldcp); static int vgen_tx_dring_full(vgen_ldc_t *ldcp); static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp); static void vgen_ldc_watchdog(void *arg); -static void vgen_copymsg(mblk_t *mp, void *bufp); static int vgen_setup_kstats(vgen_ldc_t *ldcp); static void vgen_destroy_kstats(vgen_ldc_t *ldcp); static int vgen_kstat_update(kstat_t *ksp, int rw); @@ -145,8 +145,7 @@ static int vgen_send_version_negotiate(vgen_ldc_t *ldcp); static int vgen_send_attr_info(vgen_ldc_t *ldcp); static int vgen_send_dring_reg(vgen_ldc_t *ldcp); static int vgen_send_rdx_info(vgen_ldc_t *ldcp); -static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, - uint32_t end, uint64_t next_txseq); +static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, int32_t end); static int vgen_send_mcast_info(vgen_ldc_t *ldcp); static int vgen_handshake_phase2(vgen_ldc_t *ldcp); static void vgen_handshake_reset(vgen_ldc_t *ldcp); @@ -163,6 +162,8 @@ static void vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); static void vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); static void vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, mblk_t **headp, mblk_t **tailp); +static void vgen_send_dring_ack(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, + uint32_t start, int32_t end, uint8_t pstate); static void vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, mblk_t **headp, mblk_t **tailp); static void vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp); @@ -255,6 +256,8 @@ uint32_t vgen_hwd_interval = 1000; /* handshake watchdog freq in msec */ uint32_t vgen_max_hretries = 1; /* max # of handshake retries */ uint32_t vgen_ldcwr_retries = 10; /* max # of ldc_write() retries */ uint32_t vgen_ldcup_retries = 5; /* max # of ldc_up() retries */ +uint32_t vgen_recv_delay = 1; /* delay when rx descr not ready */ +uint32_t vgen_recv_retries = 10; /* retry when rx descr not ready */ #ifdef DEBUG /* flags to simulate error conditions for debugging */ @@ -303,6 +306,7 @@ extern uint32_t vnet_reclaim_hiwat; extern uint32_t vnet_ldcwd_interval; extern uint32_t vnet_ldcwd_txtimeout; extern uint32_t vnet_ldc_qlen; +extern uint32_t vnet_nrbufs; extern int _vnet_dbglevel; extern void _vnetdebug_printf(void *vnetp, const char *fmt, ...); @@ -365,13 +369,9 @@ uint32_t vgen_hdbg; #define HDBG_BAD_SID 0x4 #define HDBG_OUT_STATE 0x8 -#if 0 -/* debug version negotiation, need to redefine VGEN_NUM_VER */ -vgen_ver_t dbg_vgen_versions[VGEN_NUM_VER] = - { {5, 0}, {3, 0}, {2, 1}, {1, 2}, {1, 1} }; #endif -#endif + /* * vgen_init() is called by an instance of vnet driver to initialize the @@ -443,15 +443,17 @@ vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr, * Called by vnet to undo the initializations done by vgen_init(). * The handle provided by generic transport during vgen_init() is the argument. */ -void +int vgen_uninit(void *arg) { vgen_t *vgenp = (vgen_t *)arg; void *vnetp; int instance; + vio_mblk_pool_t *rp, *nrp; - if (vgenp == NULL) - return; + if (vgenp == NULL) { + return (DDI_FAILURE); + } instance = ddi_get_instance(vgenp->vnetdip); vnetp = vgenp->vnetp; @@ -466,6 +468,21 @@ vgen_uninit(void *arg) /* detach all ports from the device */ vgen_detach_ports(vgenp); + /* + * free any pending rx mblk pools, + * that couldn't be freed previously during channel detach. + */ + rp = vgenp->rmp; + while (rp != NULL) { + nrp = vgenp->rmp = rp->nextp; + if (vio_destroy_mblks(rp)) { + vgenp->rmp = rp; + mutex_exit(&vgenp->lock); + return (DDI_FAILURE); + } + rp = nrp; + } + /* free multicast table */ kmem_free(vgenp->mctab, vgenp->mcsize * sizeof (struct ether_addr)); @@ -478,6 +495,8 @@ vgen_uninit(void *arg) KMEM_FREE(vgenp); DBG1((vnetp, "vgen_uninit: exit vnet_instance(%d)\n", instance)); + + return (DDI_SUCCESS); } /* enable transmit/receive for the device */ @@ -536,17 +555,14 @@ vgen_portsend(vgen_port_t *portp, mblk_t *mp) { vgen_ldclist_t *ldclp; vgen_ldc_t *ldcp; - vgen_t *vgenp; int status; - vgenp = portp->vgenp; ldclp = &portp->ldclist; READ_ENTER(&ldclp->rwlock); /* - * XXX - for now, we have a single channel. + * NOTE: for now, we will assume we have a single channel. */ if (ldclp->headp == NULL) { - DWARN((vgenp->vnetp, "vgen_portsend: dropping packet\n")); RW_EXIT(&ldclp->rwlock); return (VGEN_FAILURE); } @@ -554,15 +570,12 @@ vgen_portsend(vgen_port_t *portp, mblk_t *mp) if (ldcp->need_resched) { /* out of tx resources, see vgen_ldcsend() for details. */ - DWARN((vgenp->vnetp, "vgen_portsend: dropping packet...\n")); - mutex_enter(&ldcp->txlock); ldcp->statsp->tx_no_desc++; mutex_exit(&ldcp->txlock); RW_EXIT(&ldclp->rwlock); - freemsg(mp); - return (VGEN_SUCCESS); + return (VGEN_FAILURE); } status = vgen_ldcsend(ldcp, mp); @@ -581,10 +594,7 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) void *vnetp; size_t size; int rv; - uint32_t i; - uint32_t start; - uint32_t end; - int txpending = 0; + uint64_t tbuf_ix; vgen_private_desc_t *tbufp; vgen_private_desc_t *ntbufp; vnet_public_desc_t *txdp; @@ -593,8 +603,10 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) struct ether_header *ehp; boolean_t is_bcast = B_FALSE; boolean_t is_mcast = B_FALSE; - boolean_t reclaim = B_FALSE; boolean_t need_intr = B_FALSE; + size_t mblksz; + caddr_t dst; + mblk_t *bp; vnetp = LDC_TO_VNET(ldcp); statsp = ldcp->statsp; @@ -633,60 +645,33 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) */ tbufp = ldcp->next_tbufp; ntbufp = NEXTTBUF(ldcp, tbufp); - if (tbufp->flags != VGEN_PRIV_DESC_FREE || - ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */ + if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */ mutex_enter(&ldcp->tclock); - if (ntbufp == ldcp->cur_tbufp) + if (ntbufp == ldcp->cur_tbufp) { ldcp->need_resched = B_TRUE; - mutex_exit(&ldcp->tclock); + mutex_exit(&ldcp->tclock); - statsp->tx_no_desc++; - mutex_exit(&ldcp->txlock); -#ifdef VGEN_USE_MAC_TX_UPDATE - /* - * This cflag is disabled by default. This can be enabled if we - * want to return failure to the mac layer when we run out of - * descriptors and use mac_tx_update() to restart tx when - * descriptors become available. However, stopping tx would - * affect traffic going over other ports, as upper mac layer - * has no concept of multiple ports within a device. - * So currently, to avoid this, drop packets when we run out - * of descrs and just return success. See the corresponding - * code in vgen_portsend() and vgen_reclaim_dring(). - */ - return (VGEN_TX_NORESOURCES); -#else - freemsg(mp); /* drop the packet */ - return (VGEN_TX_SUCCESS); -#endif + statsp->tx_no_desc++; + mutex_exit(&ldcp->txlock); + + return (VGEN_TX_NORESOURCES); + } + mutex_exit(&ldcp->tclock); } if (size < ETHERMIN) size = ETHERMIN; /* copy data into pre-allocated transmit buffer */ - vgen_copymsg(mp, tbufp->datap); - - txpending = vgen_num_txpending(ldcp); - if (txpending >= ldcp->reclaim_hiwat) { - /* - * if num of pending transmits is more than hiwat, - * reclaim now and also enable ack bit. - */ - reclaim = B_TRUE; - need_intr = B_TRUE; - } else { - if (txpending >= ldcp->reclaim_lowat) { - /* - * if the num of pending transmits is more than lowat - * enable ack bit in the descr and reclaim in intr(). - */ - need_intr = B_TRUE; - } + dst = tbufp->datap + VNET_IPALIGN; + for (bp = mp; bp != NULL; bp = bp->b_cont) { + mblksz = MBLKL(bp); + bcopy(bp->b_rptr, dst, mblksz); + dst += mblksz; } - i = tbufp - ldcp->tbufp; + tbuf_ix = tbufp - ldcp->tbufp; ehp = (struct ether_header *)tbufp->datap; is_bcast = IS_BROADCAST(ehp); @@ -694,38 +679,40 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) tbufp->flags = VGEN_PRIV_DESC_BUSY; tbufp->datalen = size; - tbufp->seqnum = ldcp->next_txseq; /* initialize the corresponding public descriptor (txd) */ txdp = tbufp->descp; hdrp = &txdp->hdr; - hdrp->dstate = VIO_DESC_READY; if (need_intr) hdrp->ack = B_TRUE; txdp->nbytes = size; txdp->ncookies = tbufp->ncookies; bcopy((tbufp->memcookie), (txdp->memcookie), - tbufp->ncookies * sizeof (ldc_mem_cookie_t)); + tbufp->ncookies * sizeof (ldc_mem_cookie_t)); + hdrp->dstate = VIO_DESC_READY; /* send dring datamsg to the peer */ - start = end = i; - rv = vgen_send_dring_data(ldcp, start, end, ldcp->next_txseq); - if (rv != 0) { - /* vgen_send_dring_data() error: drop the packet */ - DWARN((vnetp, - "vgen_ldcsend: vgen_send_dring_data(): failed: " - "id(%lx) rv(%d) len (%d)\n", ldcp->ldc_id, rv, size)); - tbufp->flags = VGEN_PRIV_DESC_FREE; /* free tbuf */ - hdrp->dstate = VIO_DESC_FREE; /* free txd */ - hdrp->ack = B_FALSE; - statsp->oerrors++; - goto vgen_tx_exit; + if (ldcp->resched_peer) { + rv = vgen_send_dring_data(ldcp, (uint32_t)tbuf_ix, -1); + if (rv != 0) { + /* vgen_send_dring_data() error: drop the packet */ + DWARN((vnetp, + "vgen_ldcsend: vgen_send_dring_data(): failed: " + "id(%lx) rv(%d) len (%d)\n", + ldcp->ldc_id, rv, size)); + tbufp->flags = VGEN_PRIV_DESC_FREE; /* free tbuf */ + hdrp->dstate = VIO_DESC_FREE; /* free txd */ + hdrp->ack = B_FALSE; + statsp->oerrors++; + goto vgen_tx_exit; + } + ldcp->resched_peer = B_FALSE; } /* update next available tbuf in the ring */ ldcp->next_tbufp = ntbufp; - /* update tx seqnum and index */ - ldcp->next_txseq++; + + /* update tx index */ INCR_TXI(ldcp->next_txi, ldcp); /* update stats */ @@ -739,9 +726,6 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp) vgen_tx_exit: mutex_exit(&ldcp->txlock); - if (reclaim) { - vgen_reclaim(ldcp); - } DBG1((vnetp, "vgen_ldcsend: exit: ldcid (%lx)\n", ldcp->ldc_id)); freemsg(mp); @@ -1528,7 +1512,8 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id) ldc_status_t istatus; enum {AST_init = 0x0, AST_ldc_alloc = 0x1, AST_mutex_init = 0x2, AST_ldc_init = 0x4, - AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10} + AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10, + AST_create_rxmblks = 0x20} attach_state; attach_state = AST_init; @@ -1584,6 +1569,16 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id) } attach_state |= AST_alloc_tx_ring; + /* allocate receive resources */ + ldcp->num_rbufs = vnet_nrbufs; + ldcp->rmp = NULL; + status = vio_create_mblks(ldcp->num_rbufs, VGEN_DBLK_SZ, + &(ldcp->rmp)); + if (status != 0) { + goto ldc_attach_failed; + } + attach_state |= AST_create_rxmblks; + /* Setup kstats for the channel */ status = vgen_setup_kstats(ldcp); if (status != VGEN_SUCCESS) { @@ -1605,6 +1600,9 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id) return (DDI_SUCCESS); ldc_attach_failed: + if (attach_state & AST_create_rxmblks) { + (void) vio_destroy_mblks(ldcp->rmp); + } if (attach_state & AST_alloc_tx_ring) { vgen_free_tx_ring(ldcp); } @@ -1661,8 +1659,21 @@ vgen_ldc_detach(vgen_ldc_t *ldcp) ldcp->flags &= ~(CHANNEL_ATTACHED); vgen_destroy_kstats(ldcp); + + /* free receive resources */ + if (vio_destroy_mblks(ldcp->rmp)) { + /* + * if we cannot reclaim all mblks, put this + * on the list of pools to be reclaimed when the + * device gets detached (see vgen_uninit()). + */ + ldcp->rmp->nextp = vgenp->rmp; + vgenp->rmp = ldcp->rmp; + } + /* free transmit resources */ vgen_free_tx_ring(ldcp); + (void) ldc_unreg_callback(ldcp->ldc_handle); (void) ldc_fini(ldcp->ldc_handle); mutex_destroy(&ldcp->tclock); @@ -1825,7 +1836,7 @@ vgen_ldc_init(vgen_ldc_t *ldcp) LDC_SHADOW_MAP, LDC_MEM_RW, &ldcp->tx_dcookie, &ncookies); if (rv != 0) { DWARN((vnetp, "vgen_ldcinit: id (%lx) " - "ldc_mem_dring_bind failed\n", ldcp->ldc_id)); + "ldc_mem_dring_bind failed rv(%x)\n", ldcp->ldc_id, rv)); goto ldcinit_failed; } @@ -1952,7 +1963,7 @@ vgen_init_tbufs(vgen_ldc_t *ldcp) bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds)); bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds)); - datap = kmem_zalloc(ldcp->num_txds * VGEN_TX_DBLK_SZ, KM_SLEEP); + datap = kmem_zalloc(ldcp->num_txds * VGEN_DBLK_SZ, KM_SLEEP); ldcp->tx_datap = datap; /* @@ -1976,7 +1987,7 @@ vgen_init_tbufs(vgen_ldc_t *ldcp) */ ci = ncookies = 0; rv = ldc_mem_bind_handle(tbufp->memhandle, - (caddr_t)datap, VGEN_TX_DBLK_SZ, LDC_SHADOW_MAP, + (caddr_t)datap, VGEN_DBLK_SZ, LDC_SHADOW_MAP, LDC_MEM_R, &(tbufp->memcookie[ci]), &ncookies); if (rv != 0) { goto init_tbufs_failed; @@ -1989,20 +2000,20 @@ vgen_init_tbufs(vgen_ldc_t *ldcp) tbufp->datap = datap; if ((ncookies == 0) || - (ncookies > (uint64_t)MAX_COOKIES)) { + (ncookies > MAX_COOKIES)) { goto init_tbufs_failed; } for (ci = 1; ci < ncookies; ci++) { rv = ldc_mem_nextcookie(tbufp->memhandle, - &(tbufp->memcookie[ci])); + &(tbufp->memcookie[ci])); if (rv != 0) { goto init_tbufs_failed; } } tbufp->ncookies = ncookies; - datap += VGEN_TX_DBLK_SZ; + datap += VGEN_DBLK_SZ; tbufp->flags = VGEN_PRIV_DESC_FREE; txdp = &(ldcp->txdp[i]); @@ -2021,6 +2032,8 @@ vgen_init_tbufs(vgen_ldc_t *ldcp) ldcp->next_txseq = VNET_ISS; ldcp->next_txi = 0; + ldcp->resched_peer = B_TRUE; + return (DDI_SUCCESS); init_tbufs_failed:; @@ -2060,7 +2073,7 @@ vgen_uninit_tbufs(vgen_ldc_t *ldcp) if (ldcp->tx_datap) { /* prealloc'd tx data buffer */ - kmem_free(ldcp->tx_datap, ldcp->num_txds * VGEN_TX_DBLK_SZ); + kmem_free(ldcp->tx_datap, ldcp->num_txds * VGEN_DBLK_SZ); ldcp->tx_datap = NULL; } @@ -2104,6 +2117,9 @@ vgen_clobber_tbufs(vgen_ldc_t *ldcp) /* reset tx seqnum and index */ ldcp->next_txseq = VNET_ISS; ldcp->next_txi = 0; + + ldcp->resched_peer = B_TRUE; + #ifdef DEBUG DBG2((vnetp, "vgen_clobber_tbufs: id(0x%lx) num descrs done (%d)\n", @@ -2738,8 +2754,7 @@ vgen_send_rdx_info(vgen_ldc_t *ldcp) /* send descriptor ring data message to the peer over ldc */ static int -vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end, - uint64_t next_txseq) +vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, int32_t end) { vio_dring_msg_t dringmsg, *msgp = &dringmsg; vio_msg_tag_t *tagp = &msgp->tag; @@ -2753,7 +2768,7 @@ vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end, tagp->vio_subtype_env = VIO_DRING_DATA; tagp->vio_sid = ldcp->local_sid; - msgp->seq_num = next_txseq; + msgp->seq_num = ldcp->next_txseq; msgp->dring_ident = ldcp->local_hparams.dring_ident; msgp->start_idx = start; msgp->end_idx = end; @@ -2765,6 +2780,9 @@ vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end, return (VGEN_FAILURE); } + ldcp->next_txseq++; + ldcp->statsp->dring_data_msgs++; + DBG2((vnetp, "vgen_send_dring_data: DRING_DATA_SENT id (%lx)\n", ldcp->ldc_id)); @@ -2898,14 +2916,6 @@ vgen_reset_hphase(vgen_ldc_t *ldcp) */ bzero(&(ldcp->local_hparams), sizeof (ldcp->local_hparams)); -#ifdef DEBUG -#if 0 - if (vgen_hdbg & HDBG_VERSION) { - bcopy(dbg_vgen_versions, ldcp->vgen_versions, - sizeof (ldcp->vgen_versions)); - } -#endif -#endif /* set version to the highest version supported */ ldcp->local_hparams.ver_major = ldcp->vgen_versions[0].ver_major; @@ -2921,12 +2931,6 @@ vgen_reset_hphase(vgen_ldc_t *ldcp) ldcp->local_hparams.xfer_mode = VIO_DRING_MODE; ldcp->local_hparams.ack_freq = 0; /* don't need acks */ -#ifdef DEBUG -#if 0 - vgen_print_attr_info(ldcp, VGEN_LOCAL); -#endif -#endif - /* * set dring_info params. * Note: dring is already created and bound. @@ -3469,11 +3473,6 @@ vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp) ldcp->hstate |= ATTR_ACK_SENT; DBG2((vnetp, "vgen_handle_attr_info:" " ATTR_ACK_SENT id(%lx)\n", ldcp->ldc_id)); -#ifdef DEBUG -#if 0 - vgen_print_attr_info(ldcp, VGEN_PEER); -#endif -#endif } else { /* failed */ DWARN((vnetp, "vgen_handle_attr_info:" @@ -3838,6 +3837,24 @@ vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, } static void +vgen_send_dring_ack(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, uint32_t start, + int32_t end, uint8_t pstate) +{ + vio_dring_msg_t *msgp = (vio_dring_msg_t *)tagp; + void *vnetp = LDC_TO_VNET(ldcp); + + tagp->vio_subtype = VIO_SUBTYPE_ACK; + tagp->vio_sid = ldcp->local_sid; + msgp->start_idx = start; + msgp->end_idx = end; + msgp->dring_process_state = pstate; + if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msgp), B_FALSE)) { + DWARN((vnetp, "vgen_send_dring_ack: id(%lx) vgen_sendmsg " + "failed\n", (ldcp)->ldc_id)); + } +} + +static void vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, mblk_t **headp, mblk_t **tailp) { @@ -3854,22 +3871,25 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, size_t nread; uint64_t off = 0; uint32_t start; - uint32_t end; + int32_t end; uint32_t datalen; uint32_t ncookies; - uint32_t sync_start; - uint32_t sync_end; + uint32_t ack_start; + uint32_t ack_end; uint32_t rxi; uint32_t txi; int rv; boolean_t rxd_err = B_FALSE; - boolean_t sync_done = B_FALSE; + boolean_t set_ack_start = B_FALSE; + vgen_private_desc_t *tbufp; + uint32_t next_rxi; + boolean_t ready_txd = B_FALSE; + uint32_t retries = 0; #ifdef VGEN_HANDLE_LOST_PKTS int n; #endif #ifdef VGEN_REXMIT uint64_t seqnum; - vgen_private_desc_t *tbufp; #endif void *vnetp = LDC_TO_VNET(ldcp); @@ -3895,7 +3915,8 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, start, end)); /* validate rx start and end indeces */ - if (!(CHECK_RXI(start, ldcp)) || !(CHECK_RXI(end, ldcp))) { + if (!(CHECK_RXI(start, ldcp)) || ((end != -1) && + !(CHECK_RXI(end, ldcp)))) { /* drop the message if invalid index */ break; } @@ -3930,7 +3951,7 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, } /* - * Starting sequence number of the received packets + * sequence number of dring data message * is less than the next sequence number that * is expected: * @@ -3950,7 +3971,7 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, } /* - * Starting sequence number of the received packets + * sequence number of dring data message * is greater than the next expected sequence number * * send a NACK back to the peer to indicate lost @@ -3976,8 +3997,10 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, #ifdef VGEN_REXMIT /* * stop further processing until peer - * retransmits with the right index and seqnum. + * retransmits with the right index. + * update next_rxseq expected. */ + ldcp->next_rxseq += 1; break; #else /* VGEN_REXMIT */ /* @@ -3987,12 +4010,12 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, * from the new start index. */ ldcp->next_rxi = start; - ldcp->next_rxseq += n; + ldcp->next_rxseq += 1; #endif /* VGEN_REXMIT */ } else if (dringmsg->seq_num == ldcp->next_rxseq) { /* - * expected and starting seqnums match, but + * expected and received seqnums match, but * the descriptor indeces don't? * * restart handshake with peer. @@ -4003,11 +4026,6 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, ldcp->ldc_id, ldcp->next_rxseq, dringmsg->seq_num)); -#if 0 - vgen_handshake_retry(ldcp); - break; -#endif - } } else { @@ -4022,50 +4040,89 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, "next_rxseq(0x%lx) != seq_num(0x%lx)\n", ldcp->ldc_id, ldcp->next_rxseq, dringmsg->seq_num)); - -#if 0 - vgen_handshake_retry(ldcp); - break; -#endif } } #endif /* VGEN_HANDLE_LOST_PKTS */ /* - * Start processing the descriptor range, specified - * in the dring data msg. + * start processing the descriptors from the specified + * start index, up to the index a descriptor is not ready + * to be processed or we process the entire descriptor ring + * and wrap around upto the start index. */ - if (ldc_mem_dring_acquire(ldcp->rx_dhandle, start, end)) { - DWARN((vnetp, "vgen_handle_dring_data: " - "id(%lx), ldc_mem_dring_acquire() failed\n", - ldcp->ldc_id)); - statsp->ierrors++; - } - rxi = start; - sync_start = start; + + /* need to set the start index of descriptors to be ack'd */ + set_ack_start = B_TRUE; + + /* index upto which we have ack'd */ + ack_end = start; + DECR_RXI(ack_end, ldcp); + + next_rxi = rxi = start; do { - /* recv packets from 'start' to 'end' */ + +vgen_recv_retry: if (ldc_mem_dring_acquire(ldcp->rx_dhandle, + rxi, rxi)) { + DWARN((vnetp, "vgen_handle_dring_data: " + "id(%lx), ldc_mem_dring_acquire() failed\n", + ldcp->ldc_id)); + statsp->ierrors++; + break; + } rxdp = &(ldcp->rxdp[rxi]); hdrp = &rxdp->hdr; + if (hdrp->dstate != VIO_DESC_READY) { + /* + * descriptor is not ready. + * retry descriptor acquire, stop processing + * after max # retries. + */ + if (retries == vgen_recv_retries) + break; + retries++; + drv_usecwait(vgen_recv_delay); + goto vgen_recv_retry; + } + retries = 0; + + if (set_ack_start) { + /* + * initialize the start index of the range + * of descriptors to be ack'd. + */ + ack_start = rxi; + set_ack_start = B_FALSE; + } + datalen = rxdp->nbytes; ncookies = rxdp->ncookies; if ((datalen < ETHERMIN) || (ncookies == 0) || - (ncookies > (uint64_t)MAX_COOKIES) || - (hdrp->dstate != VIO_DESC_READY)) { + (ncookies > MAX_COOKIES)) { rxd_err = B_TRUE; } else { /* - * The data buffer returned by allocb(9F) is - * 8byte aligned. We allocate extra 8 bytes to - * ensure size is multiple of 8 bytes for - * ldc_mem_copy(). + * Try to allocate an mblk from the free pool + * of recv mblks for the channel. + * If this fails, use allocb(). */ - mp = allocb(datalen + 8, BPRI_MED); - nbytes = (datalen + 7) & ~7; + mp = vio_allocb(ldcp->rmp); + if (!mp) { + /* + * The data buffer returned by + * allocb(9F) is 8byte aligned. We + * allocate extra 8 bytes to ensure + * size is multiple of 8 bytes for + * ldc_mem_copy(). + */ + statsp->rx_vio_allocb_fail++; + mp = allocb(VNET_IPALIGN + datalen + 8, + BPRI_MED); + } + nbytes = (VNET_IPALIGN + datalen + 7) & ~7; } if ((rxd_err) || (mp == NULL)) { /* @@ -4082,35 +4139,22 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, /* set descriptor done bit */ hdrp->dstate = VIO_DESC_DONE; + (void) ldc_mem_dring_release(ldcp->rx_dhandle, + rxi, rxi); + if (hdrp->ack) { /* - * sender needs ack for this packet. - * sync pkts upto this index and - * send the ack to the peer. + * sender needs ack for this packet, + * ack pkts upto this index. */ - sync_end = rxi; - (void) ldc_mem_dring_release( - ldcp->rx_dhandle, sync_start, - sync_end); - tagp->vio_subtype = VIO_SUBTYPE_ACK; - tagp->vio_sid = ldcp->local_sid; - dringmsg = (vio_dring_msg_t *)tagp; - dringmsg->start_idx = sync_start; - dringmsg->end_idx = sync_end; - if (vgen_sendmsg(ldcp, (caddr_t)tagp, - sizeof (*dringmsg), B_FALSE)) { - DWARN((vnetp, - "vgen_handle_dring_data: " - "id(%lx) vgen_sendmsg " - "failed, stype: ACK\n", - ldcp->ldc_id)); - } - /* save new sync index start */ - if (sync_end != end) { - INCR_RXI(sync_end, ldcp); - sync_start = sync_end; - } else - sync_done = B_TRUE; + ack_end = rxi; + + vgen_send_dring_ack(ldcp, tagp, + ack_start, ack_end, + VIO_DP_ACTIVE); + + /* need to set new ack start index */ + set_ack_start = B_TRUE; } goto vgen_next_rxi; } @@ -4123,34 +4167,25 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, /* set done bit irrespective of rv of ldc_mem_copy() */ hdrp->dstate = VIO_DESC_DONE; + (void) ldc_mem_dring_release(ldcp->rx_dhandle, + rxi, rxi); + + mp->b_rptr += VNET_IPALIGN; + if (hdrp->ack) { /* - * sender needs ack for this packet. - * sync pkts upto this index and - * send the ack to the peer. + * sender needs ack for this packet, + * ack pkts upto this index. */ - sync_end = rxi; - (void) ldc_mem_dring_release(ldcp->rx_dhandle, - sync_start, sync_end); - tagp->vio_subtype = VIO_SUBTYPE_ACK; - tagp->vio_sid = ldcp->local_sid; - dringmsg = (vio_dring_msg_t *)tagp; - dringmsg->start_idx = sync_start; - dringmsg->end_idx = sync_end; - if (vgen_sendmsg(ldcp, (caddr_t)tagp, - sizeof (*dringmsg), B_FALSE)) { - DWARN((vnetp, - "vgen_handle_dring_data: id(%lx) " - "vgen_sendmsg failed stype: ACK\n", - ldcp->ldc_id)); - } - /* save new sync index start */ - if (sync_end != end) { - INCR_RXI(sync_end, ldcp); - sync_start = sync_end; - } else - sync_done = B_TRUE; + ack_end = rxi; + + vgen_send_dring_ack(ldcp, tagp, + ack_start, ack_end, VIO_DP_ACTIVE); + + /* need to set new ack start index */ + set_ack_start = B_TRUE; } + /* if ldc_mem_copy() failed */ if (rv) { DWARN((vnetp, @@ -4194,32 +4229,49 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, bpt = mp; } -vgen_next_rxi: if (rxi == end) { + +vgen_next_rxi: + /* update end index of range of descrs to be ack'd */ + ack_end = rxi; + + /* update the next index to be processed */ + INCR_RXI(next_rxi, ldcp); + if (next_rxi == start) { + /* + * processed the entire descriptor ring upto + * the index at which we started. + */ break; } - /* increment recv index */ - INCR_RXI(rxi, ldcp); + + rxi = next_rxi; _NOTE(CONSTCOND) } while (1); - if (!sync_done) { - /* sync remote descriptor range */ - sync_end = rxi; - (void) ldc_mem_dring_release(ldcp->rx_dhandle, - sync_start, sync_end); - DBG2((vnetp, - "vgen_handle_dring_data: not sending ACK\n")); + /* + * send an ack message to peer indicating that we have stopped + * processing descriptors. + */ + if (set_ack_start) { + /* + * We have ack'd upto some index and we have not + * processed any descriptors beyond that index. + * Use the last ack'd index as both the start and + * end of range of descrs being ack'd. + * Note: This results in acking the last index twice + * and should be harmless. + */ + ack_start = ack_end; } - /* save new recv index */ - INCR_RXI(rxi, ldcp); - ldcp->next_rxi = rxi; - ldcp->next_rxseq += ((end >= start) ? - ((end - start) + 1) : (start - end)); + vgen_send_dring_ack(ldcp, tagp, ack_start, ack_end, + VIO_DP_STOPPED); + + /* save new recv index and expected seqnum of next dring msg */ + ldcp->next_rxi = next_rxi; + ldcp->next_rxseq += 1; - /* try to reclaim transmit descrs also */ - vgen_reclaim(ldcp); break; case VIO_SUBTYPE_ACK: @@ -4228,6 +4280,7 @@ vgen_next_rxi: if (rxi == end) { * which we had set the ACK bit in the descriptor (during * transmit). This enables us to reclaim descriptors. */ + DBG2((vnetp, "vgen_handle_dring_data: ACK: start(%d), end(%d)\n", start, end)); @@ -4243,7 +4296,94 @@ vgen_next_rxi: if (rxi == end) { break; } statsp->dring_data_acks++; + + /* reclaim descriptors that are done */ vgen_reclaim(ldcp); + + if (dringmsg->dring_process_state != VIO_DP_STOPPED) { + /* + * receiver continued processing descriptors after + * sending us the ack. + */ + break; + } + + statsp->dring_stopped_acks++; + + /* receiver stopped processing descriptors */ + mutex_enter(&ldcp->txlock); + mutex_enter(&ldcp->tclock); + + /* + * determine if there are any pending tx descriptors + * ready to be processed by the receiver(peer) and if so, + * send a message to the peer to restart receiving. + */ + ready_txd = B_FALSE; + + /* + * using the end index of the descriptor range for which + * we received the ack, check if the next descriptor is + * ready. + */ + txi = end; + INCR_TXI(txi, ldcp); + tbufp = &ldcp->tbufp[txi]; + txdp = tbufp->descp; + hdrp = &txdp->hdr; + if (hdrp->dstate == VIO_DESC_READY) { + ready_txd = B_TRUE; + } else { + /* + * descr next to the end of ack'd descr range is not + * ready. + * starting from the current reclaim index, check + * if any descriptor is ready. + */ + + txi = ldcp->cur_tbufp - ldcp->tbufp; + tbufp = &ldcp->tbufp[txi]; + + while (tbufp != ldcp->next_tbufp) { + + txdp = tbufp->descp; + hdrp = &txdp->hdr; + if (hdrp->dstate == VIO_DESC_READY) { + break; + } + + INCR_TXI(txi, ldcp); + tbufp = &ldcp->tbufp[txi]; + + } + + if (tbufp != ldcp->next_tbufp) + ready_txd = B_TRUE; + } + + if (ready_txd) { + /* + * we have tx descriptor(s) ready to be + * processed by the receiver. + * send a message to the peer with the start index + * of ready descriptors. + */ + rv = vgen_send_dring_data(ldcp, txi, -1); + if (rv != 0) { + ldcp->resched_peer = B_TRUE; + } + } else { + /* + * no ready tx descriptors. set the flag to send a + * message to peer when tx descriptors are ready in + * transmit routine. + */ + ldcp->resched_peer = B_TRUE; + } + + mutex_exit(&ldcp->tclock); + mutex_exit(&ldcp->txlock); + break; case VIO_SUBTYPE_NACK: @@ -4281,9 +4421,7 @@ vgen_next_rxi: if (rxi == end) { /* send a new dring data msg including the lost descrs */ end = ldcp->next_tbufp - ldcp->tbufp; DECR_TXI(end, ldcp); - seqnum = ldcp->tbufp[start].seqnum; - /* no need to increment ldcp->next_txseq as this is rexmit */ - rv = vgen_send_dring_data(ldcp, start, end, seqnum); + rv = vgen_send_dring_data(ldcp, start, end); if (rv != 0) { /* * vgen_send_dring_data() error: drop all packets @@ -4305,7 +4443,6 @@ vgen_next_rxi: if (rxi == end) { /* update next pointer */ ldcp->next_tbufp = &(ldcp->tbufp[start]); - ldcp->next_txseq = seqnum; ldcp->next_txi = start; } DBG2((vnetp, @@ -4324,23 +4461,23 @@ vgen_next_rxi: if (rxi == end) { mutex_exit(&ldcp->tclock); mutex_exit(&ldcp->txlock); - vgen_reclaim(ldcp); - break; } DBG1((vnetp, "vgen_handle_dring_data: exit\n")); *headp = bp; *tailp = bpt; + } static void vgen_reclaim(vgen_ldc_t *ldcp) { - if (mutex_tryenter(&ldcp->tclock) == 0) - return; /* already in progress */ + mutex_enter(&ldcp->tclock); + vgen_reclaim_dring(ldcp); ldcp->reclaim_lbolt = ddi_get_lbolt(); + mutex_exit(&ldcp->tclock); } @@ -4355,9 +4492,7 @@ vgen_reclaim_dring(vgen_ldc_t *ldcp) vnet_public_desc_t *txdp; vgen_private_desc_t *tbufp; vio_dring_entry_hdr_t *hdrp; -#ifdef VGEN_USE_MAC_TX_UPDATE - vgen_t *vgenp = (vgen_t *)ldcp->vgenp; -#endif + vgen_t *vgenp = LDC_TO_VGEN(ldcp); #ifdef DEBUG if (vgen_trigger_txtimeout) @@ -4386,9 +4521,7 @@ vgen_reclaim_dring(vgen_ldc_t *ldcp) */ if (ldcp->need_resched) { ldcp->need_resched = B_FALSE; -#ifdef VGEN_USE_MAC_TX_UPDATE vnet_tx_update(vgenp->vnetp); -#endif } } @@ -4418,11 +4551,6 @@ vgen_tx_dring_full(vgen_ldc_t *ldcp) tbufp = ldcp->next_tbufp; ntbufp = NEXTTBUF(ldcp, tbufp); if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */ -#if 0 - void *vnetp = LDC_TO_VNET(ldcp); - DWARN((vnetp, "vgen_tx_dring_full: id(%lx)\n", - ldcp->ldc_id)); -#endif return (VGEN_SUCCESS); } return (VGEN_FAILURE); @@ -4436,11 +4564,6 @@ vgen_ldc_txtimeout(vgen_ldc_t *ldcp) drv_usectohz(vnet_ldcwd_txtimeout * 1000)) && (vnet_ldcwd_txtimeout) && (vgen_tx_dring_full(ldcp) == VGEN_SUCCESS)) { -#if 0 - void *vnetp = LDC_TO_VNET(ldcp); - DWARN((vnetp, "vgen_ldc_txtimeout: id(%lx)\n", - ldcp->ldc_id)); -#endif return (VGEN_SUCCESS); } else { return (VGEN_FAILURE); @@ -4452,10 +4575,12 @@ static void vgen_ldc_watchdog(void *arg) { vgen_ldc_t *ldcp; + vgen_t *vgenp; void *vnetp; int rv; ldcp = (vgen_ldc_t *)arg; + vgenp = LDC_TO_VGEN(ldcp); vnetp = LDC_TO_VNET(ldcp); rv = vgen_ldc_txtimeout(ldcp); @@ -4474,9 +4599,7 @@ vgen_ldc_watchdog(void *arg) mutex_exit(&ldcp->cblock); if (ldcp->need_resched) { ldcp->need_resched = B_FALSE; -#ifdef VGEN_USE_MAC_TX_UPDATE - vnet_tx_update(ldcp->vgenp->vnetp); -#endif + vnet_tx_update(vgenp->vnetp); } } @@ -4484,21 +4607,6 @@ vgen_ldc_watchdog(void *arg) drv_usectohz(vnet_ldcwd_interval * 1000)); } -/* based on mcopymsg() */ -static void -vgen_copymsg(mblk_t *mp, void *bufp) -{ - caddr_t dest = bufp; - mblk_t *bp; - size_t n; - - for (bp = mp; bp != NULL; bp = bp->b_cont) { - n = MBLKL(bp); - bcopy(bp->b_rptr, dest, n); - dest += n; - } -} - static int vgen_setup_kstats(vgen_ldc_t *ldcp) { @@ -4565,14 +4673,12 @@ vgen_setup_kstats(vgen_ldc_t *ldcp) /* Tx stats */ kstat_named_init(&ldckp->tx_no_desc, "tx_no_desc", KSTAT_DATA_ULONG); - kstat_named_init(&ldckp->tx_allocb_fail, "tx_allocb_fail", - KSTAT_DATA_ULONG); /* Rx stats */ - kstat_named_init(&ldckp->rx_no_desc, "rx_no_desc", - KSTAT_DATA_ULONG); kstat_named_init(&ldckp->rx_allocb_fail, "rx_allocb_fail", KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->rx_vio_allocb_fail, "rx_vio_allocb_fail", + KSTAT_DATA_ULONG); kstat_named_init(&ldckp->rx_lost_pkts, "rx_lost_pkts", KSTAT_DATA_ULONG); @@ -4581,6 +4687,10 @@ vgen_setup_kstats(vgen_ldc_t *ldcp) KSTAT_DATA_ULONG); kstat_named_init(&ldckp->dring_data_acks, "dring_data_acks", KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->dring_stopped_acks, "dring_stopped_acks", + KSTAT_DATA_ULONG); + kstat_named_init(&ldckp->dring_data_msgs, "dring_data_msgs", + KSTAT_DATA_ULONG); ksp->ks_update = vgen_kstat_update; ksp->ks_private = (void *)ldcp; @@ -4633,14 +4743,15 @@ vgen_kstat_update(kstat_t *ksp, int rw) ldckp->noxmtbuf.value.ul = statsp->noxmtbuf; ldckp->tx_no_desc.value.ul = statsp->tx_no_desc; - ldckp->tx_allocb_fail.value.ul = statsp->tx_allocb_fail; - ldckp->rx_no_desc.value.ul = statsp->rx_no_desc; ldckp->rx_allocb_fail.value.ul = statsp->rx_allocb_fail; + ldckp->rx_vio_allocb_fail.value.ul = statsp->rx_vio_allocb_fail; ldckp->rx_lost_pkts.value.ul = statsp->rx_lost_pkts; ldckp->callbacks.value.ul = statsp->callbacks; ldckp->dring_data_acks.value.ul = statsp->dring_data_acks; + ldckp->dring_stopped_acks.value.ul = statsp->dring_stopped_acks; + ldckp->dring_data_msgs.value.ul = statsp->dring_data_msgs; } else { statsp->ipackets = ldckp->ipackets64.value.ull; statsp->ierrors = ldckp->ierrors.value.ul; @@ -4660,14 +4771,15 @@ vgen_kstat_update(kstat_t *ksp, int rw) statsp->noxmtbuf = ldckp->noxmtbuf.value.ul; statsp->tx_no_desc = ldckp->tx_no_desc.value.ul; - statsp->tx_allocb_fail = ldckp->tx_allocb_fail.value.ul; - statsp->rx_no_desc = ldckp->rx_no_desc.value.ul; statsp->rx_allocb_fail = ldckp->rx_allocb_fail.value.ul; + statsp->rx_vio_allocb_fail = ldckp->rx_vio_allocb_fail.value.ul; statsp->rx_lost_pkts = ldckp->rx_lost_pkts.value.ul; statsp->callbacks = ldckp->callbacks.value.ul; statsp->dring_data_acks = ldckp->dring_data_acks.value.ul; + statsp->dring_stopped_acks = ldckp->dring_stopped_acks.value.ul; + statsp->dring_data_msgs = ldckp->dring_data_msgs.value.ul; } return (VGEN_SUCCESS); @@ -4702,20 +4814,11 @@ vgen_macaddr_strtoul(const uint8_t *macaddr) uint64_t val = 0; int i; -#if 0 - for (i = ETHERADDRL - 1; i >= 0; i--) { -#endif for (i = 0; i < ETHERADDRL; i++) { val <<= 8; val |= macaddr[i]; } -#if 0 - cmn_err(CE_CONT, "vgen_macaddr_strtoul: str(%x:%x:%x:%x:%x:%x)\n", - macaddr[0], macaddr[1], macaddr[2], - macaddr[3], macaddr[4], macaddr[5]); - cmn_err(CE_CONT, "vgen_macaddr_strtoul: val(0x%lx)\n", val); -#endif return (val); } @@ -4727,19 +4830,10 @@ vgen_macaddr_ultostr(uint64_t val, uint8_t *macaddr) uint64_t value; value = val; -#if 0 - for (i = 0; i < ETHERADDRL; i++) { -#endif for (i = ETHERADDRL - 1; i >= 0; i--) { macaddr[i] = value & 0xFF; value >>= 8; } -#if 0 - cmn_err(CE_CONT, "vgen_macaddr_ultostr: val(0x%lx)\n", val); - cmn_err(CE_CONT, "vgen_macaddr_ultostr: str(%x:%x:%x:%x:%x:%x)\n", - macaddr[0], macaddr[1], macaddr[2], - macaddr[3], macaddr[4], macaddr[5]); -#endif return (VGEN_SUCCESS); } @@ -4769,29 +4863,6 @@ vgen_hwatchdog(void *arg) } static void -vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint) -{ - vgen_hparams_t *hp; - char ep[8]; - uint8_t addr[6]; - char ea[6]; - - if (endpoint == VGEN_LOCAL) { - hp = &ldcp->local_hparams; - (void) sprintf(ep, "Local"); - } else { - hp = &ldcp->peer_hparams; - (void) sprintf(ep, "Peer"); - } - (void) vgen_macaddr_ultostr(hp->addr, addr); - cmn_err(CE_CONT, "attr_info: %s: \n", ep); - cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu, - vgen_print_ethaddr(addr, ea)); - cmn_err(CE_CONT, "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n", - hp->addr_type, hp->xfer_mode, hp->ack_freq); -} - -static void vgen_print_hparams(vgen_hparams_t *hp) { uint8_t addr[6]; diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c index d82d31c79f..7f32782bf2 100644 --- a/usr/src/uts/sun4v/io/vsw.c +++ b/usr/src/uts/sun4v/io/vsw.c @@ -68,6 +68,8 @@ #include <sys/vio_mailbox.h> #include <sys/vnet_mailbox.h> #include <sys/vnet_common.h> +#include <sys/vio_util.h> +#include <sys/sdt.h> /* * Function prototypes. @@ -183,7 +185,6 @@ static void vsw_create_privring(vsw_ldc_t *); static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp); static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **, int *); -static void vsw_dring_priv2pub(vsw_private_desc_t *); static dring_info_t *vsw_ident2dring(lane_t *, uint64_t); static void vsw_set_lane_attr(vsw_t *, lane_t *); @@ -194,10 +195,10 @@ static int vsw_check_dring_info(vio_dring_reg_msg_t *); /* Misc support routines */ static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); - static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); static int vsw_free_ring(dring_info_t *); + /* Debugging routines */ static void dump_flags(uint64_t); static void display_state(void); @@ -206,6 +207,13 @@ static void display_ring(dring_info_t *); int vsw_num_handshakes = 3; /* # of handshake attempts */ int vsw_wretries = 100; /* # of write attempts */ +int vsw_chain_len = 150; /* max # of mblks in msg chain */ +int vsw_desc_delay = 0; /* delay in us */ +int vsw_read_attempts = 5; /* # of reads of descriptor */ + +uint32_t vsw_mblk_size = VSW_MBLK_SIZE; +uint32_t vsw_num_mblks = VSW_NUM_MBLKS; + /* * mode specific frame switching function @@ -638,6 +646,13 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } + /* prevent auto-detaching */ + if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip, + DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) { + cmn_err(CE_NOTE, "Unable to set \"%s\" property for " + "instance %u", DDI_NO_AUTODETACH, instance); + } + /* * Now we have everything setup, register for MD change * events. @@ -681,8 +696,9 @@ vsw_attach_fail: static int vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - vsw_t **vswpp, *vswp; - int instance; + vio_mblk_pool_t *poolp, *npoolp; + vsw_t **vswpp, *vswp; + int instance; instance = ddi_get_instance(dip); vswp = ddi_get_soft_state(vsw_state, instance); @@ -707,8 +723,8 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) cmn_err(CE_WARN, "Unable to detach from MAC layer"); return (DDI_FAILURE); } + rw_destroy(&vswp->if_lockrw); } - rw_destroy(&vswp->if_lockrw); vsw_mdeg_unregister(vswp); @@ -723,6 +739,19 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) } /* + * Destroy any free pools that may still exist. + */ + poolp = vswp->rxh; + while (poolp != NULL) { + npoolp = vswp->rxh = poolp->nextp; + if (vio_destroy_mblks(poolp) != 0) { + vswp->rxh = poolp; + return (DDI_FAILURE); + } + poolp = npoolp; + } + + /* * Remove this instance from any entries it may be on in * the hash table by using the list of addresses maintained * in the vsw_t structure. @@ -927,7 +956,6 @@ vsw_get_md_properties(vsw_t *vswp) __func__, vswp->physname); } - #ifdef DEBUG /* * As a temporary measure to aid testing we check to see if there @@ -1336,6 +1364,8 @@ vsw_mac_unregister(vsw_t *vswp) } RW_EXIT(&vswp->if_lockrw); + vswp->mdprops &= ~VSW_MD_MACADDR; + D1(vswp, "%s: exit", __func__); return (rv); @@ -2021,6 +2051,7 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) ldc_attr_t attr; ldc_status_t istatus; int status = DDI_FAILURE; + int rv; D1(vswp, "%s: enter", __func__); @@ -2031,6 +2062,15 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) } ldcp->ldc_id = ldc_id; + /* allocate pool of receive mblks */ + rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh)); + if (rv) { + DWARN(vswp, "%s: unable to create free mblk pool for" + " channel %ld (rv %d)", __func__, ldc_id, rv); + kmem_free(ldcp, sizeof (vsw_ldc_t)); + return (1); + } + mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL); @@ -2045,6 +2085,8 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) ldcp->hss_id = 1; /* Initial handshake session id */ /* only set for outbound lane, inbound set by peer */ + mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL); vsw_set_lane_attr(vswp, &ldcp->lane_out); attr.devclass = LDC_DEV_NT_SVC; @@ -2055,27 +2097,15 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) if (status != 0) { DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)", __func__, ldc_id, status); - mutex_destroy(&ldcp->ldc_txlock); - mutex_destroy(&ldcp->ldc_cblock); - cv_destroy(&ldcp->drain_cv); - mutex_destroy(&ldcp->drain_cv_lock); - mutex_destroy(&ldcp->hss_lock); - kmem_free(ldcp, sizeof (vsw_ldc_t)); - return (1); + goto ldc_attach_fail; } status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp); if (status != 0) { DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)", __func__, ldc_id, status); - mutex_destroy(&ldcp->ldc_txlock); - mutex_destroy(&ldcp->ldc_cblock); - cv_destroy(&ldcp->drain_cv); - mutex_destroy(&ldcp->drain_cv_lock); - mutex_destroy(&ldcp->hss_lock); (void) ldc_fini(ldcp->ldc_handle); - kmem_free(ldcp, sizeof (vsw_ldc_t)); - return (1); + goto ldc_attach_fail; } @@ -2097,6 +2127,40 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id) D1(vswp, "%s: exit", __func__); return (0); + +ldc_attach_fail: + mutex_destroy(&ldcp->ldc_txlock); + mutex_destroy(&ldcp->ldc_cblock); + + cv_destroy(&ldcp->drain_cv); + + if (ldcp->rxh != NULL) { + if (vio_destroy_mblks(ldcp->rxh) != 0) { + /* + * Something odd has happened, as the destroy + * will only fail if some mblks have been allocated + * from the pool already (which shouldn't happen) + * and have not been returned. + * + * Add the pool pointer to a list maintained in + * the device instance. Another attempt will be made + * to free the pool when the device itself detaches. + */ + cmn_err(CE_WARN, "Creation of ldc channel %ld failed" + " and cannot destroy associated mblk pool", + ldc_id); + ldcp->rxh->nextp = vswp->rxh; + vswp->rxh = ldcp->rxh; + } + } + mutex_destroy(&ldcp->drain_cv_lock); + mutex_destroy(&ldcp->hss_lock); + + mutex_destroy(&ldcp->lane_in.seq_lock); + mutex_destroy(&ldcp->lane_out.seq_lock); + kmem_free(ldcp, sizeof (vsw_ldc_t)); + + return (1); } /* @@ -2150,11 +2214,28 @@ vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id) ldcp->ldc_status = LDC_INIT; ldcp->ldc_handle = NULL; ldcp->ldc_vswp = NULL; + + if (ldcp->rxh != NULL) { + if (vio_destroy_mblks(ldcp->rxh)) { + /* + * Mostly likely some mblks are still in use and + * have not been returned to the pool. Add the pool + * to the list maintained in the device instance. + * Another attempt will be made to destroy the pool + * when the device detaches. + */ + ldcp->rxh->nextp = vswp->rxh; + vswp->rxh = ldcp->rxh; + } + } + mutex_destroy(&ldcp->ldc_txlock); mutex_destroy(&ldcp->ldc_cblock); cv_destroy(&ldcp->drain_cv); mutex_destroy(&ldcp->drain_cv_lock); mutex_destroy(&ldcp->hss_lock); + mutex_destroy(&ldcp->lane_in.seq_lock); + mutex_destroy(&ldcp->lane_out.seq_lock); /* unlink it from the list */ prev_ldcp = ldcp->ldc_next; @@ -4072,11 +4153,14 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) size_t off = 0; uint64_t ncookies = 0; uint64_t chain = 0; - uint64_t j, len, num; - uint32_t start, end, datalen; - int i, last_sync, rv; + uint64_t j, len; + uint32_t pos, start, datalen; + uint32_t range_start, range_end; + int32_t end, num, cnt = 0; + int i, rv; boolean_t ack_needed = B_FALSE; - boolean_t sync_needed = B_TRUE; + boolean_t prev_desc_ack = B_FALSE; + int read_attempts = 0; D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id); @@ -4107,43 +4191,94 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) return; } - start = end = 0; - start = dring_pkt->start_idx; + start = pos = dring_pkt->start_idx; end = dring_pkt->end_idx; + len = dp->num_descriptors; - D3(vswp, "%s(%lld): start index %ld : end %ld\n", + range_start = range_end = pos; + + D2(vswp, "%s(%lld): start index %ld : end %ld\n", __func__, ldcp->ldc_id, start, end); - /* basic sanity check */ - len = dp->num_descriptors; - if (end > len) { - DERR(vswp, "%s(%lld): endpoint %lld outside ring" - " length %lld", __func__, ldcp->ldc_id, - end, len); + if (end == -1) { + num = -1; + } else if (num >= 0) { + num = end >= pos ? + end - pos + 1: (len - pos + 1) + end; + /* basic sanity check */ + if (end > len) { + DERR(vswp, "%s(%lld): endpoint %lld outside " + "ring length %lld", __func__, + ldcp->ldc_id, end, len); + + SND_DRING_NACK(ldcp, dring_pkt); + return; + } + } else { + DERR(vswp, "%s(%lld): invalid endpoint %lld", + __func__, ldcp->ldc_id, end); SND_DRING_NACK(ldcp, dring_pkt); return; } - /* sync data */ - if ((rv = ldc_mem_dring_acquire(dp->handle, - start, end)) != 0) { - DERR(vswp, "%s(%lld): unable to acquire dring : err %d", - __func__, ldcp->ldc_id, rv); - return; - } + while (cnt != num) { +vsw_recheck_desc: + if ((rv = ldc_mem_dring_acquire(dp->handle, + pos, pos)) != 0) { + DERR(vswp, "%s(%lld): unable to acquire " + "descriptor at pos %d: err %d", + __func__, pos, ldcp->ldc_id, rv); + SND_DRING_NACK(ldcp, dring_pkt); + return; + } - pub_addr = (vnet_public_desc_t *)dp->pub_addr; + pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos; - j = num = 0; + /* + * When given a bounded range of descriptors + * to process, its an error to hit a descriptor + * which is not ready. In the non-bounded case + * (end_idx == -1) this simply indicates we have + * reached the end of the current active range. + */ + if (pub_addr->hdr.dstate != VIO_DESC_READY) { + /* unbound - no error */ + if (end == -1) { + if (read_attempts == vsw_read_attempts) + break; + + delay(drv_usectohz(vsw_desc_delay)); + read_attempts++; + goto vsw_recheck_desc; + } - /* calculate # descriptors taking into a/c wrap around */ - num = end >= start ? end - start + 1: (len - start + 1) + end; + /* bounded - error - so NACK back */ + DERR(vswp, "%s(%lld): descriptor not READY " + "(%d)", __func__, ldcp->ldc_id, + pub_addr->hdr.dstate); + SND_DRING_NACK(ldcp, dring_pkt); + return; + } - last_sync = start; + DTRACE_PROBE1(read_attempts, int, read_attempts); - for (i = start; j < num; i = (i + 1) % len, j++) { - pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; + range_end = pos; + + /* + * If we ACK'd the previous descriptor then now + * record the new range start position for later + * ACK's. + */ + if (prev_desc_ack) { + range_start = pos; + + D2(vswp, "%s(%lld): updating range start " + "to be %d", __func__, ldcp->ldc_id, + range_start); + + prev_desc_ack = B_FALSE; + } /* * Data is padded to align on 8 byte boundary, @@ -4161,49 +4296,36 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) D2(vswp, "%s(%lld): processing desc %lld at pos" " 0x%llx : dstate 0x%lx : datalen 0x%lx", - __func__, ldcp->ldc_id, i, pub_addr, + __func__, ldcp->ldc_id, pos, pub_addr, pub_addr->hdr.dstate, datalen); /* - * XXXX : Is it a fatal error to be told to - * process a packet when the READY bit is not - * set ? - */ - if (pub_addr->hdr.dstate != VIO_DESC_READY) { - DERR(vswp, "%s(%d): descriptor %lld at pos " - " 0x%llx not READY (0x%lx)", __func__, - ldcp->ldc_id, i, pub_addr, - pub_addr->hdr.dstate); - - SND_DRING_NACK(ldcp, dring_pkt); - (void) ldc_mem_dring_release(dp->handle, - start, end); - return; - } - - /* * Mark that we are starting to process descriptor. */ pub_addr->hdr.dstate = VIO_DESC_ACCEPTED; + mp = vio_allocb(ldcp->rxh); + if (mp == NULL) { + /* + * No free receive buffers available, so + * fallback onto allocb(9F). Make sure that + * we get a data buffer which is a multiple + * of 8 as this is required by ldc_mem_copy. + */ + DTRACE_PROBE(allocb); + mp = allocb(datalen + VNET_IPALIGN + 8, + BPRI_MED); + } + /* - * allocb(9F) returns an aligned data block. We - * need to ensure that we ask ldc for an aligned - * number of bytes also. + * Ensure that we ask ldc for an aligned + * number of bytes. */ - nbytes = datalen; + nbytes = datalen + VNET_IPALIGN; if (nbytes & 0x7) { off = 8 - (nbytes & 0x7); nbytes += off; } - mp = allocb(datalen, BPRI_MED); - if (mp == NULL) { - DERR(vswp, "%s(%lld): allocb failed", - __func__, ldcp->ldc_id); - (void) ldc_mem_dring_release(dp->handle, - start, end); - return; - } ncookies = pub_addr->ncookies; rv = ldc_mem_copy(ldcp->ldc_handle, @@ -4213,18 +4335,24 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) if (rv != 0) { DERR(vswp, "%s(%d): unable to copy in " - "data from %d cookies", __func__, - ldcp->ldc_id, ncookies); + "data from %d cookies in desc %d" + " (rv %d)", __func__, ldcp->ldc_id, + ncookies, pos, rv); freemsg(mp); + + pub_addr->hdr.dstate = VIO_DESC_DONE; (void) ldc_mem_dring_release(dp->handle, - start, end); - return; + pos, pos); + break; } else { D2(vswp, "%s(%d): copied in %ld bytes" " using %d cookies", __func__, ldcp->ldc_id, nbytes, ncookies); } + /* adjust the read pointer to skip over the padding */ + mp->b_rptr += VNET_IPALIGN; + /* point to the actual end of data */ mp->b_wptr = mp->b_rptr + datalen; @@ -4246,50 +4374,89 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) /* mark we are finished with this descriptor */ pub_addr->hdr.dstate = VIO_DESC_DONE; + (void) ldc_mem_dring_release(dp->handle, pos, pos); + /* - * Send an ACK back to peer if requested, and sync - * the rings up to this point so the remote side sees - * the descriptor flag in a consistent state. + * Send an ACK back to peer if requested. */ if (ack_needed) { - if ((rv = ldc_mem_dring_release( - dp->handle, last_sync, i)) != 0) { - DERR(vswp, "%s(%lld): unable to sync" - " from %d to %d", __func__, - ldcp->ldc_id, last_sync, i); - } - ack_needed = B_FALSE; - if (i == end) - sync_needed = B_FALSE; - else - sync_needed = B_TRUE; + dring_pkt->start_idx = range_start; + dring_pkt->end_idx = range_end; - last_sync = (i + 1) % len; + DERR(vswp, "%s(%lld): processed %d %d, ACK" + " requested", __func__, ldcp->ldc_id, + dring_pkt->start_idx, + dring_pkt->end_idx); + dring_pkt->dring_process_state = VIO_DP_ACTIVE; dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; dring_pkt->tag.vio_sid = ldcp->local_session; vsw_send_msg(ldcp, (void *)dring_pkt, sizeof (vio_dring_msg_t)); + + prev_desc_ack = B_TRUE; + range_start = pos; } - } - if (sync_needed) { - if ((rv = ldc_mem_dring_release(dp->handle, - last_sync, end)) != 0) { - DERR(vswp, "%s(%lld): unable to sync" - " from %d to %d", __func__, - ldcp->ldc_id, last_sync, end); + /* next descriptor */ + pos = (pos + 1) % len; + cnt++; + + /* + * Break out of loop here and stop processing to + * allow some other network device (or disk) to + * get access to the cpu. + */ + /* send the chain of packets to be switched */ + if (chain > vsw_chain_len) { + D3(vswp, "%s(%lld): switching chain of %d " + "msgs", __func__, ldcp->ldc_id, chain); + vsw_switch_frame(vswp, bp, VSW_VNETPORT, + ldcp->ldc_port, NULL); + bp = NULL; + break; } } /* send the chain of packets to be switched */ - D3(vswp, "%s(%lld): switching chain of %d msgs", __func__, - ldcp->ldc_id, chain); - vsw_switch_frame(vswp, bp, VSW_VNETPORT, - ldcp->ldc_port, NULL); + if (bp != NULL) { + D3(vswp, "%s(%lld): switching chain of %d msgs", + __func__, ldcp->ldc_id, chain); + vsw_switch_frame(vswp, bp, VSW_VNETPORT, + ldcp->ldc_port, NULL); + } + + DTRACE_PROBE1(msg_cnt, int, cnt); + + /* + * We are now finished so ACK back with the state + * set to STOPPING so our peer knows we are finished + */ + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK; + dring_pkt->tag.vio_sid = ldcp->local_session; + + dring_pkt->dring_process_state = VIO_DP_STOPPED; + + DTRACE_PROBE(stop_process_sent); + + /* + * We have not processed any more descriptors beyond + * the last one we ACK'd. + */ + if (prev_desc_ack) + range_start = range_end; + dring_pkt->start_idx = range_start; + dring_pkt->end_idx = range_end; + + D2(vswp, "%s(%lld) processed : %d : %d, now stopping", + __func__, ldcp->ldc_id, dring_pkt->start_idx, + dring_pkt->end_idx); + + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t)); break; case VIO_SUBTYPE_ACK: @@ -4312,7 +4479,6 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) end = dring_pkt->end_idx; len = dp->num_descriptors; - j = num = 0; /* calculate # descriptors taking into a/c wrap around */ num = end >= start ? end - start + 1: (len - start + 1) + end; @@ -4320,31 +4486,112 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt) D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n", __func__, ldcp->ldc_id, start, end, num); + mutex_enter(&dp->dlock); + dp->last_ack_recv = end; + mutex_exit(&dp->dlock); + for (i = start; j < num; i = (i + 1) % len, j++) { pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; - if (pub_addr->hdr.dstate != VIO_DESC_DONE) { - DERR(vswp, "%s: descriptor %lld at pos " - " 0x%llx not DONE (0x%lx)\n", __func__, - i, pub_addr, pub_addr->hdr.dstate); - return; - } else { + /* + * If the last descriptor in a range has the ACK + * bit set then we will get two messages from our + * peer relating to it. The normal ACK msg and then + * a subsequent STOP msg. The first message will have + * resulted in the descriptor being reclaimed and + * its state set to FREE so when we encounter a non + * DONE descriptor we need to check to see if its + * because we have just reclaimed it. + */ + mutex_enter(&priv_addr->dstate_lock); + if (pub_addr->hdr.dstate == VIO_DESC_DONE) { /* clear all the fields */ bzero(priv_addr->datap, priv_addr->datalen); priv_addr->datalen = 0; pub_addr->hdr.dstate = VIO_DESC_FREE; pub_addr->hdr.ack = 0; + priv_addr->dstate = VIO_DESC_FREE; + mutex_exit(&priv_addr->dstate_lock); D3(vswp, "clearing descp %d : pub state " "0x%llx : priv state 0x%llx", i, pub_addr->hdr.dstate, priv_addr->dstate); + + } else { + mutex_exit(&priv_addr->dstate_lock); + + if (dring_pkt->dring_process_state != + VIO_DP_STOPPED) { + DERR(vswp, "%s: descriptor %lld at pos " + " 0x%llx not DONE (0x%lx)\n", + __func__, i, pub_addr, + pub_addr->hdr.dstate); + return; + } } } + /* + * If our peer is stopping processing descriptors then + * we check to make sure it has processed all the descriptors + * we have updated. If not then we send it a new message + * to prompt it to restart. + */ + if (dring_pkt->dring_process_state == VIO_DP_STOPPED) { + DTRACE_PROBE(stop_process_recv); + D2(vswp, "%s(%lld): got stopping msg : %d : %d", + __func__, ldcp->ldc_id, dring_pkt->start_idx, + dring_pkt->end_idx); + + /* + * Check next descriptor in public section of ring. + * If its marked as READY then we need to prompt our + * peer to start processing the ring again. + */ + i = (end + 1) % len; + pub_addr = (vnet_public_desc_t *)dp->pub_addr + i; + priv_addr = (vsw_private_desc_t *)dp->priv_addr + i; + + /* + * Hold the restart lock across all of this to + * make sure that its not possible for us to + * decide that a msg needs to be sent in the future + * but the sending code having already checked is + * about to exit. + */ + mutex_enter(&dp->restart_lock); + mutex_enter(&priv_addr->dstate_lock); + if (pub_addr->hdr.dstate == VIO_DESC_READY) { + + mutex_exit(&priv_addr->dstate_lock); + + dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO; + dring_pkt->tag.vio_sid = ldcp->local_session; + + mutex_enter(&ldcp->lane_out.seq_lock); + dring_pkt->seq_num = ldcp->lane_out.seq_num++; + mutex_exit(&ldcp->lane_out.seq_lock); + + dring_pkt->start_idx = (end + 1) % len; + dring_pkt->end_idx = -1; + + D2(vswp, "%s(%lld) : sending restart msg:" + " %d : %d", __func__, ldcp->ldc_id, + dring_pkt->start_idx, + dring_pkt->end_idx); + + vsw_send_msg(ldcp, (void *)dring_pkt, + sizeof (vio_dring_msg_t)); + } else { + mutex_exit(&priv_addr->dstate_lock); + dp->restart_reqd = B_TRUE; + } + mutex_exit(&dp->restart_lock); + } break; case VIO_SUBTYPE_NACK: @@ -4510,7 +4757,9 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) * check that the descriptor we are being ACK'ed for is in * fact READY, i.e. it is one we have shared with our peer. */ + mutex_enter(&priv_addr->dstate_lock); if (priv_addr->dstate != VIO_DESC_READY) { + mutex_exit(&priv_addr->dstate_lock); cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not " "READY (0x%lx)", __func__, ldcp->ldc_id, idx, priv_addr->dstate); @@ -4527,6 +4776,7 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) bzero(priv_addr->datap, priv_addr->datalen); priv_addr->datalen = 0; priv_addr->dstate = VIO_DESC_FREE; + mutex_exit(&priv_addr->dstate_lock); } break; @@ -4561,9 +4811,11 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt) priv_addr += idx; /* release resources associated with sent msg */ + mutex_enter(&priv_addr->dstate_lock); bzero(priv_addr->datap, priv_addr->datalen); priv_addr->datalen = 0; priv_addr->dstate = VIO_DESC_FREE; + mutex_exit(&priv_addr->dstate_lock); break; @@ -5153,6 +5405,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) vio_dring_msg_t dring_pkt; dring_info_t *dp = NULL; vsw_private_desc_t *priv_desc = NULL; + vnet_public_desc_t *pub = NULL; vsw_t *vswp = ldcp->ldc_vswp; mblk_t *bp; size_t n, size; @@ -5183,14 +5436,12 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) return (LDC_TX_FAILURE); } - mutex_enter(&dp->dlock); - size = msgsize(mp); if (size > (size_t)ETHERMAX) { DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, ldcp->ldc_id, size); - status = LDC_TX_FAILURE; - goto vsw_dringsend_free_exit; + freemsg(mp); + return (LDC_TX_FAILURE); } /* @@ -5201,7 +5452,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) * peers. This may change in the future. */ if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) { - DERR(vswp, "%s(%lld): no descriptor available for ring " + D2(vswp, "%s(%lld): no descriptor available for ring " "at 0x%llx", __func__, ldcp->ldc_id, dp); /* nothing more we can do */ @@ -5215,6 +5466,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) /* copy data into the descriptor */ bufp = priv_desc->datap; + bufp += VNET_IPALIGN; for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) { n = MBLKL(bp); bcopy(bp->b_rptr, bufp, n); @@ -5222,48 +5474,69 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp) } priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; - priv_desc->dstate = VIO_DESC_READY; - /* - * Copy relevant sections of private descriptor - * to public section - */ - vsw_dring_priv2pub(priv_desc); + pub = priv_desc->descp; + pub->nbytes = priv_desc->datalen; + + mutex_enter(&priv_desc->dstate_lock); + pub->hdr.dstate = VIO_DESC_READY; + mutex_exit(&priv_desc->dstate_lock); /* - * Send a vio_dring_msg to peer to prompt them to read - * the updated descriptor ring. + * Determine whether or not we need to send a message to our + * peer prompting them to read our newly updated descriptor(s). */ - dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; - dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; - dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; - dring_pkt.tag.vio_sid = ldcp->local_session; + mutex_enter(&dp->restart_lock); + if (dp->restart_reqd) { + dp->restart_reqd = B_FALSE; + mutex_exit(&dp->restart_lock); - /* Note - for now using first ring */ - dring_pkt.dring_ident = dp->ident; + /* + * Send a vio_dring_msg to peer to prompt them to read + * the updated descriptor ring. + */ + dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA; + dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO; + dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA; + dring_pkt.tag.vio_sid = ldcp->local_session; - /* - * Access to the seq_num is implicitly protected by the - * fact that we have only one dring associated with the - * lane currently and we hold the associated dring lock. - */ - dring_pkt.seq_num = ldcp->lane_out.seq_num++; + /* Note - for now using first ring */ + dring_pkt.dring_ident = dp->ident; - /* Note - only updating single descrip at time at the moment */ - dring_pkt.start_idx = idx; - dring_pkt.end_idx = idx; + mutex_enter(&ldcp->lane_out.seq_lock); + dring_pkt.seq_num = ldcp->lane_out.seq_num++; + mutex_exit(&ldcp->lane_out.seq_lock); - D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, - ldcp->ldc_id, dp, dring_pkt.dring_ident); - D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__, - ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx, - dring_pkt.seq_num); + /* + * If last_ack_recv is -1 then we know we've not + * received any ack's yet, so this must be the first + * msg sent, so set the start to the begining of the ring. + */ + mutex_enter(&dp->dlock); + if (dp->last_ack_recv == -1) { + dring_pkt.start_idx = 0; + } else { + dring_pkt.start_idx = (dp->last_ack_recv + 1) % + dp->num_descriptors; + } + dring_pkt.end_idx = -1; + mutex_exit(&dp->dlock); - vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t)); + D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__, + ldcp->ldc_id, dp, dring_pkt.dring_ident); + D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", + __func__, ldcp->ldc_id, dring_pkt.start_idx, + dring_pkt.end_idx, dring_pkt.seq_num); -vsw_dringsend_free_exit: + vsw_send_msg(ldcp, (void *)&dring_pkt, + sizeof (vio_dring_msg_t)); + } else { + mutex_exit(&dp->restart_lock); + D2(vswp, "%s(%lld): updating descp %d", __func__, + ldcp->ldc_id, idx); + } - mutex_exit(&dp->dlock); +vsw_dringsend_free_exit: /* free the message block */ freemsg(mp); @@ -5316,14 +5589,12 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) return (LDC_TX_FAILURE); } - mutex_enter(&dp->dlock); - size = msgsize(mp); if (size > (size_t)ETHERMAX) { DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__, ldcp->ldc_id, size); - status = LDC_TX_FAILURE; - goto vsw_descrsend_free_exit; + freemsg(mp); + return (LDC_TX_FAILURE); } /* @@ -5355,7 +5626,6 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) } priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size; - priv_desc->dstate = VIO_DESC_READY; /* create and send the in-band descp msg */ ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA; @@ -5363,12 +5633,9 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA; ibnd_msg.hdr.tag.vio_sid = ldcp->local_session; - /* - * Access to the seq_num is implicitly protected by the - * fact that we have only one dring associated with the - * lane currently and we hold the associated dring lock. - */ + mutex_enter(&ldcp->lane_out.seq_lock); ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++; + mutex_exit(&ldcp->lane_out.seq_lock); /* * Copy the mem cookies describing the data from the @@ -5388,8 +5655,6 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp) vsw_descrsend_free_exit: - mutex_exit(&dp->dlock); - /* free the allocated message blocks */ freemsg(mp); @@ -6140,6 +6405,7 @@ vsw_create_dring(vsw_ldc_t *ldcp) /* haven't used any descriptors yet */ dp->end_idx = 0; + dp->last_ack_recv = -1; /* bind dring to the channel */ if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle, @@ -6150,6 +6416,9 @@ vsw_create_dring(vsw_ldc_t *ldcp) goto dring_fail_exit; } + mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); + dp->restart_reqd = B_TRUE; + /* * Only ever create rings for outgoing lane. Link it onto * end of list. @@ -6225,6 +6494,9 @@ vsw_create_privring(vsw_ldc_t *ldcp) /* haven't used any descriptors yet */ dp->end_idx = 0; + mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL); + dp->restart_reqd = B_TRUE; + /* * Only ever create rings for outgoing lane. Link it onto * end of list. @@ -6257,12 +6529,14 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) uint64_t offset = 0; uint32_t ncookies = 0; static char *name = "vsw_setup_ring"; - int i, j, rv; + int i, j, nc, rv; - /* note - public section may be null */ priv_addr = dp->priv_addr; pub_addr = dp->pub_addr; + /* public section may be null but private should never be */ + ASSERT(priv_addr != NULL); + /* * Allocate the region of memory which will be used to hold * the data the descriptors will refer to. @@ -6281,6 +6555,8 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) * descriptor fields. */ for (i = 0; i < VSW_RING_NUM_EL; i++) { + mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL); + if ((ldc_mem_alloc_handle(ldcp->ldc_handle, &priv_addr->memhandle)) != 0) { DERR(vswp, "%s: alloc mem handle failed", name); @@ -6335,6 +6611,14 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) /* link pub and private sides */ priv_addr->descp = pub_addr; + pub_addr->ncookies = priv_addr->ncookies; + + for (nc = 0; nc < pub_addr->ncookies; nc++) { + bcopy(&priv_addr->memcookie[nc], + &pub_addr->memcookie[nc], + sizeof (ldc_mem_cookie_t)); + } + pub_addr->hdr.dstate = VIO_DESC_FREE; pub_addr++; } @@ -6352,10 +6636,12 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp) setup_ring_cleanup: priv_addr = dp->priv_addr; - for (i = 0; i < VSW_RING_NUM_EL; i++) { + for (j = 0; j < i; j++) { (void) ldc_mem_unbind_handle(priv_addr->memhandle); (void) ldc_mem_free_handle(priv_addr->memhandle); + mutex_destroy(&priv_addr->dstate_lock); + priv_addr++; } kmem_free(dp->data_addr, dp->data_sz); @@ -6368,7 +6654,8 @@ setup_ring_cleanup: * starting at the location of the last free descriptor found * previously. * - * Returns 0 if free descriptor is available, 1 otherwise. + * Returns 0 if free descriptor is available, and updates state + * of private descriptor to VIO_DESC_READY, otherwise returns 1. * * FUTURE: might need to return contiguous range of descriptors * as dring info msg assumes all will be contiguous. @@ -6377,38 +6664,34 @@ static int vsw_dring_find_free_desc(dring_info_t *dringp, vsw_private_desc_t **priv_p, int *idx) { - vsw_private_desc_t *addr; - uint64_t i; - uint64_t j = 0; - uint64_t start = dringp->end_idx; + vsw_private_desc_t *addr = NULL; int num = VSW_RING_NUM_EL; int ret = 1; D1(NULL, "%s enter\n", __func__); - addr = dringp->priv_addr; + ASSERT(dringp->priv_addr != NULL); D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld", - __func__, dringp, start); - - for (i = start; j < num; i = (i + 1) % num, j++) { - addr = (vsw_private_desc_t *)dringp->priv_addr + i; - D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n", - __func__, i, addr->dstate); - if (addr->dstate == VIO_DESC_FREE) { - D2(NULL, "%s: descriptor %lld is available", - __func__, i); - *priv_p = addr; - *idx = i; - dringp->end_idx = (i + 1) % num; - ret = 0; - break; - } + __func__, dringp, dringp->end_idx); + + addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx; + + mutex_enter(&addr->dstate_lock); + if (addr->dstate == VIO_DESC_FREE) { + addr->dstate = VIO_DESC_READY; + *priv_p = addr; + *idx = dringp->end_idx; + dringp->end_idx = (dringp->end_idx + 1) % num; + ret = 0; + } + mutex_exit(&addr->dstate_lock); /* ring full */ if (ret == 1) { - D2(NULL, "%s: no desp free: started at %d", __func__, start); + D2(NULL, "%s: no desp free: started at %d", __func__, + dringp->end_idx); } D1(NULL, "%s: exit\n", __func__); @@ -6417,34 +6700,6 @@ vsw_dring_find_free_desc(dring_info_t *dringp, } /* - * Copy relevant fields from the private descriptor into the - * associated public side. - */ -static void -vsw_dring_priv2pub(vsw_private_desc_t *priv) -{ - vnet_public_desc_t *pub; - int i; - - D1(NULL, "vsw_dring_priv2pub enter\n"); - - pub = priv->descp; - - pub->ncookies = priv->ncookies; - pub->nbytes = priv->datalen; - - for (i = 0; i < pub->ncookies; i++) { - bcopy(&priv->memcookie[i], &pub->memcookie[i], - sizeof (ldc_mem_cookie_t)); - } - - pub->hdr.ack = 1; - pub->hdr.dstate = VIO_DESC_READY; - - D1(NULL, "vsw_dring_priv2pub exit"); -} - -/* * Map from a dring identifier to the ring itself. Returns * pointer to ring or NULL if no match found. */ @@ -6487,7 +6742,10 @@ vsw_set_lane_attr(vsw_t *vswp, lane_t *lp) lp->addr_type = ADDR_TYPE_MAC; lp->xfer_mode = VIO_DRING_MODE; lp->ack_freq = 0; /* for shared mode */ + + mutex_enter(&lp->seq_lock); lp->seq_num = VNET_ISS; + mutex_exit(&lp->seq_lock); } /* @@ -6650,7 +6908,9 @@ vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir) } lp->lstate = VSW_LANE_INACTIV; + mutex_enter(&lp->seq_lock); lp->seq_num = VNET_ISS; + mutex_exit(&lp->seq_lock); if (lp->dringp) { if (dir == INBOUND) { dp = lp->dringp; @@ -6725,6 +6985,7 @@ vsw_free_ring(dring_info_t *dp) } paddr->memhandle = NULL; } + mutex_destroy(&paddr->dstate_lock); } kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL)); @@ -6744,6 +7005,7 @@ vsw_free_ring(dring_info_t *dp) mutex_exit(&dp->dlock); mutex_destroy(&dp->dlock); + mutex_destroy(&dp->restart_lock); kmem_free(dp, sizeof (dring_info_t)); dp = dpp; diff --git a/usr/src/uts/sun4v/sys/ldc_impl.h b/usr/src/uts/sun4v/sys/ldc_impl.h index 84fcc52b1f..4064ef99c3 100644 --- a/usr/src/uts/sun4v/sys/ldc_impl.h +++ b/usr/src/uts/sun4v/sys/ldc_impl.h @@ -427,6 +427,7 @@ struct ldc_chan { boolean_t intr_pending; /* TRUE if interrupts are pending */ + kmutex_t tx_lock; /* Transmit lock */ uint64_t tx_q_entries; /* Num entries in transmit queue */ uint64_t tx_q_va; /* Virtual addr of transmit queue */ uint64_t tx_q_ra; /* Real addr of transmit queue */ @@ -451,7 +452,6 @@ struct ldc_chan { uint8_t pkt_payload; /* Size of packet payload */ - uint32_t first_fragment; /* Seqid of first msg fragment */ uint32_t last_msg_snt; /* Seqid of last packet sent */ uint32_t last_ack_rcd; /* Seqid of last ACK recd */ uint32_t last_msg_rcd; /* Seqid of last packet received */ diff --git a/usr/src/uts/sun4v/sys/vdsk_common.h b/usr/src/uts/sun4v/sys/vdsk_common.h index b8251afea2..b4e6d4351f 100644 --- a/usr/src/uts/sun4v/sys/vdsk_common.h +++ b/usr/src/uts/sun4v/sys/vdsk_common.h @@ -79,7 +79,7 @@ extern "C" { #define VD_MAX_COOKIES ((VD_MAX_BLOCK_SIZE / PAGESIZE) + 1) #define VD_USEC_TIMEOUT 20000 #define VD_LDC_IDS_PROP "ldc-ids" -#define VD_LDC_QLEN 32 +#define VD_LDC_QLEN VD_DRING_LEN /* * Flags used by ioctl routines to indicate if a copyin/copyout is needed diff --git a/usr/src/uts/sun4v/sys/vio_mailbox.h b/usr/src/uts/sun4v/sys/vio_mailbox.h index 66de0722e6..c3b74ac9be 100644 --- a/usr/src/uts/sun4v/sys/vio_mailbox.h +++ b/usr/src/uts/sun4v/sys/vio_mailbox.h @@ -120,6 +120,13 @@ extern "C" { #define VIO_PAYLOAD_ELEMS (VIO_PAYLOAD_SZ / LDC_ELEM_SIZE) /* num words */ /* + * Peer dring processing state. Either actively processing dring + * or stopped. + */ +#define VIO_DP_ACTIVE 1 +#define VIO_DP_STOPPED 2 + +/* * VIO device message tag. * * These 64 bits are used as a common header for all VIO message types. @@ -169,7 +176,6 @@ typedef struct vio_ver_msg { uint64_t resv3[VIO_PAYLOAD_ELEMS - 1]; } vio_ver_msg_t; - /* * VIO Descriptor Ring Register message. * @@ -260,10 +266,15 @@ typedef struct vio_dring_msg { uint32_t start_idx; /* Indx of first updated elem */ int32_t end_idx; /* Indx of last updated elem */ + uint8_t dring_process_state; /* Processing state */ + /* * Padding. */ - uint64_t resv[VIO_PAYLOAD_ELEMS - 3]; + uint8_t resv1; + uint16_t resv2; + uint32_t resv3; + uint64_t resv4[VIO_PAYLOAD_ELEMS - 4]; } vio_dring_msg_t; /* diff --git a/usr/src/uts/sun4v/sys/vio_util.h b/usr/src/uts/sun4v/sys/vio_util.h new file mode 100644 index 0000000000..ab7a255f1e --- /dev/null +++ b/usr/src/uts/sun4v/sys/vio_util.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _VIO_UTIL_H +#define _VIO_UTIL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/stream.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A message is composed of three structures. A message block (mblk_t), a + * data block to which it points and a data buffer. desballoc(9F) allows + * the caller to specify the data buffer and a free function which will + * be invoked when freeb(9F) is called to free the message. This allows + * the user to reclaim and reuse the data buffer, as opposed to using + * allocb(9F) where the message block, data block and data buffer are + * all destroyed by freeb(). + * + * Note that even with desballoc the message and data blocks are destroyed + * by freeb() and must be recreated. It is only the data buffer which is + * preserved. + * + * The caller first creates a pool of vio_mblk_t's by invoking + * vio_create_mblks() and specifying the number of mblks and the size of the + * associated data buffers. Each vio_mblk_t contains a pointer to the + * mblk_t, a pointer to the data buffer and a function pointer to the + * reclaim function. The caller is returned a pointer to the pool which is + * used in subsequent allocation/destroy requests. + * + * The pool is managed as a circular queue with a head and tail pointer. + * Allocation requests result in the head index being incremented, mblks + * being returned to the pool result in the tail pointer being incremented. + * + * The pool can only be destroyed when all the mblks have been returned. It + * is the responsibility of the caller to ensure that all vio_allocb() + * requests have been completed before the pool is destroyed. + * + * + * vio_mblk_pool_t + * +-------------+ + * | tail |--------------------------------+ + * +-------------+ | + * | head |--------+ | + * +-------------+ | | + * ............... V V + * +-------------+ +-------+-------+-------+-------+ + * | quep |---->| vmp_t | vmp_t | vmp_t | vmp_t | + * +-------------+ +-------+-------+-------+-------+ + * | | | | | | + * ... | | | | +------------+ + * | | | +-->| data block | + * | | | +------------+ + * | | | +------------+ + * | | +-->| data block | + * | | +------------+ + * | | +------------+ + * | +-->| data block | + * | +------------+ + * | +------------+ + * +-->| data block | + * +------------+ + * + */ + +struct vio_mblk_pool; + +typedef struct vio_mblk { + uint8_t *datap; /* data buffer */ + mblk_t *mp; /* mblk using datap */ + frtn_t reclaim; /* mblk reclaim routine */ + struct vio_mblk_pool *vmplp; /* pointer to parent pool */ +} vio_mblk_t; + +typedef struct vio_mblk_pool { + struct vio_mblk_pool *nextp; /* next in a list */ + kmutex_t hlock; /* sync access to head */ + kmutex_t tlock; /* sync access to tail */ + vio_mblk_t *basep; /* base pointer to pool of vio_mblks */ + vio_mblk_t **quep; /* queue of free vio_mblks */ + uint8_t *datap; /* rx data buffer area */ + uint32_t head; /* queue head */ + uint32_t tail; /* queue tail */ + uint64_t quelen; /* queue len (# mblks) */ + uint64_t quemask; /* quelen - 1 */ + size_t mblk_size; /* data buf size of each mblk */ +} vio_mblk_pool_t; + +int vio_create_mblks(uint64_t num_mblks, + size_t mblk_size, vio_mblk_pool_t **); +int vio_destroy_mblks(vio_mblk_pool_t *); +mblk_t *vio_allocb(vio_mblk_pool_t *); +void vio_freeb(void *arg); + + +#ifdef __cplusplus +} +#endif + +#endif /* _VIO_UTIL_H */ diff --git a/usr/src/uts/sun4v/sys/vldc_impl.h b/usr/src/uts/sun4v/sys/vldc_impl.h index 8610344b42..ffdd97636b 100644 --- a/usr/src/uts/sun4v/sys/vldc_impl.h +++ b/usr/src/uts/sun4v/sys/vldc_impl.h @@ -52,6 +52,8 @@ extern "C" { #define VLDC_MINOR_MASK (VLDC_MAX_PORTS - 1) #define VLDC_INST_SHIFT 11 +#define VLDC_HVCTL_SVCNAME "hvctl" + /* get port number from minor number */ #define VLDCPORT(vldcp, minor) \ ((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK].portno) @@ -95,6 +97,7 @@ typedef struct vldc_port { uint32_t mtu; /* port mtu */ caddr_t send_buf; /* send buffer */ caddr_t recv_buf; /* receive buffer */ + caddr_t cookie_buf; /* rd/wr cookie buffer */ uint64_t ldc_id; /* Channel number */ ldc_handle_t ldc_handle; /* Channel handle */ diff --git a/usr/src/uts/sun4v/sys/vnet.h b/usr/src/uts/sun4v/sys/vnet.h index c43af5bfab..53202f7601 100644 --- a/usr/src/uts/sun4v/sys/vnet.h +++ b/usr/src/uts/sun4v/sys/vnet.h @@ -44,6 +44,7 @@ extern "C" { #define VNET_LDCWD_INTERVAL 1000 /* watchdog freq in msec */ #define VNET_LDCWD_TXTIMEOUT 1000 /* tx timeout in msec */ #define VNET_LDC_QLEN 1024 /* ldc qlen */ +#define VNET_NRBUFS 512 /* number of receive bufs */ /* * vnet proxy transport layer information. There is one instance of this for diff --git a/usr/src/uts/sun4v/sys/vnet_common.h b/usr/src/uts/sun4v/sys/vnet_common.h index feed7025a2..575db18efb 100644 --- a/usr/src/uts/sun4v/sys/vnet_common.h +++ b/usr/src/uts/sun4v/sys/vnet_common.h @@ -43,11 +43,13 @@ extern "C" { */ /* max # of cookies per frame size */ -#define MAX_COOKIES ((ETHERMAX >> MMU_PAGESHIFT) + 2) +#define MAX_COOKIES ((ETHERMAX >> MMU_PAGESHIFT) + 2ULL) /* initial send sequence number */ #define VNET_ISS 0x1 +#define VNET_IPALIGN 6 /* padding for IP header alignment */ + /* vnet descriptor */ typedef struct vnet_public_desc { vio_dring_entry_hdr_t hdr; /* descriptor header */ diff --git a/usr/src/uts/sun4v/sys/vnet_gen.h b/usr/src/uts/sun4v/sys/vnet_gen.h index c6ad5fe8c0..3166a3412d 100644 --- a/usr/src/uts/sun4v/sys/vnet_gen.h +++ b/usr/src/uts/sun4v/sys/vnet_gen.h @@ -69,7 +69,7 @@ extern "C" { #define LDC_TO_VNET(ldcp) ((ldcp)->portp->vgenp->vnetp) #define LDC_TO_VGEN(ldcp) ((ldcp)->portp->vgenp) -#define VGEN_TX_DBLK_SZ 2048 /* tx data buffer size */ +#define VGEN_DBLK_SZ 2048 /* data buffer size */ #define VGEN_LDC_UP_DELAY 100 /* usec delay between ldc_up retries */ /* get the address of next tbuf */ @@ -107,7 +107,6 @@ typedef struct vgen_priv_desc { ldc_mem_handle_t memhandle; /* mem handle for data */ caddr_t datap; /* prealloc'd tx data buffer */ uint64_t datalen; /* total actual datalen */ - uint64_t seqnum; /* sequence number of pkt */ uint64_t ncookies; /* num ldc_mem_cookies */ ldc_mem_cookie_t memcookie[MAX_COOKIES]; /* data cookies */ } vgen_private_desc_t; @@ -147,13 +146,10 @@ typedef struct vgen_ver { typedef struct vgen_stats { /* Link Input/Output stats */ - uint64_t ipackets; - uint64_t ierrors; - uint64_t opackets; - uint64_t oerrors; -#if 0 - uint64_t collisions; -#endif + uint64_t ipackets; /* # rx packets */ + uint64_t ierrors; /* # rx error */ + uint64_t opackets; /* # tx packets */ + uint64_t oerrors; /* # tx error */ /* MIB II variables */ uint64_t rbytes; /* # bytes received */ @@ -166,17 +162,18 @@ typedef struct vgen_stats { uint32_t noxmtbuf; /* # xmit packets discarded */ /* Tx Statistics */ - uint32_t tx_no_desc; - uint32_t tx_allocb_fail; + uint32_t tx_no_desc; /* # out of transmit descriptors */ /* Rx Statistics */ - uint32_t rx_no_desc; - uint32_t rx_allocb_fail; - uint32_t rx_lost_pkts; + uint32_t rx_allocb_fail; /* # rx buf allocb() failures */ + uint32_t rx_vio_allocb_fail; /* # vio_allocb() failures */ + uint32_t rx_lost_pkts; /* # rx lost packets */ /* Callback statistics */ - uint32_t callbacks; - uint32_t dring_data_acks; + uint32_t callbacks; /* # callbacks */ + uint32_t dring_data_acks; /* # dring data acks recvd */ + uint32_t dring_stopped_acks; /* # dring stopped acks recvd */ + uint32_t dring_data_msgs; /* # dring data msgs sent */ } vgen_stats_t; @@ -190,9 +187,7 @@ typedef struct vgen_kstats { kstat_named_t opackets; kstat_named_t opackets64; kstat_named_t oerrors; -#if 0 - kstat_named_t collisions; -#endif + /* * required by kstat for MIB II objects(RFC 1213) */ @@ -208,17 +203,18 @@ typedef struct vgen_kstats { kstat_named_t noxmtbuf; /* MIB - ifOutDiscards */ /* Tx Statistics */ - kstat_named_t tx_no_desc; - kstat_named_t tx_allocb_fail; + kstat_named_t tx_no_desc; /* # out of transmit descriptors */ /* Rx Statistics */ - kstat_named_t rx_no_desc; - kstat_named_t rx_allocb_fail; - kstat_named_t rx_lost_pkts; + kstat_named_t rx_allocb_fail; /* # rx buf allocb failures */ + kstat_named_t rx_vio_allocb_fail; /* # vio_allocb() failures */ + kstat_named_t rx_lost_pkts; /* # rx lost packets */ /* Callback statistics */ - kstat_named_t callbacks; - kstat_named_t dring_data_acks; + kstat_named_t callbacks; /* # callbacks */ + kstat_named_t dring_data_acks; /* # dring data acks recvd */ + kstat_named_t dring_stopped_acks; /* # dring stopped acks recvd */ + kstat_named_t dring_data_msgs; /* # dring data msgs sent */ } vgen_kstats_t; @@ -277,6 +273,8 @@ typedef struct vgen_ldc { uint32_t next_rxi; /* next expected recv index */ uint32_t num_rxds; /* number of rx descriptors */ caddr_t tx_datap; /* prealloc'd tx data area */ + vio_mblk_pool_t *rmp; /* rx mblk pool */ + uint32_t num_rbufs; /* number of rx bufs */ /* misc */ uint32_t flags; /* flags */ @@ -284,6 +282,7 @@ typedef struct vgen_ldc { boolean_t need_ldc_reset; /* ldc_reset needed */ boolean_t need_mcast_sync; /* sync mcast table with vsw */ uint32_t hretries; /* handshake retry count */ + boolean_t resched_peer; /* send tx msg to peer */ /* channel statistics */ vgen_stats_t *statsp; /* channel statistics */ @@ -329,6 +328,7 @@ typedef struct vgen { struct ether_addr *mctab; /* multicast addr table */ uint32_t mcsize; /* allocated size of mctab */ uint32_t mccount; /* # of valid addrs in mctab */ + vio_mblk_pool_t *rmp; /* rx mblk pools to be freed */ } vgen_t; #ifdef __cplusplus diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h index fccb3c6fb8..b1df247547 100644 --- a/usr/src/uts/sun4v/sys/vsw.h +++ b/usr/src/uts/sun4v/sys/vsw.h @@ -82,6 +82,7 @@ extern "C" { #include <sys/vio_mailbox.h> #include <sys/vnet_common.h> #include <sys/ethernet.h> +#include <sys/vio_util.h> /* * Default message type. @@ -209,9 +210,21 @@ typedef struct ver_sup { #define VSW_MAX_COOKIES ((ETHERMTU >> MMU_PAGESHIFT) + 2) /* + * Size and number of mblks to be created in free pool. + */ +#define VSW_MBLK_SIZE 2048 +#define VSW_NUM_MBLKS 1024 + +/* * Private descriptor */ typedef struct vsw_private_desc { + /* + * Below lock must be held when accessing the state of + * a descriptor on either the private or public sections + * of the ring. + */ + kmutex_t dstate_lock; uint64_t dstate; vnet_public_desc_t *descp; ldc_mem_handle_t memhandle; @@ -237,6 +250,10 @@ typedef struct dring_info { ldc_dring_handle_t handle; uint64_t ident; /* identifier sent to peer */ uint64_t end_idx; /* last idx processed */ + int64_t last_ack_recv; + + kmutex_t restart_lock; + boolean_t restart_reqd; /* send restart msg */ /* * base address of private and public portions of the @@ -258,6 +275,7 @@ typedef struct lane { uint64_t lstate; /* Lane state */ uint32_t ver_major:16, /* Version major number */ ver_minor:16; /* Version minor number */ + kmutex_t seq_lock; uint64_t seq_num; /* Sequence number */ uint64_t mtu; /* ETHERMTU */ uint64_t addr; /* Unique physical address */ @@ -295,6 +313,7 @@ typedef struct vsw_ldc { lane_t lane_in; /* Inbound lane */ lane_t lane_out; /* Outbound lane */ uint8_t dev_class; /* Peer device class */ + vio_mblk_pool_t *rxh; /* Receive pool handle */ } vsw_ldc_t; /* list of ldcs per port */ @@ -407,6 +426,8 @@ typedef struct vsw { mod_hash_t *mfdb; /* multicast FDB */ krwlock_t mfdbrw; /* rwlock for mFDB */ + vio_mblk_pool_t *rxh; /* Receive pool handle */ + /* mac layer */ mac_handle_t mh; mac_rx_handle_t mrh; diff --git a/usr/src/uts/sun4v/vnet/Makefile b/usr/src/uts/sun4v/vnet/Makefile index 2eed19f4bc..a07417544b 100644 --- a/usr/src/uts/sun4v/vnet/Makefile +++ b/usr/src/uts/sun4v/vnet/Makefile @@ -69,9 +69,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) CFLAGS += -DVGEN_HANDLE_LOST_PKTS -#CFLAGS += -DVGEN_USE_MAC_TX_UPDATE -#CFLAGS += -DVGEN_REXMIT - # # Driver depends on MAC & IP |