summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornarayan <none@none>2006-07-10 13:54:40 -0700
committernarayan <none@none>2006-07-10 13:54:40 -0700
commitd10e4ef2fabf16c3237c6d6592496df3eac6a1ef (patch)
treeeba8ce41c34abcfb747da37e51cbe610162cf334
parent0ccf9e790d232720597416743840df88825a9317 (diff)
downloadillumos-joyent-d10e4ef2fabf16c3237c6d6592496df3eac6a1ef.tar.gz
6412648 VIO service drivers auto-unload after some time
6413569 Possible memory leaks needs to be investigated. 6423722 vds should use finer-grained locking for better performance 6429738 LDom panics using a destroyed ldc 6431111 LDOM manager should use P_FAULTED state for cpu instead of P_OFFLINE/ 6431300 can not enter '~' character on a ldom's console 6431458 vDisk drivers need to handle read/write requests asynchronously 6437436 ldc read/write operations are serialized due to a common lock 6437766 vDisk VTOC should handle the timestamp field 6440543 vSwitch/vNet should use aligned IP frame headers 6440553 vNet/vSwitch should reuse previously allocated mblks 6442270 vDisk server should set FREAD and FWRITE mode when calling ldi_ioctl 6442851 Remove VLDC max write_pa limit 6442973 vntsd dumps core with assertion failure message 6443193 vDisk client incorrectly implements DKIOCSGEOM ioctl 6443198 vDisk client incorrectly caches new VTOC on DKIOCSVTOC ioctl 6444392 vswitch/vnet should set end_idx to -1 for increased performance
-rw-r--r--usr/src/cmd/vntsd/cmd.c24
-rw-r--r--usr/src/cmd/vntsd/vntsd.h2
-rw-r--r--usr/src/cmd/vntsd/write.c34
-rw-r--r--usr/src/uts/sun4v/Makefile.files2
-rw-r--r--usr/src/uts/sun4v/io/cnex.c11
-rw-r--r--usr/src/uts/sun4v/io/fault_iso.c2
-rw-r--r--usr/src/uts/sun4v/io/ldc.c329
-rw-r--r--usr/src/uts/sun4v/io/vdc.c444
-rw-r--r--usr/src/uts/sun4v/io/vds.c906
-rw-r--r--usr/src/uts/sun4v/io/vio_util.c184
-rw-r--r--usr/src/uts/sun4v/io/vldc.c156
-rw-r--r--usr/src/uts/sun4v/io/vnet.c37
-rw-r--r--usr/src/uts/sun4v/io/vnet_gen.c701
-rw-r--r--usr/src/uts/sun4v/io/vsw.c692
-rw-r--r--usr/src/uts/sun4v/sys/ldc_impl.h2
-rw-r--r--usr/src/uts/sun4v/sys/vdsk_common.h2
-rw-r--r--usr/src/uts/sun4v/sys/vio_mailbox.h15
-rw-r--r--usr/src/uts/sun4v/sys/vio_util.h127
-rw-r--r--usr/src/uts/sun4v/sys/vldc_impl.h3
-rw-r--r--usr/src/uts/sun4v/sys/vnet.h1
-rw-r--r--usr/src/uts/sun4v/sys/vnet_common.h4
-rw-r--r--usr/src/uts/sun4v/sys/vnet_gen.h52
-rw-r--r--usr/src/uts/sun4v/sys/vsw.h21
-rw-r--r--usr/src/uts/sun4v/vnet/Makefile3
24 files changed, 2499 insertions, 1255 deletions
diff --git a/usr/src/cmd/vntsd/cmd.c b/usr/src/cmd/vntsd/cmd.c
index c39ef03399..8bee8417fe 100644
--- a/usr/src/cmd/vntsd/cmd.c
+++ b/usr/src/cmd/vntsd/cmd.c
@@ -275,14 +275,22 @@ exit_daemon_cmd(vntsd_client_t *clientp, int rv)
return (rv);
}
-/* vntsd_process_daemon_cmd() - special commands */
+/*
+ * vntsd_process_daemon_cmd() - special commands
+ * "<RET>~" vntsd daemon commands
+ * "<RET>~~" enter '~' character
+ */
int
vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c)
{
esctable_t *p;
int rv;
+ char prev_char;
+
+ prev_char = clientp->prev_char;
+ clientp->prev_char = c;
- if (c != VNTSD_DAEMON_CMD) {
+ if (c != VNTSD_DAEMON_CMD || (prev_char != 0 && prev_char != CR)) {
/* not a daemon command */
return (VNTSD_SUCCESS);
}
@@ -304,6 +312,18 @@ vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c)
return (exit_daemon_cmd(clientp, rv));
}
+ clientp->prev_char = c;
+ if (c == VNTSD_DAEMON_CMD) {
+ /*
+ * received another '~'
+ * a user types '~~' to get '~'
+ */
+ (void) mutex_lock(&clientp->lock);
+ clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD;
+ (void) mutex_unlock(&clientp->lock);
+ return (VNTSD_SUCCESS);
+ }
+
for (p = etable; p->e_char; p++) {
if (p->e_char == c) {
/* found match */
diff --git a/usr/src/cmd/vntsd/vntsd.h b/usr/src/cmd/vntsd/vntsd.h
index 16b1bbe90f..efaf724c15 100644
--- a/usr/src/cmd/vntsd/vntsd.h
+++ b/usr/src/cmd/vntsd/vntsd.h
@@ -248,6 +248,8 @@ typedef struct vntsd_client {
struct vntsd_cons *cons; /* back link to console configuration */
+ char prev_char; /* previous char read by this client */
+
} vntsd_client_t;
/* console structure */
diff --git a/usr/src/cmd/vntsd/write.c b/usr/src/cmd/vntsd/write.c
index 16f07029c5..9110056c11 100644
--- a/usr/src/cmd/vntsd/write.c
+++ b/usr/src/cmd/vntsd/write.c
@@ -50,6 +50,12 @@
#include "vntsd.h"
#include "chars.h"
+/* handle for writing all clients */
+typedef struct write_buf {
+ uint_t sz; /* data size */
+ char *buf;
+} write_buf_t;
+
/*
* check the state of write thread. exit if no more client connects to the
* console.
@@ -81,20 +87,16 @@ write_chk_status(vntsd_cons_t *consp, int status)
* skip_terminal_null()
* scan terminal null character sequence (0x5e 0x40)
* return number of characters in the buf after skipping terminal null
- * sequence.
+ * sequence. buf size must be at least sz+1.
*/
static int
-skip_terminal_null(char *buf, int buf_sz, int sz)
+skip_terminal_null(char *buf, int sz)
{
int i, j;
static int term_null_seq = 0;
assert(sz >= 0);
- if (buf_sz < sz+1) {
- return (-1);
- }
-
if (term_null_seq) {
/* skip 0x5e previously */
term_null_seq = 0;
@@ -180,14 +182,18 @@ read_vcc(vntsd_cons_t *consp, char *buf, ssize_t *sz)
return (VNTSD_STATUS_VCC_IO_ERR);
}
-static int s_sz;
-/* write to a client */
+/*
+ * write to a client
+ * this function is passed as a parameter to vntsd_que_find.
+ * for each client that connected to the console, vntsd_que_find
+ * applies this function.
+ */
static boolean_t
-write_all_clients(vntsd_client_t *clientp, char *buf)
+write_one_client(vntsd_client_t *clientp, write_buf_t *write_buf)
{
int rv;
- rv = vntsd_write_client(clientp, buf, s_sz);
+ rv = vntsd_write_client(clientp, write_buf->buf, write_buf->sz);
if (rv != VNTSD_SUCCESS) {
(void) mutex_lock(&clientp->lock);
clientp->status |= VNTSD_CLIENT_IO_ERR;
@@ -206,6 +212,7 @@ vntsd_write_thread(vntsd_cons_t *consp)
char buf[VNTSD_MAX_BUF_SIZE+1];
int sz;
int rv;
+ write_buf_t write_buf;
D1(stderr, "t@%d vntsd_write@%d\n", thr_self(), consp->vcc_fd);
@@ -225,12 +232,13 @@ vntsd_write_thread(vntsd_cons_t *consp)
}
/* has data */
- if ((s_sz = skip_terminal_null(buf, sz+1, sz)) == 0) {
+ if ((sz = skip_terminal_null(buf, sz)) == 0) {
/* terminal null sequence */
continue;
}
- assert(s_sz > 0);
+ write_buf.sz = sz;
+ write_buf.buf = buf;
/*
* output data to all clients connected
@@ -239,7 +247,7 @@ vntsd_write_thread(vntsd_cons_t *consp)
(void) mutex_lock(&consp->lock);
(void) vntsd_que_find(consp->clientpq,
- (compare_func_t)write_all_clients, buf);
+ (compare_func_t)write_one_client, &write_buf);
(void) mutex_unlock(&consp->lock);
write_chk_status(consp, VNTSD_SUCCESS);
diff --git a/usr/src/uts/sun4v/Makefile.files b/usr/src/uts/sun4v/Makefile.files
index 5150da4b60..7ed94b375c 100644
--- a/usr/src/uts/sun4v/Makefile.files
+++ b/usr/src/uts/sun4v/Makefile.files
@@ -134,7 +134,7 @@ VNEX_OBJS = vnex.o
CNEX_OBJS = cnex.o
GLVC_OBJS = glvc.o glvc_hcall.o
MDESC_OBJS = mdesc.o
-LDC_OBJS = ldc.o
+LDC_OBJS = ldc.o vio_util.o
VLDC_OBJS = vldc.o
VCC_OBJS = vcc.o
VNET_OBJS = vnet.o vnet_gen.o
diff --git a/usr/src/uts/sun4v/io/cnex.c b/usr/src/uts/sun4v/io/cnex.c
index 08a70cc810..293c20e131 100644
--- a/usr/src/uts/sun4v/io/cnex.c
+++ b/usr/src/uts/sun4v/io/cnex.c
@@ -765,9 +765,11 @@ cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype)
if (rv) {
DWARN("cnex_rem_intr: ino=0x%llx, cannot get state\n",
iinfo->ino);
+ mutex_exit(&cldcp->lock);
+ return (ENXIO);
}
- if (rv || ((gethrtime() - start) > cnex_pending_tmout))
+ if ((gethrtime() - start) > cnex_pending_tmout)
break;
} while (!panicstr && istate == HV_INTR_DELIVERED_STATE);
@@ -776,9 +778,8 @@ cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype)
if (istate != HV_INTR_IDLE_STATE) {
DWARN("cnex_rem_intr: cannot remove intr busy ino=%x\n",
iinfo->ino);
- /* clear interrupt state */
- (void) hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino,
- HV_INTR_IDLE_STATE);
+ mutex_exit(&cldcp->lock);
+ return (EAGAIN);
}
/* remove interrupt */
@@ -850,6 +851,8 @@ cnex_clr_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype)
HV_INTR_IDLE_STATE);
if (rv) {
DWARN("cnex_intr_wrapper: cannot clear interrupt state\n");
+ mutex_exit(&cldcp->lock);
+ return (ENXIO);
}
mutex_exit(&cldcp->lock);
diff --git a/usr/src/uts/sun4v/io/fault_iso.c b/usr/src/uts/sun4v/io/fault_iso.c
index 0123c19291..d7b884e37f 100644
--- a/usr/src/uts/sun4v/io/fault_iso.c
+++ b/usr/src/uts/sun4v/io/fault_iso.c
@@ -212,7 +212,7 @@ cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
}
break;
case FMA_CPU_REQ_OFFLINE:
- rv = p_online_internal(msg->cpu_id, P_OFFLINE,
+ rv = p_online_internal(msg->cpu_id, P_FAULTED,
&cpu_status);
if (rv == EINVAL) {
FI_DBG(CE_CONT, "Failed p_online call failed."
diff --git a/usr/src/uts/sun4v/io/ldc.c b/usr/src/uts/sun4v/io/ldc.c
index 3e526a623c..4b2bd1a092 100644
--- a/usr/src/uts/sun4v/io/ldc.c
+++ b/usr/src/uts/sun4v/io/ldc.c
@@ -421,6 +421,8 @@ i_ldc_txq_reconf(ldc_chan_t *ldcp)
int rv;
ASSERT(MUTEX_HELD(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
+
rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries);
if (rv) {
cmn_err(CE_WARN,
@@ -513,6 +515,9 @@ i_ldc_reset(ldc_chan_t *ldcp)
{
D2(ldcp->id, "i_ldc_reset: (0x%llx) channel reset\n", ldcp->id);
+ ASSERT(MUTEX_HELD(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
+
(void) i_ldc_txq_reconf(ldcp);
(void) i_ldc_rxq_reconf(ldcp);
i_ldc_reset_state(ldcp);
@@ -558,7 +563,9 @@ i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head)
cmn_err(CE_WARN, "ldc_rx_set_qhead: (0x%lx) cannot set qhead 0x%lx",
ldcp->id, head);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -575,7 +582,7 @@ i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail)
int rv;
uint64_t current_head, new_tail;
- ASSERT(MUTEX_HELD(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
/* Read the head and tail ptrs from HV */
rv = hv_ldc_tx_get_state(ldcp->id,
&ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
@@ -626,7 +633,7 @@ i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail)
int rv, retval = EWOULDBLOCK;
int retries;
- ASSERT(MUTEX_HELD(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
for (retries = 0; retries < ldc_max_retries; retries++) {
if ((rv = hv_ldc_tx_set_qtail(ldcp->id, tail)) == 0) {
@@ -658,7 +665,9 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype,
uint64_t tx_tail;
uint32_t curr_seqid = ldcp->last_msg_snt;
- ASSERT(MUTEX_HELD(&ldcp->lock));
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
/* get the current tail for the message */
rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
if (rv) {
@@ -666,6 +675,7 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype,
"i_ldc_send_pkt: (0x%llx) error sending pkt, "
"type=0x%x,subtype=0x%x,ctrl=0x%x\n",
ldcp->id, pkttype, subtype, ctrlmsg);
+ mutex_exit(&ldcp->tx_lock);
return (rv);
}
@@ -698,12 +708,14 @@ i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype,
"i_ldc_send_pkt:(0x%llx) error sending pkt, "
"type=0x%x,stype=0x%x,ctrl=0x%x\n",
ldcp->id, pkttype, subtype, ctrlmsg);
+ mutex_exit(&ldcp->tx_lock);
return (EIO);
}
ldcp->last_msg_snt = curr_seqid;
ldcp->tx_tail = tx_tail;
+ mutex_exit(&ldcp->tx_lock);
return (0);
}
@@ -755,6 +767,9 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
D2(ldcp->id, "i_ldc_process_VER: (0x%llx) received VER v%u.%u\n",
ldcp->id, rcvd_ver->major, rcvd_ver->minor);
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
switch (msg->stype) {
case LDC_INFO:
@@ -765,6 +780,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%llx) err sending "
"version ACK/NACK\n", ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -850,6 +866,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%llx) error sending "
"ACK/NACK\n", ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -871,6 +888,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%llx) cannot send RTS\n",
ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -898,6 +916,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%llx) no listener\n",
ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -914,6 +933,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%llx) no version match\n",
ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -924,6 +944,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%lx) err sending "
"version ACK/NACK\n", ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -973,6 +994,7 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
if (idx == LDC_NUM_VERS) {
/* no version match - terminate */
ldcp->next_vidx = 0;
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
}
@@ -992,12 +1014,14 @@ i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_VER: (0x%lx) error sending version"
"INFO\n", ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
break;
}
+ mutex_exit(&ldcp->tx_lock);
return (rv);
}
@@ -1022,7 +1046,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
ldcp->id);
/* Reset the channel -- as we cannot continue */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
@@ -1040,7 +1066,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTS);
if (rv) {
/* if cannot send NACK - reset channel */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
}
@@ -1050,7 +1078,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
default:
DWARN(ldcp->id, "i_ldc_process_RTS: (0x%llx) unexp ACK\n",
ldcp->id);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
}
@@ -1070,6 +1100,9 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
/* store initial SEQID info */
ldcp->last_msg_snt = msg->seqid;
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
/* get the current tail for the response */
rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
if (rv != 0) {
@@ -1077,6 +1110,7 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_RTS: (0x%lx) err sending RTR\n",
ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -1111,9 +1145,11 @@ i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_process_RTS: (0x%lx) error sending RTR\n",
ldcp->id);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
+ mutex_exit(&ldcp->tx_lock);
return (0);
}
@@ -1136,7 +1172,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg)
ldcp->id);
/* Reset the channel -- as we cannot continue */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
@@ -1155,7 +1193,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg)
rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTR);
if (rv) {
/* if cannot send NACK - reset channel */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
}
@@ -1168,7 +1208,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg)
ldcp->id);
/* Reset the channel -- as we cannot continue */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
}
@@ -1190,7 +1232,9 @@ i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg)
cmn_err(CE_NOTE,
"i_ldc_process_RTR: (0x%lx) cannot send RDX\n",
ldcp->id);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
D2(ldcp->id,
@@ -1224,7 +1268,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg)
ldcp->id);
/* Reset the channel -- as we cannot continue */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
@@ -1239,7 +1285,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg)
DWARN(DBG_ALL_LDCS,
"i_ldc_process_RDX: (0x%llx) unexpected RDX"
" - LDC reset\n", ldcp->id);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -1255,7 +1303,9 @@ i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg)
ldcp->id);
/* Reset the channel -- as we cannot continue */
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = ECONNRESET;
break;
}
@@ -1273,8 +1323,11 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg)
uint64_t tx_head;
ldc_msg_t *pkt;
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
/*
- * Read the curret Tx head and tail
+ * Read the current Tx head and tail
*/
rv = hv_ldc_tx_get_state(ldcp->id,
&ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
@@ -1282,7 +1335,11 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg)
cmn_err(CE_WARN,
"i_ldc_process_data_ACK: (0x%lx) cannot read qptrs\n",
ldcp->id);
- return (0);
+
+ /* Reset the channel -- as we cannot continue */
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
+ return (ECONNRESET);
}
/*
@@ -1310,10 +1367,15 @@ i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg)
DWARN(ldcp->id,
"i_ldc_process_data_ACK: (0x%llx) invalid ACKid\n",
ldcp->id);
- break;
+
+ /* Reset the channel -- as we cannot continue */
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
+ return (ECONNRESET);
}
}
+ mutex_exit(&ldcp->tx_lock);
return (0);
}
@@ -1353,8 +1415,10 @@ i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg)
switch (msg->ctrl & LDC_CTRL_MASK) {
case LDC_VER:
/* peer is redoing version negotiation */
+ mutex_enter(&ldcp->tx_lock);
(void) i_ldc_txq_reconf(ldcp);
i_ldc_reset_state(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = EAGAIN;
break;
case LDC_RTS:
@@ -1387,8 +1451,10 @@ i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg)
"i_ldc_ctrlmsg: (0x%llx) unexpected VER "
"- LDC reset\n", ldcp->id);
/* peer is redoing version negotiation */
+ mutex_enter(&ldcp->tx_lock);
(void) i_ldc_txq_reconf(ldcp);
i_ldc_reset_state(ldcp);
+ mutex_exit(&ldcp->tx_lock);
rv = EAGAIN;
break;
@@ -1472,20 +1538,28 @@ i_ldc_unregister_channel(ldc_chan_t *ldcp)
if (ldcp->tstate & TS_CNEX_RDY) {
+ /* Remove the Rx interrupt */
rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR);
if (rv) {
DWARN(ldcp->id,
"i_ldc_unregister_channel: err removing Rx intr\n");
+ return (rv);
}
+
+ /* Remove the Tx interrupt */
rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR);
if (rv) {
DWARN(ldcp->id,
"i_ldc_unregister_channel: err removing Tx intr\n");
+ return (rv);
}
+
+ /* Unregister the channel */
rv = cinfo->unreg_chan(ldcssp->cinfo.dip, ldcp->id);
if (rv) {
DWARN(ldcp->id,
"i_ldc_unregister_channel: cannot unreg channel\n");
+ return (rv);
}
ldcp->tstate &= ~TS_CNEX_RDY;
@@ -1520,12 +1594,16 @@ i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2)
/* Lock channel */
mutex_enter(&ldcp->lock);
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail,
&ldcp->link_state);
if (rv) {
cmn_err(CE_WARN,
"i_ldc_tx_hdlr: (0x%lx) cannot read queue ptrs rv=0x%d\n",
ldcp->id, rv);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (DDI_INTR_CLAIMED);
}
@@ -1565,6 +1643,7 @@ i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2)
ldcp->cb_inprogress = B_TRUE;
/* Unlock channel */
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
if (notify_client) {
@@ -1603,6 +1682,7 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
ldc_chan_t *ldcp;
boolean_t notify_client = B_FALSE;
uint64_t notify_event = 0;
+ uint64_t first_fragment = 0;
/* Get the channel for which interrupt was received */
if (arg1 == NULL) {
@@ -1645,7 +1725,9 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
if (ldcp->link_state == LDC_CHANNEL_DOWN) {
D1(ldcp->id, "i_ldc_rx_hdlr: channel link down\n",
ldcp->id);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
notify_client = B_TRUE;
notify_event = LDC_EVT_DOWN;
break;
@@ -1653,7 +1735,9 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
if (ldcp->link_state == LDC_CHANNEL_RESET) {
D1(ldcp->id, "i_ldc_rx_hdlr: channel link reset\n",
ldcp->id);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
notify_client = B_TRUE;
notify_event = LDC_EVT_RESET;
}
@@ -1715,11 +1799,11 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
"q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail);
/* Reset last_msg_rcd to start of message */
- if (ldcp->first_fragment != 0) {
- ldcp->last_msg_rcd =
- ldcp->first_fragment - 1;
- ldcp->first_fragment = 0;
+ if (first_fragment != 0) {
+ ldcp->last_msg_rcd = first_fragment - 1;
+ first_fragment = 0;
}
+
/*
* Send a NACK due to seqid mismatch
*/
@@ -1730,6 +1814,13 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
cmn_err(CE_NOTE,
"i_ldc_rx_hdlr: (0x%lx) err sending "
"CTRL/NACK msg\n", ldcp->id);
+
+ /* if cannot send NACK - reset channel */
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
+ rv = ECONNRESET;
+ break;
}
/* purge receive queue */
@@ -1769,7 +1860,11 @@ i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
/* process data ACKs */
if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) {
- (void) i_ldc_process_data_ACK(ldcp, msg);
+ if (rv = i_ldc_process_data_ACK(ldcp, msg)) {
+ notify_client = B_TRUE;
+ notify_event = LDC_EVT_RESET;
+ break;
+ }
}
/* move the head one position */
@@ -1878,11 +1973,24 @@ ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle)
/* Allocate an ldcp structure */
ldcp = kmem_zalloc(sizeof (ldc_chan_t), KM_SLEEP);
- /* Initialize the channel lock */
+ /*
+ * Initialize the channel and Tx lock
+ *
+ * The channel 'lock' protects the entire channel and
+ * should be acquired before initializing, resetting,
+ * destroying or reading from a channel.
+ *
+ * The 'tx_lock' should be acquired prior to transmitting
+ * data over the channel. The lock should also be acquired
+ * prior to channel reconfiguration (in order to prevent
+ * concurrent writes).
+ *
+ * ORDERING: When both locks are being acquired, to prevent
+ * deadlocks, the channel lock should be always acquired prior
+ * to the tx_lock.
+ */
mutex_init(&ldcp->lock, NULL, MUTEX_DRIVER, NULL);
-
- /* Channel specific processing */
- mutex_enter(&ldcp->lock);
+ mutex_init(&ldcp->tx_lock, NULL, MUTEX_DRIVER, NULL);
/* Initialize the channel */
ldcp->id = id;
@@ -1996,8 +2104,6 @@ ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle)
/* mark status as INITialized */
ldcp->status = LDC_INIT;
- mutex_exit(&ldcp->lock);
-
/* Add to channel list */
mutex_enter(&ldcssp->lock);
ldcp->next = ldcssp->chan_list;
@@ -2025,7 +2131,7 @@ cleanup_on_exit:
contig_mem_free((caddr_t)ldcp->rx_q_va,
(ldcp->rx_q_entries << LDC_PACKET_SHIFT));
- mutex_exit(&ldcp->lock);
+ mutex_destroy(&ldcp->tx_lock);
mutex_destroy(&ldcp->lock);
if (ldcp)
@@ -2121,6 +2227,7 @@ ldc_fini(ldc_handle_t handle)
mutex_exit(&ldcp->lock);
/* Destroy mutex */
+ mutex_destroy(&ldcp->tx_lock);
mutex_destroy(&ldcp->lock);
/* free channel structure */
@@ -2289,7 +2396,7 @@ int
ldc_close(ldc_handle_t handle)
{
ldc_chan_t *ldcp;
- int rv = 0;
+ int rv = 0, retries = 0;
boolean_t chk_done = B_FALSE;
if (handle == NULL) {
@@ -2331,6 +2438,9 @@ ldc_close(ldc_handle_t handle)
return (EBUSY);
}
+ /* Obtain Tx lock */
+ mutex_enter(&ldcp->tx_lock);
+
/*
* Wait for pending transmits to complete i.e Tx queue to drain
* if there are pending pkts - wait 1 ms and retry again
@@ -2342,6 +2452,7 @@ ldc_close(ldc_handle_t handle)
if (rv) {
cmn_err(CE_WARN,
"ldc_close: (0x%lx) cannot read qptrs\n", ldcp->id);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (EIO);
}
@@ -2366,13 +2477,27 @@ ldc_close(ldc_handle_t handle)
/*
* Unregister the channel with the nexus
*/
- rv = i_ldc_unregister_channel(ldcp);
- if (rv && rv != EAGAIN) {
- cmn_err(CE_WARN,
- "ldc_close: (0x%lx) channel unregister failed\n",
- ldcp->id);
+ while ((rv = i_ldc_unregister_channel(ldcp)) != 0) {
+
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
- return (rv);
+
+ /* if any error other than EAGAIN return back */
+ if (rv != EAGAIN || retries >= LDC_MAX_RETRIES) {
+ cmn_err(CE_WARN,
+ "ldc_close: (0x%lx) unregister failed, %d\n",
+ ldcp->id, rv);
+ return (rv);
+ }
+
+ /*
+ * As there could be pending interrupts we need
+ * to wait and try again
+ */
+ drv_usecwait(LDC_DELAY);
+ mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
+ retries++;
}
/*
@@ -2383,6 +2508,7 @@ ldc_close(ldc_handle_t handle)
cmn_err(CE_WARN,
"ldc_close: (0x%lx) channel TX queue unconf failed\n",
ldcp->id);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (EIO);
}
@@ -2391,6 +2517,7 @@ ldc_close(ldc_handle_t handle)
cmn_err(CE_WARN,
"ldc_close: (0x%lx) channel RX queue unconf failed\n",
ldcp->id);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (EIO);
}
@@ -2406,6 +2533,7 @@ ldc_close(ldc_handle_t handle)
ldcp->tstate = TS_INIT;
ldcp->status = LDC_INIT;
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
/* Decrement number of open channels */
@@ -2557,11 +2685,14 @@ ldc_up(ldc_handle_t handle)
return (0);
}
+ mutex_enter(&ldcp->tx_lock);
+
/* get the current tail for the LDC msg */
rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
if (rv) {
DWARN(ldcp->id, "ldc_up: (0x%llx) cannot initiate handshake\n",
ldcp->id);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (ECONNREFUSED);
}
@@ -2586,6 +2717,7 @@ ldc_up(ldc_handle_t handle)
DWARN(ldcp->id,
"ldc_up: (0x%llx) cannot initiate handshake rv=%d\n",
ldcp->id, rv);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (rv);
}
@@ -2594,6 +2726,7 @@ ldc_up(ldc_handle_t handle)
ldcp->tx_tail = tx_tail;
D1(ldcp->id, "ldc_up: (0x%llx) channel up initiated\n", ldcp->id);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (rv);
@@ -2615,7 +2748,9 @@ ldc_reset(ldc_handle_t handle)
ldcp = (ldc_chan_t *)handle;
mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (0);
@@ -2736,7 +2871,9 @@ ldc_chkq(ldc_handle_t handle, boolean_t *isempty)
/* reset the channel state if the channel went down */
if (ldcp->link_state == LDC_CHANNEL_DOWN ||
ldcp->link_state == LDC_CHANNEL_RESET) {
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
mutex_exit(&ldcp->lock);
return (ECONNRESET);
}
@@ -2839,7 +2976,9 @@ i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
/* reset the channel state if the channel went down */
if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -2886,14 +3025,12 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
size_t len = 0, bytes_read = 0;
int retries = 0;
uint64_t q_size_mask;
+ uint64_t first_fragment = 0;
target = target_bufp;
ASSERT(mutex_owned(&ldcp->lock));
- /* reset first frag to 0 */
- ldcp->first_fragment = 0;
-
/* compute mask for increment */
q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT;
@@ -2913,7 +3050,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
/* reset the channel state if the channel went down */
if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
@@ -2930,7 +3069,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
}
/* reset the channel state if the channel went down */
if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+ mutex_enter(&ldcp->tx_lock);
i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
return (ECONNRESET);
}
}
@@ -2938,7 +3079,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
if (curr_head == rx_tail) {
/* If in the middle of a fragmented xfer */
- if (ldcp->first_fragment != 0) {
+ if (first_fragment != 0) {
/* wait for ldc_delay usecs */
drv_usecwait(ldc_delay);
@@ -2947,7 +3088,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
continue;
*sizep = 0;
- ldcp->last_msg_rcd = ldcp->first_fragment - 1;
+ ldcp->last_msg_rcd = first_fragment - 1;
DWARN(DBG_ALL_LDCS,
"ldc_read: (0x%llx) read timeout",
ldcp->id);
@@ -2978,10 +3119,9 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
bytes_read = 0;
/* Reset last_msg_rcd to start of message */
- if (ldcp->first_fragment != 0) {
- ldcp->last_msg_rcd =
- ldcp->first_fragment - 1;
- ldcp->first_fragment = 0;
+ if (first_fragment != 0) {
+ ldcp->last_msg_rcd = first_fragment - 1;
+ first_fragment = 0;
}
/*
* Send a NACK -- invalid seqid
@@ -2993,6 +3133,13 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
cmn_err(CE_NOTE,
"ldc_read: (0x%lx) err sending "
"NACK msg\n", ldcp->id);
+
+ /* if cannot send NACK - reset channel */
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
+ rv = ECONNRESET;
+ break;
}
/* purge receive queue */
@@ -3021,7 +3168,11 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
/* process data ACKs */
if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) {
- (void) i_ldc_process_data_ACK(ldcp, msg);
+ if (rv = i_ldc_process_data_ACK(ldcp, msg)) {
+ *sizep = 0;
+ bytes_read = 0;
+ break;
+ }
}
/* process data messages */
@@ -3047,7 +3198,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
* currently expensive.
*/
- if (ldcp->first_fragment == 0) {
+ if (first_fragment == 0) {
/*
* first packets should always have the start
@@ -3074,7 +3225,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
continue;
}
- ldcp->first_fragment = msg->seqid;
+ first_fragment = msg->seqid;
} else {
/* check to see if this is a pkt w/ START bit */
if (msg->env & LDC_FRAG_START) {
@@ -3089,7 +3240,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
/* throw data we have read so far */
bytes_read = 0;
target = target_bufp;
- ldcp->first_fragment = msg->seqid;
+ first_fragment = msg->seqid;
if (rv = i_ldc_set_rx_head(ldcp,
curr_head))
@@ -3113,7 +3264,7 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
"head=0x%lx, expect=%d, got=%d\n", ldcp->id,
curr_head, *sizep, bytes_read+len);
- ldcp->first_fragment = 0;
+ first_fragment = 0;
target = target_bufp;
bytes_read = 0;
@@ -3173,10 +3324,15 @@ i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
ldcp->mode == LDC_MODE_STREAM)) {
rv = i_ldc_send_pkt(ldcp, LDC_DATA, LDC_ACK, 0);
- if (rv != 0) {
+ if (rv) {
cmn_err(CE_NOTE,
"ldc_read: (0x%lx) cannot send ACK\n", ldcp->id);
- return (0);
+
+ /* if cannot send ACK - reset channel */
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->tx_lock);
+ rv = ECONNRESET;
}
}
@@ -3250,20 +3406,28 @@ ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep)
}
ldcp = (ldc_chan_t *)handle;
- mutex_enter(&ldcp->lock);
+ /* check if writes can occur */
+ if (!mutex_tryenter(&ldcp->tx_lock)) {
+ /*
+ * Could not get the lock - channel could
+ * be in the process of being unconfigured
+ * or reader has encountered an error
+ */
+ return (EAGAIN);
+ }
/* check if non-zero data to write */
if (buf == NULL || sizep == NULL) {
DWARN(ldcp->id, "ldc_write: (0x%llx) invalid data write\n",
ldcp->id);
- mutex_exit(&ldcp->lock);
+ mutex_exit(&ldcp->tx_lock);
return (EINVAL);
}
if (*sizep == 0) {
DWARN(ldcp->id, "ldc_write: (0x%llx) write size of zero\n",
ldcp->id);
- mutex_exit(&ldcp->lock);
+ mutex_exit(&ldcp->tx_lock);
return (0);
}
@@ -3278,7 +3442,7 @@ ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep)
rv = ldcp->write_p(ldcp, buf, sizep);
}
- mutex_exit(&ldcp->lock);
+ mutex_exit(&ldcp->tx_lock);
return (rv);
}
@@ -3295,7 +3459,7 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
int rv = 0;
size_t size;
- ASSERT(mutex_owned(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
ASSERT(ldcp->mode == LDC_MODE_RAW);
size = *sizep;
@@ -3326,8 +3490,22 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
ldcp->link_state == LDC_CHANNEL_RESET) {
DWARN(ldcp->id,
"ldc_write: (0x%llx) channel down/reset\n", ldcp->id);
- i_ldc_reset(ldcp);
+
*sizep = 0;
+ if (mutex_tryenter(&ldcp->lock)) {
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ } else {
+ /*
+ * Release Tx lock, and then reacquire channel
+ * and Tx lock in correct order
+ */
+ mutex_exit(&ldcp->tx_lock);
+ mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ }
return (ECONNRESET);
}
@@ -3349,10 +3527,10 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
/* Send the data now */
ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
- /* copy the data into pkt */
+ /* copy the data into pkt */
bcopy((uint8_t *)buf, ldcmsg, size);
- /* increment tail */
+ /* increment tail */
tx_tail = new_tail;
/*
@@ -3368,9 +3546,21 @@ i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
return (EWOULDBLOCK);
}
- /* cannot write data - reset channel */
- i_ldc_reset(ldcp);
*sizep = 0;
+ if (mutex_tryenter(&ldcp->lock)) {
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ } else {
+ /*
+ * Release Tx lock, and then reacquire channel
+ * and Tx lock in correct order
+ */
+ mutex_exit(&ldcp->tx_lock);
+ mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ }
return (ECONNRESET);
}
@@ -3403,7 +3593,7 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size)
int rv;
uint32_t curr_seqid;
- ASSERT(mutex_owned(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
ASSERT(ldcp->mode == LDC_MODE_RELIABLE ||
ldcp->mode == LDC_MODE_UNRELIABLE ||
@@ -3427,7 +3617,20 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size)
DWARN(ldcp->id,
"ldc_write: (0x%llx) channel down/reset\n", ldcp->id);
*size = 0;
- i_ldc_reset(ldcp);
+ if (mutex_tryenter(&ldcp->lock)) {
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ } else {
+ /*
+ * Release Tx lock, and then reacquire channel
+ * and Tx lock in correct order
+ */
+ mutex_exit(&ldcp->tx_lock);
+ mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ }
return (ECONNRESET);
}
@@ -3522,9 +3725,21 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size)
int rv2;
if (rv != EWOULDBLOCK) {
- /* cannot write data - reset channel */
- i_ldc_reset(ldcp);
*size = 0;
+ if (mutex_tryenter(&ldcp->lock)) {
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ } else {
+ /*
+ * Release Tx lock, and then reacquire channel
+ * and Tx lock in correct order
+ */
+ mutex_exit(&ldcp->tx_lock);
+ mutex_enter(&ldcp->lock);
+ mutex_enter(&ldcp->tx_lock);
+ i_ldc_reset(ldcp);
+ mutex_exit(&ldcp->lock);
+ }
return (ECONNRESET);
}
@@ -3560,7 +3775,7 @@ i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size)
static int
i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
{
- ASSERT(mutex_owned(&ldcp->lock));
+ ASSERT(MUTEX_HELD(&ldcp->tx_lock));
ASSERT(ldcp->mode == LDC_MODE_STREAM);
/* Truncate packet to max of MTU size */
@@ -4692,7 +4907,7 @@ ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie, uint32_t ccount,
}
D1(ldcp->id, "ldc_mem_map: (0x%llx) cookie = 0x%llx,0x%llx\n",
- mhandle, cookie->addr, cookie->size);
+ ldcp->id, cookie->addr, cookie->size);
/* FUTURE: get the page size, pgsz code, and shift */
pg_size = MMU_PAGESIZE;
diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c
index 6502c8394a..a04c57e32d 100644
--- a/usr/src/uts/sun4v/io/vdc.c
+++ b/usr/src/uts/sun4v/io/vdc.c
@@ -69,6 +69,7 @@
#include <sys/mdeg.h>
#include <sys/note.h>
#include <sys/open.h>
+#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sunddi.h>
#include <sys/types.h>
@@ -152,7 +153,6 @@ static int vdc_populate_descriptor(vdc_t *vdc, caddr_t addr,
static int vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx,
vio_dring_msg_t dmsg);
static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx);
-static int vdc_get_response(vdc_t *vdc, int start, int end);
static int vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx,
caddr_t addr, size_t nbytes, int operation);
static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int
@@ -162,19 +162,26 @@ static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int
static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode);
static int vdc_create_fake_geometry(vdc_t *vdc);
static int vdc_setup_disk_layout(vdc_t *vdc);
-static int vdc_null_copy_func(void *from, void *to, int mode, int dir);
-static int vdc_get_vtoc_convert(void *from, void *to, int mode, int dir);
-static int vdc_set_vtoc_convert(void *from, void *to, int mode, int dir);
-static int vdc_get_geom_convert(void *from, void *to, int mode, int dir);
-static int vdc_set_geom_convert(void *from, void *to, int mode, int dir);
-static int vdc_uscsicmd_convert(void *from, void *to, int mode, int dir);
+static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
+static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
+static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
+static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
+static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
+static int vdc_uscsicmd_convert(vdc_t *vdc, void *from, void *to,
+ int mode, int dir);
/*
* Module variables
*/
uint64_t vdc_hz_timeout;
uint64_t vdc_usec_timeout = VDC_USEC_TIMEOUT_MIN;
-uint64_t vdc_dump_usec_timeout = VDC_USEC_TIMEOUT_MIN / 300;
+uint64_t vdc_usec_timeout_dump = VDC_USEC_TIMEOUT_MIN / 300;
+uint64_t vdc_usec_timeout_dring = 10 * MILLISEC;
static int vdc_retries = VDC_RETRIES;
static int vdc_dump_retries = VDC_RETRIES * 10;
@@ -932,18 +939,38 @@ vdc_print(dev_t dev, char *str)
static int
vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
- int rv = 0;
- size_t nbytes = (nblk * DEV_BSIZE);
- int instance = SDUNIT(getminor(dev));
- vdc_t *vdc;
+ buf_t *buf; /* BWRITE requests need to be in a buf_t structure */
+ int rv;
+ size_t nbytes = nblk * DEV_BSIZE;
+ int instance = SDUNIT(getminor(dev));
+ vdc_t *vdc = NULL;
if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
vdc_msg("%s (%d): Could not get state.", __func__, instance);
return (ENXIO);
}
- rv = vdc_populate_descriptor(vdc, addr, nbytes, VD_OP_BWRITE,
- blkno, SDPART(getminor(dev)));
+ buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
+ bioinit(buf);
+ buf->b_un.b_addr = addr;
+ buf->b_bcount = nbytes;
+ buf->b_flags = B_BUSY | B_WRITE;
+ buf->b_dev = dev;
+ rv = vdc_populate_descriptor(vdc, (caddr_t)buf, nbytes,
+ VD_OP_BWRITE, blkno, SDPART(getminor(dev)));
+
+ /*
+ * If the OS instance is panicking, the call above will ensure that
+ * the descriptor is done before returning. This should always be
+ * case when coming through this function but we check just in case
+ * and wait if necessary for the vDisk server to ACK and trigger
+ * the biodone.
+ */
+ if (!ddi_in_panic())
+ rv = biowait(buf);
+
+ biofini(buf);
+ kmem_free(buf, sizeof (buf_t));
PR1("%s: status=%d\n", __func__, rv);
@@ -983,22 +1010,32 @@ vdc_strategy(struct buf *buf)
return (0);
}
+ DTRACE_IO2(vstart, buf_t *, buf, vdc_t *, vdc);
+
ASSERT(buf->b_bcount <= (vdc->max_xfer_sz * vdc->block_size));
if (!vdc_is_able_to_tx_data(vdc, O_NONBLOCK)) {
- vdc_msg("%s: Not ready to transmit data", __func__);
+ PR0("%s: Not ready to transmit data\n", __func__);
bioerror(buf, ENXIO);
biodone(buf);
return (0);
}
bp_mapin(buf);
- rv = vdc_populate_descriptor(vdc, buf->b_un.b_addr, buf->b_bcount, op,
+ rv = vdc_populate_descriptor(vdc, (caddr_t)buf, buf->b_bcount, op,
buf->b_lblkno, SDPART(getminor(buf->b_edev)));
- PR1("%s: status=%d", __func__, rv);
- bioerror(buf, rv);
- biodone(buf);
+ /*
+ * If the request was successfully sent, the strategy call returns and
+ * the ACK handler calls the bioxxx functions when the vDisk server is
+ * done.
+ */
+ if (rv) {
+ PR0("[%d] Failed to read/write (err=%d)\n", instance, rv);
+ bioerror(buf, rv);
+ biodone(buf);
+ }
+
return (0);
}
@@ -1900,6 +1937,8 @@ vdc_destroy_descriptor_ring(vdc_t *vdc)
*
* Description:
* This function gets the index of the next Descriptor Ring entry available
+ * If the ring is full, it will back off and wait for the next entry to be
+ * freed (the ACK handler will signal).
*
* Return Value:
* 0 <= rv < VD_DRING_LEN Next available slot
@@ -1910,9 +1949,9 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed)
{
_NOTE(ARGUNUSED(num_slots_needed))
- vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */
+ vd_dring_entry_t *dep = NULL; /* DRing Entry Pointer */
+ vdc_local_desc_t *ldep = NULL; /* Local DRing Entry Pointer */
int idx = -1;
- int start_idx = 0;
ASSERT(vdc != NULL);
ASSERT(vdc->dring_len == VD_DRING_LEN);
@@ -1920,67 +1959,31 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed)
ASSERT(vdc->dring_curr_idx < VD_DRING_LEN);
ASSERT(mutex_owned(&vdc->dring_lock));
- /* Start at the last entry used */
- idx = start_idx = vdc->dring_curr_idx;
-
- /*
- * Loop through Descriptor Ring checking for a free entry until we reach
- * the entry we started at. We should never come close to filling the
- * Ring at any stage, instead this is just to prevent an entry which
- * gets into an inconsistent state (e.g. due to a request timing out)
- * from blocking progress.
- */
- do {
- /* Get the next entry after the last known index tried */
- idx = (idx + 1) % VD_DRING_LEN;
-
- dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx);
- ASSERT(dep != NULL);
+ /* pick the next descriptor after the last one used */
+ idx = (vdc->dring_curr_idx + 1) % VD_DRING_LEN;
+ ldep = &vdc->local_dring[idx];
+ ASSERT(ldep != NULL);
+ dep = ldep->dep;
+ ASSERT(dep != NULL);
+ mutex_enter(&ldep->lock);
+ if (dep->hdr.dstate == VIO_DESC_FREE) {
+ vdc->dring_curr_idx = idx;
+ } else {
+ DTRACE_PROBE(full);
+ (void) cv_timedwait(&ldep->cv, &ldep->lock,
+ VD_GET_TIMEOUT_HZ(1));
if (dep->hdr.dstate == VIO_DESC_FREE) {
- ASSERT(idx >= 0);
- ASSERT(idx < VD_DRING_LEN);
vdc->dring_curr_idx = idx;
- return (idx);
-
- } else if (dep->hdr.dstate == VIO_DESC_READY) {
- PR0("%s: Entry %d waiting to be accepted\n",
- __func__, idx);
- continue;
-
- } else if (dep->hdr.dstate == VIO_DESC_ACCEPTED) {
- PR0("%s: Entry %d waiting to be processed\n",
- __func__, idx);
- continue;
-
- } else if (dep->hdr.dstate == VIO_DESC_DONE) {
- PR0("%s: Entry %d done but not marked free\n",
- __func__, idx);
-
- /*
- * If we are currently panicking, interrupts are
- * disabled and we will not be getting ACKs from the
- * vDisk server so we mark the descriptor ring entries
- * as FREE here instead of in the ACK handler.
- */
- if (panicstr) {
- (void) vdc_depopulate_descriptor(vdc, idx);
- dep->hdr.dstate = VIO_DESC_FREE;
- vdc->local_dring[idx].flags = VIO_DESC_FREE;
- }
- continue;
-
} else {
- vdc_msg("Public Descriptor Ring entry corrupted");
- mutex_enter(&vdc->lock);
- vdc_reset_connection(vdc, B_FALSE);
- mutex_exit(&vdc->lock);
- return (-1);
+ PR0("[%d] Entry %d unavailable still in state %d\n",
+ vdc->instance, idx, dep->hdr.dstate);
+ idx = -1; /* indicate that the ring is full */
}
+ }
+ mutex_exit(&ldep->lock);
- } while (idx != start_idx);
-
- return (-1);
+ return (idx);
}
/*
@@ -1994,7 +1997,11 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed)
*
* Arguments:
* vdc - the soft state pointer
- * addr - start address of memory region.
+ * addr - address of structure to be written. In the case of block
+ * reads and writes this structure will be a buf_t and the
+ * address of the data to be written will be in the b_un.b_addr
+ * field. Otherwise the value of addr will be the address
+ * to be written.
* nbytes - number of bytes to read/write
* operation - operation we want vds to perform (VD_OP_XXX)
* arg - parameter to be sent to server (depends on VD_OP_XXX type)
@@ -2031,8 +2038,8 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
idx = vdc_get_next_dring_entry_idx(vdc, 1);
if (idx == -1) {
mutex_exit(&vdc->dring_lock);
- vdc_msg("%s[%d]: no descriptor ring entry avail, seq=%d\n",
- __func__, vdc->instance, vdc->seq_num);
+ PR0("[%d] no descriptor ring entry avail, last seq=%d\n",
+ vdc->instance, vdc->seq_num - 1);
/*
* Since strategy should not block we don't wait for the DRing
@@ -2047,17 +2054,23 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
ASSERT(dep != NULL);
/*
- * Wait for anybody still using the DRing entry to finish.
- * (e.g. still waiting for vds to respond to a request)
+ * We now get the lock for this descriptor before dropping the overall
+ * DRing lock. This prevents a race condition where another vdc thread
+ * could grab the descriptor we selected.
*/
+ ASSERT(!MUTEX_HELD(&local_dep->lock));
mutex_enter(&local_dep->lock);
+ mutex_exit(&vdc->dring_lock);
switch (operation) {
case VD_OP_BREAD:
case VD_OP_BWRITE:
+ local_dep->buf = (struct buf *)addr;
+ local_dep->addr = local_dep->buf->b_un.b_addr;
PR1("buf=%p, block=%lx, nbytes=%lx\n", addr, arg, nbytes);
dep->payload.addr = (diskaddr_t)arg;
- rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation);
+ rv = vdc_populate_mem_hdl(vdc, idx, local_dep->addr,
+ nbytes, operation);
break;
case VD_OP_GET_VTOC:
@@ -2065,6 +2078,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
case VD_OP_GET_DISKGEOM:
case VD_OP_SET_DISKGEOM:
case VD_OP_SCSICMD:
+ local_dep->addr = addr;
if (nbytes > 0) {
rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes,
operation);
@@ -2085,7 +2099,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
if (rv != 0) {
mutex_exit(&local_dep->lock);
- mutex_exit(&vdc->dring_lock);
return (rv);
}
@@ -2101,30 +2114,34 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
dep->hdr.ack = 1; /* request an ACK for every message */
local_dep->flags = VIO_DESC_READY;
- local_dep->addr = addr;
/*
* Send a msg with the DRing details to vds
*/
+ mutex_enter(&vdc->lock);
VIO_INIT_DRING_DATA_TAG(dmsg);
VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc);
dmsg.dring_ident = vdc->dring_ident;
dmsg.start_idx = idx;
dmsg.end_idx = idx;
+ DTRACE_IO2(send, vio_dring_msg_t *, &dmsg, vdc_t *, vdc);
+
PR1("ident=0x%llx, st=%d, end=%d, seq=%d req=%d dep=%p\n",
vdc->dring_ident, dmsg.start_idx, dmsg.end_idx,
dmsg.seq_num, dep->payload.req_id, dep);
- mutex_enter(&vdc->lock);
rv = vdc_send(vdc, (caddr_t)&dmsg, &msglen);
- mutex_exit(&vdc->lock);
PR1("%s[%d]: ldc_write() rv=%d\n", __func__, vdc->instance, rv);
if (rv != 0) {
+ mutex_exit(&vdc->lock);
mutex_exit(&local_dep->lock);
- mutex_exit(&vdc->dring_lock);
vdc_msg("%s: ldc_write(%d)\n", __func__, rv);
- return (EAGAIN);
+
+ /* Clear the DRing entry */
+ rv = vdc_depopulate_descriptor(vdc, idx);
+
+ return (rv ? rv : EAGAIN);
}
/*
@@ -2132,14 +2149,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
* number to be used by the next message
*/
vdc->seq_num++;
-
- /*
- * XXX - potential performance enhancement (Investigate at a later date)
- *
- * for calls from strategy(9E), instead of waiting for a response from
- * vds, we could return at this stage and let the ACK handling code
- * trigger the biodone(9F)
- */
+ mutex_exit(&vdc->lock);
/*
* When a guest is panicking, the completion of requests needs to be
@@ -2170,7 +2180,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
}
PR1("Waiting for next packet @ %d\n", idx);
- delay(drv_usectohz(vdc_dump_usec_timeout));
+ drv_usecwait(vdc_usec_timeout_dump);
continue;
}
@@ -2238,14 +2248,24 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
}
mutex_exit(&local_dep->lock);
- mutex_exit(&vdc->dring_lock);
return (rv);
}
/*
- * Now watch the DRing entries we modified to get the response
- * from vds.
+ * In the case of calls from strategy and dump (in the non-panic case),
+ * instead of waiting for a response from the vDisk server return now.
+ * They will be processed asynchronously and the vdc ACK handling code
+ * will trigger the biodone(9F)
+ */
+ if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
+ mutex_exit(&local_dep->lock);
+ return (rv);
+ }
+
+ /*
+ * In the case of synchronous calls we watch the DRing entries we
+ * modified and await the response from vds.
*/
rv = vdc_wait_for_descriptor_update(vdc, idx, dmsg);
if (rv == ETIMEDOUT) {
@@ -2257,7 +2277,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
PR1("%s[%d] Status=%d\n", __func__, vdc->instance, rv);
mutex_exit(&local_dep->lock);
- mutex_exit(&vdc->dring_lock);
return (rv);
}
@@ -2287,7 +2306,6 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg)
int rv = 0;
ASSERT(vdc != NULL);
- ASSERT(mutex_owned(&vdc->dring_lock));
ASSERT(idx < VD_DRING_LEN);
local_dep = &vdc->local_dring[idx];
ASSERT(local_dep != NULL);
@@ -2329,12 +2347,12 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg)
* and have never made it to the other side (vds).
* (We reuse the original message but update seq ID)
*/
+ mutex_enter(&vdc->lock);
VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc);
retries = 0;
- mutex_enter(&vdc->lock);
status = vdc_send(vdc, (caddr_t)&dmsg, &msglen);
- mutex_exit(&vdc->lock);
if (status != 0) {
+ mutex_exit(&vdc->lock);
vdc_msg("%s: Error (%d) while resending after "
"timeout\n", __func__, status);
status = ETIMEDOUT;
@@ -2345,60 +2363,13 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg)
* the sequence number to be used by the next message.
*/
vdc->seq_num++;
+ mutex_exit(&vdc->lock);
}
}
return (status);
}
-static int
-vdc_get_response(vdc_t *vdc, int start, int end)
-{
- vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */
- vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */
- int status = ENXIO;
- int idx = -1;
-
- ASSERT(vdc != NULL);
- ASSERT(start >= 0);
- ASSERT(start <= VD_DRING_LEN);
- ASSERT(start >= -1);
- ASSERT(start <= VD_DRING_LEN);
-
- idx = start;
- ldep = &vdc->local_dring[idx];
- ASSERT(ldep != NULL);
- dep = ldep->dep;
- ASSERT(dep != NULL);
-
- PR0("%s[%d] DRING entry=%d status=%d\n", __func__, vdc->instance,
- idx, VIO_GET_DESC_STATE(dep->hdr.dstate));
- while (VIO_GET_DESC_STATE(dep->hdr.dstate) == VIO_DESC_DONE) {
- if ((end != -1) && (idx > end))
- return (0);
-
- switch (ldep->operation) {
- case VD_OP_BREAD:
- case VD_OP_BWRITE:
- /* call bioxxx */
- break;
- default:
- /* signal waiter */
- break;
- }
-
- /* Clear the DRing entry */
- status = vdc_depopulate_descriptor(vdc, idx);
- PR0("%s[%d] Status=%d\n", __func__, vdc->instance, status);
-
- /* loop accounting to get next DRing entry */
- idx++;
- ldep = &vdc->local_dring[idx];
- dep = ldep->dep;
- }
-
- return (status);
-}
/*
* Function:
@@ -2452,7 +2423,7 @@ vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx)
bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes);
kmem_free(ldep->align_addr,
- sizeof (caddr_t) * dep->payload.nbytes);
+ sizeof (caddr_t) * P2ROUNDUP(dep->payload.nbytes, 8));
ldep->align_addr = NULL;
}
@@ -2536,17 +2507,20 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes,
*/
vaddr = addr;
if (((uint64_t)addr & 0x7) != 0) {
+ ASSERT(ldep->align_addr == NULL);
ldep->align_addr =
- kmem_zalloc(sizeof (caddr_t) * nbytes, KM_SLEEP);
+ kmem_zalloc(sizeof (caddr_t) * P2ROUNDUP(nbytes, 8),
+ KM_SLEEP);
PR0("%s[%d] Misaligned address %lx reallocating "
- "(buf=%lx entry=%d)\n",
- __func__, vdc->instance, addr, ldep->align_addr, idx);
+ "(buf=%lx nb=%d op=%d entry=%d)\n",
+ __func__, vdc->instance, addr, ldep->align_addr, nbytes,
+ operation, idx);
bcopy(addr, ldep->align_addr, nbytes);
vaddr = ldep->align_addr;
}
rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8),
- vdc->dring_mem_info.mtype, perm, &dep->payload.cookie[0],
+ LDC_SHADOW_MAP, perm, &dep->payload.cookie[0],
&dep->payload.ncookies);
PR1("%s[%d] bound mem handle; ncookies=%d\n",
__func__, vdc->instance, dep->payload.ncookies);
@@ -2556,7 +2530,7 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes,
__func__, vdc->instance, mhdl, addr, idx, rv);
if (ldep->align_addr) {
kmem_free(ldep->align_addr,
- sizeof (caddr_t) * dep->payload.nbytes);
+ sizeof (caddr_t) * P2ROUNDUP(nbytes, 8));
ldep->align_addr = NULL;
}
return (EAGAIN);
@@ -2986,7 +2960,7 @@ static int
vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg)
{
int status = 0;
- vdc_local_desc_t *local_dep = NULL;
+ vdc_local_desc_t *ldep = NULL;
vio_dring_msg_t *dring_msg = NULL;
uint_t num_msgs;
uint_t start;
@@ -3010,6 +2984,8 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg)
return (EPROTO);
}
+ DTRACE_IO2(recv, vio_dring_msg_t, dring_msg, vdc_t *, vdc);
+
/*
* calculate the number of messages that vds ACK'ed
*
@@ -3031,12 +3007,32 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg)
* Wake the thread waiting for each DRing entry ACK'ed
*/
for (i = 0; i < num_msgs; i++) {
+ int operation;
int idx = (start + i) % VD_DRING_LEN;
- local_dep = &vdc->local_dring[idx];
- mutex_enter(&local_dep->lock);
- cv_signal(&local_dep->cv);
- mutex_exit(&local_dep->lock);
+ ldep = &vdc->local_dring[idx];
+ mutex_enter(&ldep->lock);
+ operation = ldep->dep->payload.operation;
+ if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
+ /*
+ * The vDisk server responds when it accepts a
+ * descriptor so we continue looping and process
+ * it when it sends the message that it is done.
+ */
+ if (ldep->dep->hdr.dstate != VIO_DESC_DONE) {
+ mutex_exit(&ldep->lock);
+ continue;
+ }
+ bioerror(ldep->buf, ldep->dep->payload.status);
+ biodone(ldep->buf);
+
+ DTRACE_IO2(vdone, buf_t *, ldep->buf, vdc_t *, vdc);
+
+ /* Clear the DRing entry */
+ status = vdc_depopulate_descriptor(vdc, idx);
+ }
+ cv_signal(&ldep->cv);
+ mutex_exit(&ldep->lock);
}
if (msg.tag.vio_subtype == VIO_SUBTYPE_NACK) {
@@ -3348,6 +3344,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs)
{
ASSERT(vdc != NULL);
ASSERT(dring_msg != NULL);
+ ASSERT(mutex_owned(&vdc->lock));
/*
* Check to see if the messages were responded to in the correct
@@ -3357,7 +3354,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs)
* if so something is seriously wrong so we reset the connection
* - a seq_num greater than what we expected is returned.
*/
- if (dring_msg->seq_num != (vdc->seq_num_reply + num_msgs)) {
+ if (dring_msg->seq_num < vdc->seq_num_reply) {
vdc_msg("%s[%d]: Bogus seq_num %d, expected %d\n",
__func__, vdc->instance, dring_msg->seq_num,
vdc->seq_num_reply + num_msgs);
@@ -3529,7 +3526,8 @@ typedef struct vdc_dk_ioctl {
size_t nbytes; /* size of structure to be copied */
/* function to convert between vDisk and Solaris structure formats */
- int (*convert)(void *vd_buf, void *ioctl_arg, int mode, int dir);
+ int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg,
+ int mode, int dir);
} vdc_dk_ioctl_t;
/*
@@ -3546,15 +3544,13 @@ static vdc_dk_ioctl_t dk_ioctl[] = {
vdc_get_vtoc_convert},
{VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t),
vdc_set_vtoc_convert},
- {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t),
- vdc_get_geom_convert},
{VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t),
vdc_get_geom_convert},
{VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t),
vdc_get_geom_convert},
- {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t),
+ {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t),
vdc_get_geom_convert},
- {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t),
+ {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t),
vdc_set_geom_convert},
/*
@@ -3600,6 +3596,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
size_t alloc_len = 0; /* #bytes to allocate mem for */
caddr_t mem_p = NULL;
size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0]));
+ struct vtoc vtoc_saved;
PR0("%s: Processing ioctl(%x) for dev %x : model %x\n",
__func__, cmd, dev, ddi_model_convert_from(mode & FMODELS));
@@ -3740,13 +3737,21 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
ASSERT(alloc_len != 0); /* sanity check */
mem_p = kmem_zalloc(alloc_len, KM_SLEEP);
+ if (cmd == DKIOCSVTOC) {
+ /*
+ * Save a copy of the current VTOC so that we can roll back
+ * if the setting of the new VTOC fails.
+ */
+ bcopy(vdc->vtoc, &vtoc_saved, sizeof (struct vtoc));
+ }
+
/*
* Call the conversion function for this ioctl whhich if necessary
* converts from the Solaris format to the format ARC'ed
* as part of the vDisk protocol (FWARC 2006/195)
*/
ASSERT(dk_ioctl[idx].convert != NULL);
- rv = (dk_ioctl[idx].convert)(arg, mem_p, mode, VD_COPYIN);
+ rv = (dk_ioctl[idx].convert)(vdc, arg, mem_p, mode, VD_COPYIN);
if (rv != 0) {
PR0("%s[%d]: convert returned %d for ioctl 0x%x\n",
__func__, instance, rv, cmd);
@@ -3770,20 +3775,24 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
__func__, instance, rv, cmd);
if (mem_p != NULL)
kmem_free(mem_p, alloc_len);
+
+ if (cmd == DKIOCSVTOC) {
+ /* update of the VTOC has failed, roll back */
+ bcopy(&vtoc_saved, vdc->vtoc, sizeof (struct vtoc));
+ }
+
return (rv);
}
- /*
- * If the VTOC has been changed, then vdc needs to update the copy
- * it saved in the soft state structure and try and update the device
- * node properties. Failing to set the properties should not cause
- * an error to be return the caller though.
- */
if (cmd == DKIOCSVTOC) {
- bcopy(mem_p, vdc->vtoc, sizeof (struct vtoc));
+ /*
+ * The VTOC has been changed, try and update the device
+ * node properties. Failing to set the properties should
+ * not cause an error to be return the caller though.
+ */
if (vdc_create_device_nodes_props(vdc)) {
cmn_err(CE_NOTE, "![%d] Failed to update device nodes"
- " properties", instance);
+ " properties", vdc->instance);
}
}
@@ -3793,7 +3802,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
* protocol (FWARC 2006/195) back to a format understood by
* the rest of Solaris.
*/
- rv = (dk_ioctl[idx].convert)(mem_p, arg, mode, VD_COPYOUT);
+ rv = (dk_ioctl[idx].convert)(vdc, mem_p, arg, mode, VD_COPYOUT);
if (rv != 0) {
PR0("%s[%d]: convert returned %d for ioctl 0x%x\n",
__func__, instance, rv, cmd);
@@ -3816,8 +3825,9 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
* do not need to convert the data being passed in/out to userland
*/
static int
-vdc_null_copy_func(void *from, void *to, int mode, int dir)
+vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir)
{
+ _NOTE(ARGUNUSED(vdc))
_NOTE(ARGUNUSED(from))
_NOTE(ARGUNUSED(to))
_NOTE(ARGUNUSED(mode))
@@ -3831,9 +3841,16 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir)
* vdc_get_vtoc_convert()
*
* Description:
- * This routine fakes up the disk info needed for some DKIO ioctls.
+ * This routine performs the necessary convertions from the DKIOCGVTOC
+ * Solaris structure to the format defined in FWARC 2006/195.
+ *
+ * In the struct vtoc definition, the timestamp field is marked as not
+ * supported so it is not part of vDisk protocol (FWARC 2006/195).
+ * However SVM uses that field to check it can write into the VTOC,
+ * so we fake up the info of that field.
*
* Arguments:
+ * vdc - the vDisk client
* from - the buffer containing the data to be copied from
* to - the buffer to be copied to
* mode - flags passed to ioctl() call
@@ -3842,11 +3859,12 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir)
* Return Code:
* 0 - Success
* ENXIO - incorrect buffer passed in.
- * EFAULT - ddi_copyxxx routine encountered an error.
+ * EFAULT - ddi_copyout routine encountered an error.
*/
static int
-vdc_get_vtoc_convert(void *from, void *to, int mode, int dir)
+vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir)
{
+ int i;
void *tmp_mem = NULL;
void *tmp_memp;
struct vtoc vt;
@@ -3868,6 +3886,12 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir)
tmp_mem = kmem_alloc(copy_len, KM_SLEEP);
VD_VTOC2VTOC((vd_vtoc_t *)from, &vt);
+
+ /* fake the VTOC timestamp field */
+ for (i = 0; i < V_NUMPAR; i++) {
+ vt.timestamp[i] = vdc->vtoc->timestamp[i];
+ }
+
if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
vtoctovtoc32(vt, vt32);
tmp_memp = &vt32;
@@ -3887,8 +3911,11 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir)
* vdc_set_vtoc_convert()
*
* Description:
+ * This routine performs the necessary convertions from the DKIOCSVTOC
+ * Solaris structure to the format defined in FWARC 2006/195.
*
* Arguments:
+ * vdc - the vDisk client
* from - Buffer with data
* to - Buffer where data is to be copied to
* mode - flags passed to ioctl
@@ -3900,7 +3927,7 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir)
* EFAULT - ddi_copyin of data failed
*/
static int
-vdc_set_vtoc_convert(void *from, void *to, int mode, int dir)
+vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir)
{
void *tmp_mem = NULL;
struct vtoc vt;
@@ -3934,6 +3961,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir)
vtp = tmp_mem;
}
+ /*
+ * The VTOC is being changed, then vdc needs to update the copy
+ * it saved in the soft state structure.
+ */
+ bcopy(vtp, vdc->vtoc, sizeof (struct vtoc));
+
VTOC2VD_VTOC(vtp, &vtvd);
bcopy(&vtvd, to, sizeof (vd_vtoc_t));
kmem_free(tmp_mem, copy_len);
@@ -3946,8 +3979,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir)
* vdc_get_geom_convert()
*
* Description:
+ * This routine performs the necessary convertions from the DKIOCGGEOM,
+ * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format
+ * defined in FWARC 2006/195
*
* Arguments:
+ * vdc - the vDisk client
* from - Buffer with data
* to - Buffer where data is to be copied to
* mode - flags passed to ioctl
@@ -3956,11 +3993,13 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir)
* Return Code:
* 0 - Success
* ENXIO - Invalid buffer passed in
- * EFAULT - ddi_copyin of data failed
+ * EFAULT - ddi_copyout of data failed
*/
static int
-vdc_get_geom_convert(void *from, void *to, int mode, int dir)
+vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir)
{
+ _NOTE(ARGUNUSED(vdc))
+
struct dk_geom geom;
int copy_len = sizeof (struct dk_geom);
int rv = 0;
@@ -3984,10 +4023,11 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir)
* vdc_set_geom_convert()
*
* Description:
- * This routine performs the necessary convertions from the DKIOCSVTOC
- * Solaris structure to the format defined in FWARC 2006/195
+ * This routine performs the necessary convertions from the DKIOCSGEOM
+ * Solaris structure to the format defined in FWARC 2006/195.
*
* Arguments:
+ * vdc - the vDisk client
* from - Buffer with data
* to - Buffer where data is to be copied to
* mode - flags passed to ioctl
@@ -3999,8 +4039,10 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir)
* EFAULT - ddi_copyin of data failed
*/
static int
-vdc_set_geom_convert(void *from, void *to, int mode, int dir)
+vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir)
{
+ _NOTE(ARGUNUSED(vdc))
+
vd_geom_t vdgeom;
void *tmp_mem = NULL;
int copy_len = sizeof (struct dk_geom);
@@ -4104,6 +4146,7 @@ vdc_create_fake_geometry(vdc_t *vdc)
static int
vdc_setup_disk_layout(vdc_t *vdc)
{
+ buf_t *buf; /* BREAD requests need to be in a buf_t structure */
dev_t dev;
int slice = 0;
int rv;
@@ -4129,14 +4172,9 @@ vdc_setup_disk_layout(vdc_t *vdc)
}
/*
- * Read disk label from start of disk
- */
- vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP);
-
- /*
* find the slice that represents the entire "disk" and use that to
* read the disk label. The convention in Solaris is that slice 2
- * represents the whole disk so we check that it is otherwise we
+ * represents the whole disk so we check that it is, otherwise we
* default to slice 0
*/
if ((vdc->vdisk_type == VD_DISK_TYPE_DISK) &&
@@ -4145,8 +4183,22 @@ vdc_setup_disk_layout(vdc_t *vdc)
} else {
slice = 0;
}
- rv = vdc_populate_descriptor(vdc, (caddr_t)vdc->label, DK_LABEL_SIZE,
+
+ /*
+ * Read disk label from start of disk
+ */
+ vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP);
+ buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
+ bioinit(buf);
+ buf->b_un.b_addr = (caddr_t)vdc->label;
+ buf->b_bcount = DK_LABEL_SIZE;
+ buf->b_flags = B_BUSY | B_READ;
+ buf->b_dev = dev;
+ rv = vdc_populate_descriptor(vdc, (caddr_t)buf, DK_LABEL_SIZE,
VD_OP_BREAD, 0, slice);
+ rv = biowait(buf);
+ biofini(buf);
+ kmem_free(buf, sizeof (buf_t));
return (rv);
}
diff --git a/usr/src/uts/sun4v/io/vds.c b/usr/src/uts/sun4v/io/vds.c
index adcea0c944..7a06a331d1 100644
--- a/usr/src/uts/sun4v/io/vds.c
+++ b/usr/src/uts/sun4v/io/vds.c
@@ -50,9 +50,8 @@
/* Virtual disk server initialization flags */
-#define VDS_LOCKING 0x01
-#define VDS_LDI 0x02
-#define VDS_MDEG 0x04
+#define VDS_LDI 0x01
+#define VDS_MDEG 0x02
/* Virtual disk server tunable parameters */
#define VDS_LDC_RETRIES 3
@@ -71,11 +70,10 @@
/* Virtual disk initialization flags */
#define VD_LOCKING 0x01
-#define VD_TASKQ 0x02
-#define VD_LDC 0x04
-#define VD_DRING 0x08
-#define VD_SID 0x10
-#define VD_SEQ_NUM 0x20
+#define VD_LDC 0x02
+#define VD_DRING 0x04
+#define VD_SID 0x08
+#define VD_SEQ_NUM 0x10
/* Flags for opening/closing backing devices via LDI */
#define VD_OPEN_FLAGS (FEXCL | FREAD | FWRITE)
@@ -135,20 +133,48 @@
#endif /* DEBUG */
+/*
+ * Soft state structure for a vds instance
+ */
typedef struct vds {
uint_t initialized; /* driver inst initialization flags */
dev_info_t *dip; /* driver inst devinfo pointer */
- kmutex_t lock; /* lock for this structure */
ldi_ident_t ldi_ident; /* driver's identifier for LDI */
mod_hash_t *vd_table; /* table of virtual disks served */
mdeg_handle_t mdeg; /* handle for MDEG operations */
} vds_t;
+/*
+ * Types of descriptor-processing tasks
+ */
+typedef enum vd_task_type {
+ VD_NONFINAL_RANGE_TASK, /* task for intermediate descriptor in range */
+ VD_FINAL_RANGE_TASK, /* task for last in a range of descriptors */
+} vd_task_type_t;
+
+/*
+ * Structure describing the task for processing a descriptor
+ */
+typedef struct vd_task {
+ struct vd *vd; /* vd instance task is for */
+ vd_task_type_t type; /* type of descriptor task */
+ int index; /* dring elem index for task */
+ vio_msg_t *msg; /* VIO message task is for */
+ size_t msglen; /* length of message content */
+ size_t msgsize; /* size of message buffer */
+ vd_dring_payload_t *request; /* request task will perform */
+ struct buf buf; /* buf(9s) for I/O request */
+
+} vd_task_t;
+
+/*
+ * Soft state structure for a virtual disk instance
+ */
typedef struct vd {
uint_t initialized; /* vdisk initialization flags */
- kmutex_t lock; /* lock for this structure */
vds_t *vds; /* server for this vdisk */
- ddi_taskq_t *taskq; /* taskq for this vdisk */
+ ddi_taskq_t *startq; /* queue for I/O start tasks */
+ ddi_taskq_t *completionq; /* queue for completion tasks */
ldi_handle_t ldi_handle[V_NUMPAR]; /* LDI slice handles */
dev_t dev[V_NUMPAR]; /* dev numbers for slices */
uint_t nslices; /* number for slices */
@@ -160,7 +186,6 @@ typedef struct vd {
ldc_status_t ldc_state; /* LDC connection state */
ldc_handle_t ldc_handle; /* handle for LDC comm */
size_t max_msglen; /* largest LDC message len */
- boolean_t enabled; /* whether vdisk is enabled */
vd_state_t state; /* client handshake state */
uint8_t xfer_mode; /* transfer mode with client */
uint32_t sid; /* client's session ID */
@@ -170,11 +195,19 @@ typedef struct vd {
uint32_t descriptor_size; /* num bytes in desc */
uint32_t dring_len; /* number of dring elements */
caddr_t dring; /* address of dring */
+ vd_task_t inband_task; /* task for inband descriptor */
+ vd_task_t *dring_task; /* tasks dring elements */
+
+ kmutex_t lock; /* protects variables below */
+ boolean_t enabled; /* is vdisk enabled? */
+ boolean_t reset_state; /* reset connection state? */
+ boolean_t reset_ldc; /* reset LDC channel? */
} vd_t;
typedef struct vds_operation {
uint8_t operation;
- int (*function)(vd_t *vd, vd_dring_payload_t *request);
+ int (*start)(vd_task_t *task);
+ void (*complete)(void *arg);
} vds_operation_t;
typedef struct vd_ioctl {
@@ -217,86 +250,245 @@ static int vd_msglevel;
static int
-vd_bread(vd_t *vd, vd_dring_payload_t *request)
+vd_start_bio(vd_task_t *task)
{
- int status;
- struct buf buf;
+ int status = 0;
+ vd_t *vd = task->vd;
+ vd_dring_payload_t *request = task->request;
+ struct buf *buf = &task->buf;
+
+
+ ASSERT(vd != NULL);
+ ASSERT(request != NULL);
+ ASSERT(request->slice < vd->nslices);
+ ASSERT((request->operation == VD_OP_BREAD) ||
+ (request->operation == VD_OP_BWRITE));
- PR1("Read %lu bytes at block %lu", request->nbytes, request->addr);
if (request->nbytes == 0)
return (EINVAL); /* no service for trivial requests */
- ASSERT(mutex_owned(&vd->lock));
- ASSERT(request->slice < vd->nslices);
- bioinit(&buf);
- buf.b_flags = B_BUSY | B_READ;
- buf.b_bcount = request->nbytes;
- buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP);
- buf.b_lblkno = request->addr;
- buf.b_edev = vd->dev[request->slice];
+ PR1("%s %lu bytes at block %lu",
+ (request->operation == VD_OP_BREAD) ? "Read" : "Write",
+ request->nbytes, request->addr);
+
+ bioinit(buf);
+ buf->b_flags = B_BUSY;
+ buf->b_bcount = request->nbytes;
+ buf->b_un.b_addr = kmem_alloc(buf->b_bcount, KM_SLEEP);
+ buf->b_lblkno = request->addr;
+ buf->b_edev = vd->dev[request->slice];
+
+ if (request->operation == VD_OP_BREAD) {
+ buf->b_flags |= B_READ;
+ } else {
+ buf->b_flags |= B_WRITE;
+ /* Get data to write from client */
+ if ((status = ldc_mem_copy(vd->ldc_handle, buf->b_un.b_addr, 0,
+ &request->nbytes, request->cookie,
+ request->ncookies, LDC_COPY_IN)) != 0) {
+ PRN("ldc_mem_copy() returned errno %d "
+ "copying from client", status);
+ }
+ }
- if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0)
- status = biowait(&buf);
- biofini(&buf);
+ /* Start the block I/O */
if ((status == 0) &&
- ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0,
- &request->nbytes, request->cookie, request->ncookies,
- LDC_COPY_OUT)) != 0)) {
- PRN("ldc_mem_copy() returned errno %d copying to client",
- status);
- }
- kmem_free(buf.b_un.b_addr, buf.b_bcount); /* nbytes can change */
+ ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0))
+ return (EINPROGRESS); /* will complete on completionq */
+
+ /* Clean up after error */
+ kmem_free(buf->b_un.b_addr, buf->b_bcount);
+ biofini(buf);
return (status);
}
static int
-vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes,
- ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data)
+send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
{
- int status;
- struct buf buf;
+ int retry, status;
+ size_t nbytes;
- ASSERT(mutex_owned(&vd->lock));
- ASSERT(slice < vd->nslices);
- ASSERT(nbytes != 0);
- ASSERT(data != NULL);
- /* Get data from client */
- if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes,
- cookie, ncookies, LDC_COPY_IN)) != 0) {
- PRN("ldc_mem_copy() returned errno %d copying from client",
- status);
+ for (retry = 0, status = EWOULDBLOCK;
+ retry < vds_ldc_retries && status == EWOULDBLOCK;
+ retry++) {
+ PR1("ldc_write() attempt %d", (retry + 1));
+ nbytes = msglen;
+ status = ldc_write(ldc_handle, msg, &nbytes);
+ }
+
+ if (status != 0) {
+ PRN("ldc_write() returned errno %d", status);
return (status);
+ } else if (nbytes != msglen) {
+ PRN("ldc_write() performed only partial write");
+ return (EIO);
}
- bioinit(&buf);
- buf.b_flags = B_BUSY | B_WRITE;
- buf.b_bcount = nbytes;
- buf.b_un.b_addr = data;
- buf.b_lblkno = block;
- buf.b_edev = vd->dev[slice];
+ PR1("SENT %lu bytes", msglen);
+ return (0);
+}
- if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0)
- status = biowait(&buf);
- biofini(&buf);
- return (status);
+static void
+vd_need_reset(vd_t *vd, boolean_t reset_ldc)
+{
+ mutex_enter(&vd->lock);
+ vd->reset_state = B_TRUE;
+ vd->reset_ldc = reset_ldc;
+ mutex_exit(&vd->lock);
+}
+
+/*
+ * Reset the state of the connection with a client, if needed; reset the LDC
+ * transport as well, if needed. This function should only be called from the
+ * "startq", as it waits for tasks on the "completionq" and will deadlock if
+ * called from that queue.
+ */
+static void
+vd_reset_if_needed(vd_t *vd)
+{
+ int status = 0;
+
+
+ mutex_enter(&vd->lock);
+ if (!vd->reset_state) {
+ ASSERT(!vd->reset_ldc);
+ mutex_exit(&vd->lock);
+ return;
+ }
+ mutex_exit(&vd->lock);
+
+
+ PR0("Resetting connection state with %s", VD_CLIENT(vd));
+
+ /*
+ * Let any asynchronous I/O complete before possibly pulling the rug
+ * out from under it; defer checking vd->reset_ldc, as one of the
+ * asynchronous tasks might set it
+ */
+ ddi_taskq_wait(vd->completionq);
+
+
+ if ((vd->initialized & VD_DRING) &&
+ ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
+ PRN("ldc_mem_dring_unmap() returned errno %d", status);
+
+ if (vd->dring_task != NULL) {
+ ASSERT(vd->dring_len != 0);
+ kmem_free(vd->dring_task,
+ (sizeof (*vd->dring_task)) * vd->dring_len);
+ vd->dring_task = NULL;
+ }
+
+
+ mutex_enter(&vd->lock);
+ if (vd->reset_ldc && ((status = ldc_reset(vd->ldc_handle)) != 0))
+ PRN("ldc_reset() returned errno %d", status);
+
+ vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
+ vd->state = VD_STATE_INIT;
+ vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */
+
+ vd->reset_state = B_FALSE;
+ vd->reset_ldc = B_FALSE;
+ mutex_exit(&vd->lock);
}
static int
-vd_bwrite(vd_t *vd, vd_dring_payload_t *request)
+vd_mark_elem_done(vd_t *vd, int idx, int elem_status)
{
- int status;
- caddr_t data;
+ boolean_t accepted;
+ int status;
+ vd_dring_entry_t *elem = VD_DRING_ELEM(idx);
- PR1("Write %ld bytes at block %lu", request->nbytes, request->addr);
- if (request->nbytes == 0)
- return (EINVAL); /* no service for trivial requests */
- data = kmem_alloc(request->nbytes, KM_SLEEP);
- status = vd_do_bwrite(vd, request->slice, request->addr,
- request->nbytes, request->cookie, request->ncookies, data);
- kmem_free(data, request->nbytes);
- return (status);
+ /* Acquire the element */
+ if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
+ PRN("ldc_mem_dring_acquire() returned errno %d", status);
+ return (status);
+ }
+
+ /* Set the element's status and mark it done */
+ accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
+ if (accepted) {
+ elem->payload.status = elem_status;
+ elem->hdr.dstate = VIO_DESC_DONE;
+ } else {
+ /* Perhaps client timed out waiting for I/O... */
+ PRN("element %u no longer \"accepted\"", idx);
+ VD_DUMP_DRING_ELEM(elem);
+ }
+ /* Release the element */
+ if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
+ PRN("ldc_mem_dring_release() returned errno %d", status);
+ return (status);
+ }
+
+ return (accepted ? 0 : EINVAL);
+}
+
+static void
+vd_complete_bio(void *arg)
+{
+ int status = 0;
+ vd_task_t *task = (vd_task_t *)arg;
+ vd_t *vd = task->vd;
+ vd_dring_payload_t *request = task->request;
+ struct buf *buf = &task->buf;
+
+
+ ASSERT(vd != NULL);
+ ASSERT(request != NULL);
+ ASSERT(task->msg != NULL);
+ ASSERT(task->msglen >= sizeof (*task->msg));
+ ASSERT(task->msgsize >= task->msglen);
+
+ /* Wait for the I/O to complete */
+ request->status = biowait(buf);
+
+ /* If data was read, copy it to the client */
+ if ((request->status == 0) && (request->operation == VD_OP_BREAD) &&
+ ((status = ldc_mem_copy(vd->ldc_handle, buf->b_un.b_addr, 0,
+ &request->nbytes, request->cookie, request->ncookies,
+ LDC_COPY_OUT)) != 0)) {
+ PRN("ldc_mem_copy() returned errno %d copying to client",
+ status);
+ }
+
+ /* Release I/O buffer */
+ kmem_free(buf->b_un.b_addr, buf->b_bcount);
+ biofini(buf);
+
+ /* Update the dring element for a dring client */
+ if ((status == 0) && (vd->xfer_mode == VIO_DRING_MODE))
+ status = vd_mark_elem_done(vd, task->index, request->status);
+
+ /*
+ * If a transport error occurred, arrange to "nack" the message when
+ * the final task in the descriptor element range completes
+ */
+ if (status != 0)
+ task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+ /*
+ * Only the final task for a range of elements will respond to and
+ * free the message
+ */
+ if (task->type == VD_NONFINAL_RANGE_TASK)
+ return;
+
+ /*
+ * Send the "ack" or "nack" back to the client; if sending the message
+ * via LDC fails, arrange to reset both the connection state and LDC
+ * itself
+ */
+ PR1("Sending %s",
+ (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
+ if (send_msg(vd->ldc_handle, task->msg, task->msglen) != 0)
+ vd_need_reset(vd, B_TRUE);
+
+ /* Free the message now that it has been used for the reply */
+ kmem_free(task->msg, task->msgsize);
}
static void
@@ -347,7 +539,6 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
size_t nbytes = request->nbytes; /* modifiable copy */
- ASSERT(mutex_owned(&vd->lock));
ASSERT(request->slice < vd->nslices);
PR0("Performing %s", ioctl->operation_name);
@@ -379,8 +570,8 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
(void *)ioctl->arg)) != 0)
return (status);
} else if ((status = ldi_ioctl(vd->ldi_handle[request->slice],
- ioctl->cmd, (intptr_t)ioctl->arg, FKIOCTL, kcred,
- &rval)) != 0) {
+ ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL),
+ kcred, &rval)) != 0) {
PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status);
return (status);
}
@@ -453,7 +644,7 @@ vd_open_new_slices(vd_t *vd)
/* Get the (new) VTOC for updated slice sizes */
if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc,
- FKIOCTL, kcred, &rval)) != 0) {
+ (vd_open_flags | FKIOCTL), kcred, &rval)) != 0) {
PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status);
return;
}
@@ -483,13 +674,15 @@ vd_open_new_slices(vd_t *vd)
#define RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
static int
-vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
+vd_ioctl(vd_task_t *task)
{
- int i, status;
- void *buf = NULL;
- struct dk_geom dk_geom = {0};
- struct vtoc vtoc = {0};
- vd_ioctl_t ioctl[] = {
+ int i, status;
+ void *buf = NULL;
+ struct dk_geom dk_geom = {0};
+ struct vtoc vtoc = {0};
+ vd_t *vd = task->vd;
+ vd_dring_payload_t *request = task->request;
+ vd_ioctl_t ioctl[] = {
/* Command (no-copy) operations */
{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
@@ -522,7 +715,8 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
- ASSERT(mutex_owned(&vd->lock));
+ ASSERT(vd != NULL);
+ ASSERT(request != NULL);
ASSERT(request->slice < vd->nslices);
/*
@@ -554,6 +748,7 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
if ((request->operation == VD_OP_SET_VTOC) &&
(vd->vdisk_type == VD_DISK_TYPE_DISK))
vd_open_new_slices(vd);
+ PR0("Returning %d", status);
return (status);
}
@@ -562,31 +757,33 @@ vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
* been defined
*/
static const vds_operation_t vds_operation[] = {
- {VD_OP_BREAD, vd_bread},
- {VD_OP_BWRITE, vd_bwrite},
- {VD_OP_FLUSH, vd_ioctl},
- {VD_OP_GET_WCE, vd_ioctl},
- {VD_OP_SET_WCE, vd_ioctl},
- {VD_OP_GET_VTOC, vd_ioctl},
- {VD_OP_SET_VTOC, vd_ioctl},
- {VD_OP_GET_DISKGEOM, vd_ioctl},
- {VD_OP_SET_DISKGEOM, vd_ioctl}
+ {VD_OP_BREAD, vd_start_bio, vd_complete_bio},
+ {VD_OP_BWRITE, vd_start_bio, vd_complete_bio},
+ {VD_OP_FLUSH, vd_ioctl, NULL},
+ {VD_OP_GET_WCE, vd_ioctl, NULL},
+ {VD_OP_SET_WCE, vd_ioctl, NULL},
+ {VD_OP_GET_VTOC, vd_ioctl, NULL},
+ {VD_OP_SET_VTOC, vd_ioctl, NULL},
+ {VD_OP_GET_DISKGEOM, vd_ioctl, NULL},
+ {VD_OP_SET_DISKGEOM, vd_ioctl, NULL}
};
static const size_t vds_noperations =
(sizeof (vds_operation))/(sizeof (vds_operation[0]));
/*
- * Process a request using a defined operation
+ * Process a task specifying a client I/O request
*/
static int
-vd_process_request(vd_t *vd, vd_dring_payload_t *request)
+vd_process_task(vd_task_t *task)
{
- int i;
+ int i, status;
+ vd_t *vd = task->vd;
+ vd_dring_payload_t *request = task->request;
- PR1("Entered");
- ASSERT(mutex_owned(&vd->lock));
+ ASSERT(vd != NULL);
+ ASSERT(request != NULL);
/* Range-check slice */
if (request->slice >= vd->nslices) {
@@ -595,41 +792,37 @@ vd_process_request(vd_t *vd, vd_dring_payload_t *request)
return (EINVAL);
}
- /* Perform the requested operation */
+ /* Find the requested operation */
for (i = 0; i < vds_noperations; i++)
if (request->operation == vds_operation[i].operation)
- return (vds_operation[i].function(vd, request));
-
- /* No matching operation found */
- PRN("Unsupported operation %u", request->operation);
- return (ENOTSUP);
-}
-
-static int
-send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
-{
- int retry, status;
- size_t nbytes;
-
+ break;
+ if (i == vds_noperations) {
+ PRN("Unsupported operation %u", request->operation);
+ return (ENOTSUP);
+ }
- for (retry = 0, status = EWOULDBLOCK;
- retry < vds_ldc_retries && status == EWOULDBLOCK;
- retry++) {
- PR1("ldc_write() attempt %d", (retry + 1));
- nbytes = msglen;
- status = ldc_write(ldc_handle, msg, &nbytes);
+ /* Start the operation */
+ if ((status = vds_operation[i].start(task)) != EINPROGRESS) {
+ request->status = status; /* op succeeded or failed */
+ return (0); /* but request completed */
}
- if (status != 0) {
- PRN("ldc_write() returned errno %d", status);
- return (status);
- } else if (nbytes != msglen) {
- PRN("ldc_write() performed only partial write");
- return (EIO);
+ ASSERT(vds_operation[i].complete != NULL); /* debug case */
+ if (vds_operation[i].complete == NULL) { /* non-debug case */
+ PRN("Unexpected return of EINPROGRESS "
+ "with no I/O completion handler");
+ request->status = EIO; /* operation failed */
+ return (0); /* but request completed */
}
- PR1("SENT %lu bytes", msglen);
- return (0);
+ /* Queue a task to complete the operation */
+ status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete,
+ task, DDI_SLEEP);
+ /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */
+ ASSERT(status == DDI_SUCCESS);
+
+ PR1("Operation in progress");
+ return (EINPROGRESS); /* completion handler will finish request */
}
/*
@@ -782,13 +975,12 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
vd_attr_msg_t *attr_msg = (vd_attr_msg_t *)msg;
- PR0("Entered");
- ASSERT(mutex_owned(&vd->lock));
ASSERT(msglen >= sizeof (msg->tag));
if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
VIO_ATTR_INFO)) {
- return (ENOMSG); /* not an attribute message */
+ PR0("Message is not an attribute message");
+ return (ENOMSG);
}
if (msglen != sizeof (*attr_msg)) {
@@ -835,6 +1027,14 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
* their cookies
*/
vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
+
+ /*
+ * Initialize the data structure for processing in-band I/O
+ * request descriptors
+ */
+ vd->inband_task.vd = vd;
+ vd->inband_task.index = 0;
+ vd->inband_task.type = VD_FINAL_RANGE_TASK; /* range == 1 */
}
attr_msg->vdisk_size = vd->vdisk_size;
@@ -853,13 +1053,12 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
vio_dring_reg_msg_t *reg_msg = (vio_dring_reg_msg_t *)msg;
- PR0("Entered");
- ASSERT(mutex_owned(&vd->lock));
ASSERT(msglen >= sizeof (msg->tag));
if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
VIO_DRING_REG)) {
- return (ENOMSG); /* not a register-dring message */
+ PR0("Message is not a register-dring message");
+ return (ENOMSG);
}
if (msglen < sizeof (*reg_msg)) {
@@ -881,6 +1080,12 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
return (EBADMSG);
}
+ if (reg_msg->num_descriptors > INT32_MAX) {
+ PRN("reg_msg->num_descriptors = %u; must be <= %u (%s)",
+ reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
+ return (EBADMSG);
+ }
+
if (reg_msg->ncookies != 1) {
/*
* In addition to fixing the assertion in the success case
@@ -928,7 +1133,7 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
}
- /* Valid message and dring mapped */
+ /* Initialize for valid message and mapped dring */
PR1("descriptor size = %u, dring length = %u",
vd->descriptor_size, vd->dring_len);
vd->initialized |= VD_DRING;
@@ -937,6 +1142,19 @@ vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
vd->descriptor_size = reg_msg->descriptor_size;
vd->dring_len = reg_msg->num_descriptors;
reg_msg->dring_ident = vd->dring_ident;
+
+ /*
+ * Allocate and initialize a "shadow" array of data structures for
+ * tasks to process I/O requests in dring elements
+ */
+ vd->dring_task =
+ kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
+ for (int i = 0; i < vd->dring_len; i++) {
+ vd->dring_task[i].vd = vd;
+ vd->dring_task[i].index = i;
+ vd->dring_task[i].request = &VD_DRING_ELEM(i)->payload;
+ }
+
return (0);
}
@@ -946,13 +1164,12 @@ vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
vio_dring_unreg_msg_t *unreg_msg = (vio_dring_unreg_msg_t *)msg;
- PR0("Entered");
- ASSERT(mutex_owned(&vd->lock));
ASSERT(msglen >= sizeof (msg->tag));
if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
VIO_DRING_UNREG)) {
- return (ENOMSG); /* not an unregister-dring message */
+ PR0("Message is not an unregister-dring message");
+ return (ENOMSG);
}
if (msglen != sizeof (*unreg_msg)) {
@@ -973,11 +1190,12 @@ vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
static int
process_rdx_msg(vio_msg_t *msg, size_t msglen)
{
- PR0("Entered");
ASSERT(msglen >= sizeof (msg->tag));
- if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX))
- return (ENOMSG); /* not an RDX message */
+ if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
+ PR0("Message is not an RDX message");
+ return (ENOMSG);
+ }
if (msglen != sizeof (vio_rdx_msg_t)) {
PRN("Expected %lu-byte RDX message; received %lu bytes",
@@ -985,36 +1203,17 @@ process_rdx_msg(vio_msg_t *msg, size_t msglen)
return (EBADMSG);
}
+ PR0("Valid RDX message");
return (0);
}
-static void
-vd_reset_connection(vd_t *vd, boolean_t reset_ldc)
-{
- int status = 0;
-
-
- ASSERT(mutex_owned(&vd->lock));
- PR0("Resetting connection with %s", VD_CLIENT(vd));
- if ((vd->initialized & VD_DRING) &&
- ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
- PRN("ldc_mem_dring_unmap() returned errno %d", status);
- if ((reset_ldc == B_TRUE) &&
- ((status = ldc_reset(vd->ldc_handle)) != 0))
- PRN("ldc_reset() returned errno %d", status);
- vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
- vd->state = VD_STATE_INIT;
- vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */
-}
-
static int
vd_check_seq_num(vd_t *vd, uint64_t seq_num)
{
- ASSERT(mutex_owned(&vd->lock));
if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
PRN("Received seq_num %lu; expected %lu",
seq_num, (vd->seq_num + 1));
- vd_reset_connection(vd, B_FALSE);
+ vd_need_reset(vd, B_FALSE);
return (1);
}
@@ -1040,19 +1239,19 @@ expected_inband_size(vd_dring_inband_msg_t *msg)
* operating on them within a descriptor ring
*/
static int
-vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize)
{
size_t expected;
vd_dring_inband_msg_t *desc_msg = (vd_dring_inband_msg_t *)msg;
- PR1("Entered");
- ASSERT(mutex_owned(&vd->lock));
ASSERT(msglen >= sizeof (msg->tag));
if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
- VIO_DESC_DATA))
- return (ENOMSG); /* not an in-band-descriptor message */
+ VIO_DESC_DATA)) {
+ PR1("Message is not an in-band-descriptor message");
+ return (ENOMSG);
+ }
if (msglen < sizeof (*desc_msg)) {
PRN("Expected at least %lu-byte descriptor message; "
@@ -1066,129 +1265,124 @@ vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
return (EBADMSG);
}
- if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) {
+ if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
return (EBADMSG);
- }
- /* Valid message; process the request */
- desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload);
- return (0);
+ /*
+ * Valid message: Set up the in-band descriptor task and process the
+ * request. Arrange to acknowledge the client's message, unless an
+ * error processing the descriptor task results in setting
+ * VIO_SUBTYPE_NACK
+ */
+ PR1("Valid in-band-descriptor message");
+ msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
+ vd->inband_task.msg = msg;
+ vd->inband_task.msglen = msglen;
+ vd->inband_task.msgsize = msgsize;
+ vd->inband_task.request = &desc_msg->payload;
+ return (vd_process_task(&vd->inband_task));
}
-static boolean_t
-vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc)
+static int
+vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
+ vio_msg_t *msg, size_t msglen, size_t msgsize)
{
- uint32_t i, n;
+ int status;
+ boolean_t ready;
+ vd_dring_entry_t *elem = VD_DRING_ELEM(idx);
- /* Check descriptor states */
- for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
- if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) {
- PRN("descriptor %u not ready", i);
- VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i));
- return (B_FALSE);
- }
+ /* Accept the updated dring element */
+ if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
+ PRN("ldc_mem_dring_acquire() returned errno %d", status);
+ return (status);
}
+ ready = (elem->hdr.dstate == VIO_DESC_READY);
+ if (ready) {
+ elem->hdr.dstate = VIO_DESC_ACCEPTED;
+ } else {
+ PRN("descriptor %u not ready", idx);
+ VD_DUMP_DRING_ELEM(elem);
+ }
+ if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
+ PRN("ldc_mem_dring_release() returned errno %d", status);
+ return (status);
+ }
+ if (!ready)
+ return (EBUSY);
- /* Descriptors are valid; accept them */
- for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len)
- VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED;
- return (B_TRUE);
+ /* Initialize a task and process the accepted element */
+ PR1("Processing dring element %u", idx);
+ vd->dring_task[idx].type = type;
+ vd->dring_task[idx].msg = msg;
+ vd->dring_task[idx].msglen = msglen;
+ vd->dring_task[idx].msgsize = msgsize;
+ if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS)
+ status = vd_mark_elem_done(vd, idx, elem->payload.status);
+
+ return (status);
}
static int
-vd_process_dring(vd_t *vd, uint32_t start, uint32_t end)
+vd_process_element_range(vd_t *vd, int start, int end,
+ vio_msg_t *msg, size_t msglen, size_t msgsize)
{
- int status;
- boolean_t accepted;
- uint32_t i, io_status, n, ndesc;
+ int i, n, nelem, status = 0;
+ boolean_t inprogress = B_FALSE;
+ vd_task_type_t type;
- ASSERT(mutex_owned(&vd->lock));
- PR1("start = %u, end = %u", start, end);
+ ASSERT(start >= 0);
+ ASSERT(end >= 0);
- /* Validate descriptor range */
- if ((start >= vd->dring_len) || (end >= vd->dring_len)) {
- PRN("\"start\" = %u, \"end\" = %u; both must be less than %u",
- start, end, vd->dring_len);
- return (EINVAL);
- }
+ /*
+ * Arrange to acknowledge the client's message, unless an error
+ * processing one of the dring elements results in setting
+ * VIO_SUBTYPE_NACK
+ */
+ msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
- /* Acquire updated dring elements */
- if ((status = ldc_mem_dring_acquire(vd->dring_handle,
- start, end)) != 0) {
- PRN("ldc_mem_dring_acquire() returned errno %d", status);
- return (status);
- }
- /* Accept updated dring elements */
- ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1;
- PR1("ndesc = %u", ndesc);
- accepted = vd_accept_dring_elems(vd, start, ndesc);
- /* Release dring elements */
- if ((status = ldc_mem_dring_release(vd->dring_handle,
- start, end)) != 0) {
- PRN("ldc_mem_dring_release() returned errno %d", status);
- return (status);
+ /*
+ * Process the dring elements in the range
+ */
+ nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
+ for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
+ ((vio_dring_msg_t *)msg)->end_idx = i;
+ type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
+ status = vd_process_element(vd, type, i, msg, msglen, msgsize);
+ if (status == EINPROGRESS)
+ inprogress = B_TRUE;
+ else if (status != 0)
+ break;
}
- /* If a descriptor was in the wrong state, return an error */
- if (!accepted)
- return (EINVAL);
+ /*
+ * If some, but not all, operations of a multi-element range are in
+ * progress, wait for other operations to complete before returning
+ * (which will result in "ack" or "nack" of the message). Note that
+ * all outstanding operations will need to complete, not just the ones
+ * corresponding to the current range of dring elements; howevever, as
+ * this situation is an error case, performance is less critical.
+ */
+ if ((nelem > 1) && (status != EINPROGRESS) && inprogress)
+ ddi_taskq_wait(vd->completionq);
- /* Process accepted dring elements */
- for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
- vd_dring_entry_t *elem = VD_DRING_ELEM(i);
-
- /* Process descriptor outside acquire/release bracket */
- PR1("Processing dring element %u", i);
- io_status = vd_process_request(vd, &elem->payload);
-
- /* Re-acquire client's dring element */
- if ((status = ldc_mem_dring_acquire(vd->dring_handle,
- i, i)) != 0) {
- PRN("ldc_mem_dring_acquire() returned errno %d",
- status);
- return (status);
- }
- /* Update processed element */
- if (elem->hdr.dstate == VIO_DESC_ACCEPTED) {
- elem->payload.status = io_status;
- elem->hdr.dstate = VIO_DESC_DONE;
- } else {
- /* Perhaps client timed out waiting for I/O... */
- accepted = B_FALSE;
- PRN("element %u no longer \"accepted\"", i);
- VD_DUMP_DRING_ELEM(elem);
- }
- /* Release updated processed element */
- if ((status = ldc_mem_dring_release(vd->dring_handle,
- i, i)) != 0) {
- PRN("ldc_mem_dring_release() returned errno %d",
- status);
- return (status);
- }
- /* If the descriptor was in the wrong state, return an error */
- if (!accepted)
- return (EINVAL);
- }
-
- return (0);
+ return (status);
}
static int
-vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize)
{
vio_dring_msg_t *dring_msg = (vio_dring_msg_t *)msg;
- PR1("Entered");
- ASSERT(mutex_owned(&vd->lock));
ASSERT(msglen >= sizeof (msg->tag));
if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
VIO_DRING_DATA)) {
- return (ENOMSG); /* not a dring-data message */
+ PR1("Message is not a dring-data message");
+ return (ENOMSG);
}
if (msglen != sizeof (*dring_msg)) {
@@ -1197,9 +1391,8 @@ vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
return (EBADMSG);
}
- if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) {
+ if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
return (EBADMSG);
- }
if (dring_msg->dring_ident != vd->dring_ident) {
PRN("Expected dring ident %lu; received ident %lu",
@@ -1207,10 +1400,24 @@ vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
return (EBADMSG);
}
+ if (dring_msg->start_idx >= vd->dring_len) {
+ PRN("\"start_idx\" = %u; must be less than %u",
+ dring_msg->start_idx, vd->dring_len);
+ return (EBADMSG);
+ }
- /* Valid message; process dring */
- dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
- return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx));
+ if ((dring_msg->end_idx < 0) ||
+ (dring_msg->end_idx >= vd->dring_len)) {
+ PRN("\"end_idx\" = %u; must be >= 0 and less than %u",
+ dring_msg->end_idx, vd->dring_len);
+ return (EBADMSG);
+ }
+
+ /* Valid message; process range of updated dring elements */
+ PR1("Processing descriptor range, start = %u, end = %u",
+ dring_msg->start_idx, dring_msg->end_idx);
+ return (vd_process_element_range(vd, dring_msg->start_idx,
+ dring_msg->end_idx, msg, msglen, msgsize));
}
static int
@@ -1241,14 +1448,13 @@ recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
}
static int
-vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize)
{
int status;
PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
msg->tag.vio_subtype, msg->tag.vio_subtype_env);
- ASSERT(mutex_owned(&vd->lock));
/*
* Validate session ID up front, since it applies to all messages
@@ -1338,7 +1544,7 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
case VD_STATE_DATA:
switch (vd->xfer_mode) {
case VIO_DESC_MODE: /* expect in-band-descriptor message */
- return (vd_process_desc_msg(vd, msg, msglen));
+ return (vd_process_desc_msg(vd, msg, msglen, msgsize));
case VIO_DRING_MODE: /* expect dring-data or unreg-dring */
/*
@@ -1346,7 +1552,7 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
* them first
*/
if ((status = vd_process_dring_msg(vd, msg,
- msglen)) != ENOMSG)
+ msglen, msgsize)) != ENOMSG)
return (status);
/*
@@ -1371,15 +1577,13 @@ vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
}
}
-static void
-vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+static int
+vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen, size_t msgsize)
{
int status;
boolean_t reset_ldc = B_FALSE;
- ASSERT(mutex_owned(&vd->lock));
-
/*
* Check that the message is at least big enough for a "tag", so that
* message processing can proceed based on tag-specified message type
@@ -1387,19 +1591,22 @@ vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
if (msglen < sizeof (vio_msg_tag_t)) {
PRN("Received short (%lu-byte) message", msglen);
/* Can't "nack" short message, so drop the big hammer */
- vd_reset_connection(vd, B_TRUE);
- return;
+ vd_need_reset(vd, B_TRUE);
+ return (EBADMSG);
}
/*
* Process the message
*/
- switch (status = vd_do_process_msg(vd, msg, msglen)) {
+ switch (status = vd_do_process_msg(vd, msg, msglen, msgsize)) {
case 0:
/* "ack" valid, successfully-processed messages */
msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
break;
+ case EINPROGRESS:
+ /* The completion handler will "ack" or "nack" the message */
+ return (EINPROGRESS);
case ENOMSG:
PRN("Received unexpected message");
_NOTE(FALLTHROUGH);
@@ -1417,15 +1624,29 @@ vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
break;
}
- /* "ack" or "nack" the message */
+ /* Send the "ack" or "nack" to the client */
PR1("Sending %s",
(msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
if (send_msg(vd->ldc_handle, msg, msglen) != 0)
reset_ldc = B_TRUE;
- /* Reset the connection for nack'ed or failed messages */
+ /* Arrange to reset the connection for nack'ed or failed messages */
if ((status != 0) || reset_ldc)
- vd_reset_connection(vd, reset_ldc);
+ vd_need_reset(vd, reset_ldc);
+
+ return (status);
+}
+
+static boolean_t
+vd_enabled(vd_t *vd)
+{
+ boolean_t enabled;
+
+
+ mutex_enter(&vd->lock);
+ enabled = vd->enabled;
+ mutex_exit(&vd->lock);
+ return (enabled);
}
static void
@@ -1435,74 +1656,70 @@ vd_recv_msg(void *arg)
int status = 0;
- PR2("Entered");
ASSERT(vd != NULL);
- mutex_enter(&vd->lock);
- /*
- * Receive and process any messages in the LDC queue; max_msglen is
- * reset each time through the loop, as vd->max_msglen can increase
- * during connection handshake
- */
- for (size_t max_msglen = vd->max_msglen;
- vd->enabled && status == 0;
- max_msglen = vd->max_msglen) {
- size_t msglen = max_msglen;
- vio_msg_t *vio_msg = kmem_alloc(max_msglen, KM_SLEEP);
-
- if ((status = recv_msg(vd->ldc_handle, vio_msg, &msglen)) == 0)
- vd_process_msg(vd, vio_msg, msglen);
- else if (status != ENOMSG)
- vd_reset_connection(vd, B_TRUE);
- kmem_free(vio_msg, max_msglen);
+ PR2("New task to receive incoming message(s)");
+ while (vd_enabled(vd) && status == 0) {
+ size_t msglen, msgsize;
+ vio_msg_t *vio_msg;
+
+
+ /*
+ * Receive and process a message
+ */
+ vd_reset_if_needed(vd); /* can change vd->max_msglen */
+ msgsize = vd->max_msglen; /* stable copy for alloc/free */
+ msglen = msgsize; /* actual length after recv_msg() */
+ vio_msg = kmem_alloc(msgsize, KM_SLEEP);
+ if ((status = recv_msg(vd->ldc_handle, vio_msg, &msglen)) ==
+ 0) {
+ if (vd_process_msg(vd, vio_msg, msglen, msgsize) ==
+ EINPROGRESS)
+ continue; /* handler will free msg */
+ } else if (status != ENOMSG) {
+ /* Probably an LDC failure; arrange to reset it */
+ vd_need_reset(vd, B_TRUE);
+ }
+ kmem_free(vio_msg, msgsize);
}
- mutex_exit(&vd->lock);
- PR2("Returning");
+ PR2("Task finished");
}
static uint_t
-vd_do_handle_ldc_events(vd_t *vd, uint64_t event)
+vd_handle_ldc_events(uint64_t event, caddr_t arg)
{
- ASSERT(mutex_owned(&vd->lock));
+ vd_t *vd = (vd_t *)(void *)arg;
+
- if (!vd->enabled)
+ ASSERT(vd != NULL);
+
+ if (!vd_enabled(vd))
return (LDC_SUCCESS);
if (event & LDC_EVT_RESET) {
- PR0("Channel was reset");
+ PR0("LDC channel was reset");
return (LDC_SUCCESS);
}
if (event & LDC_EVT_UP) {
- /* Reset the connection state when channel comes (back) up */
- vd_reset_connection(vd, B_FALSE);
+ PR0("LDC channel came up: Resetting client connection state");
+ vd_need_reset(vd, B_FALSE);
}
if (event & LDC_EVT_READ) {
+ int status;
+
PR1("New data available");
/* Queue a task to receive the new data */
- if (ddi_taskq_dispatch(vd->taskq, vd_recv_msg, vd, DDI_SLEEP) !=
- DDI_SUCCESS)
- PRN("Unable to dispatch vd_recv_msg()");
+ status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
+ DDI_SLEEP);
+ /* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */
+ ASSERT(status == DDI_SUCCESS);
}
return (LDC_SUCCESS);
}
static uint_t
-vd_handle_ldc_events(uint64_t event, caddr_t arg)
-{
- uint_t status;
- vd_t *vd = (vd_t *)(void *)arg;
-
-
- ASSERT(vd != NULL);
- mutex_enter(&vd->lock);
- status = vd_do_handle_ldc_events(vd, event);
- mutex_exit(&vd->lock);
- return (status);
-}
-
-static uint_t
vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
{
_NOTE(ARGUNUSED(key, val))
@@ -1519,15 +1736,15 @@ vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
vds_t *vds;
- PR0("Entered");
switch (cmd) {
case DDI_DETACH:
/* the real work happens below */
break;
case DDI_SUSPEND:
- /* nothing to do for this non-device */
+ PR0("No action required for DDI_SUSPEND");
return (DDI_SUCCESS);
default:
+ PRN("Unrecognized \"cmd\"");
return (DDI_FAILURE);
}
@@ -1552,8 +1769,6 @@ vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
if (vds->initialized & VDS_LDI)
(void) ldi_ident_release(vds->ldi_ident);
mod_hash_destroy_hash(vds->vd_table);
- if (vds->initialized & VDS_LOCKING)
- mutex_destroy(&vds->lock);
ddi_soft_state_free(vds_state, instance);
return (DDI_SUCCESS);
}
@@ -1584,7 +1799,7 @@ vd_setup_full_disk(vd_t *vd)
/* Get the VTOC for slice sizes */
if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vtoc,
- FKIOCTL, kcred, &rval)) != 0) {
+ (vd_open_flags | FKIOCTL), kcred, &rval)) != 0) {
PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d", status);
return (status);
}
@@ -1701,7 +1916,8 @@ vd_setup_vd(char *block_device, vd_t *vd)
/* Get dk_cinfo to determine slice of backing block device */
if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
- (intptr_t)&dk_cinfo, FKIOCTL, kcred, &rval)) != 0) {
+ (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred,
+ &rval)) != 0) {
PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
status, block_device);
return (status);
@@ -1726,7 +1942,8 @@ vd_setup_vd(char *block_device, vd_t *vd)
/* Initialize dk_geom structure for single-slice block device */
if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
- (intptr_t)&vd->dk_geom, FKIOCTL, kcred, &rval)) != 0) {
+ (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL), kcred,
+ &rval)) != 0) {
PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
status, block_device);
return (status);
@@ -1747,7 +1964,8 @@ vd_setup_vd(char *block_device, vd_t *vd)
/* Initialize vtoc structure for single-slice block device */
if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC,
- (intptr_t)&vd->vtoc, FKIOCTL, kcred, &rval)) != 0) {
+ (intptr_t)&vd->vtoc, (vd_open_flags | FKIOCTL), kcred,
+ &rval)) != 0) {
PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s",
status, block_device);
return (status);
@@ -1811,16 +2029,22 @@ vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id,
vd->initialized |= VD_LOCKING;
- /* Create the task queue for the vdisk */
- (void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id);
+ /* Create start and completion task queues for the vdisk */
+ (void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id);
+ PR1("tq_name = %s", tq_name);
+ if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1,
+ TASKQ_DEFAULTPRI, 0)) == NULL) {
+ PRN("Could not create task queue");
+ return (EIO);
+ }
+ (void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id);
PR1("tq_name = %s", tq_name);
- if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1,
+ if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1,
TASKQ_DEFAULTPRI, 0)) == NULL) {
PRN("Could not create task queue");
return (EIO);
}
- vd->initialized |= VD_TASKQ;
- vd->enabled = 1; /* before callback can dispatch to taskq */
+ vd->enabled = 1; /* before callback can dispatch to startq */
/* Bring up LDC */
@@ -1864,10 +2088,11 @@ vds_destroy_vd(void *arg)
vd_t *vd = (vd_t *)arg;
- PR0("Entered");
if (vd == NULL)
return;
+ PR0("Destroying vdisk state");
+
/* Disable queuing requests for the vdisk */
if (vd->initialized & VD_LOCKING) {
mutex_enter(&vd->lock);
@@ -1875,9 +2100,19 @@ vds_destroy_vd(void *arg)
mutex_exit(&vd->lock);
}
- /* Drain and destroy the task queue (*before* shutting down LDC) */
- if (vd->initialized & VD_TASKQ)
- ddi_taskq_destroy(vd->taskq); /* waits for queued tasks */
+ /* Drain and destroy start queue (*before* destroying completionq) */
+ if (vd->startq != NULL)
+ ddi_taskq_destroy(vd->startq); /* waits for queued tasks */
+
+ /* Drain and destroy completion queue (*before* shutting down LDC) */
+ if (vd->completionq != NULL)
+ ddi_taskq_destroy(vd->completionq); /* waits for tasks */
+
+ if (vd->dring_task != NULL) {
+ ASSERT(vd->dring_len != 0);
+ kmem_free(vd->dring_task,
+ (sizeof (*vd->dring_task)) * vd->dring_len);
+ }
/* Shut down LDC */
if (vd->initialized & VD_LDC) {
@@ -2171,9 +2406,6 @@ vds_do_attach(dev_info_t *dip)
sizeof (void *));
ASSERT(vds->vd_table != NULL);
- mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL);
- vds->initialized |= VDS_LOCKING;
-
if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
PRN("ldi_ident_from_dip() returned errno %d", status);
return (DDI_FAILURE);
@@ -2205,14 +2437,14 @@ vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
int status;
- PR0("Entered");
switch (cmd) {
case DDI_ATTACH:
+ PR0("Attaching");
if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
(void) vds_detach(dip, DDI_DETACH);
return (status);
case DDI_RESUME:
- /* nothing to do for this non-device */
+ PR0("No action required for DDI_RESUME");
return (DDI_SUCCESS);
default:
return (DDI_FAILURE);
@@ -2251,6 +2483,7 @@ _init(void)
{
int i, status;
+
if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
return (status);
if ((status = mod_install(&modlinkage)) != 0) {
@@ -2276,6 +2509,7 @@ _fini(void)
{
int status;
+
if ((status = mod_remove(&modlinkage)) != 0)
return (status);
ddi_soft_state_fini(&vds_state);
diff --git a/usr/src/uts/sun4v/io/vio_util.c b/usr/src/uts/sun4v/io/vio_util.c
new file mode 100644
index 0000000000..42cbf34fa2
--- /dev/null
+++ b/usr/src/uts/sun4v/io/vio_util.c
@@ -0,0 +1,184 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/ksynch.h>
+#include <sys/stream.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/vio_util.h>
+
+/*
+ * Create a pool of mblks from which future vio_allocb() requests
+ * will be serviced.
+ *
+ * NOTE: num_mblks has to non-zero and a power-of-2
+ *
+ * Returns 0 on success or EINVAL if num_mblks is zero or not
+ * a power of 2.
+ */
+int
+vio_create_mblks(uint64_t num_mblks, size_t mblk_size, vio_mblk_pool_t **poolp)
+{
+ vio_mblk_pool_t *vmplp;
+ vio_mblk_t *vmp;
+ uint8_t *datap;
+ int i;
+
+ if (!(num_mblks) || (!ISP2(num_mblks))) {
+ *poolp = 0;
+ return (EINVAL);
+ }
+
+ vmplp = kmem_zalloc(sizeof (*vmplp), KM_SLEEP);
+ vmplp->quelen = num_mblks;
+ vmplp->quemask = num_mblks - 1; /* expects quelen is power-of-2 */
+ vmplp->mblk_size = mblk_size;
+
+ mutex_init(&vmplp->hlock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(DDI_INTR_SOFTPRI_DEFAULT));
+ mutex_init(&vmplp->tlock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(DDI_INTR_SOFTPRI_DEFAULT));
+
+ vmplp->basep = kmem_zalloc(num_mblks * sizeof (vio_mblk_t), KM_SLEEP);
+ vmplp->datap = kmem_zalloc(num_mblks * mblk_size, KM_SLEEP);
+ vmplp->nextp = NULL;
+
+ /* create a queue of pointers to free vio_mblk_t's */
+ vmplp->quep = kmem_zalloc(vmplp->quelen * sizeof (vio_mblk_t *),
+ KM_SLEEP);
+ vmplp->head = 0;
+ vmplp->tail = 0;
+
+ for (i = 0, datap = vmplp->datap; i < num_mblks; i++) {
+
+ vmp = &(vmplp->basep[i]);
+ vmp->vmplp = vmplp;
+ vmp->datap = datap;
+ vmp->reclaim.free_func = vio_freeb;
+ vmp->reclaim.free_arg = (caddr_t)vmp;
+ vmp->mp = desballoc(vmp->datap, mblk_size, BPRI_MED,
+ &vmp->reclaim);
+
+ if (vmp->mp == NULL)
+ continue;
+
+ /* put this vmp on the free stack */
+ vmplp->quep[vmplp->tail] = vmp;
+ vmplp->tail = (vmplp->tail + 1) & vmplp->quemask;
+
+ datap += mblk_size;
+ }
+
+ *poolp = vmplp;
+ return (0);
+}
+
+/*
+ * Destroy the pool of mblks. This can only succeed when
+ * all allocated mblks have been returned to the pool.
+ *
+ * It is up to the caller to ensure that no further mblks are
+ * requested from the pool after destroy has been invoked.
+ *
+ * Returns 0 on success, EINVAL if handle is invalid, or
+ * EBUSY if not all mblks reclaimed yet.
+ */
+int
+vio_destroy_mblks(vio_mblk_pool_t *vmplp)
+{
+ if (vmplp == NULL)
+ return (EINVAL);
+
+ /*
+ * We can only destroy the pool once all the mblks have
+ * been reclaimed.
+ */
+ if (vmplp->head != vmplp->tail) {
+ /* some mblks still in use */
+ return (EBUSY);
+ }
+
+ kmem_free(vmplp->basep, vmplp->quelen * sizeof (vio_mblk_t));
+ kmem_free(vmplp->datap, vmplp->quelen * vmplp->mblk_size);
+ kmem_free(vmplp->quep, vmplp->quelen * sizeof (vio_mblk_t *));
+
+ mutex_destroy(&vmplp->hlock);
+ mutex_destroy(&vmplp->tlock);
+
+ kmem_free(vmplp, sizeof (*vmplp));
+
+ return (0);
+}
+
+/*
+ * Allocate a mblk from the free pool if one is available.
+ * Otherwise returns NULL.
+ */
+mblk_t *
+vio_allocb(vio_mblk_pool_t *vmplp)
+{
+ vio_mblk_t *vmp = NULL;
+ mblk_t *mp = NULL;
+ uint32_t head;
+
+ mutex_enter(&vmplp->hlock);
+ head = (vmplp->head + 1) & vmplp->quemask;
+ if (head != vmplp->tail) {
+ /* we have free mblks */
+ vmp = vmplp->quep[vmplp->head];
+ mp = vmp->mp;
+ vmplp->head = head;
+ }
+ mutex_exit(&vmplp->hlock);
+
+ return (mp);
+}
+
+/*
+ * Return a mblk to the free pool. Invoked when the upper IP
+ * layers do freemsg() etc on the mblk they were passed.
+ */
+void
+vio_freeb(void *arg)
+{
+ vio_mblk_t *vmp = (vio_mblk_t *)arg;
+ vio_mblk_pool_t *vmplp = vmp->vmplp;
+
+ vmp->mp = desballoc(vmp->datap, vmplp->mblk_size,
+ BPRI_MED, &vmp->reclaim);
+
+ mutex_enter(&vmplp->tlock);
+ vmplp->quep[vmplp->tail] = vmp;
+ vmplp->tail = (vmplp->tail + 1) & vmplp->quemask;
+ mutex_exit(&vmplp->tlock);
+}
diff --git a/usr/src/uts/sun4v/io/vldc.c b/usr/src/uts/sun4v/io/vldc.c
index 6c366c5c59..6b9d48a76c 100644
--- a/usr/src/uts/sun4v/io/vldc.c
+++ b/usr/src/uts/sun4v/io/vldc.c
@@ -408,6 +408,7 @@ i_vldc_mdeg_register(vldc_t *vldcp)
bcopy(nameprop, name, namesz);
VLDC_SET_MDEG_PROP_NAME(pspecp, name);
+ ddi_prop_free(nameprop);
/* copy in the instance property */
VLDC_SET_MDEG_PROP_INST(pspecp, inst);
@@ -728,6 +729,9 @@ i_vldc_close_port(vldc_t *vldcp, uint_t portno)
kmem_free(vport->send_buf, vport->mtu);
kmem_free(vport->recv_buf, vport->mtu);
+ if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME) == 0)
+ kmem_free(vport->cookie_buf, vldc_max_cookie);
+
vport->status = VLDC_PORT_CLOSED;
return (rv);
@@ -910,6 +914,9 @@ vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred)
vport->recv_buf = kmem_alloc(vport->mtu, KM_SLEEP);
vport->send_buf = kmem_alloc(vport->mtu, KM_SLEEP);
+ if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME) == 0)
+ vport->cookie_buf = kmem_alloc(vldc_max_cookie, KM_SLEEP);
+
vport->is_stream = B_FALSE; /* assume not a stream */
vport->hanged_up = B_FALSE;
@@ -1057,50 +1064,57 @@ i_vldc_ioctl_read_cookie(vldc_port_t *vport, int vldc_instance, void *arg,
int mode)
{
vldc_data_t copy_info;
- caddr_t buf;
- uint64_t len;
+ uint64_t len, balance, copy_size;
+ caddr_t src_addr, dst_addr;
int rv;
if (ddi_copyin(arg, &copy_info, sizeof (copy_info), mode) == -1) {
return (EFAULT);
}
- len = copy_info.length;
- if (len > vldc_max_cookie) {
- return (EINVAL);
- }
+ len = balance = copy_info.length;
+ src_addr = (caddr_t)copy_info.src_addr;
+ dst_addr = (caddr_t)copy_info.dst_addr;
+ while (balance > 0) {
- /* allocate a temporary buffer */
- buf = kmem_alloc(len, KM_SLEEP);
+ /* get the max amount to the copied */
+ copy_size = MIN(balance, vldc_max_cookie);
- mutex_enter(&vport->minorp->lock);
+ mutex_enter(&vport->minorp->lock);
- D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%lx "
- "size 0x%lx to 0x%lx\n", vldc_instance, vport->number,
- copy_info.dst_addr, copy_info.length, copy_info.src_addr);
+ D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%p "
+ "size 0x%lx to 0x%p\n", vldc_instance, vport->number,
+ dst_addr, copy_size, src_addr);
- /* read from the HV into the temporary buffer */
- rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len,
- (caddr_t)copy_info.dst_addr, LDC_COPY_IN);
- if (rv != 0) {
- DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot read "
- "address 0x%lx, rv=%d\n", vldc_instance, vport->number,
- copy_info.dst_addr, rv);
- mutex_exit(&vport->minorp->lock);
- kmem_free(buf, copy_info.length);
- return (EFAULT);
- }
+ /* read from the HV into the temporary buffer */
+ rv = ldc_mem_rdwr_pa(vport->ldc_handle, vport->cookie_buf,
+ &copy_size, dst_addr, LDC_COPY_IN);
+ if (rv != 0) {
+ DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot "
+ "read address 0x%p, rv=%d\n",
+ vldc_instance, vport->number, dst_addr, rv);
+ mutex_exit(&vport->minorp->lock);
+ return (EFAULT);
+ }
- D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n",
- vldc_instance, vport->number);
+ D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n",
+ vldc_instance, vport->number);
- mutex_exit(&vport->minorp->lock);
+ mutex_exit(&vport->minorp->lock);
- /* copy data from temporary buffer out to the caller and free buffer */
- rv = ddi_copyout(buf, (caddr_t)copy_info.src_addr, len, mode);
- kmem_free(buf, copy_info.length);
- if (rv != 0) {
- return (EFAULT);
+ /*
+ * copy data from temporary buffer out to the
+ * caller and free buffer
+ */
+ rv = ddi_copyout(vport->cookie_buf, src_addr, copy_size, mode);
+ if (rv != 0) {
+ return (EFAULT);
+ }
+
+ /* adjust len, source and dest */
+ balance -= copy_size;
+ src_addr += copy_size;
+ dst_addr += copy_size;
}
/* set the structure to reflect outcome */
@@ -1118,54 +1132,58 @@ i_vldc_ioctl_write_cookie(vldc_port_t *vport, int vldc_instance, void *arg,
int mode)
{
vldc_data_t copy_info;
- caddr_t buf;
- uint64_t len;
+ uint64_t len, balance, copy_size;
+ caddr_t src_addr, dst_addr;
int rv;
- if (ddi_copyin((caddr_t)arg, &copy_info,
- sizeof (copy_info), mode) != 0) {
+ if (ddi_copyin(arg, &copy_info, sizeof (copy_info), mode) != 0) {
return (EFAULT);
}
- len = copy_info.length;
- if (len > vldc_max_cookie) {
- return (EINVAL);
- }
-
D2("i_vldc_ioctl_write_cookie: vldc@%d:%d writing 0x%lx size 0x%lx "
"to 0x%lx\n", vldc_instance, vport->number, copy_info.src_addr,
copy_info.length, copy_info.dst_addr);
- /* allocate a temporary buffer */
- buf = kmem_alloc(len, KM_SLEEP);
+ len = balance = copy_info.length;
+ src_addr = (caddr_t)copy_info.src_addr;
+ dst_addr = (caddr_t)copy_info.dst_addr;
+ while (balance > 0) {
- /* copy into the temporary buffer the data to be written to the HV */
- if (ddi_copyin((caddr_t)copy_info.src_addr, buf,
- copy_info.length, mode) != 0) {
- kmem_free(buf, copy_info.length);
- return (EFAULT);
- }
+ /* get the max amount to the copied */
+ copy_size = MIN(balance, vldc_max_cookie);
- mutex_enter(&vport->minorp->lock);
+ /*
+ * copy into the temporary buffer the data
+ * to be written to the HV
+ */
+ if (ddi_copyin((caddr_t)src_addr, vport->cookie_buf,
+ copy_size, mode) != 0) {
+ return (EFAULT);
+ }
- /* write the data from the temporary buffer to the HV */
- rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len,
- (caddr_t)copy_info.dst_addr, LDC_COPY_OUT);
- if (rv != 0) {
- DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d failed to write at"
- " address 0x%lx\n, rv=%d", vldc_instance, vport->number,
- copy_info.dst_addr, rv);
- mutex_exit(&vport->minorp->lock);
- kmem_free(buf, copy_info.length);
- return (EFAULT);
- }
+ mutex_enter(&vport->minorp->lock);
+
+ /* write the data from the temporary buffer to the HV */
+ rv = ldc_mem_rdwr_pa(vport->ldc_handle, vport->cookie_buf,
+ &copy_size, dst_addr, LDC_COPY_OUT);
+ if (rv != 0) {
+ DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d "
+ "failed to write at address 0x%p\n, rv=%d",
+ vldc_instance, vport->number, dst_addr, rv);
+ mutex_exit(&vport->minorp->lock);
+ return (EFAULT);
+ }
- D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n",
- vldc_instance, vport->number);
+ D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n",
+ vldc_instance, vport->number);
- mutex_exit(&vport->minorp->lock);
+ mutex_exit(&vport->minorp->lock);
- kmem_free(buf, copy_info.length);
+ /* adjust len, source and dest */
+ balance -= copy_size;
+ src_addr += copy_size;
+ dst_addr += copy_size;
+ }
/* set the structure to reflect outcome */
copy_info.length = len;
@@ -1315,13 +1333,19 @@ vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
break;
case VLDC_IOCTL_READ_COOKIE:
-
+ if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME)) {
+ rv = EINVAL;
+ break;
+ }
rv = i_vldc_ioctl_read_cookie(vport, instance,
(void *)arg, mode);
break;
case VLDC_IOCTL_WRITE_COOKIE:
-
+ if (strcmp(vport->minorp->sname, VLDC_HVCTL_SVCNAME)) {
+ rv = EINVAL;
+ break;
+ }
rv = i_vldc_ioctl_write_cookie(vport, instance,
(void *)arg, mode);
break;
diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c
index 29ebf6bc59..c0cc116beb 100644
--- a/usr/src/uts/sun4v/io/vnet.c
+++ b/usr/src/uts/sun4v/io/vnet.c
@@ -86,7 +86,7 @@ void vnet_tx_update(void *arg);
/* externs */
extern int vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr,
mac_register_t **vgenmacp);
-extern void vgen_uninit(void *arg);
+extern int vgen_uninit(void *arg);
static mac_callbacks_t vnet_m_callbacks = {
0,
@@ -116,6 +116,7 @@ uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */
uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT; /* tx timeout in msec */
uint32_t vnet_ldc_qlen = VNET_LDC_QLEN; /* ldc qlen */
uint32_t vnet_nfdb_hash = VNET_NFDB_HASH; /* size of fdb hash table */
+uint32_t vnet_nrbufs = VNET_NRBUFS; /* number of receive buffers */
/*
* Property names
@@ -296,8 +297,9 @@ vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
int instance;
int status;
enum { AST_init = 0x0, AST_vnet_alloc = 0x1,
- AST_read_macaddr = 0x2, AST_vgen_init = 0x4,
- AST_vptl_alloc = 0x8, AST_fdbh_alloc = 0x10 }
+ AST_mac_alloc = 0x2, AST_read_macaddr = 0x4,
+ AST_vgen_init = 0x8, AST_vptl_alloc = 0x10,
+ AST_fdbh_alloc = 0x20 }
attach_state;
mac_register_t *vgenmacp = NULL;
uint32_t nfdbh = 0;
@@ -400,7 +402,7 @@ vnet_attach_fail:
RW_EXIT(&vnetp->trwlock);
}
if (attach_state & AST_vgen_init) {
- vgen_uninit(vgenmacp->m_driver);
+ (void) vgen_uninit(vgenmacp->m_driver);
}
if (attach_state & AST_vnet_alloc) {
KMEM_FREE(vnetp);
@@ -418,6 +420,7 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
vnet_t **vnetpp;
vp_tl_t *vp_tlp;
int instance;
+ int rv;
instance = ddi_get_instance(dip);
DBG1((NULL, "vnetdetach: instance(%d) enter\n", instance));
@@ -436,6 +439,21 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
goto vnet_detach_fail;
}
+ /* uninit and free vnet proxy transports */
+ WRITE_ENTER(&vnetp->trwlock);
+ while ((vp_tlp = vnetp->tlp) != NULL) {
+ if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) {
+ /* uninitialize generic transport */
+ rv = vgen_uninit(vp_tlp->macp->m_driver);
+ if (rv != DDI_SUCCESS) {
+ RW_EXIT(&vnetp->trwlock);
+ goto vnet_detach_fail;
+ }
+ }
+ vnet_del_vptl(vnetp, vp_tlp);
+ }
+ RW_EXIT(&vnetp->trwlock);
+
/*
* Unregister from the MAC subsystem. This can fail, in
* particular if there are DLPI style-2 streams still open -
@@ -454,17 +472,6 @@ vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
}
RW_EXIT(&vnet_rw);
- /* uninit and free vnet proxy transports */
- WRITE_ENTER(&vnetp->trwlock);
- while ((vp_tlp = vnetp->tlp) != NULL) {
- if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) {
- /* uninitialize generic transport */
- vgen_uninit(vp_tlp->macp->m_driver);
- }
- vnet_del_vptl(vnetp, vp_tlp);
- }
- RW_EXIT(&vnetp->trwlock);
-
KMEM_FREE(vnetp);
return (DDI_SUCCESS);
diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c
index 9d01b82837..1fdbf79873 100644
--- a/usr/src/uts/sun4v/io/vnet_gen.c
+++ b/usr/src/uts/sun4v/io/vnet_gen.c
@@ -50,8 +50,9 @@
#include <sys/vio_mailbox.h>
#include <sys/vio_common.h>
#include <sys/vnet_common.h>
-#include <sys/vnet_gen.h>
#include <sys/vnet_mailbox.h>
+#include <sys/vio_util.h>
+#include <sys/vnet_gen.h>
/*
* Implementation of the mac functionality for vnet using the
@@ -64,7 +65,7 @@
/* vgen proxy entry points */
int vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr,
mac_register_t **vgenmacp);
-void vgen_uninit(void *arg);
+int vgen_uninit(void *arg);
static int vgen_start(void *arg);
static void vgen_stop(void *arg);
static mblk_t *vgen_tx(void *arg, mblk_t *mp);
@@ -129,7 +130,6 @@ static int vgen_num_txpending(vgen_ldc_t *ldcp);
static int vgen_tx_dring_full(vgen_ldc_t *ldcp);
static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp);
static void vgen_ldc_watchdog(void *arg);
-static void vgen_copymsg(mblk_t *mp, void *bufp);
static int vgen_setup_kstats(vgen_ldc_t *ldcp);
static void vgen_destroy_kstats(vgen_ldc_t *ldcp);
static int vgen_kstat_update(kstat_t *ksp, int rw);
@@ -145,8 +145,7 @@ static int vgen_send_version_negotiate(vgen_ldc_t *ldcp);
static int vgen_send_attr_info(vgen_ldc_t *ldcp);
static int vgen_send_dring_reg(vgen_ldc_t *ldcp);
static int vgen_send_rdx_info(vgen_ldc_t *ldcp);
-static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start,
- uint32_t end, uint64_t next_txseq);
+static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, int32_t end);
static int vgen_send_mcast_info(vgen_ldc_t *ldcp);
static int vgen_handshake_phase2(vgen_ldc_t *ldcp);
static void vgen_handshake_reset(vgen_ldc_t *ldcp);
@@ -163,6 +162,8 @@ static void vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
static void vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
static void vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
mblk_t **headp, mblk_t **tailp);
+static void vgen_send_dring_ack(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
+ uint32_t start, int32_t end, uint8_t pstate);
static void vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
mblk_t **headp, mblk_t **tailp);
static void vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
@@ -255,6 +256,8 @@ uint32_t vgen_hwd_interval = 1000; /* handshake watchdog freq in msec */
uint32_t vgen_max_hretries = 1; /* max # of handshake retries */
uint32_t vgen_ldcwr_retries = 10; /* max # of ldc_write() retries */
uint32_t vgen_ldcup_retries = 5; /* max # of ldc_up() retries */
+uint32_t vgen_recv_delay = 1; /* delay when rx descr not ready */
+uint32_t vgen_recv_retries = 10; /* retry when rx descr not ready */
#ifdef DEBUG
/* flags to simulate error conditions for debugging */
@@ -303,6 +306,7 @@ extern uint32_t vnet_reclaim_hiwat;
extern uint32_t vnet_ldcwd_interval;
extern uint32_t vnet_ldcwd_txtimeout;
extern uint32_t vnet_ldc_qlen;
+extern uint32_t vnet_nrbufs;
extern int _vnet_dbglevel;
extern void _vnetdebug_printf(void *vnetp, const char *fmt, ...);
@@ -365,13 +369,9 @@ uint32_t vgen_hdbg;
#define HDBG_BAD_SID 0x4
#define HDBG_OUT_STATE 0x8
-#if 0
-/* debug version negotiation, need to redefine VGEN_NUM_VER */
-vgen_ver_t dbg_vgen_versions[VGEN_NUM_VER] =
- { {5, 0}, {3, 0}, {2, 1}, {1, 2}, {1, 1} };
#endif
-#endif
+
/*
* vgen_init() is called by an instance of vnet driver to initialize the
@@ -443,15 +443,17 @@ vgen_init(void *vnetp, dev_info_t *vnetdip, const uint8_t *macaddr,
* Called by vnet to undo the initializations done by vgen_init().
* The handle provided by generic transport during vgen_init() is the argument.
*/
-void
+int
vgen_uninit(void *arg)
{
vgen_t *vgenp = (vgen_t *)arg;
void *vnetp;
int instance;
+ vio_mblk_pool_t *rp, *nrp;
- if (vgenp == NULL)
- return;
+ if (vgenp == NULL) {
+ return (DDI_FAILURE);
+ }
instance = ddi_get_instance(vgenp->vnetdip);
vnetp = vgenp->vnetp;
@@ -466,6 +468,21 @@ vgen_uninit(void *arg)
/* detach all ports from the device */
vgen_detach_ports(vgenp);
+ /*
+ * free any pending rx mblk pools,
+ * that couldn't be freed previously during channel detach.
+ */
+ rp = vgenp->rmp;
+ while (rp != NULL) {
+ nrp = vgenp->rmp = rp->nextp;
+ if (vio_destroy_mblks(rp)) {
+ vgenp->rmp = rp;
+ mutex_exit(&vgenp->lock);
+ return (DDI_FAILURE);
+ }
+ rp = nrp;
+ }
+
/* free multicast table */
kmem_free(vgenp->mctab, vgenp->mcsize * sizeof (struct ether_addr));
@@ -478,6 +495,8 @@ vgen_uninit(void *arg)
KMEM_FREE(vgenp);
DBG1((vnetp, "vgen_uninit: exit vnet_instance(%d)\n", instance));
+
+ return (DDI_SUCCESS);
}
/* enable transmit/receive for the device */
@@ -536,17 +555,14 @@ vgen_portsend(vgen_port_t *portp, mblk_t *mp)
{
vgen_ldclist_t *ldclp;
vgen_ldc_t *ldcp;
- vgen_t *vgenp;
int status;
- vgenp = portp->vgenp;
ldclp = &portp->ldclist;
READ_ENTER(&ldclp->rwlock);
/*
- * XXX - for now, we have a single channel.
+ * NOTE: for now, we will assume we have a single channel.
*/
if (ldclp->headp == NULL) {
- DWARN((vgenp->vnetp, "vgen_portsend: dropping packet\n"));
RW_EXIT(&ldclp->rwlock);
return (VGEN_FAILURE);
}
@@ -554,15 +570,12 @@ vgen_portsend(vgen_port_t *portp, mblk_t *mp)
if (ldcp->need_resched) {
/* out of tx resources, see vgen_ldcsend() for details. */
- DWARN((vgenp->vnetp, "vgen_portsend: dropping packet...\n"));
-
mutex_enter(&ldcp->txlock);
ldcp->statsp->tx_no_desc++;
mutex_exit(&ldcp->txlock);
RW_EXIT(&ldclp->rwlock);
- freemsg(mp);
- return (VGEN_SUCCESS);
+ return (VGEN_FAILURE);
}
status = vgen_ldcsend(ldcp, mp);
@@ -581,10 +594,7 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
void *vnetp;
size_t size;
int rv;
- uint32_t i;
- uint32_t start;
- uint32_t end;
- int txpending = 0;
+ uint64_t tbuf_ix;
vgen_private_desc_t *tbufp;
vgen_private_desc_t *ntbufp;
vnet_public_desc_t *txdp;
@@ -593,8 +603,10 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
struct ether_header *ehp;
boolean_t is_bcast = B_FALSE;
boolean_t is_mcast = B_FALSE;
- boolean_t reclaim = B_FALSE;
boolean_t need_intr = B_FALSE;
+ size_t mblksz;
+ caddr_t dst;
+ mblk_t *bp;
vnetp = LDC_TO_VNET(ldcp);
statsp = ldcp->statsp;
@@ -633,60 +645,33 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
*/
tbufp = ldcp->next_tbufp;
ntbufp = NEXTTBUF(ldcp, tbufp);
- if (tbufp->flags != VGEN_PRIV_DESC_FREE ||
- ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */
+ if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */
mutex_enter(&ldcp->tclock);
- if (ntbufp == ldcp->cur_tbufp)
+ if (ntbufp == ldcp->cur_tbufp) {
ldcp->need_resched = B_TRUE;
- mutex_exit(&ldcp->tclock);
+ mutex_exit(&ldcp->tclock);
- statsp->tx_no_desc++;
- mutex_exit(&ldcp->txlock);
-#ifdef VGEN_USE_MAC_TX_UPDATE
- /*
- * This cflag is disabled by default. This can be enabled if we
- * want to return failure to the mac layer when we run out of
- * descriptors and use mac_tx_update() to restart tx when
- * descriptors become available. However, stopping tx would
- * affect traffic going over other ports, as upper mac layer
- * has no concept of multiple ports within a device.
- * So currently, to avoid this, drop packets when we run out
- * of descrs and just return success. See the corresponding
- * code in vgen_portsend() and vgen_reclaim_dring().
- */
- return (VGEN_TX_NORESOURCES);
-#else
- freemsg(mp); /* drop the packet */
- return (VGEN_TX_SUCCESS);
-#endif
+ statsp->tx_no_desc++;
+ mutex_exit(&ldcp->txlock);
+
+ return (VGEN_TX_NORESOURCES);
+ }
+ mutex_exit(&ldcp->tclock);
}
if (size < ETHERMIN)
size = ETHERMIN;
/* copy data into pre-allocated transmit buffer */
- vgen_copymsg(mp, tbufp->datap);
-
- txpending = vgen_num_txpending(ldcp);
- if (txpending >= ldcp->reclaim_hiwat) {
- /*
- * if num of pending transmits is more than hiwat,
- * reclaim now and also enable ack bit.
- */
- reclaim = B_TRUE;
- need_intr = B_TRUE;
- } else {
- if (txpending >= ldcp->reclaim_lowat) {
- /*
- * if the num of pending transmits is more than lowat
- * enable ack bit in the descr and reclaim in intr().
- */
- need_intr = B_TRUE;
- }
+ dst = tbufp->datap + VNET_IPALIGN;
+ for (bp = mp; bp != NULL; bp = bp->b_cont) {
+ mblksz = MBLKL(bp);
+ bcopy(bp->b_rptr, dst, mblksz);
+ dst += mblksz;
}
- i = tbufp - ldcp->tbufp;
+ tbuf_ix = tbufp - ldcp->tbufp;
ehp = (struct ether_header *)tbufp->datap;
is_bcast = IS_BROADCAST(ehp);
@@ -694,38 +679,40 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
tbufp->flags = VGEN_PRIV_DESC_BUSY;
tbufp->datalen = size;
- tbufp->seqnum = ldcp->next_txseq;
/* initialize the corresponding public descriptor (txd) */
txdp = tbufp->descp;
hdrp = &txdp->hdr;
- hdrp->dstate = VIO_DESC_READY;
if (need_intr)
hdrp->ack = B_TRUE;
txdp->nbytes = size;
txdp->ncookies = tbufp->ncookies;
bcopy((tbufp->memcookie), (txdp->memcookie),
- tbufp->ncookies * sizeof (ldc_mem_cookie_t));
+ tbufp->ncookies * sizeof (ldc_mem_cookie_t));
+ hdrp->dstate = VIO_DESC_READY;
/* send dring datamsg to the peer */
- start = end = i;
- rv = vgen_send_dring_data(ldcp, start, end, ldcp->next_txseq);
- if (rv != 0) {
- /* vgen_send_dring_data() error: drop the packet */
- DWARN((vnetp,
- "vgen_ldcsend: vgen_send_dring_data(): failed: "
- "id(%lx) rv(%d) len (%d)\n", ldcp->ldc_id, rv, size));
- tbufp->flags = VGEN_PRIV_DESC_FREE; /* free tbuf */
- hdrp->dstate = VIO_DESC_FREE; /* free txd */
- hdrp->ack = B_FALSE;
- statsp->oerrors++;
- goto vgen_tx_exit;
+ if (ldcp->resched_peer) {
+ rv = vgen_send_dring_data(ldcp, (uint32_t)tbuf_ix, -1);
+ if (rv != 0) {
+ /* vgen_send_dring_data() error: drop the packet */
+ DWARN((vnetp,
+ "vgen_ldcsend: vgen_send_dring_data(): failed: "
+ "id(%lx) rv(%d) len (%d)\n",
+ ldcp->ldc_id, rv, size));
+ tbufp->flags = VGEN_PRIV_DESC_FREE; /* free tbuf */
+ hdrp->dstate = VIO_DESC_FREE; /* free txd */
+ hdrp->ack = B_FALSE;
+ statsp->oerrors++;
+ goto vgen_tx_exit;
+ }
+ ldcp->resched_peer = B_FALSE;
}
/* update next available tbuf in the ring */
ldcp->next_tbufp = ntbufp;
- /* update tx seqnum and index */
- ldcp->next_txseq++;
+
+ /* update tx index */
INCR_TXI(ldcp->next_txi, ldcp);
/* update stats */
@@ -739,9 +726,6 @@ vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
vgen_tx_exit:
mutex_exit(&ldcp->txlock);
- if (reclaim) {
- vgen_reclaim(ldcp);
- }
DBG1((vnetp, "vgen_ldcsend: exit: ldcid (%lx)\n", ldcp->ldc_id));
freemsg(mp);
@@ -1528,7 +1512,8 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id)
ldc_status_t istatus;
enum {AST_init = 0x0, AST_ldc_alloc = 0x1,
AST_mutex_init = 0x2, AST_ldc_init = 0x4,
- AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10}
+ AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10,
+ AST_create_rxmblks = 0x20}
attach_state;
attach_state = AST_init;
@@ -1584,6 +1569,16 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id)
}
attach_state |= AST_alloc_tx_ring;
+ /* allocate receive resources */
+ ldcp->num_rbufs = vnet_nrbufs;
+ ldcp->rmp = NULL;
+ status = vio_create_mblks(ldcp->num_rbufs, VGEN_DBLK_SZ,
+ &(ldcp->rmp));
+ if (status != 0) {
+ goto ldc_attach_failed;
+ }
+ attach_state |= AST_create_rxmblks;
+
/* Setup kstats for the channel */
status = vgen_setup_kstats(ldcp);
if (status != VGEN_SUCCESS) {
@@ -1605,6 +1600,9 @@ vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id)
return (DDI_SUCCESS);
ldc_attach_failed:
+ if (attach_state & AST_create_rxmblks) {
+ (void) vio_destroy_mblks(ldcp->rmp);
+ }
if (attach_state & AST_alloc_tx_ring) {
vgen_free_tx_ring(ldcp);
}
@@ -1661,8 +1659,21 @@ vgen_ldc_detach(vgen_ldc_t *ldcp)
ldcp->flags &= ~(CHANNEL_ATTACHED);
vgen_destroy_kstats(ldcp);
+
+ /* free receive resources */
+ if (vio_destroy_mblks(ldcp->rmp)) {
+ /*
+ * if we cannot reclaim all mblks, put this
+ * on the list of pools to be reclaimed when the
+ * device gets detached (see vgen_uninit()).
+ */
+ ldcp->rmp->nextp = vgenp->rmp;
+ vgenp->rmp = ldcp->rmp;
+ }
+
/* free transmit resources */
vgen_free_tx_ring(ldcp);
+
(void) ldc_unreg_callback(ldcp->ldc_handle);
(void) ldc_fini(ldcp->ldc_handle);
mutex_destroy(&ldcp->tclock);
@@ -1825,7 +1836,7 @@ vgen_ldc_init(vgen_ldc_t *ldcp)
LDC_SHADOW_MAP, LDC_MEM_RW, &ldcp->tx_dcookie, &ncookies);
if (rv != 0) {
DWARN((vnetp, "vgen_ldcinit: id (%lx) "
- "ldc_mem_dring_bind failed\n", ldcp->ldc_id));
+ "ldc_mem_dring_bind failed rv(%x)\n", ldcp->ldc_id, rv));
goto ldcinit_failed;
}
@@ -1952,7 +1963,7 @@ vgen_init_tbufs(vgen_ldc_t *ldcp)
bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds));
bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds));
- datap = kmem_zalloc(ldcp->num_txds * VGEN_TX_DBLK_SZ, KM_SLEEP);
+ datap = kmem_zalloc(ldcp->num_txds * VGEN_DBLK_SZ, KM_SLEEP);
ldcp->tx_datap = datap;
/*
@@ -1976,7 +1987,7 @@ vgen_init_tbufs(vgen_ldc_t *ldcp)
*/
ci = ncookies = 0;
rv = ldc_mem_bind_handle(tbufp->memhandle,
- (caddr_t)datap, VGEN_TX_DBLK_SZ, LDC_SHADOW_MAP,
+ (caddr_t)datap, VGEN_DBLK_SZ, LDC_SHADOW_MAP,
LDC_MEM_R, &(tbufp->memcookie[ci]), &ncookies);
if (rv != 0) {
goto init_tbufs_failed;
@@ -1989,20 +2000,20 @@ vgen_init_tbufs(vgen_ldc_t *ldcp)
tbufp->datap = datap;
if ((ncookies == 0) ||
- (ncookies > (uint64_t)MAX_COOKIES)) {
+ (ncookies > MAX_COOKIES)) {
goto init_tbufs_failed;
}
for (ci = 1; ci < ncookies; ci++) {
rv = ldc_mem_nextcookie(tbufp->memhandle,
- &(tbufp->memcookie[ci]));
+ &(tbufp->memcookie[ci]));
if (rv != 0) {
goto init_tbufs_failed;
}
}
tbufp->ncookies = ncookies;
- datap += VGEN_TX_DBLK_SZ;
+ datap += VGEN_DBLK_SZ;
tbufp->flags = VGEN_PRIV_DESC_FREE;
txdp = &(ldcp->txdp[i]);
@@ -2021,6 +2032,8 @@ vgen_init_tbufs(vgen_ldc_t *ldcp)
ldcp->next_txseq = VNET_ISS;
ldcp->next_txi = 0;
+ ldcp->resched_peer = B_TRUE;
+
return (DDI_SUCCESS);
init_tbufs_failed:;
@@ -2060,7 +2073,7 @@ vgen_uninit_tbufs(vgen_ldc_t *ldcp)
if (ldcp->tx_datap) {
/* prealloc'd tx data buffer */
- kmem_free(ldcp->tx_datap, ldcp->num_txds * VGEN_TX_DBLK_SZ);
+ kmem_free(ldcp->tx_datap, ldcp->num_txds * VGEN_DBLK_SZ);
ldcp->tx_datap = NULL;
}
@@ -2104,6 +2117,9 @@ vgen_clobber_tbufs(vgen_ldc_t *ldcp)
/* reset tx seqnum and index */
ldcp->next_txseq = VNET_ISS;
ldcp->next_txi = 0;
+
+ ldcp->resched_peer = B_TRUE;
+
#ifdef DEBUG
DBG2((vnetp,
"vgen_clobber_tbufs: id(0x%lx) num descrs done (%d)\n",
@@ -2738,8 +2754,7 @@ vgen_send_rdx_info(vgen_ldc_t *ldcp)
/* send descriptor ring data message to the peer over ldc */
static int
-vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end,
- uint64_t next_txseq)
+vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, int32_t end)
{
vio_dring_msg_t dringmsg, *msgp = &dringmsg;
vio_msg_tag_t *tagp = &msgp->tag;
@@ -2753,7 +2768,7 @@ vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end,
tagp->vio_subtype_env = VIO_DRING_DATA;
tagp->vio_sid = ldcp->local_sid;
- msgp->seq_num = next_txseq;
+ msgp->seq_num = ldcp->next_txseq;
msgp->dring_ident = ldcp->local_hparams.dring_ident;
msgp->start_idx = start;
msgp->end_idx = end;
@@ -2765,6 +2780,9 @@ vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end,
return (VGEN_FAILURE);
}
+ ldcp->next_txseq++;
+ ldcp->statsp->dring_data_msgs++;
+
DBG2((vnetp, "vgen_send_dring_data: DRING_DATA_SENT id (%lx)\n",
ldcp->ldc_id));
@@ -2898,14 +2916,6 @@ vgen_reset_hphase(vgen_ldc_t *ldcp)
*/
bzero(&(ldcp->local_hparams), sizeof (ldcp->local_hparams));
-#ifdef DEBUG
-#if 0
- if (vgen_hdbg & HDBG_VERSION) {
- bcopy(dbg_vgen_versions, ldcp->vgen_versions,
- sizeof (ldcp->vgen_versions));
- }
-#endif
-#endif
/* set version to the highest version supported */
ldcp->local_hparams.ver_major =
ldcp->vgen_versions[0].ver_major;
@@ -2921,12 +2931,6 @@ vgen_reset_hphase(vgen_ldc_t *ldcp)
ldcp->local_hparams.xfer_mode = VIO_DRING_MODE;
ldcp->local_hparams.ack_freq = 0; /* don't need acks */
-#ifdef DEBUG
-#if 0
- vgen_print_attr_info(ldcp, VGEN_LOCAL);
-#endif
-#endif
-
/*
* set dring_info params.
* Note: dring is already created and bound.
@@ -3469,11 +3473,6 @@ vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
ldcp->hstate |= ATTR_ACK_SENT;
DBG2((vnetp, "vgen_handle_attr_info:"
" ATTR_ACK_SENT id(%lx)\n", ldcp->ldc_id));
-#ifdef DEBUG
-#if 0
- vgen_print_attr_info(ldcp, VGEN_PEER);
-#endif
-#endif
} else {
/* failed */
DWARN((vnetp, "vgen_handle_attr_info:"
@@ -3838,6 +3837,24 @@ vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
}
static void
+vgen_send_dring_ack(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp, uint32_t start,
+ int32_t end, uint8_t pstate)
+{
+ vio_dring_msg_t *msgp = (vio_dring_msg_t *)tagp;
+ void *vnetp = LDC_TO_VNET(ldcp);
+
+ tagp->vio_subtype = VIO_SUBTYPE_ACK;
+ tagp->vio_sid = ldcp->local_sid;
+ msgp->start_idx = start;
+ msgp->end_idx = end;
+ msgp->dring_process_state = pstate;
+ if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msgp), B_FALSE)) {
+ DWARN((vnetp, "vgen_send_dring_ack: id(%lx) vgen_sendmsg "
+ "failed\n", (ldcp)->ldc_id));
+ }
+}
+
+static void
vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
mblk_t **headp, mblk_t **tailp)
{
@@ -3854,22 +3871,25 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
size_t nread;
uint64_t off = 0;
uint32_t start;
- uint32_t end;
+ int32_t end;
uint32_t datalen;
uint32_t ncookies;
- uint32_t sync_start;
- uint32_t sync_end;
+ uint32_t ack_start;
+ uint32_t ack_end;
uint32_t rxi;
uint32_t txi;
int rv;
boolean_t rxd_err = B_FALSE;
- boolean_t sync_done = B_FALSE;
+ boolean_t set_ack_start = B_FALSE;
+ vgen_private_desc_t *tbufp;
+ uint32_t next_rxi;
+ boolean_t ready_txd = B_FALSE;
+ uint32_t retries = 0;
#ifdef VGEN_HANDLE_LOST_PKTS
int n;
#endif
#ifdef VGEN_REXMIT
uint64_t seqnum;
- vgen_private_desc_t *tbufp;
#endif
void *vnetp = LDC_TO_VNET(ldcp);
@@ -3895,7 +3915,8 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
start, end));
/* validate rx start and end indeces */
- if (!(CHECK_RXI(start, ldcp)) || !(CHECK_RXI(end, ldcp))) {
+ if (!(CHECK_RXI(start, ldcp)) || ((end != -1) &&
+ !(CHECK_RXI(end, ldcp)))) {
/* drop the message if invalid index */
break;
}
@@ -3930,7 +3951,7 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
}
/*
- * Starting sequence number of the received packets
+ * sequence number of dring data message
* is less than the next sequence number that
* is expected:
*
@@ -3950,7 +3971,7 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
}
/*
- * Starting sequence number of the received packets
+ * sequence number of dring data message
* is greater than the next expected sequence number
*
* send a NACK back to the peer to indicate lost
@@ -3976,8 +3997,10 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
#ifdef VGEN_REXMIT
/*
* stop further processing until peer
- * retransmits with the right index and seqnum.
+ * retransmits with the right index.
+ * update next_rxseq expected.
*/
+ ldcp->next_rxseq += 1;
break;
#else /* VGEN_REXMIT */
/*
@@ -3987,12 +4010,12 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
* from the new start index.
*/
ldcp->next_rxi = start;
- ldcp->next_rxseq += n;
+ ldcp->next_rxseq += 1;
#endif /* VGEN_REXMIT */
} else if (dringmsg->seq_num == ldcp->next_rxseq) {
/*
- * expected and starting seqnums match, but
+ * expected and received seqnums match, but
* the descriptor indeces don't?
*
* restart handshake with peer.
@@ -4003,11 +4026,6 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
ldcp->ldc_id, ldcp->next_rxseq,
dringmsg->seq_num));
-#if 0
- vgen_handshake_retry(ldcp);
- break;
-#endif
-
}
} else {
@@ -4022,50 +4040,89 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
"next_rxseq(0x%lx) != seq_num(0x%lx)\n",
ldcp->ldc_id, ldcp->next_rxseq,
dringmsg->seq_num));
-
-#if 0
- vgen_handshake_retry(ldcp);
- break;
-#endif
}
}
#endif /* VGEN_HANDLE_LOST_PKTS */
/*
- * Start processing the descriptor range, specified
- * in the dring data msg.
+ * start processing the descriptors from the specified
+ * start index, up to the index a descriptor is not ready
+ * to be processed or we process the entire descriptor ring
+ * and wrap around upto the start index.
*/
- if (ldc_mem_dring_acquire(ldcp->rx_dhandle, start, end)) {
- DWARN((vnetp, "vgen_handle_dring_data: "
- "id(%lx), ldc_mem_dring_acquire() failed\n",
- ldcp->ldc_id));
- statsp->ierrors++;
- }
- rxi = start;
- sync_start = start;
+
+ /* need to set the start index of descriptors to be ack'd */
+ set_ack_start = B_TRUE;
+
+ /* index upto which we have ack'd */
+ ack_end = start;
+ DECR_RXI(ack_end, ldcp);
+
+ next_rxi = rxi = start;
do {
- /* recv packets from 'start' to 'end' */
+
+vgen_recv_retry: if (ldc_mem_dring_acquire(ldcp->rx_dhandle,
+ rxi, rxi)) {
+ DWARN((vnetp, "vgen_handle_dring_data: "
+ "id(%lx), ldc_mem_dring_acquire() failed\n",
+ ldcp->ldc_id));
+ statsp->ierrors++;
+ break;
+ }
rxdp = &(ldcp->rxdp[rxi]);
hdrp = &rxdp->hdr;
+ if (hdrp->dstate != VIO_DESC_READY) {
+ /*
+ * descriptor is not ready.
+ * retry descriptor acquire, stop processing
+ * after max # retries.
+ */
+ if (retries == vgen_recv_retries)
+ break;
+ retries++;
+ drv_usecwait(vgen_recv_delay);
+ goto vgen_recv_retry;
+ }
+ retries = 0;
+
+ if (set_ack_start) {
+ /*
+ * initialize the start index of the range
+ * of descriptors to be ack'd.
+ */
+ ack_start = rxi;
+ set_ack_start = B_FALSE;
+ }
+
datalen = rxdp->nbytes;
ncookies = rxdp->ncookies;
if ((datalen < ETHERMIN) ||
(ncookies == 0) ||
- (ncookies > (uint64_t)MAX_COOKIES) ||
- (hdrp->dstate != VIO_DESC_READY)) {
+ (ncookies > MAX_COOKIES)) {
rxd_err = B_TRUE;
} else {
/*
- * The data buffer returned by allocb(9F) is
- * 8byte aligned. We allocate extra 8 bytes to
- * ensure size is multiple of 8 bytes for
- * ldc_mem_copy().
+ * Try to allocate an mblk from the free pool
+ * of recv mblks for the channel.
+ * If this fails, use allocb().
*/
- mp = allocb(datalen + 8, BPRI_MED);
- nbytes = (datalen + 7) & ~7;
+ mp = vio_allocb(ldcp->rmp);
+ if (!mp) {
+ /*
+ * The data buffer returned by
+ * allocb(9F) is 8byte aligned. We
+ * allocate extra 8 bytes to ensure
+ * size is multiple of 8 bytes for
+ * ldc_mem_copy().
+ */
+ statsp->rx_vio_allocb_fail++;
+ mp = allocb(VNET_IPALIGN + datalen + 8,
+ BPRI_MED);
+ }
+ nbytes = (VNET_IPALIGN + datalen + 7) & ~7;
}
if ((rxd_err) || (mp == NULL)) {
/*
@@ -4082,35 +4139,22 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
/* set descriptor done bit */
hdrp->dstate = VIO_DESC_DONE;
+ (void) ldc_mem_dring_release(ldcp->rx_dhandle,
+ rxi, rxi);
+
if (hdrp->ack) {
/*
- * sender needs ack for this packet.
- * sync pkts upto this index and
- * send the ack to the peer.
+ * sender needs ack for this packet,
+ * ack pkts upto this index.
*/
- sync_end = rxi;
- (void) ldc_mem_dring_release(
- ldcp->rx_dhandle, sync_start,
- sync_end);
- tagp->vio_subtype = VIO_SUBTYPE_ACK;
- tagp->vio_sid = ldcp->local_sid;
- dringmsg = (vio_dring_msg_t *)tagp;
- dringmsg->start_idx = sync_start;
- dringmsg->end_idx = sync_end;
- if (vgen_sendmsg(ldcp, (caddr_t)tagp,
- sizeof (*dringmsg), B_FALSE)) {
- DWARN((vnetp,
- "vgen_handle_dring_data: "
- "id(%lx) vgen_sendmsg "
- "failed, stype: ACK\n",
- ldcp->ldc_id));
- }
- /* save new sync index start */
- if (sync_end != end) {
- INCR_RXI(sync_end, ldcp);
- sync_start = sync_end;
- } else
- sync_done = B_TRUE;
+ ack_end = rxi;
+
+ vgen_send_dring_ack(ldcp, tagp,
+ ack_start, ack_end,
+ VIO_DP_ACTIVE);
+
+ /* need to set new ack start index */
+ set_ack_start = B_TRUE;
}
goto vgen_next_rxi;
}
@@ -4123,34 +4167,25 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
/* set done bit irrespective of rv of ldc_mem_copy() */
hdrp->dstate = VIO_DESC_DONE;
+ (void) ldc_mem_dring_release(ldcp->rx_dhandle,
+ rxi, rxi);
+
+ mp->b_rptr += VNET_IPALIGN;
+
if (hdrp->ack) {
/*
- * sender needs ack for this packet.
- * sync pkts upto this index and
- * send the ack to the peer.
+ * sender needs ack for this packet,
+ * ack pkts upto this index.
*/
- sync_end = rxi;
- (void) ldc_mem_dring_release(ldcp->rx_dhandle,
- sync_start, sync_end);
- tagp->vio_subtype = VIO_SUBTYPE_ACK;
- tagp->vio_sid = ldcp->local_sid;
- dringmsg = (vio_dring_msg_t *)tagp;
- dringmsg->start_idx = sync_start;
- dringmsg->end_idx = sync_end;
- if (vgen_sendmsg(ldcp, (caddr_t)tagp,
- sizeof (*dringmsg), B_FALSE)) {
- DWARN((vnetp,
- "vgen_handle_dring_data: id(%lx) "
- "vgen_sendmsg failed stype: ACK\n",
- ldcp->ldc_id));
- }
- /* save new sync index start */
- if (sync_end != end) {
- INCR_RXI(sync_end, ldcp);
- sync_start = sync_end;
- } else
- sync_done = B_TRUE;
+ ack_end = rxi;
+
+ vgen_send_dring_ack(ldcp, tagp,
+ ack_start, ack_end, VIO_DP_ACTIVE);
+
+ /* need to set new ack start index */
+ set_ack_start = B_TRUE;
}
+
/* if ldc_mem_copy() failed */
if (rv) {
DWARN((vnetp,
@@ -4194,32 +4229,49 @@ vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
bpt = mp;
}
-vgen_next_rxi: if (rxi == end) {
+
+vgen_next_rxi:
+ /* update end index of range of descrs to be ack'd */
+ ack_end = rxi;
+
+ /* update the next index to be processed */
+ INCR_RXI(next_rxi, ldcp);
+ if (next_rxi == start) {
+ /*
+ * processed the entire descriptor ring upto
+ * the index at which we started.
+ */
break;
}
- /* increment recv index */
- INCR_RXI(rxi, ldcp);
+
+ rxi = next_rxi;
_NOTE(CONSTCOND)
} while (1);
- if (!sync_done) {
- /* sync remote descriptor range */
- sync_end = rxi;
- (void) ldc_mem_dring_release(ldcp->rx_dhandle,
- sync_start, sync_end);
- DBG2((vnetp,
- "vgen_handle_dring_data: not sending ACK\n"));
+ /*
+ * send an ack message to peer indicating that we have stopped
+ * processing descriptors.
+ */
+ if (set_ack_start) {
+ /*
+ * We have ack'd upto some index and we have not
+ * processed any descriptors beyond that index.
+ * Use the last ack'd index as both the start and
+ * end of range of descrs being ack'd.
+ * Note: This results in acking the last index twice
+ * and should be harmless.
+ */
+ ack_start = ack_end;
}
- /* save new recv index */
- INCR_RXI(rxi, ldcp);
- ldcp->next_rxi = rxi;
- ldcp->next_rxseq += ((end >= start) ?
- ((end - start) + 1) : (start - end));
+ vgen_send_dring_ack(ldcp, tagp, ack_start, ack_end,
+ VIO_DP_STOPPED);
+
+ /* save new recv index and expected seqnum of next dring msg */
+ ldcp->next_rxi = next_rxi;
+ ldcp->next_rxseq += 1;
- /* try to reclaim transmit descrs also */
- vgen_reclaim(ldcp);
break;
case VIO_SUBTYPE_ACK:
@@ -4228,6 +4280,7 @@ vgen_next_rxi: if (rxi == end) {
* which we had set the ACK bit in the descriptor (during
* transmit). This enables us to reclaim descriptors.
*/
+
DBG2((vnetp,
"vgen_handle_dring_data: ACK: start(%d), end(%d)\n",
start, end));
@@ -4243,7 +4296,94 @@ vgen_next_rxi: if (rxi == end) {
break;
}
statsp->dring_data_acks++;
+
+ /* reclaim descriptors that are done */
vgen_reclaim(ldcp);
+
+ if (dringmsg->dring_process_state != VIO_DP_STOPPED) {
+ /*
+ * receiver continued processing descriptors after
+ * sending us the ack.
+ */
+ break;
+ }
+
+ statsp->dring_stopped_acks++;
+
+ /* receiver stopped processing descriptors */
+ mutex_enter(&ldcp->txlock);
+ mutex_enter(&ldcp->tclock);
+
+ /*
+ * determine if there are any pending tx descriptors
+ * ready to be processed by the receiver(peer) and if so,
+ * send a message to the peer to restart receiving.
+ */
+ ready_txd = B_FALSE;
+
+ /*
+ * using the end index of the descriptor range for which
+ * we received the ack, check if the next descriptor is
+ * ready.
+ */
+ txi = end;
+ INCR_TXI(txi, ldcp);
+ tbufp = &ldcp->tbufp[txi];
+ txdp = tbufp->descp;
+ hdrp = &txdp->hdr;
+ if (hdrp->dstate == VIO_DESC_READY) {
+ ready_txd = B_TRUE;
+ } else {
+ /*
+ * descr next to the end of ack'd descr range is not
+ * ready.
+ * starting from the current reclaim index, check
+ * if any descriptor is ready.
+ */
+
+ txi = ldcp->cur_tbufp - ldcp->tbufp;
+ tbufp = &ldcp->tbufp[txi];
+
+ while (tbufp != ldcp->next_tbufp) {
+
+ txdp = tbufp->descp;
+ hdrp = &txdp->hdr;
+ if (hdrp->dstate == VIO_DESC_READY) {
+ break;
+ }
+
+ INCR_TXI(txi, ldcp);
+ tbufp = &ldcp->tbufp[txi];
+
+ }
+
+ if (tbufp != ldcp->next_tbufp)
+ ready_txd = B_TRUE;
+ }
+
+ if (ready_txd) {
+ /*
+ * we have tx descriptor(s) ready to be
+ * processed by the receiver.
+ * send a message to the peer with the start index
+ * of ready descriptors.
+ */
+ rv = vgen_send_dring_data(ldcp, txi, -1);
+ if (rv != 0) {
+ ldcp->resched_peer = B_TRUE;
+ }
+ } else {
+ /*
+ * no ready tx descriptors. set the flag to send a
+ * message to peer when tx descriptors are ready in
+ * transmit routine.
+ */
+ ldcp->resched_peer = B_TRUE;
+ }
+
+ mutex_exit(&ldcp->tclock);
+ mutex_exit(&ldcp->txlock);
+
break;
case VIO_SUBTYPE_NACK:
@@ -4281,9 +4421,7 @@ vgen_next_rxi: if (rxi == end) {
/* send a new dring data msg including the lost descrs */
end = ldcp->next_tbufp - ldcp->tbufp;
DECR_TXI(end, ldcp);
- seqnum = ldcp->tbufp[start].seqnum;
- /* no need to increment ldcp->next_txseq as this is rexmit */
- rv = vgen_send_dring_data(ldcp, start, end, seqnum);
+ rv = vgen_send_dring_data(ldcp, start, end);
if (rv != 0) {
/*
* vgen_send_dring_data() error: drop all packets
@@ -4305,7 +4443,6 @@ vgen_next_rxi: if (rxi == end) {
/* update next pointer */
ldcp->next_tbufp = &(ldcp->tbufp[start]);
- ldcp->next_txseq = seqnum;
ldcp->next_txi = start;
}
DBG2((vnetp,
@@ -4324,23 +4461,23 @@ vgen_next_rxi: if (rxi == end) {
mutex_exit(&ldcp->tclock);
mutex_exit(&ldcp->txlock);
- vgen_reclaim(ldcp);
-
break;
}
DBG1((vnetp, "vgen_handle_dring_data: exit\n"));
*headp = bp;
*tailp = bpt;
+
}
static void
vgen_reclaim(vgen_ldc_t *ldcp)
{
- if (mutex_tryenter(&ldcp->tclock) == 0)
- return; /* already in progress */
+ mutex_enter(&ldcp->tclock);
+
vgen_reclaim_dring(ldcp);
ldcp->reclaim_lbolt = ddi_get_lbolt();
+
mutex_exit(&ldcp->tclock);
}
@@ -4355,9 +4492,7 @@ vgen_reclaim_dring(vgen_ldc_t *ldcp)
vnet_public_desc_t *txdp;
vgen_private_desc_t *tbufp;
vio_dring_entry_hdr_t *hdrp;
-#ifdef VGEN_USE_MAC_TX_UPDATE
- vgen_t *vgenp = (vgen_t *)ldcp->vgenp;
-#endif
+ vgen_t *vgenp = LDC_TO_VGEN(ldcp);
#ifdef DEBUG
if (vgen_trigger_txtimeout)
@@ -4386,9 +4521,7 @@ vgen_reclaim_dring(vgen_ldc_t *ldcp)
*/
if (ldcp->need_resched) {
ldcp->need_resched = B_FALSE;
-#ifdef VGEN_USE_MAC_TX_UPDATE
vnet_tx_update(vgenp->vnetp);
-#endif
}
}
@@ -4418,11 +4551,6 @@ vgen_tx_dring_full(vgen_ldc_t *ldcp)
tbufp = ldcp->next_tbufp;
ntbufp = NEXTTBUF(ldcp, tbufp);
if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */
-#if 0
- void *vnetp = LDC_TO_VNET(ldcp);
- DWARN((vnetp, "vgen_tx_dring_full: id(%lx)\n",
- ldcp->ldc_id));
-#endif
return (VGEN_SUCCESS);
}
return (VGEN_FAILURE);
@@ -4436,11 +4564,6 @@ vgen_ldc_txtimeout(vgen_ldc_t *ldcp)
drv_usectohz(vnet_ldcwd_txtimeout * 1000)) &&
(vnet_ldcwd_txtimeout) &&
(vgen_tx_dring_full(ldcp) == VGEN_SUCCESS)) {
-#if 0
- void *vnetp = LDC_TO_VNET(ldcp);
- DWARN((vnetp, "vgen_ldc_txtimeout: id(%lx)\n",
- ldcp->ldc_id));
-#endif
return (VGEN_SUCCESS);
} else {
return (VGEN_FAILURE);
@@ -4452,10 +4575,12 @@ static void
vgen_ldc_watchdog(void *arg)
{
vgen_ldc_t *ldcp;
+ vgen_t *vgenp;
void *vnetp;
int rv;
ldcp = (vgen_ldc_t *)arg;
+ vgenp = LDC_TO_VGEN(ldcp);
vnetp = LDC_TO_VNET(ldcp);
rv = vgen_ldc_txtimeout(ldcp);
@@ -4474,9 +4599,7 @@ vgen_ldc_watchdog(void *arg)
mutex_exit(&ldcp->cblock);
if (ldcp->need_resched) {
ldcp->need_resched = B_FALSE;
-#ifdef VGEN_USE_MAC_TX_UPDATE
- vnet_tx_update(ldcp->vgenp->vnetp);
-#endif
+ vnet_tx_update(vgenp->vnetp);
}
}
@@ -4484,21 +4607,6 @@ vgen_ldc_watchdog(void *arg)
drv_usectohz(vnet_ldcwd_interval * 1000));
}
-/* based on mcopymsg() */
-static void
-vgen_copymsg(mblk_t *mp, void *bufp)
-{
- caddr_t dest = bufp;
- mblk_t *bp;
- size_t n;
-
- for (bp = mp; bp != NULL; bp = bp->b_cont) {
- n = MBLKL(bp);
- bcopy(bp->b_rptr, dest, n);
- dest += n;
- }
-}
-
static int
vgen_setup_kstats(vgen_ldc_t *ldcp)
{
@@ -4565,14 +4673,12 @@ vgen_setup_kstats(vgen_ldc_t *ldcp)
/* Tx stats */
kstat_named_init(&ldckp->tx_no_desc, "tx_no_desc",
KSTAT_DATA_ULONG);
- kstat_named_init(&ldckp->tx_allocb_fail, "tx_allocb_fail",
- KSTAT_DATA_ULONG);
/* Rx stats */
- kstat_named_init(&ldckp->rx_no_desc, "rx_no_desc",
- KSTAT_DATA_ULONG);
kstat_named_init(&ldckp->rx_allocb_fail, "rx_allocb_fail",
KSTAT_DATA_ULONG);
+ kstat_named_init(&ldckp->rx_vio_allocb_fail, "rx_vio_allocb_fail",
+ KSTAT_DATA_ULONG);
kstat_named_init(&ldckp->rx_lost_pkts, "rx_lost_pkts",
KSTAT_DATA_ULONG);
@@ -4581,6 +4687,10 @@ vgen_setup_kstats(vgen_ldc_t *ldcp)
KSTAT_DATA_ULONG);
kstat_named_init(&ldckp->dring_data_acks, "dring_data_acks",
KSTAT_DATA_ULONG);
+ kstat_named_init(&ldckp->dring_stopped_acks, "dring_stopped_acks",
+ KSTAT_DATA_ULONG);
+ kstat_named_init(&ldckp->dring_data_msgs, "dring_data_msgs",
+ KSTAT_DATA_ULONG);
ksp->ks_update = vgen_kstat_update;
ksp->ks_private = (void *)ldcp;
@@ -4633,14 +4743,15 @@ vgen_kstat_update(kstat_t *ksp, int rw)
ldckp->noxmtbuf.value.ul = statsp->noxmtbuf;
ldckp->tx_no_desc.value.ul = statsp->tx_no_desc;
- ldckp->tx_allocb_fail.value.ul = statsp->tx_allocb_fail;
- ldckp->rx_no_desc.value.ul = statsp->rx_no_desc;
ldckp->rx_allocb_fail.value.ul = statsp->rx_allocb_fail;
+ ldckp->rx_vio_allocb_fail.value.ul = statsp->rx_vio_allocb_fail;
ldckp->rx_lost_pkts.value.ul = statsp->rx_lost_pkts;
ldckp->callbacks.value.ul = statsp->callbacks;
ldckp->dring_data_acks.value.ul = statsp->dring_data_acks;
+ ldckp->dring_stopped_acks.value.ul = statsp->dring_stopped_acks;
+ ldckp->dring_data_msgs.value.ul = statsp->dring_data_msgs;
} else {
statsp->ipackets = ldckp->ipackets64.value.ull;
statsp->ierrors = ldckp->ierrors.value.ul;
@@ -4660,14 +4771,15 @@ vgen_kstat_update(kstat_t *ksp, int rw)
statsp->noxmtbuf = ldckp->noxmtbuf.value.ul;
statsp->tx_no_desc = ldckp->tx_no_desc.value.ul;
- statsp->tx_allocb_fail = ldckp->tx_allocb_fail.value.ul;
- statsp->rx_no_desc = ldckp->rx_no_desc.value.ul;
statsp->rx_allocb_fail = ldckp->rx_allocb_fail.value.ul;
+ statsp->rx_vio_allocb_fail = ldckp->rx_vio_allocb_fail.value.ul;
statsp->rx_lost_pkts = ldckp->rx_lost_pkts.value.ul;
statsp->callbacks = ldckp->callbacks.value.ul;
statsp->dring_data_acks = ldckp->dring_data_acks.value.ul;
+ statsp->dring_stopped_acks = ldckp->dring_stopped_acks.value.ul;
+ statsp->dring_data_msgs = ldckp->dring_data_msgs.value.ul;
}
return (VGEN_SUCCESS);
@@ -4702,20 +4814,11 @@ vgen_macaddr_strtoul(const uint8_t *macaddr)
uint64_t val = 0;
int i;
-#if 0
- for (i = ETHERADDRL - 1; i >= 0; i--) {
-#endif
for (i = 0; i < ETHERADDRL; i++) {
val <<= 8;
val |= macaddr[i];
}
-#if 0
- cmn_err(CE_CONT, "vgen_macaddr_strtoul: str(%x:%x:%x:%x:%x:%x)\n",
- macaddr[0], macaddr[1], macaddr[2],
- macaddr[3], macaddr[4], macaddr[5]);
- cmn_err(CE_CONT, "vgen_macaddr_strtoul: val(0x%lx)\n", val);
-#endif
return (val);
}
@@ -4727,19 +4830,10 @@ vgen_macaddr_ultostr(uint64_t val, uint8_t *macaddr)
uint64_t value;
value = val;
-#if 0
- for (i = 0; i < ETHERADDRL; i++) {
-#endif
for (i = ETHERADDRL - 1; i >= 0; i--) {
macaddr[i] = value & 0xFF;
value >>= 8;
}
-#if 0
- cmn_err(CE_CONT, "vgen_macaddr_ultostr: val(0x%lx)\n", val);
- cmn_err(CE_CONT, "vgen_macaddr_ultostr: str(%x:%x:%x:%x:%x:%x)\n",
- macaddr[0], macaddr[1], macaddr[2],
- macaddr[3], macaddr[4], macaddr[5]);
-#endif
return (VGEN_SUCCESS);
}
@@ -4769,29 +4863,6 @@ vgen_hwatchdog(void *arg)
}
static void
-vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint)
-{
- vgen_hparams_t *hp;
- char ep[8];
- uint8_t addr[6];
- char ea[6];
-
- if (endpoint == VGEN_LOCAL) {
- hp = &ldcp->local_hparams;
- (void) sprintf(ep, "Local");
- } else {
- hp = &ldcp->peer_hparams;
- (void) sprintf(ep, "Peer");
- }
- (void) vgen_macaddr_ultostr(hp->addr, addr);
- cmn_err(CE_CONT, "attr_info: %s: \n", ep);
- cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu,
- vgen_print_ethaddr(addr, ea));
- cmn_err(CE_CONT, "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n",
- hp->addr_type, hp->xfer_mode, hp->ack_freq);
-}
-
-static void
vgen_print_hparams(vgen_hparams_t *hp)
{
uint8_t addr[6];
diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c
index d82d31c79f..7f32782bf2 100644
--- a/usr/src/uts/sun4v/io/vsw.c
+++ b/usr/src/uts/sun4v/io/vsw.c
@@ -68,6 +68,8 @@
#include <sys/vio_mailbox.h>
#include <sys/vnet_mailbox.h>
#include <sys/vnet_common.h>
+#include <sys/vio_util.h>
+#include <sys/sdt.h>
/*
* Function prototypes.
@@ -183,7 +185,6 @@ static void vsw_create_privring(vsw_ldc_t *);
static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
int *);
-static void vsw_dring_priv2pub(vsw_private_desc_t *);
static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
static void vsw_set_lane_attr(vsw_t *, lane_t *);
@@ -194,10 +195,10 @@ static int vsw_check_dring_info(vio_dring_reg_msg_t *);
/* Misc support routines */
static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
-
static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
static int vsw_free_ring(dring_info_t *);
+
/* Debugging routines */
static void dump_flags(uint64_t);
static void display_state(void);
@@ -206,6 +207,13 @@ static void display_ring(dring_info_t *);
int vsw_num_handshakes = 3; /* # of handshake attempts */
int vsw_wretries = 100; /* # of write attempts */
+int vsw_chain_len = 150; /* max # of mblks in msg chain */
+int vsw_desc_delay = 0; /* delay in us */
+int vsw_read_attempts = 5; /* # of reads of descriptor */
+
+uint32_t vsw_mblk_size = VSW_MBLK_SIZE;
+uint32_t vsw_num_mblks = VSW_NUM_MBLKS;
+
/*
* mode specific frame switching function
@@ -638,6 +646,13 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
}
}
+ /* prevent auto-detaching */
+ if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
+ DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
+ cmn_err(CE_NOTE, "Unable to set \"%s\" property for "
+ "instance %u", DDI_NO_AUTODETACH, instance);
+ }
+
/*
* Now we have everything setup, register for MD change
* events.
@@ -681,8 +696,9 @@ vsw_attach_fail:
static int
vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
- vsw_t **vswpp, *vswp;
- int instance;
+ vio_mblk_pool_t *poolp, *npoolp;
+ vsw_t **vswpp, *vswp;
+ int instance;
instance = ddi_get_instance(dip);
vswp = ddi_get_soft_state(vsw_state, instance);
@@ -707,8 +723,8 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
cmn_err(CE_WARN, "Unable to detach from MAC layer");
return (DDI_FAILURE);
}
+ rw_destroy(&vswp->if_lockrw);
}
- rw_destroy(&vswp->if_lockrw);
vsw_mdeg_unregister(vswp);
@@ -723,6 +739,19 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
}
/*
+ * Destroy any free pools that may still exist.
+ */
+ poolp = vswp->rxh;
+ while (poolp != NULL) {
+ npoolp = vswp->rxh = poolp->nextp;
+ if (vio_destroy_mblks(poolp) != 0) {
+ vswp->rxh = poolp;
+ return (DDI_FAILURE);
+ }
+ poolp = npoolp;
+ }
+
+ /*
* Remove this instance from any entries it may be on in
* the hash table by using the list of addresses maintained
* in the vsw_t structure.
@@ -927,7 +956,6 @@ vsw_get_md_properties(vsw_t *vswp)
__func__, vswp->physname);
}
-
#ifdef DEBUG
/*
* As a temporary measure to aid testing we check to see if there
@@ -1336,6 +1364,8 @@ vsw_mac_unregister(vsw_t *vswp)
}
RW_EXIT(&vswp->if_lockrw);
+ vswp->mdprops &= ~VSW_MD_MACADDR;
+
D1(vswp, "%s: exit", __func__);
return (rv);
@@ -2021,6 +2051,7 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
ldc_attr_t attr;
ldc_status_t istatus;
int status = DDI_FAILURE;
+ int rv;
D1(vswp, "%s: enter", __func__);
@@ -2031,6 +2062,15 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
}
ldcp->ldc_id = ldc_id;
+ /* allocate pool of receive mblks */
+ rv = vio_create_mblks(vsw_num_mblks, vsw_mblk_size, &(ldcp->rxh));
+ if (rv) {
+ DWARN(vswp, "%s: unable to create free mblk pool for"
+ " channel %ld (rv %d)", __func__, ldc_id, rv);
+ kmem_free(ldcp, sizeof (vsw_ldc_t));
+ return (1);
+ }
+
mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
@@ -2045,6 +2085,8 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
ldcp->hss_id = 1; /* Initial handshake session id */
/* only set for outbound lane, inbound set by peer */
+ mutex_init(&ldcp->lane_in.seq_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&ldcp->lane_out.seq_lock, NULL, MUTEX_DRIVER, NULL);
vsw_set_lane_attr(vswp, &ldcp->lane_out);
attr.devclass = LDC_DEV_NT_SVC;
@@ -2055,27 +2097,15 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
if (status != 0) {
DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
__func__, ldc_id, status);
- mutex_destroy(&ldcp->ldc_txlock);
- mutex_destroy(&ldcp->ldc_cblock);
- cv_destroy(&ldcp->drain_cv);
- mutex_destroy(&ldcp->drain_cv_lock);
- mutex_destroy(&ldcp->hss_lock);
- kmem_free(ldcp, sizeof (vsw_ldc_t));
- return (1);
+ goto ldc_attach_fail;
}
status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
if (status != 0) {
DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
__func__, ldc_id, status);
- mutex_destroy(&ldcp->ldc_txlock);
- mutex_destroy(&ldcp->ldc_cblock);
- cv_destroy(&ldcp->drain_cv);
- mutex_destroy(&ldcp->drain_cv_lock);
- mutex_destroy(&ldcp->hss_lock);
(void) ldc_fini(ldcp->ldc_handle);
- kmem_free(ldcp, sizeof (vsw_ldc_t));
- return (1);
+ goto ldc_attach_fail;
}
@@ -2097,6 +2127,40 @@ vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
D1(vswp, "%s: exit", __func__);
return (0);
+
+ldc_attach_fail:
+ mutex_destroy(&ldcp->ldc_txlock);
+ mutex_destroy(&ldcp->ldc_cblock);
+
+ cv_destroy(&ldcp->drain_cv);
+
+ if (ldcp->rxh != NULL) {
+ if (vio_destroy_mblks(ldcp->rxh) != 0) {
+ /*
+ * Something odd has happened, as the destroy
+ * will only fail if some mblks have been allocated
+ * from the pool already (which shouldn't happen)
+ * and have not been returned.
+ *
+ * Add the pool pointer to a list maintained in
+ * the device instance. Another attempt will be made
+ * to free the pool when the device itself detaches.
+ */
+ cmn_err(CE_WARN, "Creation of ldc channel %ld failed"
+ " and cannot destroy associated mblk pool",
+ ldc_id);
+ ldcp->rxh->nextp = vswp->rxh;
+ vswp->rxh = ldcp->rxh;
+ }
+ }
+ mutex_destroy(&ldcp->drain_cv_lock);
+ mutex_destroy(&ldcp->hss_lock);
+
+ mutex_destroy(&ldcp->lane_in.seq_lock);
+ mutex_destroy(&ldcp->lane_out.seq_lock);
+ kmem_free(ldcp, sizeof (vsw_ldc_t));
+
+ return (1);
}
/*
@@ -2150,11 +2214,28 @@ vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
ldcp->ldc_status = LDC_INIT;
ldcp->ldc_handle = NULL;
ldcp->ldc_vswp = NULL;
+
+ if (ldcp->rxh != NULL) {
+ if (vio_destroy_mblks(ldcp->rxh)) {
+ /*
+ * Mostly likely some mblks are still in use and
+ * have not been returned to the pool. Add the pool
+ * to the list maintained in the device instance.
+ * Another attempt will be made to destroy the pool
+ * when the device detaches.
+ */
+ ldcp->rxh->nextp = vswp->rxh;
+ vswp->rxh = ldcp->rxh;
+ }
+ }
+
mutex_destroy(&ldcp->ldc_txlock);
mutex_destroy(&ldcp->ldc_cblock);
cv_destroy(&ldcp->drain_cv);
mutex_destroy(&ldcp->drain_cv_lock);
mutex_destroy(&ldcp->hss_lock);
+ mutex_destroy(&ldcp->lane_in.seq_lock);
+ mutex_destroy(&ldcp->lane_out.seq_lock);
/* unlink it from the list */
prev_ldcp = ldcp->ldc_next;
@@ -4072,11 +4153,14 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
size_t off = 0;
uint64_t ncookies = 0;
uint64_t chain = 0;
- uint64_t j, len, num;
- uint32_t start, end, datalen;
- int i, last_sync, rv;
+ uint64_t j, len;
+ uint32_t pos, start, datalen;
+ uint32_t range_start, range_end;
+ int32_t end, num, cnt = 0;
+ int i, rv;
boolean_t ack_needed = B_FALSE;
- boolean_t sync_needed = B_TRUE;
+ boolean_t prev_desc_ack = B_FALSE;
+ int read_attempts = 0;
D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
@@ -4107,43 +4191,94 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
return;
}
- start = end = 0;
- start = dring_pkt->start_idx;
+ start = pos = dring_pkt->start_idx;
end = dring_pkt->end_idx;
+ len = dp->num_descriptors;
- D3(vswp, "%s(%lld): start index %ld : end %ld\n",
+ range_start = range_end = pos;
+
+ D2(vswp, "%s(%lld): start index %ld : end %ld\n",
__func__, ldcp->ldc_id, start, end);
- /* basic sanity check */
- len = dp->num_descriptors;
- if (end > len) {
- DERR(vswp, "%s(%lld): endpoint %lld outside ring"
- " length %lld", __func__, ldcp->ldc_id,
- end, len);
+ if (end == -1) {
+ num = -1;
+ } else if (num >= 0) {
+ num = end >= pos ?
+ end - pos + 1: (len - pos + 1) + end;
+ /* basic sanity check */
+ if (end > len) {
+ DERR(vswp, "%s(%lld): endpoint %lld outside "
+ "ring length %lld", __func__,
+ ldcp->ldc_id, end, len);
+
+ SND_DRING_NACK(ldcp, dring_pkt);
+ return;
+ }
+ } else {
+ DERR(vswp, "%s(%lld): invalid endpoint %lld",
+ __func__, ldcp->ldc_id, end);
SND_DRING_NACK(ldcp, dring_pkt);
return;
}
- /* sync data */
- if ((rv = ldc_mem_dring_acquire(dp->handle,
- start, end)) != 0) {
- DERR(vswp, "%s(%lld): unable to acquire dring : err %d",
- __func__, ldcp->ldc_id, rv);
- return;
- }
+ while (cnt != num) {
+vsw_recheck_desc:
+ if ((rv = ldc_mem_dring_acquire(dp->handle,
+ pos, pos)) != 0) {
+ DERR(vswp, "%s(%lld): unable to acquire "
+ "descriptor at pos %d: err %d",
+ __func__, pos, ldcp->ldc_id, rv);
+ SND_DRING_NACK(ldcp, dring_pkt);
+ return;
+ }
- pub_addr = (vnet_public_desc_t *)dp->pub_addr;
+ pub_addr = (vnet_public_desc_t *)dp->pub_addr + pos;
- j = num = 0;
+ /*
+ * When given a bounded range of descriptors
+ * to process, its an error to hit a descriptor
+ * which is not ready. In the non-bounded case
+ * (end_idx == -1) this simply indicates we have
+ * reached the end of the current active range.
+ */
+ if (pub_addr->hdr.dstate != VIO_DESC_READY) {
+ /* unbound - no error */
+ if (end == -1) {
+ if (read_attempts == vsw_read_attempts)
+ break;
+
+ delay(drv_usectohz(vsw_desc_delay));
+ read_attempts++;
+ goto vsw_recheck_desc;
+ }
- /* calculate # descriptors taking into a/c wrap around */
- num = end >= start ? end - start + 1: (len - start + 1) + end;
+ /* bounded - error - so NACK back */
+ DERR(vswp, "%s(%lld): descriptor not READY "
+ "(%d)", __func__, ldcp->ldc_id,
+ pub_addr->hdr.dstate);
+ SND_DRING_NACK(ldcp, dring_pkt);
+ return;
+ }
- last_sync = start;
+ DTRACE_PROBE1(read_attempts, int, read_attempts);
- for (i = start; j < num; i = (i + 1) % len, j++) {
- pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
+ range_end = pos;
+
+ /*
+ * If we ACK'd the previous descriptor then now
+ * record the new range start position for later
+ * ACK's.
+ */
+ if (prev_desc_ack) {
+ range_start = pos;
+
+ D2(vswp, "%s(%lld): updating range start "
+ "to be %d", __func__, ldcp->ldc_id,
+ range_start);
+
+ prev_desc_ack = B_FALSE;
+ }
/*
* Data is padded to align on 8 byte boundary,
@@ -4161,49 +4296,36 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
D2(vswp, "%s(%lld): processing desc %lld at pos"
" 0x%llx : dstate 0x%lx : datalen 0x%lx",
- __func__, ldcp->ldc_id, i, pub_addr,
+ __func__, ldcp->ldc_id, pos, pub_addr,
pub_addr->hdr.dstate, datalen);
/*
- * XXXX : Is it a fatal error to be told to
- * process a packet when the READY bit is not
- * set ?
- */
- if (pub_addr->hdr.dstate != VIO_DESC_READY) {
- DERR(vswp, "%s(%d): descriptor %lld at pos "
- " 0x%llx not READY (0x%lx)", __func__,
- ldcp->ldc_id, i, pub_addr,
- pub_addr->hdr.dstate);
-
- SND_DRING_NACK(ldcp, dring_pkt);
- (void) ldc_mem_dring_release(dp->handle,
- start, end);
- return;
- }
-
- /*
* Mark that we are starting to process descriptor.
*/
pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
+ mp = vio_allocb(ldcp->rxh);
+ if (mp == NULL) {
+ /*
+ * No free receive buffers available, so
+ * fallback onto allocb(9F). Make sure that
+ * we get a data buffer which is a multiple
+ * of 8 as this is required by ldc_mem_copy.
+ */
+ DTRACE_PROBE(allocb);
+ mp = allocb(datalen + VNET_IPALIGN + 8,
+ BPRI_MED);
+ }
+
/*
- * allocb(9F) returns an aligned data block. We
- * need to ensure that we ask ldc for an aligned
- * number of bytes also.
+ * Ensure that we ask ldc for an aligned
+ * number of bytes.
*/
- nbytes = datalen;
+ nbytes = datalen + VNET_IPALIGN;
if (nbytes & 0x7) {
off = 8 - (nbytes & 0x7);
nbytes += off;
}
- mp = allocb(datalen, BPRI_MED);
- if (mp == NULL) {
- DERR(vswp, "%s(%lld): allocb failed",
- __func__, ldcp->ldc_id);
- (void) ldc_mem_dring_release(dp->handle,
- start, end);
- return;
- }
ncookies = pub_addr->ncookies;
rv = ldc_mem_copy(ldcp->ldc_handle,
@@ -4213,18 +4335,24 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
if (rv != 0) {
DERR(vswp, "%s(%d): unable to copy in "
- "data from %d cookies", __func__,
- ldcp->ldc_id, ncookies);
+ "data from %d cookies in desc %d"
+ " (rv %d)", __func__, ldcp->ldc_id,
+ ncookies, pos, rv);
freemsg(mp);
+
+ pub_addr->hdr.dstate = VIO_DESC_DONE;
(void) ldc_mem_dring_release(dp->handle,
- start, end);
- return;
+ pos, pos);
+ break;
} else {
D2(vswp, "%s(%d): copied in %ld bytes"
" using %d cookies", __func__,
ldcp->ldc_id, nbytes, ncookies);
}
+ /* adjust the read pointer to skip over the padding */
+ mp->b_rptr += VNET_IPALIGN;
+
/* point to the actual end of data */
mp->b_wptr = mp->b_rptr + datalen;
@@ -4246,50 +4374,89 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
/* mark we are finished with this descriptor */
pub_addr->hdr.dstate = VIO_DESC_DONE;
+ (void) ldc_mem_dring_release(dp->handle, pos, pos);
+
/*
- * Send an ACK back to peer if requested, and sync
- * the rings up to this point so the remote side sees
- * the descriptor flag in a consistent state.
+ * Send an ACK back to peer if requested.
*/
if (ack_needed) {
- if ((rv = ldc_mem_dring_release(
- dp->handle, last_sync, i)) != 0) {
- DERR(vswp, "%s(%lld): unable to sync"
- " from %d to %d", __func__,
- ldcp->ldc_id, last_sync, i);
- }
-
ack_needed = B_FALSE;
- if (i == end)
- sync_needed = B_FALSE;
- else
- sync_needed = B_TRUE;
+ dring_pkt->start_idx = range_start;
+ dring_pkt->end_idx = range_end;
- last_sync = (i + 1) % len;
+ DERR(vswp, "%s(%lld): processed %d %d, ACK"
+ " requested", __func__, ldcp->ldc_id,
+ dring_pkt->start_idx,
+ dring_pkt->end_idx);
+ dring_pkt->dring_process_state = VIO_DP_ACTIVE;
dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
dring_pkt->tag.vio_sid = ldcp->local_session;
vsw_send_msg(ldcp, (void *)dring_pkt,
sizeof (vio_dring_msg_t));
+
+ prev_desc_ack = B_TRUE;
+ range_start = pos;
}
- }
- if (sync_needed) {
- if ((rv = ldc_mem_dring_release(dp->handle,
- last_sync, end)) != 0) {
- DERR(vswp, "%s(%lld): unable to sync"
- " from %d to %d", __func__,
- ldcp->ldc_id, last_sync, end);
+ /* next descriptor */
+ pos = (pos + 1) % len;
+ cnt++;
+
+ /*
+ * Break out of loop here and stop processing to
+ * allow some other network device (or disk) to
+ * get access to the cpu.
+ */
+ /* send the chain of packets to be switched */
+ if (chain > vsw_chain_len) {
+ D3(vswp, "%s(%lld): switching chain of %d "
+ "msgs", __func__, ldcp->ldc_id, chain);
+ vsw_switch_frame(vswp, bp, VSW_VNETPORT,
+ ldcp->ldc_port, NULL);
+ bp = NULL;
+ break;
}
}
/* send the chain of packets to be switched */
- D3(vswp, "%s(%lld): switching chain of %d msgs", __func__,
- ldcp->ldc_id, chain);
- vsw_switch_frame(vswp, bp, VSW_VNETPORT,
- ldcp->ldc_port, NULL);
+ if (bp != NULL) {
+ D3(vswp, "%s(%lld): switching chain of %d msgs",
+ __func__, ldcp->ldc_id, chain);
+ vsw_switch_frame(vswp, bp, VSW_VNETPORT,
+ ldcp->ldc_port, NULL);
+ }
+
+ DTRACE_PROBE1(msg_cnt, int, cnt);
+
+ /*
+ * We are now finished so ACK back with the state
+ * set to STOPPING so our peer knows we are finished
+ */
+ dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+ dring_pkt->tag.vio_sid = ldcp->local_session;
+
+ dring_pkt->dring_process_state = VIO_DP_STOPPED;
+
+ DTRACE_PROBE(stop_process_sent);
+
+ /*
+ * We have not processed any more descriptors beyond
+ * the last one we ACK'd.
+ */
+ if (prev_desc_ack)
+ range_start = range_end;
+ dring_pkt->start_idx = range_start;
+ dring_pkt->end_idx = range_end;
+
+ D2(vswp, "%s(%lld) processed : %d : %d, now stopping",
+ __func__, ldcp->ldc_id, dring_pkt->start_idx,
+ dring_pkt->end_idx);
+
+ vsw_send_msg(ldcp, (void *)dring_pkt,
+ sizeof (vio_dring_msg_t));
break;
case VIO_SUBTYPE_ACK:
@@ -4312,7 +4479,6 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
end = dring_pkt->end_idx;
len = dp->num_descriptors;
-
j = num = 0;
/* calculate # descriptors taking into a/c wrap around */
num = end >= start ? end - start + 1: (len - start + 1) + end;
@@ -4320,31 +4486,112 @@ vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
__func__, ldcp->ldc_id, start, end, num);
+ mutex_enter(&dp->dlock);
+ dp->last_ack_recv = end;
+ mutex_exit(&dp->dlock);
+
for (i = start; j < num; i = (i + 1) % len, j++) {
pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
- if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
- DERR(vswp, "%s: descriptor %lld at pos "
- " 0x%llx not DONE (0x%lx)\n", __func__,
- i, pub_addr, pub_addr->hdr.dstate);
- return;
- } else {
+ /*
+ * If the last descriptor in a range has the ACK
+ * bit set then we will get two messages from our
+ * peer relating to it. The normal ACK msg and then
+ * a subsequent STOP msg. The first message will have
+ * resulted in the descriptor being reclaimed and
+ * its state set to FREE so when we encounter a non
+ * DONE descriptor we need to check to see if its
+ * because we have just reclaimed it.
+ */
+ mutex_enter(&priv_addr->dstate_lock);
+ if (pub_addr->hdr.dstate == VIO_DESC_DONE) {
/* clear all the fields */
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
pub_addr->hdr.dstate = VIO_DESC_FREE;
pub_addr->hdr.ack = 0;
+
priv_addr->dstate = VIO_DESC_FREE;
+ mutex_exit(&priv_addr->dstate_lock);
D3(vswp, "clearing descp %d : pub state "
"0x%llx : priv state 0x%llx", i,
pub_addr->hdr.dstate,
priv_addr->dstate);
+
+ } else {
+ mutex_exit(&priv_addr->dstate_lock);
+
+ if (dring_pkt->dring_process_state !=
+ VIO_DP_STOPPED) {
+ DERR(vswp, "%s: descriptor %lld at pos "
+ " 0x%llx not DONE (0x%lx)\n",
+ __func__, i, pub_addr,
+ pub_addr->hdr.dstate);
+ return;
+ }
}
}
+ /*
+ * If our peer is stopping processing descriptors then
+ * we check to make sure it has processed all the descriptors
+ * we have updated. If not then we send it a new message
+ * to prompt it to restart.
+ */
+ if (dring_pkt->dring_process_state == VIO_DP_STOPPED) {
+ DTRACE_PROBE(stop_process_recv);
+ D2(vswp, "%s(%lld): got stopping msg : %d : %d",
+ __func__, ldcp->ldc_id, dring_pkt->start_idx,
+ dring_pkt->end_idx);
+
+ /*
+ * Check next descriptor in public section of ring.
+ * If its marked as READY then we need to prompt our
+ * peer to start processing the ring again.
+ */
+ i = (end + 1) % len;
+ pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
+ priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
+
+ /*
+ * Hold the restart lock across all of this to
+ * make sure that its not possible for us to
+ * decide that a msg needs to be sent in the future
+ * but the sending code having already checked is
+ * about to exit.
+ */
+ mutex_enter(&dp->restart_lock);
+ mutex_enter(&priv_addr->dstate_lock);
+ if (pub_addr->hdr.dstate == VIO_DESC_READY) {
+
+ mutex_exit(&priv_addr->dstate_lock);
+
+ dring_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
+ dring_pkt->tag.vio_sid = ldcp->local_session;
+
+ mutex_enter(&ldcp->lane_out.seq_lock);
+ dring_pkt->seq_num = ldcp->lane_out.seq_num++;
+ mutex_exit(&ldcp->lane_out.seq_lock);
+
+ dring_pkt->start_idx = (end + 1) % len;
+ dring_pkt->end_idx = -1;
+
+ D2(vswp, "%s(%lld) : sending restart msg:"
+ " %d : %d", __func__, ldcp->ldc_id,
+ dring_pkt->start_idx,
+ dring_pkt->end_idx);
+
+ vsw_send_msg(ldcp, (void *)dring_pkt,
+ sizeof (vio_dring_msg_t));
+ } else {
+ mutex_exit(&priv_addr->dstate_lock);
+ dp->restart_reqd = B_TRUE;
+ }
+ mutex_exit(&dp->restart_lock);
+ }
break;
case VIO_SUBTYPE_NACK:
@@ -4510,7 +4757,9 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
* check that the descriptor we are being ACK'ed for is in
* fact READY, i.e. it is one we have shared with our peer.
*/
+ mutex_enter(&priv_addr->dstate_lock);
if (priv_addr->dstate != VIO_DESC_READY) {
+ mutex_exit(&priv_addr->dstate_lock);
cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not "
"READY (0x%lx)", __func__, ldcp->ldc_id, idx,
priv_addr->dstate);
@@ -4527,6 +4776,7 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
priv_addr->dstate = VIO_DESC_FREE;
+ mutex_exit(&priv_addr->dstate_lock);
}
break;
@@ -4561,9 +4811,11 @@ vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
priv_addr += idx;
/* release resources associated with sent msg */
+ mutex_enter(&priv_addr->dstate_lock);
bzero(priv_addr->datap, priv_addr->datalen);
priv_addr->datalen = 0;
priv_addr->dstate = VIO_DESC_FREE;
+ mutex_exit(&priv_addr->dstate_lock);
break;
@@ -5153,6 +5405,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
vio_dring_msg_t dring_pkt;
dring_info_t *dp = NULL;
vsw_private_desc_t *priv_desc = NULL;
+ vnet_public_desc_t *pub = NULL;
vsw_t *vswp = ldcp->ldc_vswp;
mblk_t *bp;
size_t n, size;
@@ -5183,14 +5436,12 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
return (LDC_TX_FAILURE);
}
- mutex_enter(&dp->dlock);
-
size = msgsize(mp);
if (size > (size_t)ETHERMAX) {
DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
ldcp->ldc_id, size);
- status = LDC_TX_FAILURE;
- goto vsw_dringsend_free_exit;
+ freemsg(mp);
+ return (LDC_TX_FAILURE);
}
/*
@@ -5201,7 +5452,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
* peers. This may change in the future.
*/
if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
- DERR(vswp, "%s(%lld): no descriptor available for ring "
+ D2(vswp, "%s(%lld): no descriptor available for ring "
"at 0x%llx", __func__, ldcp->ldc_id, dp);
/* nothing more we can do */
@@ -5215,6 +5466,7 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
/* copy data into the descriptor */
bufp = priv_desc->datap;
+ bufp += VNET_IPALIGN;
for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
n = MBLKL(bp);
bcopy(bp->b_rptr, bufp, n);
@@ -5222,48 +5474,69 @@ vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
}
priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
- priv_desc->dstate = VIO_DESC_READY;
- /*
- * Copy relevant sections of private descriptor
- * to public section
- */
- vsw_dring_priv2pub(priv_desc);
+ pub = priv_desc->descp;
+ pub->nbytes = priv_desc->datalen;
+
+ mutex_enter(&priv_desc->dstate_lock);
+ pub->hdr.dstate = VIO_DESC_READY;
+ mutex_exit(&priv_desc->dstate_lock);
/*
- * Send a vio_dring_msg to peer to prompt them to read
- * the updated descriptor ring.
+ * Determine whether or not we need to send a message to our
+ * peer prompting them to read our newly updated descriptor(s).
*/
- dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
- dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
- dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
- dring_pkt.tag.vio_sid = ldcp->local_session;
+ mutex_enter(&dp->restart_lock);
+ if (dp->restart_reqd) {
+ dp->restart_reqd = B_FALSE;
+ mutex_exit(&dp->restart_lock);
- /* Note - for now using first ring */
- dring_pkt.dring_ident = dp->ident;
+ /*
+ * Send a vio_dring_msg to peer to prompt them to read
+ * the updated descriptor ring.
+ */
+ dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
+ dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
+ dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
+ dring_pkt.tag.vio_sid = ldcp->local_session;
- /*
- * Access to the seq_num is implicitly protected by the
- * fact that we have only one dring associated with the
- * lane currently and we hold the associated dring lock.
- */
- dring_pkt.seq_num = ldcp->lane_out.seq_num++;
+ /* Note - for now using first ring */
+ dring_pkt.dring_ident = dp->ident;
- /* Note - only updating single descrip at time at the moment */
- dring_pkt.start_idx = idx;
- dring_pkt.end_idx = idx;
+ mutex_enter(&ldcp->lane_out.seq_lock);
+ dring_pkt.seq_num = ldcp->lane_out.seq_num++;
+ mutex_exit(&ldcp->lane_out.seq_lock);
- D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
- ldcp->ldc_id, dp, dring_pkt.dring_ident);
- D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__,
- ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx,
- dring_pkt.seq_num);
+ /*
+ * If last_ack_recv is -1 then we know we've not
+ * received any ack's yet, so this must be the first
+ * msg sent, so set the start to the begining of the ring.
+ */
+ mutex_enter(&dp->dlock);
+ if (dp->last_ack_recv == -1) {
+ dring_pkt.start_idx = 0;
+ } else {
+ dring_pkt.start_idx = (dp->last_ack_recv + 1) %
+ dp->num_descriptors;
+ }
+ dring_pkt.end_idx = -1;
+ mutex_exit(&dp->dlock);
- vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t));
+ D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
+ ldcp->ldc_id, dp, dring_pkt.dring_ident);
+ D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n",
+ __func__, ldcp->ldc_id, dring_pkt.start_idx,
+ dring_pkt.end_idx, dring_pkt.seq_num);
-vsw_dringsend_free_exit:
+ vsw_send_msg(ldcp, (void *)&dring_pkt,
+ sizeof (vio_dring_msg_t));
+ } else {
+ mutex_exit(&dp->restart_lock);
+ D2(vswp, "%s(%lld): updating descp %d", __func__,
+ ldcp->ldc_id, idx);
+ }
- mutex_exit(&dp->dlock);
+vsw_dringsend_free_exit:
/* free the message block */
freemsg(mp);
@@ -5316,14 +5589,12 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
return (LDC_TX_FAILURE);
}
- mutex_enter(&dp->dlock);
-
size = msgsize(mp);
if (size > (size_t)ETHERMAX) {
DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
ldcp->ldc_id, size);
- status = LDC_TX_FAILURE;
- goto vsw_descrsend_free_exit;
+ freemsg(mp);
+ return (LDC_TX_FAILURE);
}
/*
@@ -5355,7 +5626,6 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
}
priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
- priv_desc->dstate = VIO_DESC_READY;
/* create and send the in-band descp msg */
ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
@@ -5363,12 +5633,9 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
- /*
- * Access to the seq_num is implicitly protected by the
- * fact that we have only one dring associated with the
- * lane currently and we hold the associated dring lock.
- */
+ mutex_enter(&ldcp->lane_out.seq_lock);
ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
+ mutex_exit(&ldcp->lane_out.seq_lock);
/*
* Copy the mem cookies describing the data from the
@@ -5388,8 +5655,6 @@ vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
vsw_descrsend_free_exit:
- mutex_exit(&dp->dlock);
-
/* free the allocated message blocks */
freemsg(mp);
@@ -6140,6 +6405,7 @@ vsw_create_dring(vsw_ldc_t *ldcp)
/* haven't used any descriptors yet */
dp->end_idx = 0;
+ dp->last_ack_recv = -1;
/* bind dring to the channel */
if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
@@ -6150,6 +6416,9 @@ vsw_create_dring(vsw_ldc_t *ldcp)
goto dring_fail_exit;
}
+ mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
+ dp->restart_reqd = B_TRUE;
+
/*
* Only ever create rings for outgoing lane. Link it onto
* end of list.
@@ -6225,6 +6494,9 @@ vsw_create_privring(vsw_ldc_t *ldcp)
/* haven't used any descriptors yet */
dp->end_idx = 0;
+ mutex_init(&dp->restart_lock, NULL, MUTEX_DRIVER, NULL);
+ dp->restart_reqd = B_TRUE;
+
/*
* Only ever create rings for outgoing lane. Link it onto
* end of list.
@@ -6257,12 +6529,14 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
uint64_t offset = 0;
uint32_t ncookies = 0;
static char *name = "vsw_setup_ring";
- int i, j, rv;
+ int i, j, nc, rv;
- /* note - public section may be null */
priv_addr = dp->priv_addr;
pub_addr = dp->pub_addr;
+ /* public section may be null but private should never be */
+ ASSERT(priv_addr != NULL);
+
/*
* Allocate the region of memory which will be used to hold
* the data the descriptors will refer to.
@@ -6281,6 +6555,8 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
* descriptor fields.
*/
for (i = 0; i < VSW_RING_NUM_EL; i++) {
+ mutex_init(&priv_addr->dstate_lock, NULL, MUTEX_DRIVER, NULL);
+
if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
&priv_addr->memhandle)) != 0) {
DERR(vswp, "%s: alloc mem handle failed", name);
@@ -6335,6 +6611,14 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
/* link pub and private sides */
priv_addr->descp = pub_addr;
+ pub_addr->ncookies = priv_addr->ncookies;
+
+ for (nc = 0; nc < pub_addr->ncookies; nc++) {
+ bcopy(&priv_addr->memcookie[nc],
+ &pub_addr->memcookie[nc],
+ sizeof (ldc_mem_cookie_t));
+ }
+
pub_addr->hdr.dstate = VIO_DESC_FREE;
pub_addr++;
}
@@ -6352,10 +6636,12 @@ vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
setup_ring_cleanup:
priv_addr = dp->priv_addr;
- for (i = 0; i < VSW_RING_NUM_EL; i++) {
+ for (j = 0; j < i; j++) {
(void) ldc_mem_unbind_handle(priv_addr->memhandle);
(void) ldc_mem_free_handle(priv_addr->memhandle);
+ mutex_destroy(&priv_addr->dstate_lock);
+
priv_addr++;
}
kmem_free(dp->data_addr, dp->data_sz);
@@ -6368,7 +6654,8 @@ setup_ring_cleanup:
* starting at the location of the last free descriptor found
* previously.
*
- * Returns 0 if free descriptor is available, 1 otherwise.
+ * Returns 0 if free descriptor is available, and updates state
+ * of private descriptor to VIO_DESC_READY, otherwise returns 1.
*
* FUTURE: might need to return contiguous range of descriptors
* as dring info msg assumes all will be contiguous.
@@ -6377,38 +6664,34 @@ static int
vsw_dring_find_free_desc(dring_info_t *dringp,
vsw_private_desc_t **priv_p, int *idx)
{
- vsw_private_desc_t *addr;
- uint64_t i;
- uint64_t j = 0;
- uint64_t start = dringp->end_idx;
+ vsw_private_desc_t *addr = NULL;
int num = VSW_RING_NUM_EL;
int ret = 1;
D1(NULL, "%s enter\n", __func__);
- addr = dringp->priv_addr;
+ ASSERT(dringp->priv_addr != NULL);
D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
- __func__, dringp, start);
-
- for (i = start; j < num; i = (i + 1) % num, j++) {
- addr = (vsw_private_desc_t *)dringp->priv_addr + i;
- D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n",
- __func__, i, addr->dstate);
- if (addr->dstate == VIO_DESC_FREE) {
- D2(NULL, "%s: descriptor %lld is available",
- __func__, i);
- *priv_p = addr;
- *idx = i;
- dringp->end_idx = (i + 1) % num;
- ret = 0;
- break;
- }
+ __func__, dringp, dringp->end_idx);
+
+ addr = (vsw_private_desc_t *)dringp->priv_addr + dringp->end_idx;
+
+ mutex_enter(&addr->dstate_lock);
+ if (addr->dstate == VIO_DESC_FREE) {
+ addr->dstate = VIO_DESC_READY;
+ *priv_p = addr;
+ *idx = dringp->end_idx;
+ dringp->end_idx = (dringp->end_idx + 1) % num;
+ ret = 0;
+
}
+ mutex_exit(&addr->dstate_lock);
/* ring full */
if (ret == 1) {
- D2(NULL, "%s: no desp free: started at %d", __func__, start);
+ D2(NULL, "%s: no desp free: started at %d", __func__,
+ dringp->end_idx);
}
D1(NULL, "%s: exit\n", __func__);
@@ -6417,34 +6700,6 @@ vsw_dring_find_free_desc(dring_info_t *dringp,
}
/*
- * Copy relevant fields from the private descriptor into the
- * associated public side.
- */
-static void
-vsw_dring_priv2pub(vsw_private_desc_t *priv)
-{
- vnet_public_desc_t *pub;
- int i;
-
- D1(NULL, "vsw_dring_priv2pub enter\n");
-
- pub = priv->descp;
-
- pub->ncookies = priv->ncookies;
- pub->nbytes = priv->datalen;
-
- for (i = 0; i < pub->ncookies; i++) {
- bcopy(&priv->memcookie[i], &pub->memcookie[i],
- sizeof (ldc_mem_cookie_t));
- }
-
- pub->hdr.ack = 1;
- pub->hdr.dstate = VIO_DESC_READY;
-
- D1(NULL, "vsw_dring_priv2pub exit");
-}
-
-/*
* Map from a dring identifier to the ring itself. Returns
* pointer to ring or NULL if no match found.
*/
@@ -6487,7 +6742,10 @@ vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
lp->addr_type = ADDR_TYPE_MAC;
lp->xfer_mode = VIO_DRING_MODE;
lp->ack_freq = 0; /* for shared mode */
+
+ mutex_enter(&lp->seq_lock);
lp->seq_num = VNET_ISS;
+ mutex_exit(&lp->seq_lock);
}
/*
@@ -6650,7 +6908,9 @@ vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
}
lp->lstate = VSW_LANE_INACTIV;
+ mutex_enter(&lp->seq_lock);
lp->seq_num = VNET_ISS;
+ mutex_exit(&lp->seq_lock);
if (lp->dringp) {
if (dir == INBOUND) {
dp = lp->dringp;
@@ -6725,6 +6985,7 @@ vsw_free_ring(dring_info_t *dp)
}
paddr->memhandle = NULL;
}
+ mutex_destroy(&paddr->dstate_lock);
}
kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
* VSW_RING_NUM_EL));
@@ -6744,6 +7005,7 @@ vsw_free_ring(dring_info_t *dp)
mutex_exit(&dp->dlock);
mutex_destroy(&dp->dlock);
+ mutex_destroy(&dp->restart_lock);
kmem_free(dp, sizeof (dring_info_t));
dp = dpp;
diff --git a/usr/src/uts/sun4v/sys/ldc_impl.h b/usr/src/uts/sun4v/sys/ldc_impl.h
index 84fcc52b1f..4064ef99c3 100644
--- a/usr/src/uts/sun4v/sys/ldc_impl.h
+++ b/usr/src/uts/sun4v/sys/ldc_impl.h
@@ -427,6 +427,7 @@ struct ldc_chan {
boolean_t intr_pending; /* TRUE if interrupts are pending */
+ kmutex_t tx_lock; /* Transmit lock */
uint64_t tx_q_entries; /* Num entries in transmit queue */
uint64_t tx_q_va; /* Virtual addr of transmit queue */
uint64_t tx_q_ra; /* Real addr of transmit queue */
@@ -451,7 +452,6 @@ struct ldc_chan {
uint8_t pkt_payload; /* Size of packet payload */
- uint32_t first_fragment; /* Seqid of first msg fragment */
uint32_t last_msg_snt; /* Seqid of last packet sent */
uint32_t last_ack_rcd; /* Seqid of last ACK recd */
uint32_t last_msg_rcd; /* Seqid of last packet received */
diff --git a/usr/src/uts/sun4v/sys/vdsk_common.h b/usr/src/uts/sun4v/sys/vdsk_common.h
index b8251afea2..b4e6d4351f 100644
--- a/usr/src/uts/sun4v/sys/vdsk_common.h
+++ b/usr/src/uts/sun4v/sys/vdsk_common.h
@@ -79,7 +79,7 @@ extern "C" {
#define VD_MAX_COOKIES ((VD_MAX_BLOCK_SIZE / PAGESIZE) + 1)
#define VD_USEC_TIMEOUT 20000
#define VD_LDC_IDS_PROP "ldc-ids"
-#define VD_LDC_QLEN 32
+#define VD_LDC_QLEN VD_DRING_LEN
/*
* Flags used by ioctl routines to indicate if a copyin/copyout is needed
diff --git a/usr/src/uts/sun4v/sys/vio_mailbox.h b/usr/src/uts/sun4v/sys/vio_mailbox.h
index 66de0722e6..c3b74ac9be 100644
--- a/usr/src/uts/sun4v/sys/vio_mailbox.h
+++ b/usr/src/uts/sun4v/sys/vio_mailbox.h
@@ -120,6 +120,13 @@ extern "C" {
#define VIO_PAYLOAD_ELEMS (VIO_PAYLOAD_SZ / LDC_ELEM_SIZE) /* num words */
/*
+ * Peer dring processing state. Either actively processing dring
+ * or stopped.
+ */
+#define VIO_DP_ACTIVE 1
+#define VIO_DP_STOPPED 2
+
+/*
* VIO device message tag.
*
* These 64 bits are used as a common header for all VIO message types.
@@ -169,7 +176,6 @@ typedef struct vio_ver_msg {
uint64_t resv3[VIO_PAYLOAD_ELEMS - 1];
} vio_ver_msg_t;
-
/*
* VIO Descriptor Ring Register message.
*
@@ -260,10 +266,15 @@ typedef struct vio_dring_msg {
uint32_t start_idx; /* Indx of first updated elem */
int32_t end_idx; /* Indx of last updated elem */
+ uint8_t dring_process_state; /* Processing state */
+
/*
* Padding.
*/
- uint64_t resv[VIO_PAYLOAD_ELEMS - 3];
+ uint8_t resv1;
+ uint16_t resv2;
+ uint32_t resv3;
+ uint64_t resv4[VIO_PAYLOAD_ELEMS - 4];
} vio_dring_msg_t;
/*
diff --git a/usr/src/uts/sun4v/sys/vio_util.h b/usr/src/uts/sun4v/sys/vio_util.h
new file mode 100644
index 0000000000..ab7a255f1e
--- /dev/null
+++ b/usr/src/uts/sun4v/sys/vio_util.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VIO_UTIL_H
+#define _VIO_UTIL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/stream.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * A message is composed of three structures. A message block (mblk_t), a
+ * data block to which it points and a data buffer. desballoc(9F) allows
+ * the caller to specify the data buffer and a free function which will
+ * be invoked when freeb(9F) is called to free the message. This allows
+ * the user to reclaim and reuse the data buffer, as opposed to using
+ * allocb(9F) where the message block, data block and data buffer are
+ * all destroyed by freeb().
+ *
+ * Note that even with desballoc the message and data blocks are destroyed
+ * by freeb() and must be recreated. It is only the data buffer which is
+ * preserved.
+ *
+ * The caller first creates a pool of vio_mblk_t's by invoking
+ * vio_create_mblks() and specifying the number of mblks and the size of the
+ * associated data buffers. Each vio_mblk_t contains a pointer to the
+ * mblk_t, a pointer to the data buffer and a function pointer to the
+ * reclaim function. The caller is returned a pointer to the pool which is
+ * used in subsequent allocation/destroy requests.
+ *
+ * The pool is managed as a circular queue with a head and tail pointer.
+ * Allocation requests result in the head index being incremented, mblks
+ * being returned to the pool result in the tail pointer being incremented.
+ *
+ * The pool can only be destroyed when all the mblks have been returned. It
+ * is the responsibility of the caller to ensure that all vio_allocb()
+ * requests have been completed before the pool is destroyed.
+ *
+ *
+ * vio_mblk_pool_t
+ * +-------------+
+ * | tail |--------------------------------+
+ * +-------------+ |
+ * | head |--------+ |
+ * +-------------+ | |
+ * ............... V V
+ * +-------------+ +-------+-------+-------+-------+
+ * | quep |---->| vmp_t | vmp_t | vmp_t | vmp_t |
+ * +-------------+ +-------+-------+-------+-------+
+ * | | | | | |
+ * ... | | | | +------------+
+ * | | | +-->| data block |
+ * | | | +------------+
+ * | | | +------------+
+ * | | +-->| data block |
+ * | | +------------+
+ * | | +------------+
+ * | +-->| data block |
+ * | +------------+
+ * | +------------+
+ * +-->| data block |
+ * +------------+
+ *
+ */
+
+struct vio_mblk_pool;
+
+typedef struct vio_mblk {
+ uint8_t *datap; /* data buffer */
+ mblk_t *mp; /* mblk using datap */
+ frtn_t reclaim; /* mblk reclaim routine */
+ struct vio_mblk_pool *vmplp; /* pointer to parent pool */
+} vio_mblk_t;
+
+typedef struct vio_mblk_pool {
+ struct vio_mblk_pool *nextp; /* next in a list */
+ kmutex_t hlock; /* sync access to head */
+ kmutex_t tlock; /* sync access to tail */
+ vio_mblk_t *basep; /* base pointer to pool of vio_mblks */
+ vio_mblk_t **quep; /* queue of free vio_mblks */
+ uint8_t *datap; /* rx data buffer area */
+ uint32_t head; /* queue head */
+ uint32_t tail; /* queue tail */
+ uint64_t quelen; /* queue len (# mblks) */
+ uint64_t quemask; /* quelen - 1 */
+ size_t mblk_size; /* data buf size of each mblk */
+} vio_mblk_pool_t;
+
+int vio_create_mblks(uint64_t num_mblks,
+ size_t mblk_size, vio_mblk_pool_t **);
+int vio_destroy_mblks(vio_mblk_pool_t *);
+mblk_t *vio_allocb(vio_mblk_pool_t *);
+void vio_freeb(void *arg);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIO_UTIL_H */
diff --git a/usr/src/uts/sun4v/sys/vldc_impl.h b/usr/src/uts/sun4v/sys/vldc_impl.h
index 8610344b42..ffdd97636b 100644
--- a/usr/src/uts/sun4v/sys/vldc_impl.h
+++ b/usr/src/uts/sun4v/sys/vldc_impl.h
@@ -52,6 +52,8 @@ extern "C" {
#define VLDC_MINOR_MASK (VLDC_MAX_PORTS - 1)
#define VLDC_INST_SHIFT 11
+#define VLDC_HVCTL_SVCNAME "hvctl"
+
/* get port number from minor number */
#define VLDCPORT(vldcp, minor) \
((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK].portno)
@@ -95,6 +97,7 @@ typedef struct vldc_port {
uint32_t mtu; /* port mtu */
caddr_t send_buf; /* send buffer */
caddr_t recv_buf; /* receive buffer */
+ caddr_t cookie_buf; /* rd/wr cookie buffer */
uint64_t ldc_id; /* Channel number */
ldc_handle_t ldc_handle; /* Channel handle */
diff --git a/usr/src/uts/sun4v/sys/vnet.h b/usr/src/uts/sun4v/sys/vnet.h
index c43af5bfab..53202f7601 100644
--- a/usr/src/uts/sun4v/sys/vnet.h
+++ b/usr/src/uts/sun4v/sys/vnet.h
@@ -44,6 +44,7 @@ extern "C" {
#define VNET_LDCWD_INTERVAL 1000 /* watchdog freq in msec */
#define VNET_LDCWD_TXTIMEOUT 1000 /* tx timeout in msec */
#define VNET_LDC_QLEN 1024 /* ldc qlen */
+#define VNET_NRBUFS 512 /* number of receive bufs */
/*
* vnet proxy transport layer information. There is one instance of this for
diff --git a/usr/src/uts/sun4v/sys/vnet_common.h b/usr/src/uts/sun4v/sys/vnet_common.h
index feed7025a2..575db18efb 100644
--- a/usr/src/uts/sun4v/sys/vnet_common.h
+++ b/usr/src/uts/sun4v/sys/vnet_common.h
@@ -43,11 +43,13 @@ extern "C" {
*/
/* max # of cookies per frame size */
-#define MAX_COOKIES ((ETHERMAX >> MMU_PAGESHIFT) + 2)
+#define MAX_COOKIES ((ETHERMAX >> MMU_PAGESHIFT) + 2ULL)
/* initial send sequence number */
#define VNET_ISS 0x1
+#define VNET_IPALIGN 6 /* padding for IP header alignment */
+
/* vnet descriptor */
typedef struct vnet_public_desc {
vio_dring_entry_hdr_t hdr; /* descriptor header */
diff --git a/usr/src/uts/sun4v/sys/vnet_gen.h b/usr/src/uts/sun4v/sys/vnet_gen.h
index c6ad5fe8c0..3166a3412d 100644
--- a/usr/src/uts/sun4v/sys/vnet_gen.h
+++ b/usr/src/uts/sun4v/sys/vnet_gen.h
@@ -69,7 +69,7 @@ extern "C" {
#define LDC_TO_VNET(ldcp) ((ldcp)->portp->vgenp->vnetp)
#define LDC_TO_VGEN(ldcp) ((ldcp)->portp->vgenp)
-#define VGEN_TX_DBLK_SZ 2048 /* tx data buffer size */
+#define VGEN_DBLK_SZ 2048 /* data buffer size */
#define VGEN_LDC_UP_DELAY 100 /* usec delay between ldc_up retries */
/* get the address of next tbuf */
@@ -107,7 +107,6 @@ typedef struct vgen_priv_desc {
ldc_mem_handle_t memhandle; /* mem handle for data */
caddr_t datap; /* prealloc'd tx data buffer */
uint64_t datalen; /* total actual datalen */
- uint64_t seqnum; /* sequence number of pkt */
uint64_t ncookies; /* num ldc_mem_cookies */
ldc_mem_cookie_t memcookie[MAX_COOKIES]; /* data cookies */
} vgen_private_desc_t;
@@ -147,13 +146,10 @@ typedef struct vgen_ver {
typedef struct vgen_stats {
/* Link Input/Output stats */
- uint64_t ipackets;
- uint64_t ierrors;
- uint64_t opackets;
- uint64_t oerrors;
-#if 0
- uint64_t collisions;
-#endif
+ uint64_t ipackets; /* # rx packets */
+ uint64_t ierrors; /* # rx error */
+ uint64_t opackets; /* # tx packets */
+ uint64_t oerrors; /* # tx error */
/* MIB II variables */
uint64_t rbytes; /* # bytes received */
@@ -166,17 +162,18 @@ typedef struct vgen_stats {
uint32_t noxmtbuf; /* # xmit packets discarded */
/* Tx Statistics */
- uint32_t tx_no_desc;
- uint32_t tx_allocb_fail;
+ uint32_t tx_no_desc; /* # out of transmit descriptors */
/* Rx Statistics */
- uint32_t rx_no_desc;
- uint32_t rx_allocb_fail;
- uint32_t rx_lost_pkts;
+ uint32_t rx_allocb_fail; /* # rx buf allocb() failures */
+ uint32_t rx_vio_allocb_fail; /* # vio_allocb() failures */
+ uint32_t rx_lost_pkts; /* # rx lost packets */
/* Callback statistics */
- uint32_t callbacks;
- uint32_t dring_data_acks;
+ uint32_t callbacks; /* # callbacks */
+ uint32_t dring_data_acks; /* # dring data acks recvd */
+ uint32_t dring_stopped_acks; /* # dring stopped acks recvd */
+ uint32_t dring_data_msgs; /* # dring data msgs sent */
} vgen_stats_t;
@@ -190,9 +187,7 @@ typedef struct vgen_kstats {
kstat_named_t opackets;
kstat_named_t opackets64;
kstat_named_t oerrors;
-#if 0
- kstat_named_t collisions;
-#endif
+
/*
* required by kstat for MIB II objects(RFC 1213)
*/
@@ -208,17 +203,18 @@ typedef struct vgen_kstats {
kstat_named_t noxmtbuf; /* MIB - ifOutDiscards */
/* Tx Statistics */
- kstat_named_t tx_no_desc;
- kstat_named_t tx_allocb_fail;
+ kstat_named_t tx_no_desc; /* # out of transmit descriptors */
/* Rx Statistics */
- kstat_named_t rx_no_desc;
- kstat_named_t rx_allocb_fail;
- kstat_named_t rx_lost_pkts;
+ kstat_named_t rx_allocb_fail; /* # rx buf allocb failures */
+ kstat_named_t rx_vio_allocb_fail; /* # vio_allocb() failures */
+ kstat_named_t rx_lost_pkts; /* # rx lost packets */
/* Callback statistics */
- kstat_named_t callbacks;
- kstat_named_t dring_data_acks;
+ kstat_named_t callbacks; /* # callbacks */
+ kstat_named_t dring_data_acks; /* # dring data acks recvd */
+ kstat_named_t dring_stopped_acks; /* # dring stopped acks recvd */
+ kstat_named_t dring_data_msgs; /* # dring data msgs sent */
} vgen_kstats_t;
@@ -277,6 +273,8 @@ typedef struct vgen_ldc {
uint32_t next_rxi; /* next expected recv index */
uint32_t num_rxds; /* number of rx descriptors */
caddr_t tx_datap; /* prealloc'd tx data area */
+ vio_mblk_pool_t *rmp; /* rx mblk pool */
+ uint32_t num_rbufs; /* number of rx bufs */
/* misc */
uint32_t flags; /* flags */
@@ -284,6 +282,7 @@ typedef struct vgen_ldc {
boolean_t need_ldc_reset; /* ldc_reset needed */
boolean_t need_mcast_sync; /* sync mcast table with vsw */
uint32_t hretries; /* handshake retry count */
+ boolean_t resched_peer; /* send tx msg to peer */
/* channel statistics */
vgen_stats_t *statsp; /* channel statistics */
@@ -329,6 +328,7 @@ typedef struct vgen {
struct ether_addr *mctab; /* multicast addr table */
uint32_t mcsize; /* allocated size of mctab */
uint32_t mccount; /* # of valid addrs in mctab */
+ vio_mblk_pool_t *rmp; /* rx mblk pools to be freed */
} vgen_t;
#ifdef __cplusplus
diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h
index fccb3c6fb8..b1df247547 100644
--- a/usr/src/uts/sun4v/sys/vsw.h
+++ b/usr/src/uts/sun4v/sys/vsw.h
@@ -82,6 +82,7 @@ extern "C" {
#include <sys/vio_mailbox.h>
#include <sys/vnet_common.h>
#include <sys/ethernet.h>
+#include <sys/vio_util.h>
/*
* Default message type.
@@ -209,9 +210,21 @@ typedef struct ver_sup {
#define VSW_MAX_COOKIES ((ETHERMTU >> MMU_PAGESHIFT) + 2)
/*
+ * Size and number of mblks to be created in free pool.
+ */
+#define VSW_MBLK_SIZE 2048
+#define VSW_NUM_MBLKS 1024
+
+/*
* Private descriptor
*/
typedef struct vsw_private_desc {
+ /*
+ * Below lock must be held when accessing the state of
+ * a descriptor on either the private or public sections
+ * of the ring.
+ */
+ kmutex_t dstate_lock;
uint64_t dstate;
vnet_public_desc_t *descp;
ldc_mem_handle_t memhandle;
@@ -237,6 +250,10 @@ typedef struct dring_info {
ldc_dring_handle_t handle;
uint64_t ident; /* identifier sent to peer */
uint64_t end_idx; /* last idx processed */
+ int64_t last_ack_recv;
+
+ kmutex_t restart_lock;
+ boolean_t restart_reqd; /* send restart msg */
/*
* base address of private and public portions of the
@@ -258,6 +275,7 @@ typedef struct lane {
uint64_t lstate; /* Lane state */
uint32_t ver_major:16, /* Version major number */
ver_minor:16; /* Version minor number */
+ kmutex_t seq_lock;
uint64_t seq_num; /* Sequence number */
uint64_t mtu; /* ETHERMTU */
uint64_t addr; /* Unique physical address */
@@ -295,6 +313,7 @@ typedef struct vsw_ldc {
lane_t lane_in; /* Inbound lane */
lane_t lane_out; /* Outbound lane */
uint8_t dev_class; /* Peer device class */
+ vio_mblk_pool_t *rxh; /* Receive pool handle */
} vsw_ldc_t;
/* list of ldcs per port */
@@ -407,6 +426,8 @@ typedef struct vsw {
mod_hash_t *mfdb; /* multicast FDB */
krwlock_t mfdbrw; /* rwlock for mFDB */
+ vio_mblk_pool_t *rxh; /* Receive pool handle */
+
/* mac layer */
mac_handle_t mh;
mac_rx_handle_t mrh;
diff --git a/usr/src/uts/sun4v/vnet/Makefile b/usr/src/uts/sun4v/vnet/Makefile
index 2eed19f4bc..a07417544b 100644
--- a/usr/src/uts/sun4v/vnet/Makefile
+++ b/usr/src/uts/sun4v/vnet/Makefile
@@ -69,9 +69,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
#
CFLAGS += $(CCVERBOSE)
CFLAGS += -DVGEN_HANDLE_LOST_PKTS
-#CFLAGS += -DVGEN_USE_MAC_TX_UPDATE
-#CFLAGS += -DVGEN_REXMIT
-
#
# Driver depends on MAC & IP