summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/uts/sun4v/io/vdc.c1408
-rw-r--r--usr/src/uts/sun4v/sys/vdc.h55
2 files changed, 589 insertions, 874 deletions
diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c
index ea30b337b2..b7729adeed 100644
--- a/usr/src/uts/sun4v/io/vdc.c
+++ b/usr/src/uts/sun4v/io/vdc.c
@@ -69,7 +69,6 @@
#include <sys/mdeg.h>
#include <sys/note.h>
#include <sys/open.h>
-#include <sys/random.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sunddi.h>
@@ -83,7 +82,6 @@
#include <sys/cdio.h>
#include <sys/dktp/fdisk.h>
#include <sys/dktp/dadkio.h>
-#include <sys/fs/dv_node.h>
#include <sys/mhd.h>
#include <sys/scsi/generic/sense.h>
#include <sys/scsi/impl/uscsi.h>
@@ -176,20 +174,18 @@ static int vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg);
static int vdc_handle_dring_reg_msg(vdc_t *vdc, vio_dring_reg_msg_t *msg);
static int vdc_send_request(vdc_t *vdcp, int operation,
caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
- buf_t *bufp, vio_desc_direction_t dir, int flags);
+ int cb_type, void *cb_arg, vio_desc_direction_t dir);
static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx);
static int vdc_populate_descriptor(vdc_t *vdcp, int operation,
caddr_t addr, size_t nbytes, int slice, diskaddr_t offset,
- buf_t *bufp, vio_desc_direction_t dir, int flags);
+ int cb_type, void *cb_arg, vio_desc_direction_t dir);
static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr,
- size_t nbytes, int slice, diskaddr_t offset,
- vio_desc_direction_t dir, boolean_t);
-static int vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes,
- int slice, diskaddr_t offset, struct buf *bufp,
- vio_desc_direction_t dir, int flags);
+ size_t nbytes, int slice, diskaddr_t offset, int cb_type,
+ void *cb_arg, vio_desc_direction_t dir, boolean_t);
static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp);
-static int vdc_drain_response(vdc_t *vdcp, struct buf *buf);
+static int vdc_drain_response(vdc_t *vdcp, vio_cb_type_t cb_type,
+ struct buf *buf);
static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx);
static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep);
static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg);
@@ -226,12 +222,9 @@ static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to,
int mode, int dir);
static void vdc_ownership_update(vdc_t *vdc, int ownership_flags);
-static int vdc_access_set(vdc_t *vdc, uint64_t flags);
-static vdc_io_t *vdc_eio_queue(vdc_t *vdc, int index);
-static void vdc_eio_unqueue(vdc_t *vdc, clock_t deadline,
- boolean_t complete_io);
-static int vdc_eio_check(vdc_t *vdc, int flags);
-static void vdc_eio_thread(void *arg);
+static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode);
+static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf);
+static int vdc_failfast_check_resv(vdc_t *vdc);
/*
* Module variables
@@ -399,7 +392,7 @@ vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp)
static int
vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
- kt_did_t eio_tid, ownership_tid;
+ kt_did_t failfast_tid, ownership_tid;
int instance;
int rv;
vdc_server_t *srvr;
@@ -425,7 +418,14 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
return (DDI_FAILURE);
}
- if (vdc_is_opened(vdc)) {
+ /*
+ * This function is called when vdc is detached or if it has failed to
+ * attach. In that case, the attach may have fail before the vdisk type
+ * has been set so we can't call vdc_is_opened(). However as the attach
+ * has failed, we know that the vdisk is not opened and we can safely
+ * detach.
+ */
+ if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) {
DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance);
return (DDI_FAILURE);
}
@@ -449,7 +449,7 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
/* If we took ownership, release ownership */
mutex_enter(&vdc->ownership_lock);
if (vdc->ownership & VDC_OWNERSHIP_GRANTED) {
- rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR);
+ rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL);
if (rv == 0) {
vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE);
}
@@ -487,9 +487,6 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
instance);
vdc->state = VDC_STATE_RESETTING;
cv_signal(&vdc->initwait_cv);
- } else if (vdc->state == VDC_STATE_FAILED) {
- vdc->io_pending = B_TRUE;
- cv_signal(&vdc->io_pending_cv);
}
mutex_exit(&vdc->lock);
@@ -507,13 +504,12 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
vdc_fini_ports(vdc);
- if (vdc->eio_thread) {
- eio_tid = vdc->eio_thread->t_did;
+ if (vdc->failfast_thread) {
+ failfast_tid = vdc->failfast_thread->t_did;
vdc->failfast_interval = 0;
- ASSERT(vdc->num_servers == 0);
- cv_signal(&vdc->eio_cv);
+ cv_signal(&vdc->failfast_cv);
} else {
- eio_tid = 0;
+ failfast_tid = 0;
}
if (vdc->ownership & VDC_OWNERSHIP_WANTED) {
@@ -526,8 +522,8 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
mutex_exit(&vdc->lock);
- if (eio_tid != 0)
- thread_join(eio_tid);
+ if (failfast_tid != 0)
+ thread_join(failfast_tid);
if (ownership_tid != 0)
thread_join(ownership_tid);
@@ -552,12 +548,13 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
cv_destroy(&vdc->initwait_cv);
cv_destroy(&vdc->dring_free_cv);
cv_destroy(&vdc->membind_cv);
+ cv_destroy(&vdc->sync_pending_cv);
cv_destroy(&vdc->sync_blocked_cv);
cv_destroy(&vdc->read_cv);
cv_destroy(&vdc->running_cv);
- cv_destroy(&vdc->io_pending_cv);
cv_destroy(&vdc->ownership_cv);
- cv_destroy(&vdc->eio_cv);
+ cv_destroy(&vdc->failfast_cv);
+ cv_destroy(&vdc->failfast_io_cv);
}
if (vdc->minfo)
@@ -650,16 +647,17 @@ vdc_do_attach(dev_info_t *dip)
cv_init(&vdc->dring_free_cv, NULL, CV_DRIVER, NULL);
cv_init(&vdc->membind_cv, NULL, CV_DRIVER, NULL);
cv_init(&vdc->running_cv, NULL, CV_DRIVER, NULL);
- cv_init(&vdc->io_pending_cv, NULL, CV_DRIVER, NULL);
- vdc->io_pending = B_FALSE;
vdc->threads_pending = 0;
+ vdc->sync_op_pending = B_FALSE;
vdc->sync_op_blocked = B_FALSE;
+ cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL);
cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL);
mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL);
cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL);
- cv_init(&vdc->eio_cv, NULL, CV_DRIVER, NULL);
+ cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL);
+ cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL);
/* init blocking msg read functionality */
mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL);
@@ -701,19 +699,6 @@ vdc_do_attach(dev_info_t *dip)
return (DDI_FAILURE);
}
- /*
- * If there are multiple servers then start the eio thread.
- */
- if (vdc->num_servers > 1) {
- vdc->eio_thread = thread_create(NULL, 0, vdc_eio_thread, vdc, 0,
- &p0, TS_RUN, v.v_maxsyspri - 2);
- if (vdc->eio_thread == NULL) {
- cmn_err(CE_NOTE, "[%d] Failed to create error "
- "I/O thread", instance);
- return (DDI_FAILURE);
- }
- }
-
vdc->initialized |= VDC_THREAD;
atomic_inc_32(&vdc_instance_count);
@@ -740,6 +725,13 @@ vdc_do_attach(dev_info_t *dip)
}
/*
+ * Setup devid
+ */
+ if (vdc_setup_devid(vdc)) {
+ DMSG(vdc, 0, "[%d] No device id available\n", instance);
+ }
+
+ /*
* Fill in the fields of the error statistics kstat that were not
* available when creating the kstat
*/
@@ -1037,6 +1029,7 @@ vdc_create_device_nodes_vtoc(vdc_t *vdc)
* Return Values
* 0 - Success
* EIO - Failed to create node
+ * EINVAL - Unknown type of disk exported
*/
static int
vdc_create_device_nodes(vdc_t *vdc)
@@ -1054,14 +1047,14 @@ vdc_create_device_nodes(vdc_t *vdc)
switch (vdc->vdisk_type) {
case VD_DISK_TYPE_DISK:
- case VD_DISK_TYPE_UNK:
num_slices = V_NUMPAR;
break;
case VD_DISK_TYPE_SLICE:
num_slices = 1;
break;
+ case VD_DISK_TYPE_UNK:
default:
- ASSERT(0);
+ return (EINVAL);
}
/*
@@ -1159,10 +1152,22 @@ vdc_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
static boolean_t
vdc_is_opened(vdc_t *vdc)
{
- int i;
+ int i, nslices;
+
+ switch (vdc->vdisk_type) {
+ case VD_DISK_TYPE_DISK:
+ nslices = V_NUMPAR;
+ break;
+ case VD_DISK_TYPE_SLICE:
+ nslices = 1;
+ break;
+ case VD_DISK_TYPE_UNK:
+ default:
+ ASSERT(0);
+ }
/* check if there's any layered open */
- for (i = 0; i < V_NUMPAR; i++) {
+ for (i = 0; i < nslices; i++) {
if (vdc->open_lyr[i] > 0)
return (B_TRUE);
}
@@ -1188,15 +1193,6 @@ vdc_mark_opened(vdc_t *vdc, int slice, int flag, int otyp)
slicemask = 1 << slice;
- /*
- * If we have a single-slice disk which was unavailable during the
- * attach then a device was created for each 8 slices. Now that
- * the type is known, we prevent opening any slice other than 0
- * even if a device still exists.
- */
- if (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0)
- return (EIO);
-
/* check if slice is already exclusively opened */
if (vdc->open_excl & slicemask)
return (EBUSY);
@@ -1285,12 +1281,7 @@ vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred)
return (status);
}
- /*
- * If the disk type is unknown then we have to wait for the
- * handshake to complete because we don't know if the slice
- * device we are opening effectively exists.
- */
- if (vdc->vdisk_type != VD_DISK_TYPE_UNK && nodelay) {
+ if (nodelay) {
/* don't resubmit a validate request if there's already one */
if (vdc->validate_pending > 0) {
@@ -1317,10 +1308,8 @@ vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred)
mutex_enter(&vdc->lock);
- if (vdc->vdisk_type == VD_DISK_TYPE_UNK ||
- (vdc->vdisk_type == VD_DISK_TYPE_SLICE && slice != 0) ||
- (!nodelay && (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
- vdc->slice[slice].nblocks == 0))) {
+ if (vdc->vdisk_label == VD_DISK_LABEL_UNK ||
+ vdc->slice[slice].nblocks == 0) {
vdc_mark_closed(vdc, slice, flag, otyp);
status = EIO;
}
@@ -1392,7 +1381,7 @@ vdc_print(dev_t dev, char *str)
static int
vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
{
- int rv, flags;
+ int rv;
size_t nbytes = nblk * DEV_BSIZE;
int instance = VDCUNIT(dev);
vdc_t *vdc = NULL;
@@ -1413,20 +1402,16 @@ vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
}
vio_blkno = blkno >> vdc->vio_bshift;
- /*
- * If we are panicking, we need the state to be "running" so that we
- * can submit I/Os, but we don't want to check for any backend error.
- */
- flags = (ddi_in_panic())? VDC_OP_STATE_RUNNING : VDC_OP_NORMAL;
-
- rv = vdc_do_op(vdc, VD_OP_BWRITE, addr, nbytes, VDCPART(dev),
- vio_blkno, NULL, VIO_write_dir, flags);
-
+ rv = vdc_send_request(vdc, VD_OP_BWRITE, addr, nbytes,
+ VDCPART(dev), vio_blkno, CB_STRATEGY, 0, VIO_write_dir);
if (rv) {
DMSG(vdc, 0, "Failed to do a disk dump (err=%d)\n", rv);
return (rv);
}
+ if (ddi_in_panic())
+ (void) vdc_drain_response(vdc, CB_STRATEGY, NULL);
+
DMSG(vdc, 0, "[%d] End\n", instance);
return (0);
@@ -1450,6 +1435,7 @@ static int
vdc_strategy(struct buf *buf)
{
diskaddr_t vio_blkno;
+ int rv = -1;
vdc_t *vdc = NULL;
int instance = VDCUNIT(buf->b_edev);
int op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE;
@@ -1488,11 +1474,27 @@ vdc_strategy(struct buf *buf)
}
vio_blkno = buf->b_lblkno >> vdc->vio_bshift;
- /* submit the I/O, any error will be reported in the buf structure */
- (void) vdc_do_op(vdc, op, (caddr_t)buf->b_un.b_addr,
+ rv = vdc_send_request(vdc, op, (caddr_t)buf->b_un.b_addr,
buf->b_bcount, slice, vio_blkno,
- buf, (op == VD_OP_BREAD) ? VIO_read_dir : VIO_write_dir,
- VDC_OP_NORMAL);
+ CB_STRATEGY, buf, (op == VD_OP_BREAD) ? VIO_read_dir :
+ VIO_write_dir);
+
+ /*
+ * If the request was successfully sent, the strategy call returns and
+ * the ACK handler calls the bioxxx functions when the vDisk server is
+ * done otherwise we handle the error here.
+ */
+ if (rv) {
+ DMSG(vdc, 0, "Failed to read/write (err=%d)\n", rv);
+ bioerror(buf, rv);
+ biodone(buf);
+ } else if (ddi_in_panic()) {
+ rv = vdc_drain_response(vdc, CB_STRATEGY, buf);
+ if (rv != 0) {
+ bioerror(buf, EIO);
+ biodone(buf);
+ }
+ }
return (0);
}
@@ -2366,8 +2368,6 @@ vdc_init_ports(vdc_t *vdc, md_t *mdp, mde_cookie_t vd_nodep)
vd_port = portp[idx];
srvr = kmem_zalloc(sizeof (vdc_server_t), KM_SLEEP);
srvr->vdcp = vdc;
- srvr->svc_state = VDC_SERVICE_OFFLINE;
- srvr->log_state = VDC_SERVICE_NONE;
/* get port id */
if (md_get_prop_val(mdp, vd_port, VDC_MD_ID, &srvr->id) != 0) {
@@ -2587,7 +2587,6 @@ vdc_fini_ports(vdc_t *vdc)
}
vdc->server_list = NULL;
- vdc->num_servers = 0;
}
/* -------------------------------------------------------------------------- */
@@ -2884,7 +2883,10 @@ vdc_map_to_shared_dring(vdc_t *vdcp, int idx)
* nbytes - number of bytes to read/write
* slice - the disk slice this request is for
* offset - relative disk offset
- * bufp - buf of operation
+ * cb_type - type of call - STRATEGY or SYNC
+ * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type)
+ * . mode for ioctl(9e)
+ * . LP64 diskaddr_t (block I/O)
* dir - direction of operation (READ/WRITE/BOTH)
*
* Return Codes:
@@ -2893,8 +2895,8 @@ vdc_map_to_shared_dring(vdc_t *vdcp, int idx)
*/
static int
vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
- size_t nbytes, int slice, diskaddr_t offset, buf_t *bufp,
- vio_desc_direction_t dir, int flags)
+ size_t nbytes, int slice, diskaddr_t offset, int cb_type,
+ void *cb_arg, vio_desc_direction_t dir)
{
int rv = 0;
@@ -2915,20 +2917,10 @@ vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
* higher up the stack in vdc_strategy() et. al.
*/
if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
- DTRACE_IO1(start, buf_t *, bufp);
+ DTRACE_IO1(start, buf_t *, cb_arg);
VD_KSTAT_WAITQ_ENTER(vdcp);
}
- /*
- * If the request does not expect the state to be VDC_STATE_RUNNING
- * then we just try to populate the descriptor ring once.
- */
- if (!(flags & VDC_OP_STATE_RUNNING)) {
- rv = vdc_populate_descriptor(vdcp, operation, addr,
- nbytes, slice, offset, bufp, dir, flags);
- goto done;
- }
-
do {
while (vdcp->state != VDC_STATE_RUNNING) {
@@ -2938,6 +2930,12 @@ vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
goto done;
}
+ /* fail request if connection timeout is reached */
+ if (vdcp->ctimeout_reached) {
+ rv = EIO;
+ goto done;
+ }
+
/*
* If we are panicking and the disk is not ready then
* we can't send any request because we can't complete
@@ -2948,27 +2946,11 @@ vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr,
goto done;
}
- /*
- * If the state is faulted, notify that a new I/O is
- * being submitted to force the system to check if any
- * server has recovered.
- */
- if (vdcp->state == VDC_STATE_FAILED) {
- vdcp->io_pending = B_TRUE;
- cv_signal(&vdcp->io_pending_cv);
- }
-
cv_wait(&vdcp->running_cv, &vdcp->lock);
-
- /* if service is still faulted then fail the request */
- if (vdcp->state == VDC_STATE_FAILED) {
- rv = EIO;
- goto done;
- }
}
} while (vdc_populate_descriptor(vdcp, operation, addr,
- nbytes, slice, offset, bufp, dir, flags));
+ nbytes, slice, offset, cb_type, cb_arg, dir));
done:
/*
@@ -2981,11 +2963,11 @@ done:
if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) {
if (rv == 0) {
VD_KSTAT_WAITQ_TO_RUNQ(vdcp);
- DTRACE_PROBE1(send, buf_t *, bufp);
+ DTRACE_PROBE1(send, buf_t *, cb_arg);
} else {
VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
VD_KSTAT_WAITQ_EXIT(vdcp);
- DTRACE_IO1(done, buf_t *, bufp);
+ DTRACE_IO1(done, buf_t *, cb_arg);
}
}
@@ -3011,7 +2993,10 @@ done:
* nbytes - number of bytes to read/write
* slice - the disk slice this request is for
* offset - relative disk offset
- * bufp - buf of operation
+ * cb_type - type of call - STRATEGY or SYNC
+ * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type)
+ * . mode for ioctl(9e)
+ * . LP64 diskaddr_t (block I/O)
* dir - direction of operation (READ/WRITE/BOTH)
*
* Return Codes:
@@ -3022,8 +3007,8 @@ done:
*/
static int
vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr,
- size_t nbytes, int slice, diskaddr_t offset,
- buf_t *bufp, vio_desc_direction_t dir, int flags)
+ size_t nbytes, int slice, diskaddr_t offset, int cb_type,
+ void *cb_arg, vio_desc_direction_t dir)
{
vdc_local_desc_t *local_dep = NULL; /* Local Dring Pointer */
int idx; /* Index of DRing entry used */
@@ -3065,9 +3050,9 @@ loop:
local_dep->nbytes = nbytes;
local_dep->slice = slice;
local_dep->offset = offset;
- local_dep->buf = bufp;
+ local_dep->cb_type = cb_type;
+ local_dep->cb_arg = cb_arg;
local_dep->dir = dir;
- local_dep->flags = flags;
local_dep->is_free = B_FALSE;
@@ -3139,127 +3124,11 @@ cleanup_and_exit:
/*
* Function:
- * vdc_do_op
- *
- * Description:
- * Wrapper around vdc_submit_request(). Each request is associated with a
- * buf structure. If a buf structure is provided (bufp != NULL) then the
- * request will be submitted with that buf, and the caller can wait for
- * completion of the request with biowait(). If a buf structure is not
- * provided (bufp == NULL) then a buf structure is created and the function
- * waits for the completion of the request.
- *
- * If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will
- * submit the request only when the vdisk is in state VD_STATE_RUNNING.
- * If the vdisk is not in that state then the vdc_submit_request() will
- * wait for that state to be reached. After the request is submitted, the
- * reply will be processed asynchronously by the vdc_process_msg_thread()
- * thread.
- *
- * If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request()
- * submit the request whatever the state of the vdisk is. Then vdc_do_op()
- * will wait for a reply message, process the reply and complete the
- * request.
- *
- * Arguments:
- * vdc - the soft state pointer
- * op - operation we want vds to perform (VD_OP_XXX)
- * addr - address of data buf to be read/written.
- * nbytes - number of bytes to read/write
- * slice - the disk slice this request is for
- * offset - relative disk offset
- * bufp - buf structure associated with the request (can be NULL).
- * dir - direction of operation (READ/WRITE/BOTH)
- * flags - flags for the request.
- *
- * Return Codes:
- * 0 - the request has been succesfully submitted and completed.
- * != 0 - the request has failed. In that case, if a buf structure
- * was provided (bufp != NULL) then the B_ERROR flag is set
- * and the b_error field of the buf structure is set to EIO.
- */
-static int
-vdc_do_op(vdc_t *vdc, int op, caddr_t addr, size_t nbytes, int slice,
- diskaddr_t offset, struct buf *bufp, vio_desc_direction_t dir, int flags)
-{
- vio_msg_t vio_msg;
- struct buf buf;
- int rv;
-
- if (bufp == NULL) {
- /*
- * We use buf just as a convenient way to get a notification
- * that the request is completed, so we initialize buf to the
- * minimum we need.
- */
- bioinit(&buf);
- buf.b_bcount = nbytes;
- buf.b_flags = B_BUSY;
- bufp = &buf;
- }
-
- rv = vdc_send_request(vdc, op, addr, nbytes, slice, offset, bufp,
- dir, flags);
-
- if (rv != 0)
- goto done;
-
- /*
- * If the request should be done in VDC_STATE_RUNNING state then the
- * reply will be received and processed by vdc_process_msg_thread()
- * and we just have to handle the panic case. Otherwise we have to
- * wait for the reply message and process it.
- */
- if (flags & VDC_OP_STATE_RUNNING) {
-
- if (ddi_in_panic()) {
- rv = vdc_drain_response(vdc, bufp);
- goto done;
- }
-
- } else {
- /* wait for the response message */
- rv = vdc_wait_for_response(vdc, &vio_msg);
- if (rv) {
- /*
- * If this is a block read/write we update the I/O
- * statistics kstat to take it off the run queue.
- */
- mutex_enter(&vdc->lock);
- if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
- VD_UPDATE_ERR_STATS(vdc, vd_transerrs);
- VD_KSTAT_RUNQ_EXIT(vdc);
- DTRACE_IO1(done, buf_t *, bufp);
- }
- mutex_exit(&vdc->lock);
- goto done;
- }
-
- rv = vdc_process_data_msg(vdc, &vio_msg);
- if (rv)
- goto done;
- }
-
- if (bufp == &buf)
- rv = biowait(bufp);
-
-done:
- if (bufp == &buf) {
- biofini(bufp);
- } else if (rv != 0) {
- bioerror(bufp, EIO);
- biodone(bufp);
- }
-
- return (rv);
-}
-
-/*
- * Function:
* vdc_do_sync_op
*
* Description:
- * Wrapper around vdc_do_op that serializes requests.
+ * Wrapper around vdc_populate_descriptor that blocks until the
+ * response to the message is available.
*
* Arguments:
* vdcp - the soft state pointer
@@ -3268,12 +3137,16 @@ done:
* nbytes - number of bytes to read/write
* slice - the disk slice this request is for
* offset - relative disk offset
+ * cb_type - type of call - STRATEGY or SYNC
+ * cb_arg - parameter to be sent to server (depends on VD_OP_XXX type)
+ * . mode for ioctl(9e)
+ * . LP64 diskaddr_t (block I/O)
* dir - direction of operation (READ/WRITE/BOTH)
* rconflict - check for reservation conflict in case of failure
*
* rconflict should be set to B_TRUE by most callers. Callers invoking the
* VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the
- * result of a successful operation with vdc_scsi_status().
+ * result of a successful operation with vd_scsi_status().
*
* Return Codes:
* 0
@@ -3284,10 +3157,14 @@ done:
*/
static int
vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes,
- int slice, diskaddr_t offset, vio_desc_direction_t dir, boolean_t rconflict)
+ int slice, diskaddr_t offset, int cb_type, void *cb_arg,
+ vio_desc_direction_t dir, boolean_t rconflict)
{
int status;
- int flags = VDC_OP_NORMAL;
+ vdc_io_t *vio;
+ boolean_t check_resv_conflict = B_FALSE;
+
+ ASSERT(cb_type == CB_SYNC);
/*
* Grab the lock, if blocked wait until the server
@@ -3315,29 +3192,69 @@ vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes,
/* now block anyone other thread entering after us */
vdcp->sync_op_blocked = B_TRUE;
-
+ vdcp->sync_op_pending = B_TRUE;
mutex_exit(&vdcp->lock);
- if (!rconflict)
- flags &= ~VDC_OP_ERRCHK_CONFLICT;
-
- status = vdc_do_op(vdcp, operation, addr, nbytes, slice, offset,
- NULL, dir, flags);
+ status = vdc_send_request(vdcp, operation, addr,
+ nbytes, slice, offset, cb_type, cb_arg, dir);
mutex_enter(&vdcp->lock);
- DMSG(vdcp, 2, ": operation returned %d\n", status);
+ if (status != 0) {
+ vdcp->sync_op_pending = B_FALSE;
+ } else if (ddi_in_panic()) {
+ if (vdc_drain_response(vdcp, CB_SYNC, NULL) == 0) {
+ status = vdcp->sync_op_status;
+ } else {
+ vdcp->sync_op_pending = B_FALSE;
+ status = EIO;
+ }
+ } else {
+ /*
+ * block until our transaction completes.
+ * Also anyone else waiting also gets to go next.
+ */
+ while (vdcp->sync_op_pending && vdcp->state != VDC_STATE_DETACH)
+ cv_wait(&vdcp->sync_pending_cv, &vdcp->lock);
+
+ DMSG(vdcp, 2, ": operation returned %d\n",
+ vdcp->sync_op_status);
+ if (vdcp->state == VDC_STATE_DETACH) {
+ vdcp->sync_op_pending = B_FALSE;
+ status = ENXIO;
+ } else {
+ status = vdcp->sync_op_status;
+ if (status != 0 && vdcp->failfast_interval != 0) {
+ /*
+ * Operation has failed and failfast is enabled.
+ * We need to check if the failure is due to a
+ * reservation conflict if this was requested.
+ */
+ check_resv_conflict = rconflict;
+ }
- if (vdcp->state == VDC_STATE_DETACH) {
- status = ENXIO;
+ }
}
+ vdcp->sync_op_status = 0;
vdcp->sync_op_blocked = B_FALSE;
vdcp->sync_op_cnt--;
/* signal the next waiting thread */
cv_signal(&vdcp->sync_blocked_cv);
+ /*
+ * We have to check for reservation conflict after unblocking sync
+ * operations because some sync operations will be used to do this
+ * check.
+ */
+ if (check_resv_conflict) {
+ vio = vdc_failfast_io_queue(vdcp, NULL);
+ while (vio->vio_qtime != 0)
+ cv_wait(&vdcp->failfast_io_cv, &vdcp->lock);
+ kmem_free(vio, sizeof (vdc_io_t));
+ }
+
mutex_exit(&vdcp->lock);
return (status);
@@ -3358,16 +3275,23 @@ vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes,
*
* Arguments:
* vdc - soft state pointer for this instance of the device driver.
- * buf - if buf is NULL then we drain all responses, otherwise we
+ * cb_type - the type of request we want to drain. If type is CB_SYNC
+ * then we drain all responses until we find a CB_SYNC request.
+ * If the type is CB_STRATEGY then the behavior depends on the
+ * value of the buf argument.
+ * buf - if the cb_type argument is CB_SYNC then the buf argument
+ * must be NULL. If the cb_type argument is CB_STRATEGY and
+ * if buf is NULL then we drain all responses, otherwise we
* poll until we receive a ACK/NACK for the specific I/O
* described by buf.
*
* Return Code:
* 0 - Success. If we were expecting a response to a particular
- * request then this means that a response has been received.
+ * CB_SYNC or CB_STRATEGY request then this means that a
+ * response has been received.
*/
static int
-vdc_drain_response(vdc_t *vdc, struct buf *buf)
+vdc_drain_response(vdc_t *vdc, vio_cb_type_t cb_type, struct buf *buf)
{
int rv, idx, retries;
size_t msglen;
@@ -3376,6 +3300,8 @@ vdc_drain_response(vdc_t *vdc, struct buf *buf)
struct buf *mbuf;
boolean_t ack;
+ ASSERT(cb_type == CB_STRATEGY || cb_type == CB_SYNC);
+
mutex_enter(&vdc->lock);
retries = 0;
@@ -3443,16 +3369,34 @@ vdc_drain_response(vdc_t *vdc, struct buf *buf)
continue;
}
- mbuf = ldep->buf;
- ASSERT(mbuf != NULL);
- mbuf->b_resid = mbuf->b_bcount - ldep->dep->payload.nbytes;
- bioerror(mbuf, ack ? ldep->dep->payload.status : EIO);
- biodone(mbuf);
+ switch (ldep->cb_type) {
- rv = vdc_depopulate_descriptor(vdc, idx);
- if (buf != NULL && buf == mbuf) {
- rv = 0;
- goto done;
+ case CB_STRATEGY:
+ mbuf = ldep->cb_arg;
+ if (mbuf != NULL) {
+ mbuf->b_resid = mbuf->b_bcount -
+ ldep->dep->payload.nbytes;
+ bioerror(mbuf,
+ ack ? ldep->dep->payload.status : EIO);
+ biodone(mbuf);
+ }
+ rv = vdc_depopulate_descriptor(vdc, idx);
+ if (buf != NULL && buf == mbuf) {
+ rv = 0;
+ goto done;
+ }
+ break;
+
+ case CB_SYNC:
+ rv = vdc_depopulate_descriptor(vdc, idx);
+ vdc->sync_op_status = ack ? rv : EIO;
+ vdc->sync_op_pending = B_FALSE;
+ cv_signal(&vdc->sync_pending_cv);
+ if (cb_type == CB_SYNC) {
+ rv = 0;
+ goto done;
+ }
+ break;
}
/* if this is the last descriptor - break out of loop */
@@ -3462,7 +3406,7 @@ vdc_drain_response(vdc_t *vdc, struct buf *buf)
* request then we return with an error otherwise we
* have successfully completed the drain.
*/
- rv = (buf != NULL)? ESRCH: 0;
+ rv = (buf != NULL || cb_type == CB_SYNC)? ESRCH: 0;
break;
}
}
@@ -3739,10 +3683,8 @@ vdc_handle_cb(uint64_t event, caddr_t arg)
*/
vdc->seq_num = 1;
vdc->seq_num_reply = 0;
- vdc->io_pending = B_TRUE;
srvr->ldc_state = ldc_state;
cv_signal(&vdc->initwait_cv);
- cv_signal(&vdc->io_pending_cv);
}
}
@@ -3777,9 +3719,6 @@ vdc_handle_cb(uint64_t event, caddr_t arg)
if (vdc->state == VDC_STATE_INIT_WAITING) {
vdc->state = VDC_STATE_RESETTING;
cv_signal(&vdc->initwait_cv);
- } else if (vdc->state == VDC_STATE_FAILED) {
- vdc->io_pending = B_TRUE;
- cv_signal(&vdc->io_pending_cv);
}
}
@@ -3881,6 +3820,8 @@ vdc_resubmit_backup_dring(vdc_t *vdcp)
int b_idx;
int rv = 0;
int dring_size;
+ int op;
+ vio_msg_t vio_msg;
vdc_local_desc_t *curr_ldep;
ASSERT(MUTEX_NOT_HELD(&vdcp->lock));
@@ -3905,21 +3846,84 @@ vdc_resubmit_backup_dring(vdc_t *vdcp)
/* only resubmit outstanding transactions */
if (!curr_ldep->is_free) {
+ /*
+ * If we are retrying a block read/write operation we
+ * need to update the I/O statistics to indicate that
+ * the request is being put back on the waitq to be
+ * serviced (it will have been taken off after the
+ * error was reported).
+ */
+ mutex_enter(&vdcp->lock);
+ op = curr_ldep->operation;
+ if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
+ DTRACE_IO1(start, buf_t *, curr_ldep->cb_arg);
+ VD_KSTAT_WAITQ_ENTER(vdcp);
+ }
DMSG(vdcp, 1, "resubmitting entry idx=%x\n", b_idx);
-
- rv = vdc_do_op(vdcp, curr_ldep->operation,
+ rv = vdc_populate_descriptor(vdcp, op,
curr_ldep->addr, curr_ldep->nbytes,
curr_ldep->slice, curr_ldep->offset,
- curr_ldep->buf, curr_ldep->dir,
- curr_ldep->flags & ~VDC_OP_STATE_RUNNING);
+ curr_ldep->cb_type, curr_ldep->cb_arg,
+ curr_ldep->dir);
if (rv) {
- DMSG(vdcp, 1, "[%d] resubmit entry %d failed\n",
+ if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
+ VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
+ VD_KSTAT_WAITQ_EXIT(vdcp);
+ DTRACE_IO1(done, buf_t *,
+ curr_ldep->cb_arg);
+ }
+ DMSG(vdcp, 1, "[%d] cannot resubmit entry %d\n",
vdcp->instance, b_idx);
+ mutex_exit(&vdcp->lock);
+ goto done;
+ }
+
+ /*
+ * If this is a block read/write we update the I/O
+ * statistics kstat to indicate that the request
+ * has been sent back to the vDisk server and should
+ * now be put on the run queue.
+ */
+ if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
+ DTRACE_PROBE1(send, buf_t *, curr_ldep->cb_arg);
+ VD_KSTAT_WAITQ_TO_RUNQ(vdcp);
+ }
+ mutex_exit(&vdcp->lock);
+
+ /* Wait for the response message. */
+ DMSG(vdcp, 1, "waiting for response to idx=%x\n",
+ b_idx);
+ rv = vdc_wait_for_response(vdcp, &vio_msg);
+ if (rv) {
+ /*
+ * If this is a block read/write we update
+ * the I/O statistics kstat to take it
+ * off the run queue.
+ */
+ mutex_enter(&vdcp->lock);
+ if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
+ VD_UPDATE_ERR_STATS(vdcp, vd_transerrs);
+ VD_KSTAT_RUNQ_EXIT(vdcp);
+ DTRACE_IO1(done, buf_t *,
+ curr_ldep->cb_arg);
+ }
+ DMSG(vdcp, 1, "[%d] wait_for_response "
+ "returned err=%d\n", vdcp->instance,
+ rv);
+ mutex_exit(&vdcp->lock);
goto done;
}
+ DMSG(vdcp, 1, "processing msg for idx=%x\n", b_idx);
+ rv = vdc_process_data_msg(vdcp, &vio_msg);
+ if (rv) {
+ DMSG(vdcp, 1, "[%d] process_data_msg "
+ "returned err=%d\n", vdcp->instance,
+ rv);
+ goto done;
+ }
/*
* Mark this entry as free so that we will not resubmit
* this "done" request again, if we were to use the same
@@ -3974,7 +3978,10 @@ vdc_cancel_backup_dring(vdc_t *vdcp)
int cancelled = 0;
ASSERT(MUTEX_HELD(&vdcp->lock));
- ASSERT(vdcp->state == VDC_STATE_FAILED);
+ ASSERT(vdcp->state == VDC_STATE_INIT ||
+ vdcp->state == VDC_STATE_INIT_WAITING ||
+ vdcp->state == VDC_STATE_NEGOTIATE ||
+ vdcp->state == VDC_STATE_RESETTING);
if (vdcp->local_dring_backup == NULL) {
/* the pending requests have already been processed */
@@ -4006,17 +4013,29 @@ vdc_cancel_backup_dring(vdc_t *vdcp)
* requests. Now we just have to notify threads waiting
* for replies that the request has failed.
*/
- bufp = ldep->buf;
- ASSERT(bufp != NULL);
- bufp->b_resid = bufp->b_bcount;
- if (ldep->operation == VD_OP_BREAD ||
- ldep->operation == VD_OP_BWRITE) {
+ switch (ldep->cb_type) {
+ case CB_SYNC:
+ ASSERT(vdcp->sync_op_pending);
+ vdcp->sync_op_status = EIO;
+ vdcp->sync_op_pending = B_FALSE;
+ cv_signal(&vdcp->sync_pending_cv);
+ break;
+
+ case CB_STRATEGY:
+ bufp = ldep->cb_arg;
+ ASSERT(bufp != NULL);
+ bufp->b_resid = bufp->b_bcount;
VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
VD_KSTAT_RUNQ_EXIT(vdcp);
DTRACE_IO1(done, buf_t *, bufp);
+ bioerror(bufp, EIO);
+ biodone(bufp);
+ break;
+
+ default:
+ ASSERT(0);
}
- bioerror(bufp, EIO);
- biodone(bufp);
+
}
/* get the next element to cancel */
@@ -4042,12 +4061,14 @@ vdc_cancel_backup_dring(vdc_t *vdcp)
* Description:
* This function is invoked if the timeout set to establish the connection
* with vds expires. This will happen if we spend too much time in the
- * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states.
+ * VDC_STATE_INIT_WAITING or VDC_STATE_NEGOTIATE states. Then we will
+ * cancel any pending request and mark them as failed.
*
* If the timeout does not expire, it will be cancelled when we reach the
- * VDC_STATE_HANDLE_PENDING, VDC_STATE_FAILED or VDC_STATE_DETACH state.
- * This function can also be invoked while we are in those states, in
- * which case we do nothing because the timeout is being cancelled.
+ * VDC_STATE_HANDLE_PENDING or VDC_STATE_RESETTING state. This function can
+ * be invoked while we are in the VDC_STATE_HANDLE_PENDING or
+ * VDC_STATE_RESETTING state in which case we do nothing because the
+ * timeout is being cancelled.
*
* Arguments:
* arg - argument of the timeout function actually a soft state
@@ -4064,18 +4085,28 @@ vdc_connection_timeout(void *arg)
mutex_enter(&vdcp->lock);
if (vdcp->state == VDC_STATE_HANDLE_PENDING ||
- vdcp->state == VDC_STATE_DETACH ||
- vdcp->state == VDC_STATE_FAILED) {
+ vdcp->state == VDC_STATE_DETACH) {
/*
- * The connection has just been re-established, has failed or
+ * The connection has just been re-established or
* we are detaching.
*/
vdcp->ctimeout_reached = B_FALSE;
- } else {
- vdcp->ctimeout_reached = B_TRUE;
+ mutex_exit(&vdcp->lock);
+ return;
}
+ vdcp->ctimeout_reached = B_TRUE;
+
+ /* notify requests waiting for sending */
+ cv_broadcast(&vdcp->running_cv);
+
+ /* cancel requests waiting for a result */
+ vdc_cancel_backup_dring(vdcp);
+
mutex_exit(&vdcp->lock);
+
+ cmn_err(CE_NOTE, "[%d] connection to service domain timeout",
+ vdcp->instance);
}
/*
@@ -4171,58 +4202,6 @@ vdc_switch_server(vdc_t *vdcp)
vdcp->instance, vdcp->curr_server->id, vdcp->curr_server->ldc_id);
}
-static void
-vdc_print_svc_status(vdc_t *vdcp)
-{
- int instance;
- uint64_t ldc_id, port_id;
- vdc_service_state_t svc_state;
-
- ASSERT(mutex_owned(&vdcp->lock));
-
- svc_state = vdcp->curr_server->svc_state;
-
- if (vdcp->curr_server->log_state == svc_state)
- return;
-
- instance = vdcp->instance;
- ldc_id = vdcp->curr_server->ldc_id;
- port_id = vdcp->curr_server->id;
-
- switch (svc_state) {
-
- case VDC_SERVICE_OFFLINE:
- cmn_err(CE_CONT, "?vdisk@%d is offline\n", instance);
- break;
-
- case VDC_SERVICE_CONNECTED:
- cmn_err(CE_CONT, "?vdisk@%d is connected using ldc@%ld,%ld\n",
- instance, ldc_id, port_id);
- break;
-
- case VDC_SERVICE_ONLINE:
- cmn_err(CE_CONT, "?vdisk@%d is online using ldc@%ld,%ld\n",
- instance, ldc_id, port_id);
- break;
-
- case VDC_SERVICE_FAILED:
- cmn_err(CE_CONT, "?vdisk@%d access to service failed "
- "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
- break;
-
- case VDC_SERVICE_FAULTED:
- cmn_err(CE_CONT, "?vdisk@%d access to backend failed "
- "using ldc@%ld,%ld\n", instance, ldc_id, port_id);
- break;
-
- default:
- ASSERT(0);
- break;
- }
-
- vdcp->curr_server->log_state = svc_state;
-}
-
/* -------------------------------------------------------------------------- */
/*
@@ -4253,8 +4232,6 @@ vdc_process_msg_thread(vdc_t *vdcp)
int ctimeout;
timeout_id_t tmid = 0;
clock_t ldcup_timeout = 0;
- vdc_server_t *srvr;
- vdc_service_state_t svc_state;
mutex_enter(&vdcp->lock);
@@ -4266,8 +4243,6 @@ vdc_process_msg_thread(vdc_t *vdcp)
Q(VDC_STATE_INIT_WAITING)
Q(VDC_STATE_NEGOTIATE)
Q(VDC_STATE_HANDLE_PENDING)
- Q(VDC_STATE_FAULTED)
- Q(VDC_STATE_FAILED)
Q(VDC_STATE_RUNNING)
Q(VDC_STATE_RESETTING)
Q(VDC_STATE_DETACH)
@@ -4302,27 +4277,21 @@ vdc_process_msg_thread(vdc_t *vdcp)
ctimeout * drv_usectohz(MICROSEC));
}
- /* Switch to STATE_DETACH if drv is detaching */
- if (vdcp->lifecycle == VDC_LC_DETACHING) {
- vdcp->state = VDC_STATE_DETACH;
- break;
- }
-
- /* Check if the timeout has been reached */
- if (vdcp->ctimeout_reached) {
- ASSERT(tmid != 0);
- tmid = 0;
- vdcp->state = VDC_STATE_FAILED;
- break;
- }
-
/* Check if we are re-initializing repeatedly */
if (vdcp->hshake_cnt > vdc_hshake_retries &&
vdcp->lifecycle != VDC_LC_ONLINE) {
DMSG(vdcp, 0, "[%d] too many handshakes,cnt=%d",
vdcp->instance, vdcp->hshake_cnt);
- vdcp->state = VDC_STATE_FAILED;
+ cmn_err(CE_NOTE, "[%d] disk access failed.\n",
+ vdcp->instance);
+ vdcp->state = VDC_STATE_DETACH;
+ break;
+ }
+
+ /* Switch to STATE_DETACH if drv is detaching */
+ if (vdcp->lifecycle == VDC_LC_DETACHING) {
+ vdcp->state = VDC_STATE_DETACH;
break;
}
@@ -4335,10 +4304,6 @@ vdc_process_msg_thread(vdc_t *vdcp)
status = vdc_start_ldc_connection(vdcp);
if (status != EINVAL) {
vdcp->state = VDC_STATE_INIT_WAITING;
- } else {
- vdcp->curr_server->svc_state =
- VDC_SERVICE_FAILED;
- vdc_print_svc_status(vdcp);
}
break;
@@ -4350,23 +4315,26 @@ vdc_process_msg_thread(vdc_t *vdcp)
break;
}
- /*
- * Wait for LDC_UP. If it times out and we have multiple
- * servers then we will retry using a different server.
- */
- ldcup_timeout = ddi_get_lbolt() + (vdc_ldcup_timeout *
- drv_usectohz(MICROSEC));
- status = cv_timedwait(&vdcp->initwait_cv, &vdcp->lock,
- ldcup_timeout);
- if (status == -1 &&
- vdcp->state == VDC_STATE_INIT_WAITING &&
- vdcp->curr_server->ldc_state != LDC_UP) {
- /* timed out & still waiting */
- vdcp->curr_server->svc_state =
- VDC_SERVICE_FAILED;
- vdc_print_svc_status(vdcp);
- vdcp->state = VDC_STATE_INIT;
- break;
+ /* check if only one server exists */
+ if (vdcp->num_servers == 1) {
+ cv_wait(&vdcp->initwait_cv, &vdcp->lock);
+ } else {
+ /*
+ * wait for LDC_UP, if it times out, switch
+ * to another server.
+ */
+ ldcup_timeout = ddi_get_lbolt() +
+ (vdc_ldcup_timeout *
+ drv_usectohz(MICROSEC));
+ status = cv_timedwait(&vdcp->initwait_cv,
+ &vdcp->lock, ldcup_timeout);
+ if (status == -1 &&
+ vdcp->state == VDC_STATE_INIT_WAITING &&
+ vdcp->curr_server->ldc_state != LDC_UP) {
+ /* timed out & still waiting */
+ vdcp->state = VDC_STATE_INIT;
+ break;
+ }
}
if (vdcp->state != VDC_STATE_INIT_WAITING) {
@@ -4418,8 +4386,6 @@ reset:
status);
vdcp->state = VDC_STATE_RESETTING;
vdcp->self_reset = B_TRUE;
- vdcp->curr_server->svc_state = VDC_SERVICE_FAILED;
- vdc_print_svc_status(vdcp);
done:
DMSG(vdcp, 0, "negotiation complete (state=0x%x)...\n",
vdcp->state);
@@ -4427,121 +4393,36 @@ done:
case VDC_STATE_HANDLE_PENDING:
- DMSG(vdcp, 0, "[%d] connection to service domain is up",
- vdcp->instance);
- vdcp->curr_server->svc_state = VDC_SERVICE_CONNECTED;
-
- mutex_exit(&vdcp->lock);
-
- /*
- * If we have multiple servers, check that the backend
- * is effectively available before resubmitting any IO.
- */
- if (vdcp->num_servers > 1 &&
- vdc_eio_check(vdcp, 0) != 0) {
- mutex_enter(&vdcp->lock);
- vdcp->curr_server->svc_state =
- VDC_SERVICE_FAULTED;
- vdcp->state = VDC_STATE_FAULTED;
+ if (vdcp->ctimeout_reached) {
+ /*
+ * The connection timeout had been reached so
+ * pending requests have been cancelled. Now
+ * that the connection is back we can reset
+ * the timeout.
+ */
+ ASSERT(vdcp->local_dring_backup == NULL);
+ ASSERT(tmid != 0);
+ tmid = 0;
+ vdcp->ctimeout_reached = B_FALSE;
+ vdcp->state = VDC_STATE_RUNNING;
+ DMSG(vdcp, 0, "[%d] connection to service "
+ "domain is up", vdcp->instance);
break;
}
+ mutex_exit(&vdcp->lock);
if (tmid != 0) {
(void) untimeout(tmid);
tmid = 0;
- vdcp->ctimeout_reached = B_FALSE;
}
-
- /*
- * Setup devid
- */
- (void) vdc_setup_devid(vdcp);
-
status = vdc_resubmit_backup_dring(vdcp);
-
mutex_enter(&vdcp->lock);
- if (status) {
+ if (status)
vdcp->state = VDC_STATE_RESETTING;
- vdcp->self_reset = B_TRUE;
- vdcp->curr_server->svc_state =
- VDC_SERVICE_FAILED;
- vdc_print_svc_status(vdcp);
- } else {
+ else
vdcp->state = VDC_STATE_RUNNING;
- }
- break;
-
- case VDC_STATE_FAULTED:
- /*
- * Server is faulted because the backend is unavailable.
- * If all servers are faulted then we mark the service
- * as failed, otherwise we reset to switch to another
- * server.
- */
- vdc_print_svc_status(vdcp);
-
- /* check if all servers are faulted */
- for (srvr = vdcp->server_list; srvr != NULL;
- srvr = srvr->next) {
- svc_state = srvr->svc_state;
- if (svc_state != VDC_SERVICE_FAULTED)
- break;
- }
- if (srvr != NULL) {
- vdcp->state = VDC_STATE_RESETTING;
- vdcp->self_reset = B_TRUE;
- } else {
- vdcp->state = VDC_STATE_FAILED;
- }
- break;
-
- case VDC_STATE_FAILED:
- /*
- * We reach this state when we are unable to access the
- * backend from any server, either because of a maximum
- * connection retries or timeout, or because the backend
- * is unavailable.
- *
- * Then we cancel the backup DRing so that errors get
- * reported and we wait for a new I/O before attempting
- * another connection.
- */
- cmn_err(CE_NOTE, "vdisk@%d disk access failed",
- vdcp->instance);
-
- /* cancel any timeout */
- if (tmid != 0) {
- (void) untimeout(tmid);
- tmid = 0;
- }
-
- /* cancel pending I/Os */
- cv_broadcast(&vdcp->running_cv);
- vdc_cancel_backup_dring(vdcp);
-
- /* wait for new I/O */
- while (!vdcp->io_pending)
- cv_wait(&vdcp->io_pending_cv, &vdcp->lock);
-
- /*
- * There's a new IO pending. Try to re-establish a
- * connection. Mark all services as offline, so that
- * we don't stop again before having retried all
- * servers.
- */
- for (srvr = vdcp->server_list; srvr != NULL;
- srvr = srvr->next) {
- srvr->svc_state = VDC_SERVICE_OFFLINE;
- }
-
- /* reset variables */
- vdcp->hshake_cnt = 0;
- vdcp->ctimeout_reached = B_FALSE;
-
- vdcp->state = VDC_STATE_RESETTING;
- vdcp->self_reset = B_TRUE;
break;
/* enter running state */
@@ -4553,18 +4434,17 @@ done:
vdcp->hshake_cnt = 0;
cv_broadcast(&vdcp->running_cv);
- /* backend has to be checked after reset */
- if (vdcp->failfast_interval != 0 ||
- vdcp->num_servers > 1)
- cv_signal(&vdcp->eio_cv);
+ /* failfast has to been checked after reset */
+ cv_signal(&vdcp->failfast_cv);
/* ownership is lost during reset */
if (vdcp->ownership & VDC_OWNERSHIP_WANTED)
vdcp->ownership |= VDC_OWNERSHIP_RESET;
cv_signal(&vdcp->ownership_cv);
- vdcp->curr_server->svc_state = VDC_SERVICE_ONLINE;
- vdc_print_svc_status(vdcp);
+ cmn_err(CE_CONT, "?vdisk@%d is online using "
+ "ldc@%ld,%ld\n", vdcp->instance,
+ vdcp->curr_server->ldc_id, vdcp->curr_server->id);
mutex_exit(&vdcp->lock);
@@ -4587,14 +4467,8 @@ done:
mutex_enter(&vdcp->lock);
- /* all servers are now offline */
- for (srvr = vdcp->server_list; srvr != NULL;
- srvr = srvr->next) {
- srvr->svc_state = VDC_SERVICE_OFFLINE;
- srvr->log_state = VDC_SERVICE_NONE;
- }
-
- vdc_print_svc_status(vdcp);
+ cmn_err(CE_CONT, "?vdisk@%d is offline\n",
+ vdcp->instance);
vdcp->state = VDC_STATE_RESETTING;
vdcp->self_reset = B_TRUE;
@@ -4642,13 +4516,6 @@ done:
ASSERT(vdcp->read_state != VDC_READ_WAITING);
vdcp->read_state = VDC_READ_IDLE;
- vdcp->io_pending = B_FALSE;
-
- /*
- * Cleanup any pending eio. These I/Os are going to
- * be resubmitted.
- */
- vdc_eio_unqueue(vdcp, 0, B_FALSE);
vdc_backup_local_dring(vdcp);
@@ -4678,8 +4545,9 @@ done:
*/
cv_broadcast(&vdcp->running_cv);
- while (vdcp->sync_op_cnt > 0) {
- cv_broadcast(&vdcp->sync_blocked_cv);
+ while (vdcp->sync_op_pending) {
+ cv_signal(&vdcp->sync_pending_cv);
+ cv_signal(&vdcp->sync_blocked_cv);
mutex_exit(&vdcp->lock);
/* give the waiters enough time to wake up */
delay(vdc_hz_min_ldc_delay);
@@ -4791,7 +4659,7 @@ vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg)
ldep = &vdcp->local_dring[idx];
op = ldep->operation;
if ((op == VD_OP_BREAD) || (op == VD_OP_BWRITE)) {
- DTRACE_IO1(done, buf_t *, ldep->buf);
+ DTRACE_IO1(done, buf_t *, ldep->cb_arg);
VD_KSTAT_RUNQ_EXIT(vdcp);
}
VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
@@ -4816,57 +4684,62 @@ vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg)
ldep = &vdcp->local_dring[idx];
- DMSG(vdcp, 1, ": state 0x%x\n", ldep->dep->hdr.dstate);
+ DMSG(vdcp, 1, ": state 0x%x - cb_type 0x%x\n",
+ ldep->dep->hdr.dstate, ldep->cb_type);
if (ldep->dep->hdr.dstate == VIO_DESC_DONE) {
struct buf *bufp;
- status = ldep->dep->payload.status;
+ switch (ldep->cb_type) {
+ case CB_SYNC:
+ ASSERT(vdcp->sync_op_pending);
- bufp = ldep->buf;
- ASSERT(bufp != NULL);
+ status = vdc_depopulate_descriptor(vdcp, idx);
+ vdcp->sync_op_status = status;
+ vdcp->sync_op_pending = B_FALSE;
+ cv_signal(&vdcp->sync_pending_cv);
+ break;
- bufp->b_resid = bufp->b_bcount - ldep->dep->payload.nbytes;
- bioerror(bufp, status);
+ case CB_STRATEGY:
+ bufp = ldep->cb_arg;
+ ASSERT(bufp != NULL);
+ bufp->b_resid =
+ bufp->b_bcount - ldep->dep->payload.nbytes;
+ status = ldep->dep->payload.status; /* Future:ntoh */
+ if (status != 0) {
+ DMSG(vdcp, 1, "strategy status=%d\n", status);
+ VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
+ bioerror(bufp, status);
+ }
- if (status != 0) {
- DMSG(vdcp, 1, "I/O status=%d\n", status);
- }
+ (void) vdc_depopulate_descriptor(vdcp, idx);
- DMSG(vdcp, 1,
- "I/O complete req=%ld bytes resp=%ld bytes\n",
- bufp->b_bcount, ldep->dep->payload.nbytes);
+ DMSG(vdcp, 1,
+ "strategy complete req=%ld bytes resp=%ld bytes\n",
+ bufp->b_bcount, ldep->dep->payload.nbytes);
- /*
- * If the request has failed and we have multiple servers or
- * failfast is enabled then we will have to defer the completion
- * of the request until we have checked that the vdisk backend
- * is effectively available (if multiple server) or that there
- * is no reservation conflict (if failfast).
- */
- if ((status != 0 &&
- (vdcp->num_servers > 1 &&
- (ldep->flags & VDC_OP_ERRCHK_BACKEND)) ||
- (vdcp->failfast_interval != 0 &&
- (ldep->flags & VDC_OP_ERRCHK_CONFLICT)))) {
- /*
- * The I/O has failed and we need to check the error.
- */
- (void) vdc_eio_queue(vdcp, idx);
- } else {
- op = ldep->operation;
- if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
+ if (status != 0 && vdcp->failfast_interval != 0) {
+ /*
+ * The I/O has failed and failfast is enabled.
+ * We need the failfast thread to check if the
+ * failure is due to a reservation conflict.
+ */
+ (void) vdc_failfast_io_queue(vdcp, bufp);
+ } else {
if (status == 0) {
+ op = (bufp->b_flags & B_READ) ?
+ VD_OP_BREAD : VD_OP_BWRITE;
VD_UPDATE_IO_STATS(vdcp, op,
ldep->dep->payload.nbytes);
- } else {
- VD_UPDATE_ERR_STATS(vdcp, vd_softerrs);
}
VD_KSTAT_RUNQ_EXIT(vdcp);
DTRACE_IO1(done, buf_t *, bufp);
+ biodone(bufp);
}
- (void) vdc_depopulate_descriptor(vdcp, idx);
- biodone(bufp);
+ break;
+
+ default:
+ ASSERT(0);
}
}
@@ -4985,7 +4858,6 @@ static int
vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg)
{
int status = 0;
- vd_disk_type_t old_type;
ASSERT(vdc != NULL);
ASSERT(mutex_owned(&vdc->lock));
@@ -5030,7 +4902,6 @@ vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg)
}
/* update disk, block and transfer sizes */
- old_type = vdc->vdisk_type;
vdc_update_size(vdc, attr_msg->vdisk_size,
attr_msg->vdisk_block_size, attr_msg->max_xfer_sz);
vdc->vdisk_type = attr_msg->vdisk_type;
@@ -5061,25 +4932,6 @@ vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg)
* fake geometry for the disk.
*/
vdc_create_fake_geometry(vdc);
-
- /*
- * If the disk type was previously unknown and device nodes
- * were created then the driver would have created 8 device
- * nodes. If we now find out that this is a single-slice disk
- * then we need to re-create the appropriate device nodes.
- */
- if (old_type == VD_DISK_TYPE_UNK &&
- (vdc->initialized & VDC_MINOR) &&
- vdc->vdisk_type == VD_DISK_TYPE_SLICE) {
- ddi_remove_minor_node(vdc->dip, NULL);
- (void) devfs_clean(ddi_get_parent(vdc->dip),
- NULL, DV_CLEAN_FORCE);
- if (vdc_create_device_nodes(vdc) != 0) {
- DMSG(vdc, 0, "![%d] Failed to update "
- "device nodes", vdc->instance);
- }
- }
-
break;
case VIO_SUBTYPE_NACK:
@@ -5331,7 +5183,7 @@ vdc_dkio_flush_cb(void *arg)
ASSERT(vdc != NULL);
rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0,
- VDCPART(dk_arg->dev), 0, VIO_both_dir, B_TRUE);
+ VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE);
if (rv != 0) {
DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n",
vdc->instance, rv,
@@ -5747,8 +5599,8 @@ vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode)
/* a uscsi reset is converted to a VD_OP_RESET operation */
if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN |
USCSI_RESET_ALL)) {
- rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0,
- VIO_both_dir, B_TRUE);
+ rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC,
+ (void *)(uint64_t)mode, VIO_both_dir, B_TRUE);
return (rv);
}
@@ -5825,7 +5677,7 @@ vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv != 0)
goto done;
@@ -6019,7 +5871,7 @@ vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv != 0)
goto done;
@@ -6133,7 +5985,7 @@ vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv != 0)
goto done;
@@ -6238,7 +6090,7 @@ vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv == 0)
rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6279,7 +6131,7 @@ vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv == 0)
rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6324,7 +6176,7 @@ vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv == 0)
rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6363,7 +6215,7 @@ vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode)
/* submit the request */
rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, VIO_both_dir, B_FALSE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE);
if (rv == 0)
rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
@@ -6373,10 +6225,11 @@ vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode)
}
/*
- * This function is used to send a (simple) SCSI command and check errors.
+ * This function is used by the failfast mechanism to send a SCSI command
+ * to check for reservation conflict.
*/
static int
-vdc_eio_scsi_cmd(vdc_t *vdc, uchar_t scmd, int flags)
+vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd)
{
int cdb_len, sense_len, vd_scsi_len;
vd_scsi_t *vd_scsi;
@@ -6401,177 +6254,103 @@ vdc_eio_scsi_cmd(vdc_t *vdc, uchar_t scmd, int flags)
vd_scsi->timeout = vdc_scsi_timeout;
/*
- * Submit the request. Note the operation should not request that any
- * error is checked because this function is precisely called when
- * checking errors.
+ * Submit the request. The last argument has to be B_FALSE so that
+ * vdc_do_sync_op does not loop checking for reservation conflict if
+ * the operation returns an error.
*/
- ASSERT((flags & VDC_OP_ERRCHK) == 0);
-
- rv = vdc_do_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
- 0, 0, NULL, VIO_both_dir, flags);
+ rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len,
+ 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE);
if (rv == 0)
- rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE);
+ (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE);
kmem_free(vd_scsi, vd_scsi_len);
return (rv);
}
/*
- * This function is used to check if a SCSI backend is accessible. It will
- * also detect reservation conflict if failfast is enabled, and panic the
- * system in that case.
+ * This function is used by the failfast mechanism to check for reservation
+ * conflict. It sends some SCSI commands which will fail with a reservation
+ * conflict error if the system does not have access to the disk and this
+ * will panic the system.
*
* Returned Code:
- * 0 - disk is accessible
- * != 0 - disk is inaccessible or unable to check if disk is accessible
+ * 0 - disk is accessible without reservation conflict error
+ * != 0 - unable to check if disk is accessible
*/
-static int
-vdc_eio_scsi_check(vdc_t *vdc, int flags)
+int
+vdc_failfast_check_resv(vdc_t *vdc)
{
int failure = 0;
- int rv;
/*
* Send a TEST UNIT READY command. The command will panic
- * the system if it fails with a reservation conflict and
- * failfast is enabled. If there is a reservation conflict
- * and failfast is not enabled then the function will return
- * EACCES. In that case, there's no problem with accessing
- * the backend, it is just reserved.
+ * the system if it fails with a reservation conflict.
*/
- rv = vdc_eio_scsi_cmd(vdc, SCMD_TEST_UNIT_READY, flags);
- if (rv != 0 && rv != EACCES)
+ if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0)
failure++;
- /* we don't need to do more checking if failfast is not enabled */
- if (vdc->failfast_interval == 0)
- return (failure);
-
/*
* With SPC-3 compliant devices TEST UNIT READY will succeed on
* a reserved device, so we also do a WRITE(10) of zero byte in
* order to provoke a Reservation Conflict status on those newer
* devices.
*/
- if (vdc_eio_scsi_cmd(vdc, SCMD_WRITE_G1, flags) != 0)
+ if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0)
failure++;
return (failure);
}
/*
- * This function is used to check if a backend is effectively accessible.
+ * Add a pending I/O to the failfast I/O queue. An I/O is added to this
+ * queue when it has failed and failfast is enabled. Then we have to check
+ * if it has failed because of a reservation conflict in which case we have
+ * to panic the system.
*
- * Returned Code:
- * 0 - disk is accessible
- * != 0 - disk is inaccessible or unable to check if disk is accessible
- */
-static int
-vdc_eio_check(vdc_t *vdc, int flags)
-{
- char *buffer;
- diskaddr_t blkno;
- int rv;
-
- ASSERT((flags & VDC_OP_ERRCHK) == 0);
-
- if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD))
- return (vdc_eio_scsi_check(vdc, flags));
-
- ASSERT(vdc->failfast_interval == 0);
-
- /*
- * If the backend does not support SCSI operations then we simply
- * check if the backend is accessible by reading some data blocks.
- * We first try to read a random block, to try to avoid getting
- * a block that might have been cached on the service domain. Then
- * we try the last block, and finally the first block.
- *
- * We return success as soon as we are able to read any block.
- */
- buffer = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP);
-
- if (vdc->vdisk_size > 0) {
-
- /* try a random block */
- (void) random_get_pseudo_bytes((uint8_t *)&blkno,
- sizeof (diskaddr_t));
- blkno = blkno % vdc->vdisk_size;
- rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer,
- vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL,
- VIO_read_dir, flags);
-
- if (rv == 0)
- goto done;
-
- /* try the last block */
- blkno = vdc->vdisk_size - 1;
- rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer,
- vdc->vdisk_bsize, VD_SLICE_NONE, blkno, NULL,
- VIO_read_dir, flags);
-
- if (rv == 0)
- goto done;
- }
-
- /* try block 0 */
- blkno = 0;
- rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)buffer, vdc->vdisk_bsize,
- VD_SLICE_NONE, blkno, NULL, VIO_read_dir, flags);
-
-done:
- kmem_free(buffer, vdc->vdisk_bsize);
- return (rv);
-}
-
-/*
- * Add a pending I/O to the eio queue. An I/O is added to this queue
- * when it has failed and failfast is enabled or the vdisk has multiple
- * servers. It will then be handled by the eio thread (vdc_eio_thread).
- * The eio queue is ordered starting with the most recent I/O added.
+ * Async I/O should be queued with their block I/O data transfer structure
+ * (buf). Sync I/O should be queued with buf = NULL.
*/
static vdc_io_t *
-vdc_eio_queue(vdc_t *vdc, int index)
+vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf)
{
vdc_io_t *vio;
ASSERT(MUTEX_HELD(&vdc->lock));
vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP);
- vio->vio_next = vdc->eio_queue;
- vio->vio_index = index;
+ vio->vio_next = vdc->failfast_io_queue;
+ vio->vio_buf = buf;
vio->vio_qtime = ddi_get_lbolt();
- vdc->eio_queue = vio;
+ vdc->failfast_io_queue = vio;
- /* notify the eio thread that a new I/O is queued */
- cv_signal(&vdc->eio_cv);
+ /* notify the failfast thread that a new I/O is queued */
+ cv_signal(&vdc->failfast_cv);
return (vio);
}
/*
- * Remove I/Os added before the indicated deadline from the eio queue. A
- * deadline of 0 means that all I/Os have to be unqueued. The complete_io
- * boolean specifies if unqueued I/Os should be marked as completed or not.
+ * Remove and complete I/O in the failfast I/O queue which have been
+ * added after the indicated deadline. A deadline of 0 means that all
+ * I/O have to be unqueued and marked as completed.
*/
static void
-vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, boolean_t complete_io)
+vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline)
{
- struct buf *buf;
vdc_io_t *vio, *vio_tmp;
- int index, op;
ASSERT(MUTEX_HELD(&vdc->lock));
vio_tmp = NULL;
- vio = vdc->eio_queue;
+ vio = vdc->failfast_io_queue;
if (deadline != 0) {
/*
- * Skip any io queued after the deadline. The eio queue is
- * ordered starting with the last I/O added to the queue.
+ * Skip any io queued after the deadline. The failfast
+ * I/O queue is ordered starting with the last I/O added
+ * to the queue.
*/
while (vio != NULL && vio->vio_qtime > deadline) {
vio_tmp = vio;
@@ -6585,54 +6364,53 @@ vdc_eio_unqueue(vdc_t *vdc, clock_t deadline, boolean_t complete_io)
/* update the queue */
if (vio_tmp == NULL)
- vdc->eio_queue = NULL;
+ vdc->failfast_io_queue = NULL;
else
vio_tmp->vio_next = NULL;
/*
- * Free and complete unqueued I/Os if this was requested. All I/Os
- * have a block I/O data transfer structure (buf) and they are
- * completed by calling biodone().
+ * Complete unqueued I/O. Async I/O have a block I/O data transfer
+ * structure (buf) and they are completed by calling biodone(). Sync
+ * I/O do not have a buf and they are completed by setting the
+ * vio_qtime to zero and signaling failfast_io_cv. In that case, the
+ * thread waiting for the I/O to complete is responsible for freeing
+ * the vio structure.
*/
while (vio != NULL) {
vio_tmp = vio->vio_next;
-
- if (complete_io) {
- index = vio->vio_index;
- op = vdc->local_dring[index].operation;
- buf = vdc->local_dring[index].buf;
- (void) vdc_depopulate_descriptor(vdc, index);
- ASSERT(buf->b_flags & B_ERROR);
- if (op == VD_OP_BREAD || op == VD_OP_BWRITE) {
- VD_UPDATE_ERR_STATS(vdc, vd_softerrs);
- VD_KSTAT_RUNQ_EXIT(vdc);
- DTRACE_IO1(done, buf_t *, buf);
- }
- biodone(buf);
+ if (vio->vio_buf != NULL) {
+ VD_KSTAT_RUNQ_EXIT(vdc);
+ DTRACE_IO1(done, buf_t *, vio->vio_buf);
+ biodone(vio->vio_buf);
+ kmem_free(vio, sizeof (vdc_io_t));
+ } else {
+ vio->vio_qtime = 0;
}
-
- kmem_free(vio, sizeof (vdc_io_t));
vio = vio_tmp;
}
+
+ cv_broadcast(&vdc->failfast_io_cv);
}
/*
- * Error I/O Thread. There is one eio thread for each virtual disk that
- * has multiple servers or for which failfast is enabled. Failfast can only
- * be enabled for vdisk supporting SCSI commands.
+ * Failfast Thread.
*
- * While failfast is enabled, the eio thread sends a TEST UNIT READY
+ * While failfast is enabled, the failfast thread sends a TEST UNIT READY
* and a zero size WRITE(10) SCSI commands on a regular basis to check that
* we still have access to the disk. If a command fails with a RESERVATION
* CONFLICT error then the system will immediatly panic.
*
- * The eio thread is also woken up when an I/O has failed. It then checks
+ * The failfast thread is also woken up when an I/O has failed. It then check
* the access to the disk to ensure that the I/O failure was not due to a
- * reservation conflict or to the backend been inaccessible.
+ * reservation conflict.
*
+ * There is one failfast thread for each virtual disk for which failfast is
+ * enabled. We could have only one thread sending requests for all disks but
+ * this would need vdc to send asynchronous requests and to have callbacks to
+ * process replies.
*/
static void
-vdc_eio_thread(void *arg)
+vdc_failfast_thread(void *arg)
{
int status;
vdc_t *vdc = (vdc_t *)arg;
@@ -6640,74 +6418,45 @@ vdc_eio_thread(void *arg)
mutex_enter(&vdc->lock);
- while (vdc->failfast_interval != 0 || vdc->num_servers > 1) {
- /*
- * Wait if there is nothing in the eio queue or if the state
- * is not VDC_STATE_RUNNING.
- */
- if (vdc->eio_queue == NULL || vdc->state != VDC_STATE_RUNNING) {
- if (vdc->failfast_interval != 0) {
- timeout = ddi_get_lbolt() +
- drv_usectohz(vdc->failfast_interval);
- (void) cv_timedwait(&vdc->eio_cv, &vdc->lock,
- timeout);
- } else {
- ASSERT(vdc->num_servers > 1);
- (void) cv_wait(&vdc->eio_cv, &vdc->lock);
- }
+ while (vdc->failfast_interval != 0) {
- if (vdc->state != VDC_STATE_RUNNING)
- continue;
- }
+ starttime = ddi_get_lbolt();
mutex_exit(&vdc->lock);
- starttime = ddi_get_lbolt();
-
- /* check error */
- status = vdc_eio_check(vdc, VDC_OP_STATE_RUNNING);
+ /* check for reservation conflict */
+ status = vdc_failfast_check_resv(vdc);
mutex_enter(&vdc->lock);
/*
- * We have dropped the lock to check the backend so we have
- * to check that the eio thread is still enabled.
+ * We have dropped the lock to send the SCSI command so we have
+ * to check that failfast is still enabled.
*/
- if (vdc->failfast_interval == 0 && vdc->num_servers <= 1)
+ if (vdc->failfast_interval == 0)
break;
/*
- * If the eio queue is empty or we are not in running state
- * anymore then there is nothing to do.
+ * If we have successfully check the disk access and there was
+ * no reservation conflict then we can complete any I/O queued
+ * before the last check.
*/
- if (vdc->state != VDC_STATE_RUNNING || vdc->eio_queue == NULL)
- continue;
+ if (status == 0)
+ vdc_failfast_io_unqueue(vdc, starttime);
- if (status == 0) {
- /*
- * The backend access has been successfully checked,
- * we can complete any I/O queued before the last check.
- */
- vdc_eio_unqueue(vdc, starttime, B_TRUE);
+ /* proceed again if some I/O are still in the queue */
+ if (vdc->failfast_io_queue != NULL)
+ continue;
- } else if (vdc->num_servers > 1) {
- /*
- * The backend is inaccessible for a disk with multiple
- * servers. So we force a reset to switch to another
- * server. The reset will also clear the eio queue and
- * resubmit all pending I/Os.
- */
- mutex_enter(&vdc->read_lock);
- vdc->read_state = VDC_READ_RESET;
- cv_signal(&vdc->read_cv);
- mutex_exit(&vdc->read_lock);
- }
+ timeout = ddi_get_lbolt() +
+ drv_usectohz(vdc->failfast_interval);
+ (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout);
}
/*
- * The thread is being stopped so we can complete any queued I/O.
+ * Failfast is being stop so we can complete any queued I/O.
*/
- vdc_eio_unqueue(vdc, 0, B_TRUE);
- vdc->eio_thread = NULL;
+ vdc_failfast_io_unqueue(vdc, 0);
+ vdc->failfast_thread = NULL;
mutex_exit(&vdc->lock);
thread_exit();
}
@@ -6724,14 +6473,14 @@ vdc_failfast(vdc_t *vdc, caddr_t arg, int mode)
return (EFAULT);
mutex_enter(&vdc->lock);
- if (mh_time != 0 && vdc->eio_thread == NULL) {
- vdc->eio_thread = thread_create(NULL, 0,
- vdc_eio_thread, vdc, 0, &p0, TS_RUN,
+ if (mh_time != 0 && vdc->failfast_thread == NULL) {
+ vdc->failfast_thread = thread_create(NULL, 0,
+ vdc_failfast_thread, vdc, 0, &p0, TS_RUN,
v.v_maxsyspri - 2);
}
- vdc->failfast_interval = ((long)mh_time) * MILLISEC;
- cv_signal(&vdc->eio_cv);
+ vdc->failfast_interval = mh_time * 1000;
+ cv_signal(&vdc->failfast_cv);
mutex_exit(&vdc->lock);
return (0);
@@ -6742,13 +6491,14 @@ vdc_failfast(vdc_t *vdc, caddr_t arg, int mode)
* converted to VD_OP_SET_ACCESS operations.
*/
static int
-vdc_access_set(vdc_t *vdc, uint64_t flags)
+vdc_access_set(vdc_t *vdc, uint64_t flags, int mode)
{
int rv;
/* submit owership command request */
rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags,
- sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE);
+ sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode,
+ VIO_both_dir, B_TRUE);
return (rv);
}
@@ -6758,13 +6508,14 @@ vdc_access_set(vdc_t *vdc, uint64_t flags)
* VD_OP_GET_ACCESS operation.
*/
static int
-vdc_access_get(vdc_t *vdc, uint64_t *status)
+vdc_access_get(vdc_t *vdc, uint64_t *status, int mode)
{
int rv;
/* submit owership command request */
rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status,
- sizeof (uint64_t), 0, 0, VIO_both_dir, B_TRUE);
+ sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode,
+ VIO_both_dir, B_TRUE);
return (rv);
}
@@ -6809,7 +6560,7 @@ vdc_ownership_thread(void *arg)
mutex_exit(&vdc->lock);
status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE |
- VD_ACCESS_SET_PRESERVE);
+ VD_ACCESS_SET_PRESERVE, FKIOCTL);
mutex_enter(&vdc->lock);
@@ -6894,7 +6645,7 @@ vdc_get_capacity(vdc_t *vdc, size_t *dsk_size, size_t *blk_size)
vd_cap = kmem_zalloc(alloc_len, KM_SLEEP);
rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len,
- 0, 0, VIO_both_dir, B_TRUE);
+ 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE);
*dsk_size = vd_cap->vdisk_size;
*blk_size = vd_cap->vdisk_block_size;
@@ -7189,7 +6940,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp)
vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED);
rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE |
- VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE);
+ VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode);
if (rv == 0) {
vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED |
VDC_OWNERSHIP_GRANTED);
@@ -7203,7 +6954,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp)
case MHIOCRELEASE:
{
mutex_enter(&vdc->ownership_lock);
- rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR);
+ rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode);
if (rv == 0) {
vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE);
}
@@ -7215,7 +6966,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp)
{
uint64_t status;
- rv = vdc_access_get(vdc, &status);
+ rv = vdc_access_get(vdc, &status, mode);
if (rv == 0 && rvalp != NULL)
*rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1;
return (rv);
@@ -7223,7 +6974,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp)
case MHIOCQRESERVE:
{
- rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE);
+ rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode);
return (rv);
}
@@ -7401,7 +7152,8 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp)
* send request to vds to service the ioctl.
*/
rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len,
- VDCPART(dev), 0, VIO_both_dir, B_TRUE);
+ VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode,
+ VIO_both_dir, B_TRUE);
if (rv != 0) {
/*
@@ -8081,6 +7833,7 @@ vdc_update_vio_bsize(vdc_t *vdc, uint32_t blk_size)
static int
vdc_validate_geometry(vdc_t *vdc)
{
+ buf_t *buf; /* BREAD requests need to be in a buf_t structure */
dev_t dev;
int rv, rval;
struct dk_label *label;
@@ -8207,9 +7960,27 @@ vdc_validate_geometry(vdc_t *vdc)
* Read disk label from start of disk
*/
label = kmem_alloc(vdc->vdisk_bsize, KM_SLEEP);
-
- rv = vdc_do_op(vdc, VD_OP_BREAD, (caddr_t)label, vdc->vdisk_bsize,
- VD_SLICE_NONE, 0, NULL, VIO_read_dir, VDC_OP_NORMAL);
+ buf = kmem_alloc(sizeof (buf_t), KM_SLEEP);
+ bioinit(buf);
+ buf->b_un.b_addr = (caddr_t)label;
+ buf->b_bcount = vdc->vdisk_bsize;
+ buf->b_flags = B_BUSY | B_READ;
+ buf->b_dev = cmpdev(dev);
+ rv = vdc_send_request(vdc, VD_OP_BREAD, (caddr_t)label,
+ vdc->vdisk_bsize, VD_SLICE_NONE, 0, CB_STRATEGY, buf, VIO_read_dir);
+ if (rv) {
+ DMSG(vdc, 1, "[%d] Failed to read disk block 0\n",
+ vdc->instance);
+ } else if (ddi_in_panic()) {
+ rv = vdc_drain_response(vdc, CB_STRATEGY, buf);
+ if (rv == 0) {
+ rv = geterror(buf);
+ }
+ } else {
+ rv = biowait(buf);
+ }
+ biofini(buf);
+ kmem_free(buf, sizeof (buf_t));
if (rv != 0 || label->dkl_magic != DKL_MAGIC ||
label->dkl_cksum != vdc_lbl2cksum(label)) {
@@ -8260,8 +8031,7 @@ vdc_validate(vdc_t *vdc)
(void) vdc_validate_geometry(vdc);
/* if the disk label has changed, update device nodes */
- if (vdc->vdisk_type == VD_DISK_TYPE_DISK &&
- vdc->vdisk_label != old_label) {
+ if (vdc->vdisk_label != old_label) {
if (vdc->vdisk_label == VD_DISK_LABEL_EFI)
rv = vdc_create_device_nodes_efi(vdc);
@@ -8312,8 +8082,6 @@ vdc_setup_devid(vdc_t *vdc)
int rv;
vd_devid_t *vd_devid;
size_t bufsize, bufid_len;
- ddi_devid_t vdisk_devid;
- char *devid_str;
/*
* At first sight, we don't know the size of the devid that the
@@ -8328,10 +8096,10 @@ vdc_setup_devid(vdc_t *vdc)
vd_devid = kmem_zalloc(bufsize, KM_SLEEP);
bufid_len = bufsize - sizeof (vd_efi_t) - 1;
- rv = vdc_do_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
- bufsize, 0, 0, NULL, VIO_both_dir, 0);
+ rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
+ bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE);
- DMSG(vdc, 2, "do_op returned %d\n", rv);
+ DMSG(vdc, 2, "sync_op returned %d\n", rv);
if (rv) {
kmem_free(vd_devid, bufsize);
@@ -8349,8 +8117,9 @@ vdc_setup_devid(vdc_t *vdc)
vd_devid = kmem_zalloc(bufsize, KM_SLEEP);
bufid_len = bufsize - sizeof (vd_efi_t) - 1;
- rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid,
- bufsize, 0, 0, VIO_both_dir, B_TRUE);
+ rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID,
+ (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0,
+ VIO_both_dir, B_TRUE);
if (rv) {
kmem_free(vd_devid, bufsize);
@@ -8373,58 +8142,23 @@ vdc_setup_devid(vdc_t *vdc)
/* build an encapsulated devid based on the returned devid */
if (ddi_devid_init(vdc->dip, DEVID_ENCAP, vd_devid->length,
- vd_devid->id, &vdisk_devid) != DDI_SUCCESS) {
+ vd_devid->id, &vdc->devid) != DDI_SUCCESS) {
DMSG(vdc, 1, "[%d] Fail to created devid\n", vdc->instance);
kmem_free(vd_devid, bufsize);
return (1);
}
- DEVID_FORMTYPE((impl_devid_t *)vdisk_devid, vd_devid->type);
+ DEVID_FORMTYPE((impl_devid_t *)vdc->devid, vd_devid->type);
- ASSERT(ddi_devid_valid(vdisk_devid) == DDI_SUCCESS);
+ ASSERT(ddi_devid_valid(vdc->devid) == DDI_SUCCESS);
kmem_free(vd_devid, bufsize);
- if (vdc->devid != NULL) {
- /* check that the devid hasn't changed */
- if (ddi_devid_compare(vdisk_devid, vdc->devid) == 0) {
- ddi_devid_free(vdisk_devid);
- return (0);
- }
-
- cmn_err(CE_WARN, "vdisk@%d backend devid has changed",
- vdc->instance);
-
- devid_str = ddi_devid_str_encode(vdc->devid, NULL);
-
- cmn_err(CE_CONT, "vdisk@%d backend initial devid: %s",
- vdc->instance,
- (devid_str)? devid_str : "<encoding error>");
-
- if (devid_str)
- ddi_devid_str_free(devid_str);
-
- devid_str = ddi_devid_str_encode(vdisk_devid, NULL);
-
- cmn_err(CE_CONT, "vdisk@%d backend current devid: %s",
- vdc->instance,
- (devid_str)? devid_str : "<encoding error>");
-
- if (devid_str)
- ddi_devid_str_free(devid_str);
-
- ddi_devid_free(vdisk_devid);
- return (1);
- }
-
- if (ddi_devid_register(vdc->dip, vdisk_devid) != DDI_SUCCESS) {
+ if (ddi_devid_register(vdc->dip, vdc->devid) != DDI_SUCCESS) {
DMSG(vdc, 1, "[%d] Fail to register devid\n", vdc->instance);
- ddi_devid_free(vdisk_devid);
return (1);
}
- vdc->devid = vdisk_devid;
-
return (0);
}
diff --git a/usr/src/uts/sun4v/sys/vdc.h b/usr/src/uts/sun4v/sys/vdc.h
index e39a57bd9f..eecaf9a30b 100644
--- a/usr/src/uts/sun4v/sys/vdc.h
+++ b/usr/src/uts/sun4v/sys/vdc.h
@@ -84,16 +84,6 @@ extern "C" {
#define VDC_SEQ_NUM_TODO 1 /* Request needs processing */
/*
- * Flags for virtual disk operations.
- */
-#define VDC_OP_STATE_RUNNING 0x01 /* do operation in running state */
-#define VDC_OP_ERRCHK_BACKEND 0x02 /* check backend on error */
-#define VDC_OP_ERRCHK_CONFLICT 0x04 /* check resv conflict on error */
-
-#define VDC_OP_ERRCHK (VDC_OP_ERRCHK_BACKEND | VDC_OP_ERRCHK_CONFLICT)
-#define VDC_OP_NORMAL (VDC_OP_STATE_RUNNING | VDC_OP_ERRCHK)
-
-/*
* Macros to get UNIT and PART number
*/
#define VDCUNIT_SHIFT 3
@@ -181,26 +171,12 @@ typedef enum vdc_state {
VDC_STATE_INIT_WAITING, /* waiting for ldc connection */
VDC_STATE_NEGOTIATE, /* doing handshake negotiation */
VDC_STATE_HANDLE_PENDING, /* handle requests in backup dring */
- VDC_STATE_FAULTED, /* multipath backend is inaccessible */
- VDC_STATE_FAILED, /* device is not usable */
VDC_STATE_RUNNING, /* running and accepting requests */
VDC_STATE_DETACH, /* detaching */
VDC_STATE_RESETTING /* resetting connection with vds */
} vdc_state_t;
/*
- * States of the service provided by a vds server
- */
-typedef enum vdc_service_state {
- VDC_SERVICE_NONE = -1, /* no state define */
- VDC_SERVICE_OFFLINE, /* no connection with the service */
- VDC_SERVICE_CONNECTED, /* connection established */
- VDC_SERVICE_ONLINE, /* connection and backend available */
- VDC_SERVICE_FAILED, /* connection failed */
- VDC_SERVICE_FAULTED /* connection but backend unavailable */
-} vdc_service_state_t;
-
-/*
* The states that the vdc instance can be in.
*/
typedef enum vdc_lc_state {
@@ -222,6 +198,11 @@ typedef enum {
VIO_both_dir /* transfer both in and out in same buffer */
} vio_desc_direction_t;
+typedef enum {
+ CB_STRATEGY, /* non-blocking strategy call */
+ CB_SYNC /* synchronous operation */
+} vio_cb_type_t;
+
typedef struct vdc_local_desc {
boolean_t is_free; /* local state - inuse or not */
@@ -230,9 +211,9 @@ typedef struct vdc_local_desc {
int slice;
diskaddr_t offset; /* disk offset */
size_t nbytes;
- struct buf *buf; /* buf of operation */
+ vio_cb_type_t cb_type; /* operation type blk/nonblk */
+ void *cb_arg; /* buf passed to strategy() */
vio_desc_direction_t dir; /* direction of transfer */
- int flags; /* flags of operation */
caddr_t align_addr; /* used if addr non-aligned */
ldc_mem_handle_t desc_mhdl; /* Mem handle of buf */
@@ -241,11 +222,11 @@ typedef struct vdc_local_desc {
} vdc_local_desc_t;
/*
- * I/O queue used for checking backend or failfast
+ * I/O queue used by failfast
*/
typedef struct vdc_io {
struct vdc_io *vio_next; /* next pending I/O in the queue */
- int vio_index; /* descriptor index */
+ struct buf *vio_buf; /* buf for CB_STRATEGY I/O */
clock_t vio_qtime; /* time the I/O was queued */
} vdc_io_t;
@@ -265,8 +246,6 @@ typedef struct vdc_server {
struct vdc *vdcp; /* Ptr to vdc struct */
uint64_t id; /* Server port id */
uint64_t state; /* Server state */
- vdc_service_state_t svc_state; /* Service state */
- vdc_service_state_t log_state; /* Last state logged */
uint64_t ldc_id; /* Server LDC id */
ldc_handle_t ldc_handle; /* Server LDC handle */
ldc_status_t ldc_state; /* Server LDC state */
@@ -283,9 +262,7 @@ typedef struct vdc {
kcondvar_t initwait_cv; /* signal when ldc conn is up */
kcondvar_t dring_free_cv; /* signal when desc is avail */
kcondvar_t membind_cv; /* signal when mem can be bound */
- boolean_t self_reset; /* self initiated reset */
- kcondvar_t io_pending_cv; /* signal on pending I/O */
- boolean_t io_pending; /* pending I/O */
+ boolean_t self_reset;
int initialized; /* keeps track of what's init'ed */
vdc_lc_state_t lifecycle; /* Current state of the vdc instance */
@@ -308,7 +285,10 @@ typedef struct vdc {
vdc_rd_state_t read_state; /* current read state */
uint32_t sync_op_cnt; /* num of active sync operations */
+ boolean_t sync_op_pending; /* sync operation is pending */
boolean_t sync_op_blocked; /* blocked waiting to do sync op */
+ uint32_t sync_op_status; /* status of sync operation */
+ kcondvar_t sync_pending_cv; /* cv wait for sync op to finish */
kcondvar_t sync_blocked_cv; /* cv wait for other syncs to finish */
uint64_t session_id; /* common ID sent with all messages */
@@ -346,12 +326,13 @@ typedef struct vdc {
kcondvar_t ownership_cv; /* cv for ownership update */
/*
- * The eio and failfast fields are protected by the lock mutex.
+ * The failfast fields are protected by the lock mutex.
*/
- kthread_t *eio_thread; /* error io thread */
- kcondvar_t eio_cv; /* cv for eio thread update */
- vdc_io_t *eio_queue; /* error io queue */
+ kthread_t *failfast_thread; /* failfast thread */
clock_t failfast_interval; /* interval in microsecs */
+ kcondvar_t failfast_cv; /* cv for failfast update */
+ kcondvar_t failfast_io_cv; /* cv wait for I/O to finish */
+ vdc_io_t *failfast_io_queue; /* failfast io queue */
/*
* kstats used to store I/O statistics consumed by iostat(1M).