diff options
| author | achartre <none@none> | 2007-12-09 18:06:41 -0800 |
|---|---|---|
| committer | achartre <none@none> | 2007-12-09 18:06:41 -0800 |
| commit | 2f5224ae5d04383463098ad866ccee0464ee6429 (patch) | |
| tree | c1d13a95153bb9ac61cc717d15eba9a7f615b730 /usr/src | |
| parent | e0f3b92f9ac4073ea7820750694954b119f3fd8c (diff) | |
| download | illumos-gate-2f5224ae5d04383463098ad866ccee0464ee6429.tar.gz | |
FWARC 2007/672 VIO vDisk Protocol Updates
6437722 vDisk should support USCSICMD ioctl
6437772 vDisk should support mhd (multihost disk control operations)
6621222 need a tunable to export volumes as single slice disks
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/sun4v/io/vdc.c | 1737 | ||||
| -rw-r--r-- | usr/src/uts/sun4v/io/vds.c | 645 | ||||
| -rw-r--r-- | usr/src/uts/sun4v/sys/vdc.h | 42 | ||||
| -rw-r--r-- | usr/src/uts/sun4v/sys/vdsk_common.h | 96 | ||||
| -rw-r--r-- | usr/src/uts/sun4v/vdc/Makefile | 2 |
5 files changed, 2400 insertions, 122 deletions
diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c index 4fc8950927..2b09f6ee4a 100644 --- a/usr/src/uts/sun4v/io/vdc.c +++ b/usr/src/uts/sun4v/io/vdc.c @@ -75,6 +75,7 @@ #include <sys/sunddi.h> #include <sys/types.h> #include <sys/promif.h> +#include <sys/var.h> #include <sys/vtoc.h> #include <sys/archsystm.h> #include <sys/sysmacros.h> @@ -82,8 +83,11 @@ #include <sys/cdio.h> #include <sys/dktp/fdisk.h> #include <sys/dktp/dadkio.h> +#include <sys/mhd.h> #include <sys/scsi/generic/sense.h> -#include <sys/scsi/impl/uscsi.h> /* Needed for defn of USCSICMD ioctl */ +#include <sys/scsi/impl/uscsi.h> +#include <sys/scsi/impl/services.h> +#include <sys/scsi/targets/sddef.h> #include <sys/ldoms.h> #include <sys/ldc.h> @@ -165,9 +169,9 @@ static int vdc_map_to_shared_dring(vdc_t *vdcp, int idx); static int vdc_populate_descriptor(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, int cb_type, void *cb_arg, vio_desc_direction_t dir); -static int vdc_do_sync_op(vdc_t *vdcp, int operation, - caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, - int cb_type, void *cb_arg, vio_desc_direction_t dir); +static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, + size_t nbytes, int slice, diskaddr_t offset, int cb_type, + void *cb_arg, vio_desc_direction_t dir, boolean_t); static int vdc_wait_for_response(vdc_t *vdcp, vio_msg_t *msgp); static int vdc_drain_response(vdc_t *vdcp); @@ -176,7 +180,9 @@ static int vdc_populate_mem_hdl(vdc_t *vdcp, vdc_local_desc_t *ldep); static int vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg); /* dkio */ -static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); +static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, + int *rvalp); +static int vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); static void vdc_create_fake_geometry(vdc_t *vdc); static int vdc_validate_geometry(vdc_t *vdc); static void vdc_validate(vdc_t *vdc); @@ -200,6 +206,11 @@ static int vdc_get_efi_convert(vdc_t *vdc, void *from, void *to, static int vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir); +static void vdc_ownership_update(vdc_t *vdc, int ownership_flags); +static int vdc_access_set(vdc_t *vdc, uint64_t flags, int mode); +static vdc_io_t *vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf); +static int vdc_failfast_check_resv(vdc_t *vdc); + /* * Module variables */ @@ -224,9 +235,16 @@ static uint64_t vdc_ldc_read_max_delay = 100 * MILLISEC; static uint64_t vdc_usec_timeout_dump = 100 * MILLISEC; /* 0.1s units: ns */ static int vdc_dump_retries = 100; +static uint16_t vdc_scsi_timeout = 60; /* 60s units: seconds */ + +static uint64_t vdc_ownership_delay = 6 * MICROSEC; /* 6s units: usec */ + /* Count of the number of vdc instances attached */ static volatile uint32_t vdc_instance_count = 0; +/* Tunable to log all SCSI errors */ +static boolean_t vdc_scsi_log_error = B_FALSE; + /* Soft state pointer */ static void *vdc_state; @@ -309,7 +327,7 @@ _init(void) return (status); if ((status = mod_install(&modlinkage)) != 0) ddi_soft_state_fini(&vdc_state); - vdc_efi_init(vd_process_ioctl); + vdc_efi_init(vd_process_efi_ioctl); return (status); } @@ -359,6 +377,7 @@ vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) static int vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + kt_did_t failfast_tid, ownership_tid; int instance; int rv; vdc_t *vdc = NULL; @@ -383,7 +402,14 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (DDI_FAILURE); } - if (vdc_is_opened(vdc)) { + /* + * This function is called when vdc is detached or if it has failed to + * attach. In that case, the attach may have fail before the vdisk type + * has been set so we can't call vdc_is_opened(). However as the attach + * has failed, we know that the vdisk is not opened and we can safely + * detach. + */ + if (vdc->vdisk_type != VD_DISK_TYPE_UNK && vdc_is_opened(vdc)) { DMSG(vdc, 0, "[%d] Cannot detach: device is open", instance); return (DDI_FAILURE); } @@ -404,6 +430,16 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) DMSG(vdc, 0, "[%d] proceeding...\n", instance); + /* If we took ownership, release ownership */ + mutex_enter(&vdc->ownership_lock); + if (vdc->ownership & VDC_OWNERSHIP_GRANTED) { + rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, FKIOCTL); + if (rv == 0) { + vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); + } + } + mutex_exit(&vdc->ownership_lock); + /* mark instance as detaching */ vdc->lifecycle = VDC_LC_DETACHING; @@ -449,8 +485,30 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (vdc->initialized & VDC_LDC) vdc_terminate_ldc(vdc); + if (vdc->failfast_thread) { + failfast_tid = vdc->failfast_thread->t_did; + vdc->failfast_interval = 0; + cv_signal(&vdc->failfast_cv); + } else { + failfast_tid = 0; + } + + if (vdc->ownership & VDC_OWNERSHIP_WANTED) { + ownership_tid = vdc->ownership_thread->t_did; + vdc->ownership = VDC_OWNERSHIP_NONE; + cv_signal(&vdc->ownership_cv); + } else { + ownership_tid = 0; + } + mutex_exit(&vdc->lock); + if (failfast_tid != 0) + thread_join(failfast_tid); + + if (ownership_tid != 0) + thread_join(ownership_tid); + if (vdc->initialized & VDC_MINOR) { ddi_prop_remove_all(dip); ddi_remove_minor_node(dip, NULL); @@ -459,6 +517,7 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (vdc->initialized & VDC_LOCKS) { mutex_destroy(&vdc->lock); mutex_destroy(&vdc->read_lock); + mutex_destroy(&vdc->ownership_lock); cv_destroy(&vdc->initwait_cv); cv_destroy(&vdc->dring_free_cv); cv_destroy(&vdc->membind_cv); @@ -466,6 +525,9 @@ vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) cv_destroy(&vdc->sync_blocked_cv); cv_destroy(&vdc->read_cv); cv_destroy(&vdc->running_cv); + cv_destroy(&vdc->ownership_cv); + cv_destroy(&vdc->failfast_cv); + cv_destroy(&vdc->failfast_io_cv); } if (vdc->minfo) @@ -564,6 +626,11 @@ vdc_do_attach(dev_info_t *dip) cv_init(&vdc->sync_pending_cv, NULL, CV_DRIVER, NULL); cv_init(&vdc->sync_blocked_cv, NULL, CV_DRIVER, NULL); + mutex_init(&vdc->ownership_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vdc->ownership_cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->failfast_cv, NULL, CV_DRIVER, NULL); + cv_init(&vdc->failfast_io_cv, NULL, CV_DRIVER, NULL); + /* init blocking msg read functionality */ mutex_init(&vdc->read_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&vdc->read_cv, NULL, CV_DRIVER, NULL); @@ -1197,7 +1264,7 @@ vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) int instance; int slice; - int rv; + int rv, rval; vdc_t *vdc; instance = VDCUNIT(dev); @@ -1219,7 +1286,7 @@ vdc_close(dev_t dev, int flag, int otyp, cred_t *cred) * not a supported IOCTL command or the backing device is read-only * do not fail the close operation. */ - rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL); + rv = vd_process_ioctl(dev, DKIOCFLUSHWRITECACHE, NULL, FKIOCTL, &rval); if (rv != 0 && rv != ENOTSUP && rv != ENOTTY && rv != EROFS) { DMSG(vdc, 0, "[%d] flush failed with error %d on close\n", @@ -1238,9 +1305,8 @@ static int vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { _NOTE(ARGUNUSED(credp)) - _NOTE(ARGUNUSED(rvalp)) - return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode)); + return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode, rvalp)); } static int @@ -2639,6 +2705,16 @@ vdc_send_request(vdc_t *vdcp, int operation, caddr_t addr, return (EIO); } + /* + * If we are panicking and the disk is not ready then + * we can't send any request because we can't complete + * the handshake now. + */ + if (ddi_in_panic()) { + mutex_exit(&vdcp->lock); + return (EIO); + } + cv_wait(&vdcp->running_cv, &vdcp->lock); } @@ -2815,20 +2891,27 @@ cleanup_and_exit: * . mode for ioctl(9e) * . LP64 diskaddr_t (block I/O) * dir - direction of operation (READ/WRITE/BOTH) + * rconflict - check for reservation conflict in case of failure + * + * rconflict should be set to B_TRUE by most callers. Callers invoking the + * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the + * result of a successful operation with vd_scsi_status(). * * Return Codes: * 0 * EAGAIN - * EFAULT - * ENXIO - * EIO + * EFAULT + * ENXIO + * EIO */ static int vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, int slice, diskaddr_t offset, int cb_type, void *cb_arg, - vio_desc_direction_t dir) + vio_desc_direction_t dir, boolean_t rconflict) { int status; + vdc_io_t *vio; + boolean_t check_resv_conflict = B_FALSE; ASSERT(cb_type == CB_SYNC); @@ -2875,6 +2958,15 @@ vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, status = ENXIO; } else { status = vdcp->sync_op_status; + if (status != 0 && vdcp->failfast_interval != 0) { + /* + * Operation has failed and failfast is enabled. + * We need to check if the failure is due to a + * reservation conflict if this was requested. + */ + check_resv_conflict = rconflict; + } + } } @@ -2884,6 +2976,19 @@ vdc_do_sync_op(vdc_t *vdcp, int operation, caddr_t addr, size_t nbytes, /* signal the next waiting thread */ cv_signal(&vdcp->sync_blocked_cv); + + /* + * We have to check for reservation conflict after unblocking sync + * operations because some sync operations will be used to do this + * check. + */ + if (check_resv_conflict) { + vio = vdc_failfast_io_queue(vdcp, NULL); + while (vio->vio_qtime != 0) + cv_wait(&vdcp->failfast_io_cv, &vdcp->lock); + kmem_free(vio, sizeof (vdc_io_t)); + } + mutex_exit(&vdcp->lock); return (status); @@ -3872,6 +3977,15 @@ done: */ vdcp->hshake_cnt = 0; cv_broadcast(&vdcp->running_cv); + + /* failfast has to been checked after reset */ + cv_signal(&vdcp->failfast_cv); + + /* ownership is lost during reset */ + if (vdcp->ownership & VDC_OWNERSHIP_WANTED) + vdcp->ownership |= VDC_OWNERSHIP_RESET; + cv_signal(&vdcp->ownership_cv); + mutex_exit(&vdcp->lock); for (;;) { @@ -4098,12 +4212,23 @@ vdc_process_data_msg(vdc_t *vdcp, vio_msg_t *msg) DMSG(vdcp, 1, "strategy status=%d\n", status); bioerror(bufp, status); } - status = vdc_depopulate_descriptor(vdcp, idx); - biodone(bufp); + + (void) vdc_depopulate_descriptor(vdcp, idx); DMSG(vdcp, 1, "strategy complete req=%ld bytes resp=%ld bytes\n", bufp->b_bcount, ldep->dep->payload.nbytes); + + if (status != 0 && vdcp->failfast_interval != 0) { + /* + * The I/O has failed and failfast is enabled. + * We need the failfast thread to check if the + * failure is due to a reservation conflict. + */ + (void) vdc_failfast_io_queue(vdcp, bufp); + } else { + biodone(bufp); + } break; default: @@ -4253,10 +4378,16 @@ vdc_handle_attr_msg(vdc_t *vdc, vd_attr_msg_t *attr_msg) break; } + if (attr_msg->vdisk_size == VD_SIZE_UNKNOWN) { + DMSG(vdc, 0, "[%d] Unknown disk size from vds", + vdc->instance); + attr_msg->vdisk_size = 0; + } + /* * If the disk size is already set check that it hasn't changed. */ - if ((vdc->vdisk_size != 0) && + if ((vdc->vdisk_size != 0) && (attr_msg->vdisk_size != 0) && (vdc->vdisk_size != attr_msg->vdisk_size)) { DMSG(vdc, 0, "[%d] Different disk size from vds " "(old=0x%lx - new=0x%lx", vdc->instance, @@ -4562,7 +4693,7 @@ vdc_dkio_flush_cb(void *arg) ASSERT(vdc != NULL); rv = vdc_do_sync_op(vdc, VD_OP_FLUSH, NULL, 0, - VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir); + VDCPART(dk_arg->dev), 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); if (rv != 0) { DMSG(vdc, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n", vdc->instance, rv, @@ -4730,6 +4861,1258 @@ vdc_dioctl_rwcmd(dev_t dev, caddr_t arg, int flag) } /* + * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated + * buffer is returned in alloc_len. + */ +static vd_scsi_t * +vdc_scsi_alloc(int cdb_len, int sense_len, int datain_len, int dataout_len, + int *alloc_len) +{ + vd_scsi_t *vd_scsi; + int vd_scsi_len = VD_SCSI_SIZE; + + vd_scsi_len += P2ROUNDUP(cdb_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(sense_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(datain_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(dataout_len, sizeof (uint64_t)); + + ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); + + vd_scsi = kmem_zalloc(vd_scsi_len, KM_SLEEP); + + vd_scsi->cdb_len = cdb_len; + vd_scsi->sense_len = sense_len; + vd_scsi->datain_len = datain_len; + vd_scsi->dataout_len = dataout_len; + + *alloc_len = vd_scsi_len; + + return (vd_scsi); +} + +/* + * Convert the status of a SCSI command to a Solaris return code. + * + * Arguments: + * vd_scsi - The SCSI operation buffer. + * log_error - indicate if an error message should be logged. + * + * Note that our SCSI error messages are rather primitive for the moment + * and could be improved by decoding some data like the SCSI command and + * the sense key. + * + * Return value: + * 0 - Status is good. + * EACCES - Status reports a reservation conflict. + * ENOTSUP - Status reports a check condition and sense key + * reports an illegal request. + * EIO - Any other status. + */ +static int +vdc_scsi_status(vdc_t *vdc, vd_scsi_t *vd_scsi, boolean_t log_error) +{ + int rv; + char path_str[MAXPATHLEN]; + char panic_str[VDC_RESV_CONFLICT_FMT_LEN + MAXPATHLEN]; + union scsi_cdb *cdb; + struct scsi_extended_sense *sense; + + if (vd_scsi->cmd_status == STATUS_GOOD) + /* no error */ + return (0); + + /* when the tunable vdc_scsi_log_error is true we log all errors */ + if (vdc_scsi_log_error) + log_error = B_TRUE; + + if (log_error) { + cmn_err(CE_WARN, "%s (vdc%d):\tError for Command: 0x%x)\n", + ddi_pathname(vdc->dip, path_str), vdc->instance, + GETCMD(VD_SCSI_DATA_CDB(vd_scsi))); + } + + /* default returned value */ + rv = EIO; + + switch (vd_scsi->cmd_status) { + + case STATUS_CHECK: + case STATUS_TERMINATED: + if (log_error) + cmn_err(CE_CONT, "\tCheck Condition Error\n"); + + /* check sense buffer */ + if (vd_scsi->sense_len == 0 || + vd_scsi->sense_status != STATUS_GOOD) { + if (log_error) + cmn_err(CE_CONT, "\tNo Sense Data Available\n"); + break; + } + + sense = VD_SCSI_DATA_SENSE(vd_scsi); + + if (log_error) { + cmn_err(CE_CONT, "\tSense Key: 0x%x\n" + "\tASC: 0x%x, ASCQ: 0x%x\n", + scsi_sense_key((uint8_t *)sense), + scsi_sense_asc((uint8_t *)sense), + scsi_sense_ascq((uint8_t *)sense)); + } + + if (scsi_sense_key((uint8_t *)sense) == KEY_ILLEGAL_REQUEST) + rv = ENOTSUP; + break; + + case STATUS_BUSY: + if (log_error) + cmn_err(CE_NOTE, "\tDevice Busy\n"); + break; + + case STATUS_RESERVATION_CONFLICT: + /* + * If the command was PERSISTENT_RESERVATION_[IN|OUT] then + * reservation conflict could be due to various reasons like + * incorrect keys, not registered or not reserved etc. So, + * we should not panic in that case. + */ + cdb = VD_SCSI_DATA_CDB(vd_scsi); + if (vdc->failfast_interval != 0 && + cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_IN && + cdb->scc_cmd != SCMD_PERSISTENT_RESERVE_OUT) { + /* failfast is enabled so we have to panic */ + (void) snprintf(panic_str, sizeof (panic_str), + VDC_RESV_CONFLICT_FMT_STR "%s", + ddi_pathname(vdc->dip, path_str)); + panic(panic_str); + } + if (log_error) + cmn_err(CE_NOTE, "\tReservation Conflict\n"); + rv = EACCES; + break; + + case STATUS_QFULL: + if (log_error) + cmn_err(CE_NOTE, "\tQueue Full\n"); + break; + + case STATUS_MET: + case STATUS_INTERMEDIATE: + case STATUS_SCSI2: + case STATUS_INTERMEDIATE_MET: + case STATUS_ACA_ACTIVE: + if (log_error) + cmn_err(CE_CONT, + "\tUnexpected SCSI status received: 0x%x\n", + vd_scsi->cmd_status); + break; + + default: + if (log_error) + cmn_err(CE_CONT, + "\tInvalid SCSI status received: 0x%x\n", + vd_scsi->cmd_status); + break; + } + + return (rv); +} + +/* + * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to + * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI + * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is + * converted to a VD_OP_RESET operation. + */ +static int +vdc_uscsi_cmd(vdc_t *vdc, caddr_t arg, int mode) +{ + struct uscsi_cmd uscsi; + struct uscsi_cmd32 uscsi32; + vd_scsi_t *vd_scsi; + int vd_scsi_len; + union scsi_cdb *cdb; + struct scsi_extended_sense *sense; + char *datain, *dataout; + size_t cdb_len, datain_len, dataout_len, sense_len; + int rv; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin(arg, &uscsi32, sizeof (struct uscsi_cmd32), + mode) != 0) + return (EFAULT); + uscsi_cmd32touscsi_cmd((&uscsi32), (&uscsi)); + } else { + if (ddi_copyin(arg, &uscsi, sizeof (struct uscsi_cmd), + mode) != 0) + return (EFAULT); + } + + /* a uscsi reset is converted to a VD_OP_RESET operation */ + if (uscsi.uscsi_flags & (USCSI_RESET | USCSI_RESET_LUN | + USCSI_RESET_ALL)) { + rv = vdc_do_sync_op(vdc, VD_OP_RESET, NULL, 0, 0, 0, CB_SYNC, + (void *)(uint64_t)mode, VIO_both_dir, B_TRUE); + return (rv); + } + + /* cdb buffer length */ + cdb_len = uscsi.uscsi_cdblen; + + /* data in and out buffers length */ + if (uscsi.uscsi_flags & USCSI_READ) { + datain_len = uscsi.uscsi_buflen; + dataout_len = 0; + } else { + datain_len = 0; + dataout_len = uscsi.uscsi_buflen; + } + + /* sense buffer length */ + if (uscsi.uscsi_flags & USCSI_RQENABLE) + sense_len = uscsi.uscsi_rqlen; + else + sense_len = 0; + + /* allocate buffer for the VD_SCSICMD_OP operation */ + vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, + &vd_scsi_len); + + /* + * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague, + * but basically they prevent a SCSI command from being retried in case + * of an error. + */ + if ((uscsi.uscsi_flags & USCSI_ISOLATE) || + (uscsi.uscsi_flags & USCSI_DIAGNOSE)) + vd_scsi->options |= VD_SCSI_OPT_NORETRY; + + /* set task attribute */ + if (uscsi.uscsi_flags & USCSI_NOTAG) { + vd_scsi->task_attribute = 0; + } else { + if (uscsi.uscsi_flags & USCSI_HEAD) + vd_scsi->task_attribute = VD_SCSI_TASK_ACA; + else if (uscsi.uscsi_flags & USCSI_HTAG) + vd_scsi->task_attribute = VD_SCSI_TASK_HQUEUE; + else if (uscsi.uscsi_flags & USCSI_OTAG) + vd_scsi->task_attribute = VD_SCSI_TASK_ORDERED; + else + vd_scsi->task_attribute = 0; + } + + /* set timeout */ + vd_scsi->timeout = uscsi.uscsi_timeout; + + /* copy-in cdb data */ + cdb = VD_SCSI_DATA_CDB(vd_scsi); + if (ddi_copyin(uscsi.uscsi_cdb, cdb, cdb_len, mode) != 0) { + rv = EFAULT; + goto done; + } + + /* keep a pointer to the sense buffer */ + sense = VD_SCSI_DATA_SENSE(vd_scsi); + + /* keep a pointer to the data-in buffer */ + datain = (char *)VD_SCSI_DATA_IN(vd_scsi); + + /* copy-in request data to the data-out buffer */ + dataout = (char *)VD_SCSI_DATA_OUT(vd_scsi); + if (!(uscsi.uscsi_flags & USCSI_READ)) { + if (ddi_copyin(uscsi.uscsi_bufaddr, dataout, dataout_len, + mode)) { + rv = EFAULT; + goto done; + } + } + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv != 0) + goto done; + + /* update scsi status */ + uscsi.uscsi_status = vd_scsi->cmd_status; + + /* update sense data */ + if ((uscsi.uscsi_flags & USCSI_RQENABLE) && + (uscsi.uscsi_status == STATUS_CHECK || + uscsi.uscsi_status == STATUS_TERMINATED)) { + + uscsi.uscsi_rqstatus = vd_scsi->sense_status; + + if (uscsi.uscsi_rqstatus == STATUS_GOOD) { + uscsi.uscsi_rqresid = uscsi.uscsi_rqlen - + vd_scsi->sense_len; + if (ddi_copyout(sense, uscsi.uscsi_rqbuf, + vd_scsi->sense_len, mode) != 0) { + rv = EFAULT; + goto done; + } + } + } + + /* update request data */ + if (uscsi.uscsi_status == STATUS_GOOD) { + if (uscsi.uscsi_flags & USCSI_READ) { + uscsi.uscsi_resid = uscsi.uscsi_buflen - + vd_scsi->datain_len; + if (ddi_copyout(datain, uscsi.uscsi_bufaddr, + vd_scsi->datain_len, mode) != 0) { + rv = EFAULT; + goto done; + } + } else { + uscsi.uscsi_resid = uscsi.uscsi_buflen - + vd_scsi->dataout_len; + } + } + + /* copy-out result */ + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + uscsi_cmdtouscsi_cmd32((&uscsi), (&uscsi32)); + if (ddi_copyout(&uscsi32, arg, sizeof (struct uscsi_cmd32), + mode) != 0) { + rv = EFAULT; + goto done; + } + } else { + if (ddi_copyout(&uscsi, arg, sizeof (struct uscsi_cmd), + mode) != 0) { + rv = EFAULT; + goto done; + } + } + + /* get the return code from the SCSI command status */ + rv = vdc_scsi_status(vdc, vd_scsi, + !(uscsi.uscsi_flags & USCSI_SILENT)); + +done: + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command. + * + * Arguments: + * cmd - SCSI PERSISTENT IN command + * len - length of the SCSI input buffer + * vd_scsi_len - return the length of the allocated buffer + * + * Returned Value: + * a pointer to the allocated VD_OP_SCSICMD buffer. + */ +static vd_scsi_t * +vdc_scsi_alloc_persistent_in(uchar_t cmd, int len, int *vd_scsi_len) +{ + int cdb_len, sense_len, datain_len, dataout_len; + vd_scsi_t *vd_scsi; + union scsi_cdb *cdb; + + cdb_len = CDB_GROUP1; + sense_len = sizeof (struct scsi_extended_sense); + datain_len = len; + dataout_len = 0; + + vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, + vd_scsi_len); + + cdb = VD_SCSI_DATA_CDB(vd_scsi); + + /* set cdb */ + cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_IN; + cdb->cdb_opaque[1] = cmd; + FORMG1COUNT(cdb, datain_len); + + vd_scsi->timeout = vdc_scsi_timeout; + + return (vd_scsi); +} + +/* + * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command. + * + * Arguments: + * cmd - SCSI PERSISTENT OUT command + * len - length of the SCSI output buffer + * vd_scsi_len - return the length of the allocated buffer + * + * Returned Code: + * a pointer to the allocated VD_OP_SCSICMD buffer. + */ +static vd_scsi_t * +vdc_scsi_alloc_persistent_out(uchar_t cmd, int len, int *vd_scsi_len) +{ + int cdb_len, sense_len, datain_len, dataout_len; + vd_scsi_t *vd_scsi; + union scsi_cdb *cdb; + + cdb_len = CDB_GROUP1; + sense_len = sizeof (struct scsi_extended_sense); + datain_len = 0; + dataout_len = len; + + vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, datain_len, dataout_len, + vd_scsi_len); + + cdb = VD_SCSI_DATA_CDB(vd_scsi); + + /* set cdb */ + cdb->scc_cmd = SCMD_PERSISTENT_RESERVE_OUT; + cdb->cdb_opaque[1] = cmd; + FORMG1COUNT(cdb, dataout_len); + + vd_scsi->timeout = vdc_scsi_timeout; + + return (vd_scsi); +} + +/* + * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted + * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk + * server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_inkeys(vdc_t *vdc, caddr_t arg, int mode) +{ + vd_scsi_t *vd_scsi; + mhioc_inkeys_t inkeys; + mhioc_key_list_t klist; + struct mhioc_inkeys32 inkeys32; + struct mhioc_key_list32 klist32; + sd_prin_readkeys_t *scsi_keys; + void *user_keys; + int vd_scsi_len; + int listsize, listlen, rv; + + /* copyin arguments */ + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + rv = ddi_copyin(arg, &inkeys32, sizeof (inkeys32), mode); + if (rv != 0) + return (EFAULT); + + rv = ddi_copyin((caddr_t)(uintptr_t)inkeys32.li, &klist32, + sizeof (klist32), mode); + if (rv != 0) + return (EFAULT); + + listsize = klist32.listsize; + } else { + rv = ddi_copyin(arg, &inkeys, sizeof (inkeys), mode); + if (rv != 0) + return (EFAULT); + + rv = ddi_copyin(inkeys.li, &klist, sizeof (klist), mode); + if (rv != 0) + return (EFAULT); + + listsize = klist.listsize; + } + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_KEYS, + sizeof (sd_prin_readkeys_t) - sizeof (caddr_t) + + (sizeof (mhioc_resv_key_t) * listsize), &vd_scsi_len); + + scsi_keys = (sd_prin_readkeys_t *)VD_SCSI_DATA_IN(vd_scsi); + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv != 0) + goto done; + + listlen = scsi_keys->len / MHIOC_RESV_KEY_SIZE; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + inkeys32.generation = scsi_keys->generation; + rv = ddi_copyout(&inkeys32, arg, sizeof (inkeys32), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + klist32.listlen = listlen; + rv = ddi_copyout(&klist32, (caddr_t)(uintptr_t)inkeys32.li, + sizeof (klist32), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + user_keys = (caddr_t)(uintptr_t)klist32.list; + } else { + inkeys.generation = scsi_keys->generation; + rv = ddi_copyout(&inkeys, arg, sizeof (inkeys), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + klist.listlen = listlen; + rv = ddi_copyout(&klist, inkeys.li, sizeof (klist), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + user_keys = klist.list; + } + + /* copy out keys */ + if (listlen > 0 && listsize > 0) { + if (listsize < listlen) + listlen = listsize; + rv = ddi_copyout(&scsi_keys->keylist, user_keys, + listlen * MHIOC_RESV_KEY_SIZE, mode); + if (rv != 0) + rv = EFAULT; + } + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + +done: + kmem_free(vd_scsi, vd_scsi_len); + + return (rv); +} + +/* + * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted + * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to + * the vdisk server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_inresv(vdc_t *vdc, caddr_t arg, int mode) +{ + vd_scsi_t *vd_scsi; + mhioc_inresvs_t inresv; + mhioc_resv_desc_list_t rlist; + struct mhioc_inresvs32 inresv32; + struct mhioc_resv_desc_list32 rlist32; + mhioc_resv_desc_t mhd_resv; + sd_prin_readresv_t *scsi_resv; + sd_readresv_desc_t *resv; + mhioc_resv_desc_t *user_resv; + int vd_scsi_len; + int listsize, listlen, i, rv; + + /* copyin arguments */ + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + rv = ddi_copyin(arg, &inresv32, sizeof (inresv32), mode); + if (rv != 0) + return (EFAULT); + + rv = ddi_copyin((caddr_t)(uintptr_t)inresv32.li, &rlist32, + sizeof (rlist32), mode); + if (rv != 0) + return (EFAULT); + + listsize = rlist32.listsize; + } else { + rv = ddi_copyin(arg, &inresv, sizeof (inresv), mode); + if (rv != 0) + return (EFAULT); + + rv = ddi_copyin(inresv.li, &rlist, sizeof (rlist), mode); + if (rv != 0) + return (EFAULT); + + listsize = rlist.listsize; + } + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_in(SD_READ_RESV, + sizeof (sd_prin_readresv_t) - sizeof (caddr_t) + + (SCSI3_RESV_DESC_LEN * listsize), &vd_scsi_len); + + scsi_resv = (sd_prin_readresv_t *)VD_SCSI_DATA_IN(vd_scsi); + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv != 0) + goto done; + + listlen = scsi_resv->len / SCSI3_RESV_DESC_LEN; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + inresv32.generation = scsi_resv->generation; + rv = ddi_copyout(&inresv32, arg, sizeof (inresv32), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + rlist32.listlen = listlen; + rv = ddi_copyout(&rlist32, (caddr_t)(uintptr_t)inresv32.li, + sizeof (rlist32), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + user_resv = (mhioc_resv_desc_t *)(uintptr_t)rlist32.list; + } else { + inresv.generation = scsi_resv->generation; + rv = ddi_copyout(&inresv, arg, sizeof (inresv), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + rlist.listlen = listlen; + rv = ddi_copyout(&rlist, inresv.li, sizeof (rlist), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + + user_resv = rlist.list; + } + + /* copy out reservations */ + if (listsize > 0 && listlen > 0) { + if (listsize < listlen) + listlen = listsize; + resv = (sd_readresv_desc_t *)&scsi_resv->readresv_desc; + + for (i = 0; i < listlen; i++) { + mhd_resv.type = resv->type; + mhd_resv.scope = resv->scope; + mhd_resv.scope_specific_addr = + BE_32(resv->scope_specific_addr); + bcopy(&resv->resvkey, &mhd_resv.key, + MHIOC_RESV_KEY_SIZE); + + rv = ddi_copyout(&mhd_resv, user_resv, + sizeof (mhd_resv), mode); + if (rv != 0) { + rv = EFAULT; + goto done; + } + resv++; + user_resv++; + } + } + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + +done: + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted + * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk + * server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_register(vdc_t *vdc, caddr_t arg, int mode) +{ + vd_scsi_t *vd_scsi; + sd_prout_t *scsi_prout; + mhioc_register_t mhd_reg; + int vd_scsi_len, rv; + + /* copyin arguments */ + rv = ddi_copyin(arg, &mhd_reg, sizeof (mhd_reg), mode); + if (rv != 0) + return (EFAULT); + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER, + sizeof (sd_prout_t), &vd_scsi_len); + + /* set parameters */ + scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); + bcopy(mhd_reg.oldkey.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); + bcopy(mhd_reg.newkey.key, scsi_prout->service_key, MHIOC_RESV_KEY_SIZE); + scsi_prout->aptpl = (uchar_t)mhd_reg.aptpl; + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted + * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk + * server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_reserve(vdc_t *vdc, caddr_t arg, int mode) +{ + union scsi_cdb *cdb; + vd_scsi_t *vd_scsi; + sd_prout_t *scsi_prout; + mhioc_resv_desc_t mhd_resv; + int vd_scsi_len, rv; + + /* copyin arguments */ + rv = ddi_copyin(arg, &mhd_resv, sizeof (mhd_resv), mode); + if (rv != 0) + return (EFAULT); + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE, + sizeof (sd_prout_t), &vd_scsi_len); + + /* set parameters */ + cdb = VD_SCSI_DATA_CDB(vd_scsi); + scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); + bcopy(mhd_resv.key.key, scsi_prout->res_key, MHIOC_RESV_KEY_SIZE); + scsi_prout->scope_address = mhd_resv.scope_specific_addr; + cdb->cdb_opaque[2] = mhd_resv.type; + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is + * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which + * is sent to the vdisk server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_preemptabort(vdc_t *vdc, caddr_t arg, int mode) +{ + union scsi_cdb *cdb; + vd_scsi_t *vd_scsi; + sd_prout_t *scsi_prout; + mhioc_preemptandabort_t mhd_preempt; + int vd_scsi_len, rv; + + /* copyin arguments */ + rv = ddi_copyin(arg, &mhd_preempt, sizeof (mhd_preempt), mode); + if (rv != 0) + return (EFAULT); + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT, + sizeof (sd_prout_t), &vd_scsi_len); + + /* set parameters */ + vd_scsi->task_attribute = VD_SCSI_TASK_ACA; + cdb = VD_SCSI_DATA_CDB(vd_scsi); + scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); + bcopy(mhd_preempt.resvdesc.key.key, scsi_prout->res_key, + MHIOC_RESV_KEY_SIZE); + bcopy(mhd_preempt.victim_key.key, scsi_prout->service_key, + MHIOC_RESV_KEY_SIZE); + scsi_prout->scope_address = mhd_preempt.resvdesc.scope_specific_addr; + cdb->cdb_opaque[2] = mhd_preempt.resvdesc.type; + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl + * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY + * command which is sent to the vdisk server with a VD_OP_SCSICMD operation. + */ +static int +vdc_mhd_registerignore(vdc_t *vdc, caddr_t arg, int mode) +{ + vd_scsi_t *vd_scsi; + sd_prout_t *scsi_prout; + mhioc_registerandignorekey_t mhd_regi; + int vd_scsi_len, rv; + + /* copyin arguments */ + rv = ddi_copyin(arg, &mhd_regi, sizeof (mhd_regi), mode); + if (rv != 0) + return (EFAULT); + + /* build SCSI VD_OP request */ + vd_scsi = vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY, + sizeof (sd_prout_t), &vd_scsi_len); + + /* set parameters */ + scsi_prout = (sd_prout_t *)VD_SCSI_DATA_OUT(vd_scsi); + bcopy(mhd_regi.newkey.key, scsi_prout->service_key, + MHIOC_RESV_KEY_SIZE); + scsi_prout->aptpl = (uchar_t)mhd_regi.aptpl; + + /* submit the request */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)mode, VIO_both_dir, B_FALSE); + + if (rv == 0) + rv = vdc_scsi_status(vdc, vd_scsi, B_FALSE); + + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * This function is used by the failfast mechanism to send a SCSI command + * to check for reservation conflict. + */ +static int +vdc_failfast_scsi_cmd(vdc_t *vdc, uchar_t scmd) +{ + int cdb_len, sense_len, vd_scsi_len; + vd_scsi_t *vd_scsi; + union scsi_cdb *cdb; + int rv; + + ASSERT(scmd == SCMD_TEST_UNIT_READY || scmd == SCMD_WRITE_G1); + + if (scmd == SCMD_WRITE_G1) + cdb_len = CDB_GROUP1; + else + cdb_len = CDB_GROUP0; + + sense_len = sizeof (struct scsi_extended_sense); + + vd_scsi = vdc_scsi_alloc(cdb_len, sense_len, 0, 0, &vd_scsi_len); + + /* set cdb */ + cdb = VD_SCSI_DATA_CDB(vd_scsi); + cdb->scc_cmd = scmd; + + vd_scsi->timeout = vdc_scsi_timeout; + + /* + * Submit the request. The last argument has to be B_FALSE so that + * vdc_do_sync_op does not loop checking for reservation conflict if + * the operation returns an error. + */ + rv = vdc_do_sync_op(vdc, VD_OP_SCSICMD, (caddr_t)vd_scsi, vd_scsi_len, + 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_FALSE); + + if (rv == 0) + (void) vdc_scsi_status(vdc, vd_scsi, B_FALSE); + + kmem_free(vd_scsi, vd_scsi_len); + return (rv); +} + +/* + * This function is used by the failfast mechanism to check for reservation + * conflict. It sends some SCSI commands which will fail with a reservation + * conflict error if the system does not have access to the disk and this + * will panic the system. + * + * Returned Code: + * 0 - disk is accessible without reservation conflict error + * != 0 - unable to check if disk is accessible + */ +int +vdc_failfast_check_resv(vdc_t *vdc) +{ + int failure = 0; + + /* + * Send a TEST UNIT READY command. The command will panic + * the system if it fails with a reservation conflict. + */ + if (vdc_failfast_scsi_cmd(vdc, SCMD_TEST_UNIT_READY) != 0) + failure++; + + /* + * With SPC-3 compliant devices TEST UNIT READY will succeed on + * a reserved device, so we also do a WRITE(10) of zero byte in + * order to provoke a Reservation Conflict status on those newer + * devices. + */ + if (vdc_failfast_scsi_cmd(vdc, SCMD_WRITE_G1) != 0) + failure++; + + return (failure); +} + +/* + * Add a pending I/O to the failfast I/O queue. An I/O is added to this + * queue when it has failed and failfast is enabled. Then we have to check + * if it has failed because of a reservation conflict in which case we have + * to panic the system. + * + * Async I/O should be queued with their block I/O data transfer structure + * (buf). Sync I/O should be queued with buf = NULL. + */ +static vdc_io_t * +vdc_failfast_io_queue(vdc_t *vdc, struct buf *buf) +{ + vdc_io_t *vio; + + ASSERT(MUTEX_HELD(&vdc->lock)); + + vio = kmem_alloc(sizeof (vdc_io_t), KM_SLEEP); + vio->vio_next = vdc->failfast_io_queue; + vio->vio_buf = buf; + vio->vio_qtime = ddi_get_lbolt(); + + vdc->failfast_io_queue = vio; + + /* notify the failfast thread that a new I/O is queued */ + cv_signal(&vdc->failfast_cv); + + return (vio); +} + +/* + * Remove and complete I/O in the failfast I/O queue which have been + * added after the indicated deadline. A deadline of 0 means that all + * I/O have to be unqueued and marked as completed. + */ +static void +vdc_failfast_io_unqueue(vdc_t *vdc, clock_t deadline) +{ + vdc_io_t *vio, *vio_tmp; + + ASSERT(MUTEX_HELD(&vdc->lock)); + + vio_tmp = NULL; + vio = vdc->failfast_io_queue; + + if (deadline != 0) { + /* + * Skip any io queued after the deadline. The failfast + * I/O queue is ordered starting with the last I/O added + * to the queue. + */ + while (vio != NULL && vio->vio_qtime > deadline) { + vio_tmp = vio; + vio = vio->vio_next; + } + } + + if (vio == NULL) + /* nothing to unqueue */ + return; + + /* update the queue */ + if (vio_tmp == NULL) + vdc->failfast_io_queue = NULL; + else + vio_tmp->vio_next = NULL; + + /* + * Complete unqueued I/O. Async I/O have a block I/O data transfer + * structure (buf) and they are completed by calling biodone(). Sync + * I/O do not have a buf and they are completed by setting the + * vio_qtime to zero and signaling failfast_io_cv. In that case, the + * thread waiting for the I/O to complete is responsible for freeing + * the vio structure. + */ + while (vio != NULL) { + vio_tmp = vio->vio_next; + if (vio->vio_buf != NULL) { + biodone(vio->vio_buf); + kmem_free(vio, sizeof (vdc_io_t)); + } else { + vio->vio_qtime = 0; + } + vio = vio_tmp; + } + + cv_broadcast(&vdc->failfast_io_cv); +} + +/* + * Failfast Thread. + * + * While failfast is enabled, the failfast thread sends a TEST UNIT READY + * and a zero size WRITE(10) SCSI commands on a regular basis to check that + * we still have access to the disk. If a command fails with a RESERVATION + * CONFLICT error then the system will immediatly panic. + * + * The failfast thread is also woken up when an I/O has failed. It then check + * the access to the disk to ensure that the I/O failure was not due to a + * reservation conflict. + * + * There is one failfast thread for each virtual disk for which failfast is + * enabled. We could have only one thread sending requests for all disks but + * this would need vdc to send asynchronous requests and to have callbacks to + * process replies. + */ +static void +vdc_failfast_thread(void *arg) +{ + int status; + vdc_t *vdc = (vdc_t *)arg; + clock_t timeout, starttime; + + mutex_enter(&vdc->lock); + + while (vdc->failfast_interval != 0) { + + starttime = ddi_get_lbolt(); + + mutex_exit(&vdc->lock); + + /* check for reservation conflict */ + status = vdc_failfast_check_resv(vdc); + + mutex_enter(&vdc->lock); + /* + * We have dropped the lock to send the SCSI command so we have + * to check that failfast is still enabled. + */ + if (vdc->failfast_interval == 0) + break; + + /* + * If we have successfully check the disk access and there was + * no reservation conflict then we can complete any I/O queued + * before the last check. + */ + if (status == 0) + vdc_failfast_io_unqueue(vdc, starttime); + + /* proceed again if some I/O are still in the queue */ + if (vdc->failfast_io_queue != NULL) + continue; + + timeout = ddi_get_lbolt() + + drv_usectohz(vdc->failfast_interval); + (void) cv_timedwait(&vdc->failfast_cv, &vdc->lock, timeout); + } + + /* + * Failfast is being stop so we can complete any queued I/O. + */ + vdc_failfast_io_unqueue(vdc, 0); + vdc->failfast_thread = NULL; + mutex_exit(&vdc->lock); + thread_exit(); +} + +/* + * Implement the MHIOCENFAILFAST mhd(7i) ioctl. + */ +static int +vdc_failfast(vdc_t *vdc, caddr_t arg, int mode) +{ + unsigned int mh_time; + + if (ddi_copyin((void *)arg, &mh_time, sizeof (int), mode)) + return (EFAULT); + + mutex_enter(&vdc->lock); + if (mh_time != 0 && vdc->failfast_thread == NULL) { + vdc->failfast_thread = thread_create(NULL, 0, + vdc_failfast_thread, vdc, 0, &p0, TS_RUN, + v.v_maxsyspri - 2); + } + + vdc->failfast_interval = mh_time * 1000; + cv_signal(&vdc->failfast_cv); + mutex_exit(&vdc->lock); + + return (0); +} + +/* + * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are + * converted to VD_OP_SET_ACCESS operations. + */ +static int +vdc_access_set(vdc_t *vdc, uint64_t flags, int mode) +{ + int rv; + + /* submit owership command request */ + rv = vdc_do_sync_op(vdc, VD_OP_SET_ACCESS, (caddr_t)&flags, + sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, + VIO_both_dir, B_TRUE); + + return (rv); +} + +/* + * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a + * VD_OP_GET_ACCESS operation. + */ +static int +vdc_access_get(vdc_t *vdc, uint64_t *status, int mode) +{ + int rv; + + /* submit owership command request */ + rv = vdc_do_sync_op(vdc, VD_OP_GET_ACCESS, (caddr_t)status, + sizeof (uint64_t), 0, 0, CB_SYNC, (void *)(uint64_t)mode, + VIO_both_dir, B_TRUE); + + return (rv); +} + +/* + * Disk Ownership Thread. + * + * When we have taken the ownership of a disk, this thread waits to be + * notified when the LDC channel is reset so that it can recover the + * ownership. + * + * Note that the thread handling the LDC reset (vdc_process_msg_thread()) + * can not be used to do the ownership recovery because it has to be + * running to handle the reply message to the ownership operation. + */ +static void +vdc_ownership_thread(void *arg) +{ + vdc_t *vdc = (vdc_t *)arg; + clock_t timeout; + uint64_t status; + + mutex_enter(&vdc->ownership_lock); + mutex_enter(&vdc->lock); + + while (vdc->ownership & VDC_OWNERSHIP_WANTED) { + + if ((vdc->ownership & VDC_OWNERSHIP_RESET) || + !(vdc->ownership & VDC_OWNERSHIP_GRANTED)) { + /* + * There was a reset so the ownership has been lost, + * try to recover. We do this without using the preempt + * option so that we don't steal the ownership from + * someone who has preempted us. + */ + DMSG(vdc, 0, "[%d] Ownership lost, recovering", + vdc->instance); + + vdc->ownership &= ~(VDC_OWNERSHIP_RESET | + VDC_OWNERSHIP_GRANTED); + + mutex_exit(&vdc->lock); + + status = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | + VD_ACCESS_SET_PRESERVE, FKIOCTL); + + mutex_enter(&vdc->lock); + + if (status == 0) { + DMSG(vdc, 0, "[%d] Ownership recovered", + vdc->instance); + vdc->ownership |= VDC_OWNERSHIP_GRANTED; + } else { + DMSG(vdc, 0, "[%d] Fail to recover ownership", + vdc->instance); + } + + } + + /* + * If we have the ownership then we just wait for an event + * to happen (LDC reset), otherwise we will retry to recover + * after a delay. + */ + if (vdc->ownership & VDC_OWNERSHIP_GRANTED) + timeout = 0; + else + timeout = ddi_get_lbolt() + + drv_usectohz(vdc_ownership_delay); + + /* Release the ownership_lock and wait on the vdc lock */ + mutex_exit(&vdc->ownership_lock); + + if (timeout == 0) + (void) cv_wait(&vdc->ownership_cv, &vdc->lock); + else + (void) cv_timedwait(&vdc->ownership_cv, + &vdc->lock, timeout); + + mutex_exit(&vdc->lock); + + mutex_enter(&vdc->ownership_lock); + mutex_enter(&vdc->lock); + } + + vdc->ownership_thread = NULL; + mutex_exit(&vdc->lock); + mutex_exit(&vdc->ownership_lock); + + thread_exit(); +} + +static void +vdc_ownership_update(vdc_t *vdc, int ownership_flags) +{ + ASSERT(MUTEX_HELD(&vdc->ownership_lock)); + + mutex_enter(&vdc->lock); + vdc->ownership = ownership_flags; + if ((vdc->ownership & VDC_OWNERSHIP_WANTED) && + vdc->ownership_thread == NULL) { + /* start ownership thread */ + vdc->ownership_thread = thread_create(NULL, 0, + vdc_ownership_thread, vdc, 0, &p0, TS_RUN, + v.v_maxsyspri - 2); + } else { + /* notify the ownership thread */ + cv_signal(&vdc->ownership_cv); + } + mutex_exit(&vdc->lock); +} + +/* + * Get the size and the block size of a virtual disk from the vdisk server. + * We need to use this operation when the vdisk_size attribute was not + * available during the handshake with the vdisk server. + */ +static int +vdc_check_capacity(vdc_t *vdc) +{ + int rv = 0; + size_t alloc_len; + vd_capacity_t *vd_cap; + + if (vdc->vdisk_size != 0) + return (0); + + alloc_len = P2ROUNDUP(sizeof (vd_capacity_t), sizeof (uint64_t)); + + vd_cap = kmem_zalloc(alloc_len, KM_SLEEP); + + rv = vdc_do_sync_op(vdc, VD_OP_GET_CAPACITY, (caddr_t)vd_cap, alloc_len, + 0, 0, CB_SYNC, (void *)(uint64_t)FKIOCTL, VIO_both_dir, B_TRUE); + + if (rv == 0) { + if (vd_cap->vdisk_block_size != vdc->block_size || + vd_cap->vdisk_size == VD_SIZE_UNKNOWN || + vd_cap->vdisk_size == 0) + rv = EINVAL; + else + vdc->vdisk_size = vd_cap->vdisk_size; + } + + kmem_free(vd_cap, alloc_len); + return (rv); +} + +/* * This structure is used in the DKIO(7I) array below. */ typedef struct vdc_dk_ioctl { @@ -4772,6 +6155,23 @@ static vdc_dk_ioctl_t dk_ioctl[] = { /* DIOCTL_RWCMD is converted to a read or a write */ {0, DIOCTL_RWCMD, sizeof (struct dadkio_rwcmd), NULL}, + /* mhd(7I) non-shared multihost disks ioctls */ + {0, MHIOCTKOWN, 0, vdc_null_copy_func}, + {0, MHIOCRELEASE, 0, vdc_null_copy_func}, + {0, MHIOCSTATUS, 0, vdc_null_copy_func}, + {0, MHIOCQRESERVE, 0, vdc_null_copy_func}, + + /* mhd(7I) shared multihost disks ioctls */ + {0, MHIOCGRP_INKEYS, 0, vdc_null_copy_func}, + {0, MHIOCGRP_INRESV, 0, vdc_null_copy_func}, + {0, MHIOCGRP_REGISTER, 0, vdc_null_copy_func}, + {0, MHIOCGRP_RESERVE, 0, vdc_null_copy_func}, + {0, MHIOCGRP_PREEMPTANDABORT, 0, vdc_null_copy_func}, + {0, MHIOCGRP_REGISTERANDIGNOREKEY, 0, vdc_null_copy_func}, + + /* mhd(7I) failfast ioctl */ + {0, MHIOCENFAILFAST, 0, vdc_null_copy_func}, + /* * These particular ioctls are not sent to the server - vdc fakes up * the necessary info. @@ -4785,6 +6185,21 @@ static vdc_dk_ioctl_t dk_ioctl[] = { }; /* + * The signature of vd_process_ioctl() has changed to include the return value + * pointer. However we don't want to change vd_efi_* functions now so we add + * this wrapper function so that we can use it with vdc_efi_init(). + * + * vd_efi_* functions need some changes to fix 6528974 and so we will eventually + * remove this function when fixing that bug. + */ +static int +vd_process_efi_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) +{ + int rval; + return (vd_process_ioctl(dev, cmd, arg, mode, &rval)); +} + +/* * Function: * vd_process_ioctl() * @@ -4797,6 +6212,7 @@ static vdc_dk_ioctl_t dk_ioctl[] = { * arg - pointer to user provided structure * (contains data to be set or reference parameter for get) * mode - bit flag, indicating open settings, 32/64 bit type, etc + * rvalp - pointer to return value for calling process. * * Return Code: * 0 @@ -4806,7 +6222,7 @@ static vdc_dk_ioctl_t dk_ioctl[] = { * ENOTSUP */ static int -vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) +vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode, int *rvalp) { int instance = VDCUNIT(dev); vdc_t *vdc = NULL; @@ -4828,6 +6244,11 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) DMSG(vdc, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n", instance, cmd, dev, ddi_model_convert_from(mode & FMODELS)); + if (rvalp != NULL) { + /* the return value of the ioctl is 0 by default */ + *rvalp = 0; + } + /* * Validate the ioctl operation to be performed. * @@ -4860,61 +6281,185 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) len = iop->nbytes; } - /* - * Deal with the ioctls which the server does not provide. vdc can - * fake these up and return immediately - */ + /* check if the ioctl is applicable */ switch (cmd) { case CDROMREADOFFSET: case DKIOCREMOVABLE: - case USCSICMD: return (ENOTTY); + case USCSICMD: + case MHIOCTKOWN: + case MHIOCSTATUS: + case MHIOCQRESERVE: + case MHIOCRELEASE: + case MHIOCGRP_INKEYS: + case MHIOCGRP_INRESV: + case MHIOCGRP_REGISTER: + case MHIOCGRP_RESERVE: + case MHIOCGRP_PREEMPTANDABORT: + case MHIOCGRP_REGISTERANDIGNOREKEY: + case MHIOCENFAILFAST: + if (vdc->cinfo == NULL) + return (ENXIO); + if (vdc->cinfo->dki_ctype != DKC_SCSI_CCS) + return (ENOTTY); + break; + case DIOCTL_RWCMD: - { - if (vdc->cinfo == NULL) - return (ENXIO); + if (vdc->cinfo == NULL) + return (ENXIO); + if (vdc->cinfo->dki_ctype != DKC_DIRECT) + return (ENOTTY); + break; - if (vdc->cinfo->dki_ctype != DKC_DIRECT) - return (ENOTTY); + case DKIOCINFO: + if (vdc->cinfo == NULL) + return (ENXIO); + break; - return (vdc_dioctl_rwcmd(dev, arg, mode)); + case DKIOCGMEDIAINFO: + if (vdc->minfo == NULL) + return (ENXIO); + if (vdc_check_capacity(vdc) != 0) + /* disk capacity is not available */ + return (EIO); + break; + } + + /* + * Deal with ioctls which require a processing different than + * converting ioctl arguments and sending a corresponding + * VD operation. + */ + switch (cmd) { + + case USCSICMD: + { + return (vdc_uscsi_cmd(vdc, arg, mode)); + } + + case MHIOCTKOWN: + { + mutex_enter(&vdc->ownership_lock); + /* + * We have to set VDC_OWNERSHIP_WANTED now so that the ownership + * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset + * while we are processing the ioctl. + */ + vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED); + + rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE | + VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE, mode); + if (rv == 0) { + vdc_ownership_update(vdc, VDC_OWNERSHIP_WANTED | + VDC_OWNERSHIP_GRANTED); + } else { + vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); } + mutex_exit(&vdc->ownership_lock); + return (rv); + } - case DKIOCGAPART: - { - return (vdc_dkio_get_partition(vdc, arg, mode)); + case MHIOCRELEASE: + { + mutex_enter(&vdc->ownership_lock); + rv = vdc_access_set(vdc, VD_ACCESS_SET_CLEAR, mode); + if (rv == 0) { + vdc_ownership_update(vdc, VDC_OWNERSHIP_NONE); } + mutex_exit(&vdc->ownership_lock); + return (rv); + } + + case MHIOCSTATUS: + { + uint64_t status; + + rv = vdc_access_get(vdc, &status, mode); + if (rv == 0 && rvalp != NULL) + *rvalp = (status & VD_ACCESS_ALLOWED)? 0 : 1; + return (rv); + } + + case MHIOCQRESERVE: + { + rv = vdc_access_set(vdc, VD_ACCESS_SET_EXCLUSIVE, mode); + return (rv); + } + + case MHIOCGRP_INKEYS: + { + return (vdc_mhd_inkeys(vdc, arg, mode)); + } + + case MHIOCGRP_INRESV: + { + return (vdc_mhd_inresv(vdc, arg, mode)); + } + + case MHIOCGRP_REGISTER: + { + return (vdc_mhd_register(vdc, arg, mode)); + } + + case MHIOCGRP_RESERVE: + { + return (vdc_mhd_reserve(vdc, arg, mode)); + } + + case MHIOCGRP_PREEMPTANDABORT: + { + return (vdc_mhd_preemptabort(vdc, arg, mode)); + } + + case MHIOCGRP_REGISTERANDIGNOREKEY: + { + return (vdc_mhd_registerignore(vdc, arg, mode)); + } + + case MHIOCENFAILFAST: + { + rv = vdc_failfast(vdc, arg, mode); + return (rv); + } + + case DIOCTL_RWCMD: + { + return (vdc_dioctl_rwcmd(dev, arg, mode)); + } + + case DKIOCGAPART: + { + return (vdc_dkio_get_partition(vdc, arg, mode)); + } case DKIOCINFO: - { - struct dk_cinfo cinfo; - if (vdc->cinfo == NULL) - return (ENXIO); + { + struct dk_cinfo cinfo; - bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); - cinfo.dki_partition = VDCPART(dev); + bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo)); + cinfo.dki_partition = VDCPART(dev); - rv = ddi_copyout(&cinfo, (void *)arg, - sizeof (struct dk_cinfo), mode); - if (rv != 0) - return (EFAULT); + rv = ddi_copyout(&cinfo, (void *)arg, + sizeof (struct dk_cinfo), mode); + if (rv != 0) + return (EFAULT); - return (0); - } + return (0); + } case DKIOCGMEDIAINFO: - { - if (vdc->minfo == NULL) - return (ENXIO); - - rv = ddi_copyout(vdc->minfo, (void *)arg, - sizeof (struct dk_minfo), mode); - if (rv != 0) - return (EFAULT); + { + ASSERT(vdc->vdisk_size != 0); + if (vdc->minfo->dki_capacity == 0) + vdc->minfo->dki_capacity = vdc->vdisk_size; + rv = ddi_copyout(vdc->minfo, (void *)arg, + sizeof (struct dk_minfo), mode); + if (rv != 0) + return (EFAULT); - return (0); - } + return (0); + } case DKIOCFLUSHWRITECACHE: { @@ -5013,16 +6558,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) */ rv = vdc_do_sync_op(vdc, iop->op, mem_p, alloc_len, VDCPART(dev), 0, CB_SYNC, (void *)(uint64_t)mode, - VIO_both_dir); - - if (cmd == DKIOCSVTOC || cmd == DKIOCSETEFI) { - /* - * The disk label may have changed. Revalidate the disk - * geometry. This will also update the device nodes and - * properties. - */ - vdc_validate(vdc); - } + VIO_both_dir, B_TRUE); if (rv != 0) { /* @@ -5204,19 +6740,21 @@ vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { _NOTE(ARGUNUSED(vdc)) - void *tmp_mem = NULL; + void *tmp_mem = NULL, *uvtoc; struct vtoc vt; struct vtoc *vtp = &vt; vd_vtoc_t vtvd; int copy_len = 0; - int rv = 0; - - if (dir != VD_COPYIN) - return (0); /* nothing to do */ + int i, rv = 0; if ((from == NULL) || (to == NULL)) return (ENXIO); + if (dir == VD_COPYIN) + uvtoc = from; + else + uvtoc = to; + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) copy_len = sizeof (struct vtoc32); else @@ -5224,7 +6762,7 @@ vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) tmp_mem = kmem_alloc(copy_len, KM_SLEEP); - rv = ddi_copyin(from, tmp_mem, copy_len, mode); + rv = ddi_copyin(uvtoc, tmp_mem, copy_len, mode); if (rv != 0) { kmem_free(tmp_mem, copy_len); return (EFAULT); @@ -5236,6 +6774,24 @@ vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) vtp = tmp_mem; } + if (dir == VD_COPYOUT) { + /* + * The disk label may have changed. Revalidate the disk + * geometry. This will also update the device nodes and + * properties. + */ + vdc_validate(vdc); + + /* + * We also need to keep track of the timestamp fields. + */ + for (i = 0; i < V_NUMPAR; i++) { + vdc->vtoc->timestamp[i] = vtp->timestamp[i]; + } + + return (0); + } + VTOC2VD_VTOC(vtp, &vtvd); bcopy(&vtvd, to, sizeof (vd_vtoc_t)); kmem_free(tmp_mem, copy_len); @@ -5393,8 +6949,15 @@ vdc_set_efi_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) dk_efi_t dk_efi; void *uaddr; - if (dir == VD_COPYOUT) - return (0); /* nothing to do */ + if (dir == VD_COPYOUT) { + /* + * The disk label may have changed. Revalidate the disk + * geometry. This will also update the device nodes and + * properties. + */ + vdc_validate(vdc); + return (0); + } if ((from == NULL) || (to == NULL)) return (ENXIO); @@ -5440,7 +7003,6 @@ static void vdc_create_fake_geometry(vdc_t *vdc) { ASSERT(vdc != NULL); - ASSERT(vdc->vdisk_size != 0); ASSERT(vdc->max_xfer_sz != 0); /* @@ -5453,10 +7015,13 @@ vdc_create_fake_geometry(vdc_t *vdc) (void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME); /* max_xfer_sz is #blocks so we don't need to divide by DEV_BSIZE */ vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz; + /* - * We currently set the controller type to DKC_DIRECT for any disk. - * When SCSI support is implemented, we will eventually change this - * type to DKC_SCSI_CCS for disks supporting the SCSI protocol. + * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD + * operation is supported, otherwise the controller type is DKC_DIRECT. + * Version 1.0 does not support the VD_OP_SCSICMD operation, so the + * controller type is always DKC_DIRECT in that case. + * * If the virtual disk is backed by a physical CD/DVD device or * an ISO image, modify the controller type to indicate this */ @@ -5466,7 +7031,10 @@ vdc_create_fake_geometry(vdc_t *vdc) vdc->cinfo->dki_ctype = DKC_CDROM; break; case VD_MEDIA_FIXED: - vdc->cinfo->dki_ctype = DKC_DIRECT; + if (VD_OP_SUPPORTED(vdc->operations, VD_OP_SCSICMD)) + vdc->cinfo->dki_ctype = DKC_SCSI_CCS; + else + vdc->cinfo->dki_ctype = DKC_DIRECT; break; default: /* in the case of v1.0 we default to a fixed disk */ @@ -5544,7 +7112,7 @@ vdc_validate_geometry(vdc_t *vdc) { buf_t *buf; /* BREAD requests need to be in a buf_t structure */ dev_t dev; - int rv; + int rv, rval; struct dk_label label; struct dk_geom geom; struct vtoc vtoc; @@ -5558,9 +7126,10 @@ vdc_validate_geometry(vdc_t *vdc) dev = makedevice(ddi_driver_major(vdc->dip), VD_MAKE_DEV(vdc->instance, 0)); - rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL); + rv = vd_process_ioctl(dev, DKIOCGGEOM, (caddr_t)&geom, FKIOCTL, &rval); if (rv == 0) - rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, FKIOCTL); + rv = vd_process_ioctl(dev, DKIOCGVTOC, (caddr_t)&vtoc, + FKIOCTL, &rval); if (rv == ENOTSUP) { /* @@ -5779,7 +7348,7 @@ vdc_setup_devid(vdc_t *vdc) bufid_len = bufsize - sizeof (vd_efi_t) - 1; rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, - bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir); + bufsize, 0, 0, CB_SYNC, 0, VIO_both_dir, B_TRUE); DMSG(vdc, 2, "sync_op returned %d\n", rv); @@ -5801,7 +7370,7 @@ vdc_setup_devid(vdc_t *vdc) rv = vdc_do_sync_op(vdc, VD_OP_GET_DEVID, (caddr_t)vd_devid, bufsize, 0, 0, CB_SYNC, 0, - VIO_both_dir); + VIO_both_dir, B_TRUE); if (rv) { kmem_free(vd_devid, bufsize); diff --git a/usr/src/uts/sun4v/io/vds.c b/usr/src/uts/sun4v/io/vds.c index 9d424703e8..b864adbc5e 100644 --- a/usr/src/uts/sun4v/io/vds.c +++ b/usr/src/uts/sun4v/io/vds.c @@ -39,6 +39,7 @@ #include <sys/file.h> #include <sys/fs/hsfs_isospec.h> #include <sys/mdeg.h> +#include <sys/mhd.h> #include <sys/modhash.h> #include <sys/note.h> #include <sys/pathname.h> @@ -137,6 +138,10 @@ vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \ 0, sizeof (struct dk_label)) +/* Message for disk access rights reset failure */ +#define VD_RESET_ACCESS_FAILURE_MSG \ + "Fail to reset disk access rights for disk %s" + /* * Specification of an MD node passed to the MDEG to filter any * 'vport' nodes that do not belong to the specified node. This @@ -347,6 +352,7 @@ typedef struct vd { size_t block_size; /* blk size of actual device */ boolean_t pseudo; /* underlying pseudo dev */ boolean_t file; /* is vDisk backed by a file? */ + boolean_t scsi; /* is vDisk backed by scsi? */ vnode_t *file_vnode; /* file vnode */ size_t file_size; /* file size */ ddi_devid_t file_devid; /* devid for disk image */ @@ -354,6 +360,7 @@ typedef struct vd { struct dk_geom dk_geom; /* synthetic for slice type */ struct dk_minfo dk_minfo; /* synthetic for slice type */ struct vtoc vtoc; /* synthetic for slice type */ + boolean_t ownership; /* disk ownership status */ ldc_status_t ldc_state; /* LDC connection state */ ldc_handle_t ldc_handle; /* handle for LDC comm */ size_t max_msglen; /* largest LDC message len */ @@ -391,7 +398,7 @@ typedef struct vd_ioctl { const char *cmd_name; /* ioctl cmd name */ void *arg; /* ioctl cmd argument */ /* convert input vd_buf to output ioctl_arg */ - void (*copyin)(void *vd_buf, void *ioctl_arg); + int (*copyin)(void *vd_buf, size_t, void *ioctl_arg); /* convert input ioctl_arg to output vd_buf */ void (*copyout)(void *ioctl_arg, void *vd_buf); /* write is true if the operation writes any data to the backend */ @@ -399,7 +406,8 @@ typedef struct vd_ioctl { } vd_ioctl_t; /* Define trivial copyin/copyout conversion function flag */ -#define VD_IDENTITY ((void (*)(void *, void *))-1) +#define VD_IDENTITY_IN ((int (*)(void *, size_t, void *))-1) +#define VD_IDENTITY_OUT ((void (*)(void *, void *))-1) static int vds_ldc_retries = VDS_RETRIES; @@ -411,6 +419,31 @@ static void *vds_state; static uint_t vd_file_write_flags = VD_FILE_WRITE_FLAGS; static short vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT; +static int vd_scsi_debug = USCSI_SILENT; + +/* + * Tunable to define the behavior of the service domain if the vdisk server + * fails to reset disk exclusive access when a LDC channel is reset. When a + * LDC channel is reset the vdisk server will try to reset disk exclusive + * access by releasing any SCSI-2 reservation or resetting the disk. If these + * actions fail then the default behavior (vd_reset_access_failure = 0) is to + * print a warning message. This default behavior can be changed by setting + * the vd_reset_access_failure variable to A_REBOOT (= 0x1) and that will + * cause the service domain to reboot, or A_DUMP (= 0x5) and that will cause + * the service domain to panic. In both cases, the reset of the service domain + * should trigger a reset SCSI buses and hopefully clear any SCSI-2 reservation. + */ +static int vd_reset_access_failure = 0; + +/* + * Tunable for backward compatibility. When this variable is set to B_TRUE, + * all disk volumes (ZFS, SVM, VxvM volumes) will be exported as single + * slice disks whether or not they have the "slice" option set. This is + * to provide a simple backward compatibility mechanism when upgrading + * the vds driver and using a domain configuration created before the + * "slice" option was available. + */ +static boolean_t vd_volume_force_slice = B_FALSE; /* * Supported protocol version pairs, from highest (newest) to lowest (oldest) @@ -426,11 +459,13 @@ static const size_t vds_num_versions = static void vd_free_dring_task(vd_t *vdp); static int vd_setup_vd(vd_t *vd); static int vd_setup_single_slice_disk(vd_t *vd); +static int vd_setup_mediainfo(vd_t *vd); static boolean_t vd_enabled(vd_t *vd); static ushort_t vd_lbl2cksum(struct dk_label *label); static int vd_file_validate_geometry(vd_t *vd); static boolean_t vd_file_is_iso_image(vd_t *vd); static void vd_set_exported_operations(vd_t *vd); +static void vd_reset_access(vd_t *vd); /* * Function: @@ -1109,6 +1144,15 @@ vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen) size_t plen; /* length of data to be read from physical device */ char *buf; /* buffer area to fit physical device's block size */ + if (vd->block_size == 0) { + /* + * The block size was not available during the attach, + * try to update it now. + */ + if (vd_setup_mediainfo(vd) != 0) + return (EIO); + } + /* * If the vdisk block size and the block size of the underlying device * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need @@ -1419,6 +1463,9 @@ vd_reset_if_needed(vd_t *vd) if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0)) PR0("ldc_down() returned errno %d", status); + /* Reset exclusive access rights */ + vd_reset_access(vd); + vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING); vd->state = VD_STATE_INIT; vd->max_msglen = sizeof (vio_msg_t); /* baseline vio message size */ @@ -1675,16 +1722,20 @@ vd_serial_notify(void *arg) vd_notify(task); } -static void -vd_geom2dk_geom(void *vd_buf, void *ioctl_arg) +/* ARGSUSED */ +static int +vd_geom2dk_geom(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) { VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg); + return (0); } -static void -vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg) +/* ARGSUSED */ +static int +vd_vtoc2vtoc(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) { VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg); + return (0); } static void @@ -1699,15 +1750,21 @@ vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf) VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf); } -static void -vd_get_efi_in(void *vd_buf, void *ioctl_arg) +static int +vd_get_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) { vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; + size_t data_len; + + data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t)); + if (vd_efi->length > data_len) + return (EINVAL); dk_efi->dki_lba = vd_efi->lba; dk_efi->dki_length = vd_efi->length; dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP); + return (0); } static void @@ -1722,14 +1779,20 @@ vd_get_efi_out(void *ioctl_arg, void *vd_buf) kmem_free(dk_efi->dki_data, len); } -static void -vd_set_efi_in(void *vd_buf, void *ioctl_arg) +static int +vd_set_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) { vd_efi_t *vd_efi = (vd_efi_t *)vd_buf; dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg; + size_t data_len; + + data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t)); + if (vd_efi->length > data_len) + return (EINVAL); dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP); VD_EFI2DK_EFI(vd_efi, dk_efi); + return (0); } static void @@ -1741,6 +1804,123 @@ vd_set_efi_out(void *ioctl_arg, void *vd_buf) kmem_free(dk_efi->dki_data, vd_efi->length); } +static int +vd_scsicmd_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg) +{ + size_t vd_scsi_len; + vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf; + struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg; + + /* check buffer size */ + vd_scsi_len = VD_SCSI_SIZE; + vd_scsi_len += P2ROUNDUP(vd_scsi->cdb_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(vd_scsi->sense_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(vd_scsi->datain_len, sizeof (uint64_t)); + vd_scsi_len += P2ROUNDUP(vd_scsi->dataout_len, sizeof (uint64_t)); + + ASSERT(vd_scsi_len % sizeof (uint64_t) == 0); + + if (vd_buf_len < vd_scsi_len) + return (EINVAL); + + /* set flags */ + uscsi->uscsi_flags = vd_scsi_debug; + + if (vd_scsi->options & VD_SCSI_OPT_NORETRY) { + uscsi->uscsi_flags |= USCSI_ISOLATE; + uscsi->uscsi_flags |= USCSI_DIAGNOSE; + } + + /* task attribute */ + switch (vd_scsi->task_attribute) { + case VD_SCSI_TASK_ACA: + uscsi->uscsi_flags |= USCSI_HEAD; + break; + case VD_SCSI_TASK_HQUEUE: + uscsi->uscsi_flags |= USCSI_HTAG; + break; + case VD_SCSI_TASK_ORDERED: + uscsi->uscsi_flags |= USCSI_OTAG; + break; + default: + uscsi->uscsi_flags |= USCSI_NOTAG; + break; + } + + /* timeout */ + uscsi->uscsi_timeout = vd_scsi->timeout; + + /* cdb data */ + uscsi->uscsi_cdb = (caddr_t)VD_SCSI_DATA_CDB(vd_scsi); + uscsi->uscsi_cdblen = vd_scsi->cdb_len; + + /* sense buffer */ + if (vd_scsi->sense_len != 0) { + uscsi->uscsi_flags |= USCSI_RQENABLE; + uscsi->uscsi_rqbuf = (caddr_t)VD_SCSI_DATA_SENSE(vd_scsi); + uscsi->uscsi_rqlen = vd_scsi->sense_len; + } + + if (vd_scsi->datain_len != 0 && vd_scsi->dataout_len != 0) { + /* uscsi does not support read/write request */ + return (EINVAL); + } + + /* request data-in */ + if (vd_scsi->datain_len != 0) { + uscsi->uscsi_flags |= USCSI_READ; + uscsi->uscsi_buflen = vd_scsi->datain_len; + uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_IN(vd_scsi); + } + + /* request data-out */ + if (vd_scsi->dataout_len != 0) { + uscsi->uscsi_buflen = vd_scsi->dataout_len; + uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_OUT(vd_scsi); + } + + return (0); +} + +static void +vd_scsicmd_out(void *ioctl_arg, void *vd_buf) +{ + vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf; + struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg; + + /* output fields */ + vd_scsi->cmd_status = uscsi->uscsi_status; + + /* sense data */ + if ((uscsi->uscsi_flags & USCSI_RQENABLE) && + (uscsi->uscsi_status == STATUS_CHECK || + uscsi->uscsi_status == STATUS_TERMINATED)) { + vd_scsi->sense_status = uscsi->uscsi_rqstatus; + if (uscsi->uscsi_rqstatus == STATUS_GOOD) + vd_scsi->sense_len -= uscsi->uscsi_resid; + else + vd_scsi->sense_len = 0; + } else { + vd_scsi->sense_len = 0; + } + + if (uscsi->uscsi_status != STATUS_GOOD) { + vd_scsi->dataout_len = 0; + vd_scsi->datain_len = 0; + return; + } + + if (uscsi->uscsi_flags & USCSI_READ) { + /* request data (read) */ + vd_scsi->datain_len -= uscsi->uscsi_resid; + vd_scsi->dataout_len = 0; + } else { + /* request data (write) */ + vd_scsi->datain_len = 0; + vd_scsi->dataout_len -= uscsi->uscsi_resid; + } +} + static vd_disk_label_t vd_read_vtoc(vd_t *vd, struct vtoc *vtoc) { @@ -2143,10 +2323,30 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) } /* Convert client's data, if necessary */ - if (ioctl->copyin == VD_IDENTITY) /* use client buffer */ + if (ioctl->copyin == VD_IDENTITY_IN) { + /* use client buffer */ ioctl->arg = buf; - else /* convert client vdisk operation data to ioctl data */ - (ioctl->copyin)(buf, (void *)ioctl->arg); + } else { + /* convert client vdisk operation data to ioctl data */ + status = (ioctl->copyin)(buf, nbytes, + (void *)ioctl->arg); + if (status != 0) { + request->status = status; + return (0); + } + } + } + + if (ioctl->operation == VD_OP_SCSICMD) { + struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl->arg; + + /* check write permission */ + if (!(vd->open_flags & FWRITE) && + !(uscsi->uscsi_flags & USCSI_READ)) { + PR0("uscsi fails because backend is opened read-only"); + request->status = EROFS; + return (0); + } } /* @@ -2176,7 +2376,19 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) if (request->status != 0) { PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status); - return (0); + if (ioctl->operation == VD_OP_SCSICMD && + ((struct uscsi_cmd *)ioctl->arg)->uscsi_status != 0) + /* + * USCSICMD has reported an error and the uscsi_status + * field is not zero. This means that the SCSI command + * has completed but it has an error. So we should + * mark the VD operation has succesfully completed + * and clients can check the SCSI status field for + * SCSI errors. + */ + request->status = 0; + else + return (0); } /* Convert data and send to client, if necessary */ @@ -2185,7 +2397,7 @@ vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl) PR1("Sending \"arg\" data to client"); /* Convert ioctl data to vdisk operation data, if necessary */ - if (ioctl->copyout != VD_IDENTITY) + if (ioctl->copyout != VD_IDENTITY_OUT) (ioctl->copyout)((void *)ioctl->arg, buf); if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes, @@ -2239,6 +2451,7 @@ vd_ioctl(vd_task_t *task) struct dk_geom dk_geom = {0}; struct vtoc vtoc = {0}; struct dk_efi dk_efi = {0}; + struct uscsi_cmd uscsi = {0}; vd_t *vd = task->vd; vd_dring_payload_t *request = task->request; vd_ioctl_t ioctl[] = { @@ -2250,7 +2463,7 @@ vd_ioctl(vd_task_t *task) /* "Get" (copy-out) operations */ {VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int), DKIOCGETWCE, STRINGIZE(DKIOCGETWCE), - NULL, VD_IDENTITY, VD_IDENTITY, B_FALSE}, + NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_FALSE}, {VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), RNDSIZE(vd_geom_t), DKIOCGGEOM, STRINGIZE(DKIOCGGEOM), @@ -2265,7 +2478,7 @@ vd_ioctl(vd_task_t *task) /* "Set" (copy-in) operations */ {VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int), DKIOCSETWCE, STRINGIZE(DKIOCSETWCE), - NULL, VD_IDENTITY, VD_IDENTITY, B_TRUE}, + NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_TRUE}, {VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), RNDSIZE(vd_geom_t), DKIOCSGEOM, STRINGIZE(DKIOCSGEOM), @@ -2276,6 +2489,10 @@ vd_ioctl(vd_task_t *task) {VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t), DKIOCSETEFI, STRINGIZE(DKIOCSETEFI), &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE}, + + {VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), RNDSIZE(vd_scsi_t), + USCSICMD, STRINGIZE(USCSICMD), + &uscsi, vd_scsicmd_in, vd_scsicmd_out, B_FALSE}, }; size_t nioctls = (sizeof (ioctl))/(sizeof (ioctl[0])); @@ -2294,7 +2511,8 @@ vd_ioctl(vd_task_t *task) ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0); if (request->operation == VD_OP_GET_EFI || - request->operation == VD_OP_SET_EFI) { + request->operation == VD_OP_SET_EFI || + request->operation == VD_OP_SCSICMD) { if (request->nbytes >= ioctl[i].nbytes) break; PR0("%s: Expected at least nbytes = %lu, " @@ -2399,6 +2617,308 @@ vd_get_devid(vd_task_t *task) return (status); } +static int +vd_scsi_reset(vd_t *vd) +{ + int rval, status; + struct uscsi_cmd uscsi = { 0 }; + + uscsi.uscsi_flags = vd_scsi_debug | USCSI_RESET; + uscsi.uscsi_timeout = vd_scsi_rdwr_timeout; + + status = ldi_ioctl(vd->ldi_handle[0], USCSICMD, (intptr_t)&uscsi, + (vd->open_flags | FKIOCTL), kcred, &rval); + + return (status); +} + +static int +vd_reset(vd_task_t *task) +{ + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + + ASSERT(request->operation == VD_OP_RESET); + ASSERT(vd->scsi); + + PR0("Performing VD_OP_RESET"); + + if (request->nbytes != 0) { + PR0("VD_OP_RESET: Expected nbytes = 0, got %lu", + request->nbytes); + return (EINVAL); + } + + request->status = vd_scsi_reset(vd); + + return (0); +} + +static int +vd_get_capacity(vd_task_t *task) +{ + int rv; + size_t nbytes; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + vd_capacity_t vd_cap = { 0 }; + + ASSERT(request->operation == VD_OP_GET_CAPACITY); + ASSERT(vd->scsi); + + PR0("Performing VD_OP_GET_CAPACITY"); + + nbytes = request->nbytes; + + if (nbytes != RNDSIZE(vd_capacity_t)) { + PR0("VD_OP_GET_CAPACITY: Expected nbytes = %lu, got %lu", + RNDSIZE(vd_capacity_t), nbytes); + return (EINVAL); + } + + if (vd->vdisk_size == VD_SIZE_UNKNOWN) { + if (vd_setup_mediainfo(vd) != 0) + ASSERT(vd->vdisk_size == VD_SIZE_UNKNOWN); + } + + ASSERT(vd->vdisk_size != 0); + + request->status = 0; + + vd_cap.vdisk_block_size = vd->vdisk_block_size; + vd_cap.vdisk_size = vd->vdisk_size; + + if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&vd_cap, 0, &nbytes, + request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) { + PR0("ldc_mem_copy() returned errno %d copying to client", rv); + return (rv); + } + + return (0); +} + +static int +vd_get_access(vd_task_t *task) +{ + uint64_t access; + int rv, rval = 0; + size_t nbytes; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + + ASSERT(request->operation == VD_OP_GET_ACCESS); + ASSERT(vd->scsi); + + PR0("Performing VD_OP_GET_ACCESS"); + + nbytes = request->nbytes; + + if (nbytes != sizeof (uint64_t)) { + PR0("VD_OP_GET_ACCESS: Expected nbytes = %lu, got %lu", + sizeof (uint64_t), nbytes); + return (EINVAL); + } + + request->status = ldi_ioctl(vd->ldi_handle[request->slice], MHIOCSTATUS, + NULL, (vd->open_flags | FKIOCTL), kcred, &rval); + + if (request->status != 0) + return (0); + + access = (rval == 0)? VD_ACCESS_ALLOWED : VD_ACCESS_DENIED; + + if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&access, 0, &nbytes, + request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) { + PR0("ldc_mem_copy() returned errno %d copying to client", rv); + return (rv); + } + + return (0); +} + +static int +vd_set_access(vd_task_t *task) +{ + uint64_t flags; + int rv, rval; + size_t nbytes; + vd_t *vd = task->vd; + vd_dring_payload_t *request = task->request; + + ASSERT(request->operation == VD_OP_SET_ACCESS); + ASSERT(vd->scsi); + + nbytes = request->nbytes; + + if (nbytes != sizeof (uint64_t)) { + PR0("VD_OP_SET_ACCESS: Expected nbytes = %lu, got %lu", + sizeof (uint64_t), nbytes); + return (EINVAL); + } + + if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&flags, 0, &nbytes, + request->cookie, request->ncookies, LDC_COPY_IN)) != 0) { + PR0("ldc_mem_copy() returned errno %d copying from client", rv); + return (rv); + } + + if (flags == VD_ACCESS_SET_CLEAR) { + PR0("Performing VD_OP_SET_ACCESS (CLEAR)"); + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCRELEASE, NULL, (vd->open_flags | FKIOCTL), kcred, + &rval); + if (request->status == 0) + vd->ownership = B_FALSE; + return (0); + } + + /* + * As per the VIO spec, the PREEMPT and PRESERVE flags are only valid + * when the EXCLUSIVE flag is set. + */ + if (!(flags & VD_ACCESS_SET_EXCLUSIVE)) { + PR0("Invalid VD_OP_SET_ACCESS flags: 0x%lx", flags); + request->status = EINVAL; + return (0); + } + + switch (flags & (VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE)) { + + case VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE: + /* + * Flags EXCLUSIVE and PREEMPT and PRESERVE. We have to + * acquire exclusive access rights, preserve them and we + * can use preemption. So we can use the MHIOCTKNOWN ioctl. + */ + PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT|PRESERVE)"); + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval); + break; + + case VD_ACCESS_SET_PRESERVE: + /* + * Flags EXCLUSIVE and PRESERVE. We have to acquire exclusive + * access rights and preserve them, but not preempt any other + * host. So we need to use the MHIOCTKOWN ioctl to enable the + * "preserve" feature but we can not called it directly + * because it uses preemption. So before that, we use the + * MHIOCQRESERVE ioctl to ensure we can get exclusive rights + * without preempting anyone. + */ + PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PRESERVE)"); + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, + &rval); + if (request->status != 0) + break; + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval); + break; + + case VD_ACCESS_SET_PREEMPT: + /* + * Flags EXCLUSIVE and PREEMPT. We have to acquire exclusive + * access rights and we can use preemption. So we try to do + * a SCSI reservation, if it fails we reset the disk to clear + * any reservation and we try to reserve again. + */ + PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT)"); + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, + &rval); + if (request->status == 0) + break; + + /* reset the disk */ + (void) vd_scsi_reset(vd); + + /* try again even if the reset has failed */ + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, + &rval); + break; + + case 0: + /* Flag EXCLUSIVE only. Just issue a SCSI reservation */ + PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE)"); + request->status = ldi_ioctl(vd->ldi_handle[request->slice], + MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred, + &rval); + break; + } + + if (request->status == 0) + vd->ownership = B_TRUE; + else + PR0("VD_OP_SET_ACCESS: error %d", request->status); + + return (0); +} + +static void +vd_reset_access(vd_t *vd) +{ + int status, rval; + + if (vd->file || !vd->ownership) + return; + + PR0("Releasing disk ownership"); + status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL, + (vd->open_flags | FKIOCTL), kcred, &rval); + + /* + * An EACCES failure means that there is a reservation conflict, + * so we are not the owner of the disk anymore. + */ + if (status == 0 || status == EACCES) { + vd->ownership = B_FALSE; + return; + } + + PR0("Fail to release ownership, error %d", status); + + /* + * We have failed to release the ownership, try to reset the disk + * to release reservations. + */ + PR0("Resetting disk"); + status = vd_scsi_reset(vd); + + if (status != 0) + PR0("Fail to reset disk, error %d", status); + + /* whatever the result of the reset is, we try the release again */ + status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL, + (vd->open_flags | FKIOCTL), kcred, &rval); + + if (status == 0 || status == EACCES) { + vd->ownership = B_FALSE; + return; + } + + PR0("Fail to release ownership, error %d", status); + + /* + * At this point we have done our best to try to reset the + * access rights to the disk and we don't know if we still + * own a reservation and if any mechanism to preserve the + * ownership is still in place. The ultimate solution would + * be to reset the system but this is usually not what we + * want to happen. + */ + + if (vd_reset_access_failure == A_REBOOT) { + cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG + ", rebooting the system", vd->device_path); + (void) uadmin(A_SHUTDOWN, AD_BOOT, NULL); + } else if (vd_reset_access_failure == A_DUMP) { + panic(VD_RESET_ACCESS_FAILURE_MSG, vd->device_path); + } + + cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG, vd->device_path); +} + /* * Define the supported operations once the functions for performing them have * been defined @@ -2417,6 +2937,11 @@ static const vds_operation_t vds_operation[] = { {X(VD_OP_GET_EFI), vd_ioctl, NULL}, {X(VD_OP_SET_EFI), vd_ioctl, NULL}, {X(VD_OP_GET_DEVID), vd_get_devid, NULL}, + {X(VD_OP_SCSICMD), vd_ioctl, NULL}, + {X(VD_OP_RESET), vd_reset, NULL}, + {X(VD_OP_GET_CAPACITY), vd_get_capacity, NULL}, + {X(VD_OP_SET_ACCESS), vd_set_access, NULL}, + {X(VD_OP_GET_ACCESS), vd_get_access, NULL}, #undef X }; @@ -2702,6 +3227,9 @@ vd_set_exported_operations(vd_t *vd) if (vd->open_flags & FWRITE) vd->operations |= VD_OP_MASK_WRITE; + if (vd->scsi) + vd->operations |= VD_OP_MASK_SCSI; + if (vd->file && vd_file_is_iso_image(vd)) { /* * can't write to ISO images, make sure that write @@ -2823,8 +3351,7 @@ vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen) } /* Return the device's block size and max transfer size to the client */ - attr_msg->vdisk_block_size = DEV_BSIZE; - attr_msg->vdisk_block_size = vd->block_size; + attr_msg->vdisk_block_size = vd->vdisk_block_size; attr_msg->max_xfer_sz = vd->max_xfer_sz; attr_msg->vdisk_size = vd->vdisk_size; @@ -3806,32 +4333,65 @@ vd_is_atapi_device(vd_t *vd) } static int +vd_setup_mediainfo(vd_t *vd) +{ + int status, rval; + struct dk_minfo dk_minfo; + + ASSERT(vd->ldi_handle[0] != NULL); + ASSERT(vd->vdisk_block_size != 0); + + if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, + (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), + kcred, &rval)) != 0) + return (status); + + ASSERT(dk_minfo.dki_lbsize % vd->vdisk_block_size == 0); + + vd->block_size = dk_minfo.dki_lbsize; + vd->vdisk_size = (dk_minfo.dki_capacity * dk_minfo.dki_lbsize) / + vd->vdisk_block_size; + vd->vdisk_media = DK_MEDIATYPE2VD_MEDIATYPE(dk_minfo.dki_media_type); + return (0); +} + +static int vd_setup_full_disk(vd_t *vd) { - int rval, status; + int status; major_t major = getmajor(vd->dev[0]); minor_t minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE; - struct dk_minfo dk_minfo; ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK); + vd->vdisk_block_size = DEV_BSIZE; + /* * At this point, vdisk_size is set to the size of partition 2 but * this does not represent the size of the disk because partition 2 * may not cover the entire disk and its size does not include reserved - * blocks. So we update vdisk_size to be the size of the entire disk. + * blocks. So we call vd_get_mediainfo to udpate this information and + * set the block size and the media type of the disk. */ - if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO, - (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL), - kcred, &rval)) != 0) { - PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", - status); - return (status); + status = vd_setup_mediainfo(vd); + + if (status != 0) { + if (!vd->scsi) { + /* unexpected failure */ + PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d", + status); + return (status); + } + + /* + * The function can fail for SCSI disks which are present but + * reserved by another system. In that case, we don't know the + * size of the disk and the block size. + */ + vd->vdisk_size = VD_SIZE_UNKNOWN; + vd->block_size = 0; + vd->vdisk_media = VD_MEDIA_FIXED; } - vd->vdisk_size = dk_minfo.dki_capacity; - vd->block_size = dk_minfo.dki_lbsize; - vd->vdisk_media = DK_MEDIATYPE2VD_MEDIATYPE(dk_minfo.dki_media_type); - vd->vdisk_block_size = DEV_BSIZE; /* Move dev number and LDI handle to entire-disk-slice array elements */ vd->dev[VD_ENTIRE_DISK_SLICE] = vd->dev[0]; @@ -4337,6 +4897,8 @@ vd_setup_backend_ldi(vd_t *vd) vd->vdisk_type == VD_DISK_TYPE_DISK) || dk_cinfo.dki_ctype == DKC_CDROM) { ASSERT(!vd->pseudo); + if (dk_cinfo.dki_ctype == DKC_SCSI_CCS) + vd->scsi = B_TRUE; return (vd_setup_full_disk(vd)); } @@ -4349,8 +4911,6 @@ vd_setup_backend_ldi(vd_t *vd) * If it is disk slice 2 or a pseudo device then it is exported as a * single slice disk only if the "slice" option is specified. */ - ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE || - dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE); return (vd_setup_single_slice_disk(vd)); } @@ -4472,13 +5032,26 @@ vd_setup_vd(vd_t *vd) ddi_release_devi(dip); VN_RELE(vnp); + if (!vd->pseudo) { + status = vd_setup_backend_ldi(vd); + break; + } + /* * If this is a pseudo device then its usage depends if the * "slice" option is set or not. If the "slice" option is set * then the pseudo device will be exported as a single slice, * otherwise it will be exported as a full disk. + * + * For backward compatibility, if vd_volume_force_slice is set + * then we always export pseudo devices as slices. */ - if (vd->pseudo && vd->vdisk_type == VD_DISK_TYPE_DISK) + if (vd_volume_force_slice) { + vd->vdisk_type = VD_DISK_TYPE_SLICE; + vd->nslices = 1; + } + + if (vd->vdisk_type == VD_DISK_TYPE_DISK) status = vd_setup_backend_vnode(vd); else status = vd_setup_backend_ldi(vd); diff --git a/usr/src/uts/sun4v/sys/vdc.h b/usr/src/uts/sun4v/sys/vdc.h index 03dd59a208..715a25fd04 100644 --- a/usr/src/uts/sun4v/sys/vdc.h +++ b/usr/src/uts/sun4v/sys/vdc.h @@ -231,6 +231,15 @@ typedef struct vdc_local_desc { } vdc_local_desc_t; /* + * I/O queue used by failfast + */ +typedef struct vdc_io { + struct vdc_io *vio_next; /* next pending I/O in the queue */ + struct buf *vio_buf; /* buf for CB_STRATEGY I/O */ + clock_t vio_qtime; /* time the I/O was queued */ +} vdc_io_t; + +/* * vdc soft state structure */ typedef struct vdc { @@ -291,6 +300,25 @@ typedef struct vdc { uint64_t ctimeout; /* connection timeout in seconds */ boolean_t ctimeout_reached; /* connection timeout has expired */ + /* + * The ownership fields are protected by the lock mutex. The + * ownership_lock mutex is used to serialize ownership operations; + * it should be acquired before the lock mutex. + */ + kmutex_t ownership_lock; /* serialize ownership ops */ + int ownership; /* ownership status flags */ + kthread_t *ownership_thread; /* ownership thread */ + kcondvar_t ownership_cv; /* cv for ownership update */ + + /* + * The failfast fields are protected by the lock mutex. + */ + kthread_t *failfast_thread; /* failfast thread */ + clock_t failfast_interval; /* interval in microsecs */ + kcondvar_t failfast_cv; /* cv for failfast update */ + kcondvar_t failfast_io_cv; /* cv wait for I/O to finish */ + vdc_io_t *failfast_io_queue; /* failfast io queue */ + ldc_mem_info_t dring_mem_info; /* dring information */ uint_t dring_curr_idx; /* current index */ uint32_t dring_len; /* dring length */ @@ -314,6 +342,20 @@ typedef struct vdc { } vdc_t; /* + * Ownership status flags + */ +#define VDC_OWNERSHIP_NONE 0x00 /* no ownership wanted */ +#define VDC_OWNERSHIP_WANTED 0x01 /* ownership is wanted */ +#define VDC_OWNERSHIP_GRANTED 0x02 /* ownership has been granted */ +#define VDC_OWNERSHIP_RESET 0x04 /* ownership has been reset */ + +/* + * Reservation conflict panic message + */ +#define VDC_RESV_CONFLICT_FMT_STR "Reservation Conflict\nDisk: " +#define VDC_RESV_CONFLICT_FMT_LEN (sizeof (VDC_RESV_CONFLICT_FMT_STR)) + +/* * Debugging macros */ #ifdef DEBUG diff --git a/usr/src/uts/sun4v/sys/vdsk_common.h b/usr/src/uts/sun4v/sys/vdsk_common.h index 4e5bff94c3..e5909a4787 100644 --- a/usr/src/uts/sun4v/sys/vdsk_common.h +++ b/usr/src/uts/sun4v/sys/vdsk_common.h @@ -104,8 +104,26 @@ extern "C" { #define VD_OP_GET_DEVID 0x0b /* Get device id */ #define VD_OP_GET_EFI 0x0c /* Get EFI */ #define VD_OP_SET_EFI 0x0d /* Set EFI */ +#define VD_OP_RESET 0x0e /* Reset disk */ +#define VD_OP_GET_ACCESS 0x0f /* Get disk access */ +#define VD_OP_SET_ACCESS 0x10 /* Set disk access */ +#define VD_OP_GET_CAPACITY 0x11 /* Get disk capacity */ #define VD_OP_MASK 0xFF /* mask of all possible operations */ -#define VD_OP_COUNT 13 /* Number of operations */ +#define VD_OP_COUNT 0x11 /* Number of operations */ + +/* + * Status for the VD_OP_GET_ACCESS operation + */ +#define VD_ACCESS_DENIED 0x00 /* access is not allowed */ +#define VD_ACCESS_ALLOWED 0x01 /* access is allowed */ + +/* + * Flags for the VD_OP_SET_ACCESS operation + */ +#define VD_ACCESS_SET_CLEAR 0x00 /* clear exclusive access rights */ +#define VD_ACCESS_SET_EXCLUSIVE 0x01 /* set exclusive access rights */ +#define VD_ACCESS_SET_PREEMPT 0x02 /* forcefully set access rights */ +#define VD_ACCESS_SET_PRESERVE 0x04 /* preserve access rights */ /* * This is a mask of all the basic operations supported by all @@ -127,6 +145,15 @@ extern "C" { (1 << VD_OP_SET_DISKGEOM) | \ (1 << VD_OP_SET_EFI)) +/* + * Mask for additional operations provided for SCSI disks (v1.1) + */ +#define VD_OP_MASK_SCSI \ + ((1 << VD_OP_SCSICMD) | \ + (1 << VD_OP_RESET) | \ + (1 << VD_OP_GET_ACCESS) | \ + (1 << VD_OP_SET_ACCESS) | \ + (1 << VD_OP_GET_CAPACITY)) /* * macro to check if the operation 'op' is supported by checking the list @@ -261,6 +288,73 @@ typedef struct vd_devid { } vd_devid_t; /* + * vDisk CAPACITY definition (VD_OP_GET_CAPACITY) + */ +typedef struct vd_capacity { + uint32_t vdisk_block_size; /* block size in bytes */ + uint32_t reserved; /* reserved */ + uint64_t vdisk_size; /* disk size in blocks */ +} vd_capacity_t; + +/* Identifier for unknown disk size */ +#define VD_SIZE_UNKNOWN -1 + +/* + * vDisk SCSI definition (VD_OP_SCSICMD) + */ +typedef struct vd_scsi { + uint8_t cmd_status; /* command completion status */ + uint8_t sense_status; /* sense command completion status */ + uint8_t task_attribute; /* task attribute */ + uint8_t task_priority; /* task priority */ + uint8_t crn; /* command reference number */ + uint8_t reserved; /* reserved */ + uint16_t timeout; /* command timeout */ + uint64_t options; /* options */ + uint64_t cdb_len; /* CDB data length */ + uint64_t sense_len; /* sense request length */ + uint64_t datain_len; /* data in buffer length */ + uint64_t dataout_len; /* data out buffer length */ + char data[1]; /* data (CDB, sense, data in/out */ +} vd_scsi_t; + +/* Minimum size of the vd_scsi structure */ +#define VD_SCSI_SIZE (sizeof (vd_scsi_t) - sizeof (uint64_t)) + +/* + * Macros to access data buffers in a vd_scsi structure. When using these + * macros, the vd_scsi structure needs to be populated with the sizes of + * data buffers allocated in the structure. + */ +#define VD_SCSI_DATA_CDB(vscsi) \ + ((union scsi_cdb *)(uintptr_t)((vscsi)->data)) + +#define VD_SCSI_DATA_SENSE(vscsi) \ + ((struct scsi_extended_sense *)(uintptr_t)((vscsi)->data + \ + P2ROUNDUP((vscsi)->cdb_len, sizeof (uint64_t)))) + +#define VD_SCSI_DATA_IN(vscsi) \ + ((uintptr_t)((vscsi)->data + \ + P2ROUNDUP((vscsi)->cdb_len, sizeof (uint64_t)) + \ + P2ROUNDUP((vscsi)->sense_len, sizeof (uint64_t)))) + +#define VD_SCSI_DATA_OUT(vscsi) \ + ((uintptr_t)((vscsi)->data + \ + P2ROUNDUP((vscsi)->cdb_len, sizeof (uint64_t)) + \ + P2ROUNDUP((vscsi)->sense_len, sizeof (uint64_t)) + \ + P2ROUNDUP((vscsi)->datain_len, sizeof (uint64_t)))) + +/* vDisk SCSI task attribute */ +#define VD_SCSI_TASK_SIMPLE 0x01 /* simple task */ +#define VD_SCSI_TASK_ORDERED 0x02 /* ordered task */ +#define VD_SCSI_TASK_HQUEUE 0x03 /* head of queue task */ +#define VD_SCSI_TASK_ACA 0x04 /* ACA task */ + +/* vDisk SCSI options */ +#define VD_SCSI_OPT_CRN 0x01 /* request has a CRN */ +#define VD_SCSI_OPT_NORETRY 0x02 /* do not attempt any retry */ + +/* * Copy the contents of a vd_geom_t to the contents of a dk_geom struct */ #define VD_GEOM2DK_GEOM(vd_geom, dk_geom) \ diff --git a/usr/src/uts/sun4v/vdc/Makefile b/usr/src/uts/sun4v/vdc/Makefile index c2db8529e8..21dd386c1e 100644 --- a/usr/src/uts/sun4v/vdc/Makefile +++ b/usr/src/uts/sun4v/vdc/Makefile @@ -70,7 +70,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) CFLAGS += $(CCVERBOSE) CFLAGS += -errwarn=%all -LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc +LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc -Nmisc/scsi # # Default build targets. |
