diff options
Diffstat (limited to 'usr/src/uts/sun4v/io/vdc.c')
-rw-r--r-- | usr/src/uts/sun4v/io/vdc.c | 444 |
1 files changed, 248 insertions, 196 deletions
diff --git a/usr/src/uts/sun4v/io/vdc.c b/usr/src/uts/sun4v/io/vdc.c index 6502c8394a..a04c57e32d 100644 --- a/usr/src/uts/sun4v/io/vdc.c +++ b/usr/src/uts/sun4v/io/vdc.c @@ -69,6 +69,7 @@ #include <sys/mdeg.h> #include <sys/note.h> #include <sys/open.h> +#include <sys/sdt.h> #include <sys/stat.h> #include <sys/sunddi.h> #include <sys/types.h> @@ -152,7 +153,6 @@ static int vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, static int vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg); static int vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx); -static int vdc_get_response(vdc_t *vdc, int start, int end); static int vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, int operation); static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int @@ -162,19 +162,26 @@ static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int static int vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode); static int vdc_create_fake_geometry(vdc_t *vdc); static int vdc_setup_disk_layout(vdc_t *vdc); -static int vdc_null_copy_func(void *from, void *to, int mode, int dir); -static int vdc_get_vtoc_convert(void *from, void *to, int mode, int dir); -static int vdc_set_vtoc_convert(void *from, void *to, int mode, int dir); -static int vdc_get_geom_convert(void *from, void *to, int mode, int dir); -static int vdc_set_geom_convert(void *from, void *to, int mode, int dir); -static int vdc_uscsicmd_convert(void *from, void *to, int mode, int dir); +static int vdc_null_copy_func(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); +static int vdc_uscsicmd_convert(vdc_t *vdc, void *from, void *to, + int mode, int dir); /* * Module variables */ uint64_t vdc_hz_timeout; uint64_t vdc_usec_timeout = VDC_USEC_TIMEOUT_MIN; -uint64_t vdc_dump_usec_timeout = VDC_USEC_TIMEOUT_MIN / 300; +uint64_t vdc_usec_timeout_dump = VDC_USEC_TIMEOUT_MIN / 300; +uint64_t vdc_usec_timeout_dring = 10 * MILLISEC; static int vdc_retries = VDC_RETRIES; static int vdc_dump_retries = VDC_RETRIES * 10; @@ -932,18 +939,38 @@ vdc_print(dev_t dev, char *str) static int vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { - int rv = 0; - size_t nbytes = (nblk * DEV_BSIZE); - int instance = SDUNIT(getminor(dev)); - vdc_t *vdc; + buf_t *buf; /* BWRITE requests need to be in a buf_t structure */ + int rv; + size_t nbytes = nblk * DEV_BSIZE; + int instance = SDUNIT(getminor(dev)); + vdc_t *vdc = NULL; if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) { vdc_msg("%s (%d): Could not get state.", __func__, instance); return (ENXIO); } - rv = vdc_populate_descriptor(vdc, addr, nbytes, VD_OP_BWRITE, - blkno, SDPART(getminor(dev))); + buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); + bioinit(buf); + buf->b_un.b_addr = addr; + buf->b_bcount = nbytes; + buf->b_flags = B_BUSY | B_WRITE; + buf->b_dev = dev; + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, nbytes, + VD_OP_BWRITE, blkno, SDPART(getminor(dev))); + + /* + * If the OS instance is panicking, the call above will ensure that + * the descriptor is done before returning. This should always be + * case when coming through this function but we check just in case + * and wait if necessary for the vDisk server to ACK and trigger + * the biodone. + */ + if (!ddi_in_panic()) + rv = biowait(buf); + + biofini(buf); + kmem_free(buf, sizeof (buf_t)); PR1("%s: status=%d\n", __func__, rv); @@ -983,22 +1010,32 @@ vdc_strategy(struct buf *buf) return (0); } + DTRACE_IO2(vstart, buf_t *, buf, vdc_t *, vdc); + ASSERT(buf->b_bcount <= (vdc->max_xfer_sz * vdc->block_size)); if (!vdc_is_able_to_tx_data(vdc, O_NONBLOCK)) { - vdc_msg("%s: Not ready to transmit data", __func__); + PR0("%s: Not ready to transmit data\n", __func__); bioerror(buf, ENXIO); biodone(buf); return (0); } bp_mapin(buf); - rv = vdc_populate_descriptor(vdc, buf->b_un.b_addr, buf->b_bcount, op, + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, buf->b_bcount, op, buf->b_lblkno, SDPART(getminor(buf->b_edev))); - PR1("%s: status=%d", __func__, rv); - bioerror(buf, rv); - biodone(buf); + /* + * If the request was successfully sent, the strategy call returns and + * the ACK handler calls the bioxxx functions when the vDisk server is + * done. + */ + if (rv) { + PR0("[%d] Failed to read/write (err=%d)\n", instance, rv); + bioerror(buf, rv); + biodone(buf); + } + return (0); } @@ -1900,6 +1937,8 @@ vdc_destroy_descriptor_ring(vdc_t *vdc) * * Description: * This function gets the index of the next Descriptor Ring entry available + * If the ring is full, it will back off and wait for the next entry to be + * freed (the ACK handler will signal). * * Return Value: * 0 <= rv < VD_DRING_LEN Next available slot @@ -1910,9 +1949,9 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) { _NOTE(ARGUNUSED(num_slots_needed)) - vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ + vd_dring_entry_t *dep = NULL; /* DRing Entry Pointer */ + vdc_local_desc_t *ldep = NULL; /* Local DRing Entry Pointer */ int idx = -1; - int start_idx = 0; ASSERT(vdc != NULL); ASSERT(vdc->dring_len == VD_DRING_LEN); @@ -1920,67 +1959,31 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) ASSERT(vdc->dring_curr_idx < VD_DRING_LEN); ASSERT(mutex_owned(&vdc->dring_lock)); - /* Start at the last entry used */ - idx = start_idx = vdc->dring_curr_idx; - - /* - * Loop through Descriptor Ring checking for a free entry until we reach - * the entry we started at. We should never come close to filling the - * Ring at any stage, instead this is just to prevent an entry which - * gets into an inconsistent state (e.g. due to a request timing out) - * from blocking progress. - */ - do { - /* Get the next entry after the last known index tried */ - idx = (idx + 1) % VD_DRING_LEN; - - dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx); - ASSERT(dep != NULL); + /* pick the next descriptor after the last one used */ + idx = (vdc->dring_curr_idx + 1) % VD_DRING_LEN; + ldep = &vdc->local_dring[idx]; + ASSERT(ldep != NULL); + dep = ldep->dep; + ASSERT(dep != NULL); + mutex_enter(&ldep->lock); + if (dep->hdr.dstate == VIO_DESC_FREE) { + vdc->dring_curr_idx = idx; + } else { + DTRACE_PROBE(full); + (void) cv_timedwait(&ldep->cv, &ldep->lock, + VD_GET_TIMEOUT_HZ(1)); if (dep->hdr.dstate == VIO_DESC_FREE) { - ASSERT(idx >= 0); - ASSERT(idx < VD_DRING_LEN); vdc->dring_curr_idx = idx; - return (idx); - - } else if (dep->hdr.dstate == VIO_DESC_READY) { - PR0("%s: Entry %d waiting to be accepted\n", - __func__, idx); - continue; - - } else if (dep->hdr.dstate == VIO_DESC_ACCEPTED) { - PR0("%s: Entry %d waiting to be processed\n", - __func__, idx); - continue; - - } else if (dep->hdr.dstate == VIO_DESC_DONE) { - PR0("%s: Entry %d done but not marked free\n", - __func__, idx); - - /* - * If we are currently panicking, interrupts are - * disabled and we will not be getting ACKs from the - * vDisk server so we mark the descriptor ring entries - * as FREE here instead of in the ACK handler. - */ - if (panicstr) { - (void) vdc_depopulate_descriptor(vdc, idx); - dep->hdr.dstate = VIO_DESC_FREE; - vdc->local_dring[idx].flags = VIO_DESC_FREE; - } - continue; - } else { - vdc_msg("Public Descriptor Ring entry corrupted"); - mutex_enter(&vdc->lock); - vdc_reset_connection(vdc, B_FALSE); - mutex_exit(&vdc->lock); - return (-1); + PR0("[%d] Entry %d unavailable still in state %d\n", + vdc->instance, idx, dep->hdr.dstate); + idx = -1; /* indicate that the ring is full */ } + } + mutex_exit(&ldep->lock); - } while (idx != start_idx); - - return (-1); + return (idx); } /* @@ -1994,7 +1997,11 @@ vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed) * * Arguments: * vdc - the soft state pointer - * addr - start address of memory region. + * addr - address of structure to be written. In the case of block + * reads and writes this structure will be a buf_t and the + * address of the data to be written will be in the b_un.b_addr + * field. Otherwise the value of addr will be the address + * to be written. * nbytes - number of bytes to read/write * operation - operation we want vds to perform (VD_OP_XXX) * arg - parameter to be sent to server (depends on VD_OP_XXX type) @@ -2031,8 +2038,8 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, idx = vdc_get_next_dring_entry_idx(vdc, 1); if (idx == -1) { mutex_exit(&vdc->dring_lock); - vdc_msg("%s[%d]: no descriptor ring entry avail, seq=%d\n", - __func__, vdc->instance, vdc->seq_num); + PR0("[%d] no descriptor ring entry avail, last seq=%d\n", + vdc->instance, vdc->seq_num - 1); /* * Since strategy should not block we don't wait for the DRing @@ -2047,17 +2054,23 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, ASSERT(dep != NULL); /* - * Wait for anybody still using the DRing entry to finish. - * (e.g. still waiting for vds to respond to a request) + * We now get the lock for this descriptor before dropping the overall + * DRing lock. This prevents a race condition where another vdc thread + * could grab the descriptor we selected. */ + ASSERT(!MUTEX_HELD(&local_dep->lock)); mutex_enter(&local_dep->lock); + mutex_exit(&vdc->dring_lock); switch (operation) { case VD_OP_BREAD: case VD_OP_BWRITE: + local_dep->buf = (struct buf *)addr; + local_dep->addr = local_dep->buf->b_un.b_addr; PR1("buf=%p, block=%lx, nbytes=%lx\n", addr, arg, nbytes); dep->payload.addr = (diskaddr_t)arg; - rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation); + rv = vdc_populate_mem_hdl(vdc, idx, local_dep->addr, + nbytes, operation); break; case VD_OP_GET_VTOC: @@ -2065,6 +2078,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, case VD_OP_GET_DISKGEOM: case VD_OP_SET_DISKGEOM: case VD_OP_SCSICMD: + local_dep->addr = addr; if (nbytes > 0) { rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation); @@ -2085,7 +2099,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, if (rv != 0) { mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } @@ -2101,30 +2114,34 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, dep->hdr.ack = 1; /* request an ACK for every message */ local_dep->flags = VIO_DESC_READY; - local_dep->addr = addr; /* * Send a msg with the DRing details to vds */ + mutex_enter(&vdc->lock); VIO_INIT_DRING_DATA_TAG(dmsg); VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); dmsg.dring_ident = vdc->dring_ident; dmsg.start_idx = idx; dmsg.end_idx = idx; + DTRACE_IO2(send, vio_dring_msg_t *, &dmsg, vdc_t *, vdc); + PR1("ident=0x%llx, st=%d, end=%d, seq=%d req=%d dep=%p\n", vdc->dring_ident, dmsg.start_idx, dmsg.end_idx, dmsg.seq_num, dep->payload.req_id, dep); - mutex_enter(&vdc->lock); rv = vdc_send(vdc, (caddr_t)&dmsg, &msglen); - mutex_exit(&vdc->lock); PR1("%s[%d]: ldc_write() rv=%d\n", __func__, vdc->instance, rv); if (rv != 0) { + mutex_exit(&vdc->lock); mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); vdc_msg("%s: ldc_write(%d)\n", __func__, rv); - return (EAGAIN); + + /* Clear the DRing entry */ + rv = vdc_depopulate_descriptor(vdc, idx); + + return (rv ? rv : EAGAIN); } /* @@ -2132,14 +2149,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, * number to be used by the next message */ vdc->seq_num++; - - /* - * XXX - potential performance enhancement (Investigate at a later date) - * - * for calls from strategy(9E), instead of waiting for a response from - * vds, we could return at this stage and let the ACK handling code - * trigger the biodone(9F) - */ + mutex_exit(&vdc->lock); /* * When a guest is panicking, the completion of requests needs to be @@ -2170,7 +2180,7 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, } PR1("Waiting for next packet @ %d\n", idx); - delay(drv_usectohz(vdc_dump_usec_timeout)); + drv_usecwait(vdc_usec_timeout_dump); continue; } @@ -2238,14 +2248,24 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, } mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } /* - * Now watch the DRing entries we modified to get the response - * from vds. + * In the case of calls from strategy and dump (in the non-panic case), + * instead of waiting for a response from the vDisk server return now. + * They will be processed asynchronously and the vdc ACK handling code + * will trigger the biodone(9F) + */ + if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { + mutex_exit(&local_dep->lock); + return (rv); + } + + /* + * In the case of synchronous calls we watch the DRing entries we + * modified and await the response from vds. */ rv = vdc_wait_for_descriptor_update(vdc, idx, dmsg); if (rv == ETIMEDOUT) { @@ -2257,7 +2277,6 @@ vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation, PR1("%s[%d] Status=%d\n", __func__, vdc->instance, rv); mutex_exit(&local_dep->lock); - mutex_exit(&vdc->dring_lock); return (rv); } @@ -2287,7 +2306,6 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) int rv = 0; ASSERT(vdc != NULL); - ASSERT(mutex_owned(&vdc->dring_lock)); ASSERT(idx < VD_DRING_LEN); local_dep = &vdc->local_dring[idx]; ASSERT(local_dep != NULL); @@ -2329,12 +2347,12 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) * and have never made it to the other side (vds). * (We reuse the original message but update seq ID) */ + mutex_enter(&vdc->lock); VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc); retries = 0; - mutex_enter(&vdc->lock); status = vdc_send(vdc, (caddr_t)&dmsg, &msglen); - mutex_exit(&vdc->lock); if (status != 0) { + mutex_exit(&vdc->lock); vdc_msg("%s: Error (%d) while resending after " "timeout\n", __func__, status); status = ETIMEDOUT; @@ -2345,60 +2363,13 @@ vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg) * the sequence number to be used by the next message. */ vdc->seq_num++; + mutex_exit(&vdc->lock); } } return (status); } -static int -vdc_get_response(vdc_t *vdc, int start, int end) -{ - vdc_local_desc_t *ldep = NULL; /* Local Dring Entry Pointer */ - vd_dring_entry_t *dep = NULL; /* Dring Entry Pointer */ - int status = ENXIO; - int idx = -1; - - ASSERT(vdc != NULL); - ASSERT(start >= 0); - ASSERT(start <= VD_DRING_LEN); - ASSERT(start >= -1); - ASSERT(start <= VD_DRING_LEN); - - idx = start; - ldep = &vdc->local_dring[idx]; - ASSERT(ldep != NULL); - dep = ldep->dep; - ASSERT(dep != NULL); - - PR0("%s[%d] DRING entry=%d status=%d\n", __func__, vdc->instance, - idx, VIO_GET_DESC_STATE(dep->hdr.dstate)); - while (VIO_GET_DESC_STATE(dep->hdr.dstate) == VIO_DESC_DONE) { - if ((end != -1) && (idx > end)) - return (0); - - switch (ldep->operation) { - case VD_OP_BREAD: - case VD_OP_BWRITE: - /* call bioxxx */ - break; - default: - /* signal waiter */ - break; - } - - /* Clear the DRing entry */ - status = vdc_depopulate_descriptor(vdc, idx); - PR0("%s[%d] Status=%d\n", __func__, vdc->instance, status); - - /* loop accounting to get next DRing entry */ - idx++; - ldep = &vdc->local_dring[idx]; - dep = ldep->dep; - } - - return (status); -} /* * Function: @@ -2452,7 +2423,7 @@ vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx) bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes); kmem_free(ldep->align_addr, - sizeof (caddr_t) * dep->payload.nbytes); + sizeof (caddr_t) * P2ROUNDUP(dep->payload.nbytes, 8)); ldep->align_addr = NULL; } @@ -2536,17 +2507,20 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, */ vaddr = addr; if (((uint64_t)addr & 0x7) != 0) { + ASSERT(ldep->align_addr == NULL); ldep->align_addr = - kmem_zalloc(sizeof (caddr_t) * nbytes, KM_SLEEP); + kmem_zalloc(sizeof (caddr_t) * P2ROUNDUP(nbytes, 8), + KM_SLEEP); PR0("%s[%d] Misaligned address %lx reallocating " - "(buf=%lx entry=%d)\n", - __func__, vdc->instance, addr, ldep->align_addr, idx); + "(buf=%lx nb=%d op=%d entry=%d)\n", + __func__, vdc->instance, addr, ldep->align_addr, nbytes, + operation, idx); bcopy(addr, ldep->align_addr, nbytes); vaddr = ldep->align_addr; } rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8), - vdc->dring_mem_info.mtype, perm, &dep->payload.cookie[0], + LDC_SHADOW_MAP, perm, &dep->payload.cookie[0], &dep->payload.ncookies); PR1("%s[%d] bound mem handle; ncookies=%d\n", __func__, vdc->instance, dep->payload.ncookies); @@ -2556,7 +2530,7 @@ vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes, __func__, vdc->instance, mhdl, addr, idx, rv); if (ldep->align_addr) { kmem_free(ldep->align_addr, - sizeof (caddr_t) * dep->payload.nbytes); + sizeof (caddr_t) * P2ROUNDUP(nbytes, 8)); ldep->align_addr = NULL; } return (EAGAIN); @@ -2986,7 +2960,7 @@ static int vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) { int status = 0; - vdc_local_desc_t *local_dep = NULL; + vdc_local_desc_t *ldep = NULL; vio_dring_msg_t *dring_msg = NULL; uint_t num_msgs; uint_t start; @@ -3010,6 +2984,8 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) return (EPROTO); } + DTRACE_IO2(recv, vio_dring_msg_t, dring_msg, vdc_t *, vdc); + /* * calculate the number of messages that vds ACK'ed * @@ -3031,12 +3007,32 @@ vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg) * Wake the thread waiting for each DRing entry ACK'ed */ for (i = 0; i < num_msgs; i++) { + int operation; int idx = (start + i) % VD_DRING_LEN; - local_dep = &vdc->local_dring[idx]; - mutex_enter(&local_dep->lock); - cv_signal(&local_dep->cv); - mutex_exit(&local_dep->lock); + ldep = &vdc->local_dring[idx]; + mutex_enter(&ldep->lock); + operation = ldep->dep->payload.operation; + if ((operation == VD_OP_BREAD) || (operation == VD_OP_BWRITE)) { + /* + * The vDisk server responds when it accepts a + * descriptor so we continue looping and process + * it when it sends the message that it is done. + */ + if (ldep->dep->hdr.dstate != VIO_DESC_DONE) { + mutex_exit(&ldep->lock); + continue; + } + bioerror(ldep->buf, ldep->dep->payload.status); + biodone(ldep->buf); + + DTRACE_IO2(vdone, buf_t *, ldep->buf, vdc_t *, vdc); + + /* Clear the DRing entry */ + status = vdc_depopulate_descriptor(vdc, idx); + } + cv_signal(&ldep->cv); + mutex_exit(&ldep->lock); } if (msg.tag.vio_subtype == VIO_SUBTYPE_NACK) { @@ -3348,6 +3344,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs) { ASSERT(vdc != NULL); ASSERT(dring_msg != NULL); + ASSERT(mutex_owned(&vdc->lock)); /* * Check to see if the messages were responded to in the correct @@ -3357,7 +3354,7 @@ vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs) * if so something is seriously wrong so we reset the connection * - a seq_num greater than what we expected is returned. */ - if (dring_msg->seq_num != (vdc->seq_num_reply + num_msgs)) { + if (dring_msg->seq_num < vdc->seq_num_reply) { vdc_msg("%s[%d]: Bogus seq_num %d, expected %d\n", __func__, vdc->instance, dring_msg->seq_num, vdc->seq_num_reply + num_msgs); @@ -3529,7 +3526,8 @@ typedef struct vdc_dk_ioctl { size_t nbytes; /* size of structure to be copied */ /* function to convert between vDisk and Solaris structure formats */ - int (*convert)(void *vd_buf, void *ioctl_arg, int mode, int dir); + int (*convert)(vdc_t *vdc, void *vd_buf, void *ioctl_arg, + int mode, int dir); } vdc_dk_ioctl_t; /* @@ -3546,15 +3544,13 @@ static vdc_dk_ioctl_t dk_ioctl[] = { vdc_get_vtoc_convert}, {VD_OP_SET_VTOC, DKIOCSVTOC, sizeof (vd_vtoc_t), vdc_set_vtoc_convert}, - {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), - vdc_get_geom_convert}, {VD_OP_GET_DISKGEOM, DKIOCGGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, {VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, - {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), + {VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, sizeof (vd_geom_t), vdc_get_geom_convert}, - {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), + {VD_OP_SET_DISKGEOM, DKIOCSGEOM, sizeof (vd_geom_t), vdc_set_geom_convert}, /* @@ -3600,6 +3596,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) size_t alloc_len = 0; /* #bytes to allocate mem for */ caddr_t mem_p = NULL; size_t nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0])); + struct vtoc vtoc_saved; PR0("%s: Processing ioctl(%x) for dev %x : model %x\n", __func__, cmd, dev, ddi_model_convert_from(mode & FMODELS)); @@ -3740,13 +3737,21 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) ASSERT(alloc_len != 0); /* sanity check */ mem_p = kmem_zalloc(alloc_len, KM_SLEEP); + if (cmd == DKIOCSVTOC) { + /* + * Save a copy of the current VTOC so that we can roll back + * if the setting of the new VTOC fails. + */ + bcopy(vdc->vtoc, &vtoc_saved, sizeof (struct vtoc)); + } + /* * Call the conversion function for this ioctl whhich if necessary * converts from the Solaris format to the format ARC'ed * as part of the vDisk protocol (FWARC 2006/195) */ ASSERT(dk_ioctl[idx].convert != NULL); - rv = (dk_ioctl[idx].convert)(arg, mem_p, mode, VD_COPYIN); + rv = (dk_ioctl[idx].convert)(vdc, arg, mem_p, mode, VD_COPYIN); if (rv != 0) { PR0("%s[%d]: convert returned %d for ioctl 0x%x\n", __func__, instance, rv, cmd); @@ -3770,20 +3775,24 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) __func__, instance, rv, cmd); if (mem_p != NULL) kmem_free(mem_p, alloc_len); + + if (cmd == DKIOCSVTOC) { + /* update of the VTOC has failed, roll back */ + bcopy(&vtoc_saved, vdc->vtoc, sizeof (struct vtoc)); + } + return (rv); } - /* - * If the VTOC has been changed, then vdc needs to update the copy - * it saved in the soft state structure and try and update the device - * node properties. Failing to set the properties should not cause - * an error to be return the caller though. - */ if (cmd == DKIOCSVTOC) { - bcopy(mem_p, vdc->vtoc, sizeof (struct vtoc)); + /* + * The VTOC has been changed, try and update the device + * node properties. Failing to set the properties should + * not cause an error to be return the caller though. + */ if (vdc_create_device_nodes_props(vdc)) { cmn_err(CE_NOTE, "![%d] Failed to update device nodes" - " properties", instance); + " properties", vdc->instance); } } @@ -3793,7 +3802,7 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) * protocol (FWARC 2006/195) back to a format understood by * the rest of Solaris. */ - rv = (dk_ioctl[idx].convert)(mem_p, arg, mode, VD_COPYOUT); + rv = (dk_ioctl[idx].convert)(vdc, mem_p, arg, mode, VD_COPYOUT); if (rv != 0) { PR0("%s[%d]: convert returned %d for ioctl 0x%x\n", __func__, instance, rv, cmd); @@ -3816,8 +3825,9 @@ vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode) * do not need to convert the data being passed in/out to userland */ static int -vdc_null_copy_func(void *from, void *to, int mode, int dir) +vdc_null_copy_func(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) _NOTE(ARGUNUSED(from)) _NOTE(ARGUNUSED(to)) _NOTE(ARGUNUSED(mode)) @@ -3831,9 +3841,16 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir) * vdc_get_vtoc_convert() * * Description: - * This routine fakes up the disk info needed for some DKIO ioctls. + * This routine performs the necessary convertions from the DKIOCGVTOC + * Solaris structure to the format defined in FWARC 2006/195. + * + * In the struct vtoc definition, the timestamp field is marked as not + * supported so it is not part of vDisk protocol (FWARC 2006/195). + * However SVM uses that field to check it can write into the VTOC, + * so we fake up the info of that field. * * Arguments: + * vdc - the vDisk client * from - the buffer containing the data to be copied from * to - the buffer to be copied to * mode - flags passed to ioctl() call @@ -3842,11 +3859,12 @@ vdc_null_copy_func(void *from, void *to, int mode, int dir) * Return Code: * 0 - Success * ENXIO - incorrect buffer passed in. - * EFAULT - ddi_copyxxx routine encountered an error. + * EFAULT - ddi_copyout routine encountered an error. */ static int -vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) +vdc_get_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + int i; void *tmp_mem = NULL; void *tmp_memp; struct vtoc vt; @@ -3868,6 +3886,12 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) tmp_mem = kmem_alloc(copy_len, KM_SLEEP); VD_VTOC2VTOC((vd_vtoc_t *)from, &vt); + + /* fake the VTOC timestamp field */ + for (i = 0; i < V_NUMPAR; i++) { + vt.timestamp[i] = vdc->vtoc->timestamp[i]; + } + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { vtoctovtoc32(vt, vt32); tmp_memp = &vt32; @@ -3887,8 +3911,11 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) * vdc_set_vtoc_convert() * * Description: + * This routine performs the necessary convertions from the DKIOCSVTOC + * Solaris structure to the format defined in FWARC 2006/195. * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3900,7 +3927,7 @@ vdc_get_vtoc_convert(void *from, void *to, int mode, int dir) * EFAULT - ddi_copyin of data failed */ static int -vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) +vdc_set_vtoc_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { void *tmp_mem = NULL; struct vtoc vt; @@ -3934,6 +3961,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) vtp = tmp_mem; } + /* + * The VTOC is being changed, then vdc needs to update the copy + * it saved in the soft state structure. + */ + bcopy(vtp, vdc->vtoc, sizeof (struct vtoc)); + VTOC2VD_VTOC(vtp, &vtvd); bcopy(&vtvd, to, sizeof (vd_vtoc_t)); kmem_free(tmp_mem, copy_len); @@ -3946,8 +3979,12 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) * vdc_get_geom_convert() * * Description: + * This routine performs the necessary convertions from the DKIOCGGEOM, + * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format + * defined in FWARC 2006/195 * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3956,11 +3993,13 @@ vdc_set_vtoc_convert(void *from, void *to, int mode, int dir) * Return Code: * 0 - Success * ENXIO - Invalid buffer passed in - * EFAULT - ddi_copyin of data failed + * EFAULT - ddi_copyout of data failed */ static int -vdc_get_geom_convert(void *from, void *to, int mode, int dir) +vdc_get_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) + struct dk_geom geom; int copy_len = sizeof (struct dk_geom); int rv = 0; @@ -3984,10 +4023,11 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir) * vdc_set_geom_convert() * * Description: - * This routine performs the necessary convertions from the DKIOCSVTOC - * Solaris structure to the format defined in FWARC 2006/195 + * This routine performs the necessary convertions from the DKIOCSGEOM + * Solaris structure to the format defined in FWARC 2006/195. * * Arguments: + * vdc - the vDisk client * from - Buffer with data * to - Buffer where data is to be copied to * mode - flags passed to ioctl @@ -3999,8 +4039,10 @@ vdc_get_geom_convert(void *from, void *to, int mode, int dir) * EFAULT - ddi_copyin of data failed */ static int -vdc_set_geom_convert(void *from, void *to, int mode, int dir) +vdc_set_geom_convert(vdc_t *vdc, void *from, void *to, int mode, int dir) { + _NOTE(ARGUNUSED(vdc)) + vd_geom_t vdgeom; void *tmp_mem = NULL; int copy_len = sizeof (struct dk_geom); @@ -4104,6 +4146,7 @@ vdc_create_fake_geometry(vdc_t *vdc) static int vdc_setup_disk_layout(vdc_t *vdc) { + buf_t *buf; /* BREAD requests need to be in a buf_t structure */ dev_t dev; int slice = 0; int rv; @@ -4129,14 +4172,9 @@ vdc_setup_disk_layout(vdc_t *vdc) } /* - * Read disk label from start of disk - */ - vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP); - - /* * find the slice that represents the entire "disk" and use that to * read the disk label. The convention in Solaris is that slice 2 - * represents the whole disk so we check that it is otherwise we + * represents the whole disk so we check that it is, otherwise we * default to slice 0 */ if ((vdc->vdisk_type == VD_DISK_TYPE_DISK) && @@ -4145,8 +4183,22 @@ vdc_setup_disk_layout(vdc_t *vdc) } else { slice = 0; } - rv = vdc_populate_descriptor(vdc, (caddr_t)vdc->label, DK_LABEL_SIZE, + + /* + * Read disk label from start of disk + */ + vdc->label = kmem_zalloc(DK_LABEL_SIZE, KM_SLEEP); + buf = kmem_alloc(sizeof (buf_t), KM_SLEEP); + bioinit(buf); + buf->b_un.b_addr = (caddr_t)vdc->label; + buf->b_bcount = DK_LABEL_SIZE; + buf->b_flags = B_BUSY | B_READ; + buf->b_dev = dev; + rv = vdc_populate_descriptor(vdc, (caddr_t)buf, DK_LABEL_SIZE, VD_OP_BREAD, 0, slice); + rv = biowait(buf); + biofini(buf); + kmem_free(buf, sizeof (buf_t)); return (rv); } |