summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/io/lvm/raid/raid.c
diff options
context:
space:
mode:
authorstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
committerstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
commit7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
treec871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/io/lvm/raid/raid.c
downloadillumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/io/lvm/raid/raid.c')
-rw-r--r--usr/src/uts/common/io/lvm/raid/raid.c4395
1 files changed, 4395 insertions, 0 deletions
diff --git a/usr/src/uts/common/io/lvm/raid/raid.c b/usr/src/uts/common/io/lvm/raid/raid.c
new file mode 100644
index 0000000000..0fec16660d
--- /dev/null
+++ b/usr/src/uts/common/io/lvm/raid/raid.c
@@ -0,0 +1,4395 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * NAME: raid.c
+ *
+ * DESCRIPTION: Main RAID driver source file containing open, close and I/O
+ * operations.
+ *
+ * ROUTINES PROVIDED FOR EXTERNAL USE:
+ * raid_open() - open the RAID metadevice for access.
+ * raid_internal_open() - internal open routine of RAID metdevice.
+ * md_raid_strategy() - perform normal I/O operations,
+ * such as read and write.
+ * raid_close() - close the RAID metadevice.
+ * raid_internal_close() - internal close routine of RAID metadevice.
+ * raid_snarf() - initialize and clean up MDD records.
+ * raid_halt() - reset the RAID metadevice
+ * raid_line() - return the line # of this segment
+ * raid_dcolumn() - return the data column # of this segment
+ * raid_pcolumn() - return the parity column # of this segment
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/t_lock.h>
+#include <sys/buf.h>
+#include <sys/dkio.h>
+#include <sys/vtoc.h>
+#include <sys/kmem.h>
+#include <vm/page.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/mkdev.h>
+#include <sys/stat.h>
+#include <sys/open.h>
+#include <sys/modctl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/debug.h>
+#include <sys/lvm/md_raid.h>
+#include <sys/lvm/mdvar.h>
+#include <sys/lvm/md_convert.h>
+
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/svm.h>
+
+md_ops_t raid_md_ops;
+#ifndef lint
+static char _depends_on[] = "drv/md";
+md_ops_t *md_interface_ops = &raid_md_ops;
+#endif /* lint */
+
+extern unit_t md_nunits;
+extern unit_t md_nsets;
+extern md_set_t md_set[];
+extern int md_status;
+extern major_t md_major;
+extern mdq_anchor_t md_done_daemon;
+extern mdq_anchor_t md_mstr_daemon;
+extern int md_sleep_for_test;
+extern clock_t md_hz;
+
+extern md_event_queue_t *md_event_queue;
+
+
+int pchunks = 16;
+int phigh = 1024;
+int plow = 128;
+int cchunks = 64;
+int chigh = 1024;
+int clow = 512;
+int bchunks = 32;
+int bhigh = 256;
+int blow = 128;
+
+int raid_total_io = 0;
+int raid_reads = 0;
+int raid_writes = 0;
+int raid_no_bpmaps = 0;
+int raid_512 = 0;
+int raid_1024 = 0;
+int raid_1024_8192 = 0;
+int raid_8192 = 0;
+int raid_8192_bigger = 0;
+int raid_line_lock_wait = 0;
+
+int data_buffer_waits = 0;
+int parity_buffer_waits = 0;
+
+/* writer line locks */
+int raid_writer_locks = 0; /* total writer locks */
+int raid_write_waits = 0; /* total writer locks that waited */
+int raid_full_line_writes = 0; /* total full line writes */
+int raid_write_queue_length = 0; /* wait queue length */
+int raid_max_write_q_length = 0; /* maximum queue length */
+int raid_write_locks_active = 0; /* writer locks at any time */
+int raid_max_write_locks = 0; /* maximum writer locks active */
+
+/* read line locks */
+int raid_reader_locks = 0; /* total reader locks held */
+int raid_reader_locks_active = 0; /* reader locks held */
+int raid_max_reader_locks = 0; /* maximum reader locks held in run */
+int raid_read_overlaps = 0; /* number of times 2 reads hit same line */
+int raid_read_waits = 0; /* times a reader waited on writer */
+
+/* prewrite stats */
+int raid_prewrite_waits = 0; /* number of waits for a pw slot */
+int raid_pw = 0; /* number of pw slots in use */
+int raid_prewrite_max = 0; /* maximum number of pw slots in use */
+int raid_pw_invalidates = 0;
+
+static clock_t md_wr_wait = 0;
+
+int nv_available = 0; /* presence of nv-ram support in device */
+int nv_prewrite = 1; /* mark prewrites with nv_available */
+int nv_parity = 1; /* mark parity with nv_available */
+
+kmem_cache_t *raid_parent_cache = NULL;
+kmem_cache_t *raid_child_cache = NULL;
+kmem_cache_t *raid_cbuf_cache = NULL;
+
+int raid_internal_open(minor_t mnum, int flag, int otyp,
+ int md_oflags);
+
+static void freebuffers(md_raidcs_t *cs);
+static int raid_read(mr_unit_t *un, md_raidcs_t *cs);
+static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs);
+static int raid_write(mr_unit_t *un, md_raidcs_t *cs);
+static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs);
+static void raid_stage(md_raidcs_t *cs);
+static void raid_enqueue(md_raidcs_t *cs);
+static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un);
+uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un);
+static void getpbuffer(md_raidcs_t *cs);
+static void getdbuffer(md_raidcs_t *cs);
+static void raid_done(buf_t *bp);
+static void raid_io_startup(mr_unit_t *un);
+
+static rus_state_t
+raid_col2unit(rcs_state_t state, rus_state_t unitstate)
+{
+ switch (state) {
+ case RCS_INIT:
+ return (RUS_INIT);
+ case RCS_OKAY:
+ return (RUS_OKAY);
+ case RCS_RESYNC:
+ if (unitstate & RUS_LAST_ERRED)
+ return (RUS_LAST_ERRED);
+ else
+ return (RUS_ERRED);
+ case RCS_ERRED:
+ return (RUS_ERRED);
+ case RCS_LAST_ERRED:
+ return (RUS_ERRED);
+ default:
+ break;
+ }
+ panic("raid_col2unit");
+ /*NOTREACHED*/
+}
+
+void
+raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force)
+{
+
+ rus_state_t unitstate, origstate;
+ rcs_state_t colstate;
+ rcs_state_t orig_colstate;
+ int errcnt = 0,
+ okaycnt = 0,
+ resynccnt = 0;
+ int i;
+ char *devname;
+
+ ASSERT(un);
+ ASSERT(col < un->un_totalcolumncnt);
+ ASSERT(newstate &
+ (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
+ RCS_LAST_ERRED | RCS_REGEN));
+ ASSERT((newstate &
+ ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED |
+ RCS_LAST_ERRED | RCS_REGEN))
+ == 0);
+
+ ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
+
+ unitstate = un->un_state;
+ origstate = unitstate;
+
+ if (force) {
+ un->un_column[col].un_devstate = newstate;
+ un->un_state = raid_col2unit(newstate, unitstate);
+ uniqtime32(&un->un_column[col].un_devtimestamp);
+ uniqtime32(&un->un_timestamp);
+ return;
+ }
+
+ ASSERT(un->un_state &
+ (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED |
+ RUS_REGEN));
+ ASSERT((un->un_state & ~(RUS_INIT |
+ RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0);
+
+ if (un->un_column[col].un_devstate == newstate)
+ return;
+
+ if (newstate == RCS_REGEN) {
+ if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt)
+ return;
+ un->un_state = RUS_REGEN;
+ return;
+ }
+
+ orig_colstate = un->un_column[col].un_devstate;
+
+ /*
+ * if there is another column in the error state then this
+ * column should go to the last errored state
+ */
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ if (i == col)
+ colstate = newstate;
+ else
+ colstate = un->un_column[i].un_devstate;
+ if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED))
+ errcnt++;
+ if (colstate & RCS_OKAY)
+ okaycnt++;
+ if (colstate & RCS_RESYNC)
+ resynccnt++;
+ }
+ ASSERT(resynccnt < 2);
+
+ if (okaycnt == un->un_totalcolumncnt)
+ unitstate = RUS_OKAY;
+ else if (errcnt > 1) {
+ unitstate = RUS_LAST_ERRED;
+ if (newstate & RCS_ERRED)
+ newstate = RCS_LAST_ERRED;
+ } else if (errcnt == 1)
+ if (!(unitstate & RUS_LAST_ERRED))
+ unitstate = RUS_ERRED;
+
+ if (un->un_state == RUS_DOI)
+ unitstate = RUS_DOI;
+
+ un->un_column[col].un_devstate = newstate;
+ uniqtime32(&un->un_column[col].un_devtimestamp);
+ /*
+ * if there are last errored column being brought back online
+ * by open or snarf, then be sure to clear the RUS_LAST_ERRED
+ * bit to allow writes. If there is a real error then the
+ * column will go back into last erred.
+ */
+ if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) &&
+ (raid_state_cnt(un, RCS_ERRED) == 1))
+ unitstate = RUS_ERRED;
+
+ un->un_state = unitstate;
+ uniqtime32(&un->un_timestamp);
+
+ if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) &&
+ (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) {
+ devname = md_devname(MD_UN2SET(un),
+ un->un_column[col].un_dev, NULL, 0);
+
+ cmn_err(CE_WARN, "md: %s: %s needs maintenance",
+ md_shortname(MD_SID(un)), devname);
+
+ if (unitstate & RUS_LAST_ERRED) {
+ cmn_err(CE_WARN, "md: %s: %s last erred",
+ md_shortname(MD_SID(un)), devname);
+
+ } else if (un->un_column[col].un_devflags &
+ MD_RAID_DEV_ISOPEN) {
+ /*
+ * Close the broken device and clear the open flag on
+ * it. We have to check that the device is open,
+ * otherwise the first open on it has resulted in the
+ * error that is being processed and the actual un_dev
+ * will be NODEV64.
+ */
+ md_layered_close(un->un_column[col].un_dev,
+ MD_OFLG_NULL);
+ un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
+ }
+ } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED &&
+ un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) {
+ /*
+ * Similar to logic above except no log messages since we
+ * are just transitioning from Last Erred to Erred.
+ */
+ md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL);
+ un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN;
+ }
+
+ /*
+ * If a resync has completed, see if there is a Last Erred
+ * component that we can change to the Erred state.
+ */
+ if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) {
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ if (i != col &&
+ (un->un_column[i].un_devstate & RCS_LAST_ERRED)) {
+ raid_set_state(un, i, RCS_ERRED, 0);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * NAME: erred_check_line
+ *
+ * DESCRIPTION: Return the type of write to perform on an erred column based
+ * upon any resync activity.
+ *
+ * if a column is being resynced and the write is above the
+ * resync point may have to write to the target being resynced.
+ *
+ * Column state may make it impossible to do the write
+ * in which case RCL_EIO or RCL_ENXIO is returned.
+ *
+ * If a column cannot be written directly, RCL_ERRED is
+ * returned and processing should proceed accordingly.
+ *
+ * PARAMETERS: minor_t mnum - minor number identity of metadevice
+ * md_raidcs_t *cs - child save structure
+ * mr_column_t *dcolumn - pointer to data column structure
+ * mr_column_t *pcolumn - pointer to parity column structure
+ *
+ * RETURNS: RCL_OKAY, RCL_ERRED
+ *
+ * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held
+ * across call.
+ */
+
+static int
+erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column)
+{
+
+ ASSERT(un != NULL);
+ ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
+
+ if (column->un_devstate & RCS_OKAY)
+ return (RCL_OKAY);
+
+ if (column->un_devstate & RCS_ERRED)
+ return (RCL_ERRED); /* do not read from errored disk */
+
+ /*
+ * for the last errored case their are two considerations.
+ * When the last errored column is the only errored column then
+ * do treat it like a maintenance column, not doing I/O from
+ * it. When it there are other failures then just attempt
+ * to use it.
+ */
+ if (column->un_devstate & RCS_LAST_ERRED)
+ return (RCL_ERRED);
+
+ ASSERT(column->un_devstate & RCS_RESYNC);
+
+ /*
+ * When a resync from a hotspare is being done (copy resync)
+ * then always treat it as an OKAY column, since no regen
+ * is required.
+ */
+ if (column->un_devflags & MD_RAID_COPY_RESYNC) {
+ return (RCL_OKAY);
+ }
+
+ mutex_enter(&un->un_mx);
+ if (cs->cs_line < un->un_resync_line_index) {
+ mutex_exit(&un->un_mx);
+ return (RCL_OKAY);
+ }
+ mutex_exit(&un->un_mx);
+ return (RCL_ERRED);
+
+}
+
+/*
+ * NAMES: raid_state_cnt
+ *
+ * DESCRIPTION: counts number of column in a specific state
+ *
+ * PARAMETERS: md_raid_t *un
+ * rcs_state state
+ */
+int
+raid_state_cnt(mr_unit_t *un, rcs_state_t state)
+{
+ int i, retval = 0;
+
+ for (i = 0; i < un->un_totalcolumncnt; i++)
+ if (un->un_column[i].un_devstate & state)
+ retval++;
+ return (retval);
+}
+
+/*
+ * NAMES: raid_io_overlaps
+ *
+ * DESCRIPTION: checkst for overlap of 2 child save structures
+ *
+ * PARAMETERS: md_raidcs_t cs1
+ * md_raidcs_t cs2
+ *
+ * RETURNS: 0 - no overlap
+ * 1 - overlap
+ */
+int
+raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2)
+{
+ if (cs1->cs_blkno > cs2->cs_lastblk)
+ return (0);
+ if (cs1->cs_lastblk < cs2->cs_blkno)
+ return (0);
+ return (1);
+}
+
+/*
+ * NAMES: raid_parent_constructor
+ * DESCRIPTION: parent structure constructor routine
+ * PARAMETERS:
+ */
+/*ARGSUSED1*/
+static int
+raid_parent_constructor(void *p, void *d1, int d2)
+{
+ mutex_init(&((md_raidps_t *)p)->ps_mx,
+ NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&((md_raidps_t *)p)->ps_mapin_mx,
+ NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+void
+raid_parent_init(md_raidps_t *ps)
+{
+ bzero(ps, offsetof(md_raidps_t, ps_mx));
+ ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE;
+ ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC;
+}
+
+/*ARGSUSED1*/
+static void
+raid_parent_destructor(void *p, void *d)
+{
+ mutex_destroy(&((md_raidps_t *)p)->ps_mx);
+ mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx);
+}
+
+/*
+ * NAMES: raid_child_constructor
+ * DESCRIPTION: child structure constructor routine
+ * PARAMETERS:
+ */
+/*ARGSUSED1*/
+static int
+raid_child_constructor(void *p, void *d1, int d2)
+{
+ md_raidcs_t *cs = (md_raidcs_t *)p;
+ mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL);
+ bioinit(&cs->cs_dbuf);
+ bioinit(&cs->cs_pbuf);
+ bioinit(&cs->cs_hbuf);
+ return (0);
+}
+
+void
+raid_child_init(md_raidcs_t *cs)
+{
+ bzero(cs, offsetof(md_raidcs_t, cs_mx));
+
+ md_bioreset(&cs->cs_dbuf);
+ md_bioreset(&cs->cs_pbuf);
+ md_bioreset(&cs->cs_hbuf);
+
+ ((md_raidcs_t *)cs)->cs_dbuf.b_chain =
+ ((md_raidcs_t *)cs)->cs_pbuf.b_chain =
+ ((md_raidcs_t *)cs)->cs_hbuf.b_chain =
+ (struct buf *)(cs);
+
+ cs->cs_magic = RAID_CSMAGIC;
+ cs->cs_line = MD_DISKADDR_ERROR;
+ cs->cs_dpwslot = -1;
+ cs->cs_ppwslot = -1;
+}
+
+/*ARGSUSED1*/
+static void
+raid_child_destructor(void *p, void *d)
+{
+ biofini(&((md_raidcs_t *)p)->cs_dbuf);
+ biofini(&((md_raidcs_t *)p)->cs_hbuf);
+ biofini(&((md_raidcs_t *)p)->cs_pbuf);
+ mutex_destroy(&((md_raidcs_t *)p)->cs_mx);
+}
+
+/*ARGSUSED1*/
+static int
+raid_cbuf_constructor(void *p, void *d1, int d2)
+{
+ bioinit(&((md_raidcbuf_t *)p)->cbuf_bp);
+ return (0);
+}
+
+static void
+raid_cbuf_init(md_raidcbuf_t *cb)
+{
+ bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp));
+ md_bioreset(&cb->cbuf_bp);
+ cb->cbuf_magic = RAID_BUFMAGIC;
+ cb->cbuf_pwslot = -1;
+ cb->cbuf_flags = CBUF_WRITE;
+}
+
+/*ARGSUSED1*/
+static void
+raid_cbuf_destructor(void *p, void *d)
+{
+ biofini(&((md_raidcbuf_t *)p)->cbuf_bp);
+}
+
+/*
+ * NAMES: raid_run_queue
+ * DESCRIPTION: spawn a backend processing daemon for RAID metadevice.
+ * PARAMETERS:
+ */
+/*ARGSUSED*/
+static void
+raid_run_queue(void *d)
+{
+ if (!(md_status & MD_GBL_DAEMONS_LIVE))
+ md_daemon(1, &md_done_daemon);
+}
+
+/*
+ * NAME: raid_build_pwslot
+ * DESCRIPTION: builds mr_pw_reserve for the column
+ * PARAMETERS: un is the pointer to the unit structure
+ * colindex is the column to create the structure for
+ */
+int
+raid_build_pw_reservation(mr_unit_t *un, int colindex)
+{
+ mr_pw_reserve_t *pw;
+ mr_scoreboard_t *sb;
+ int i;
+
+ pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) +
+ (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP);
+ pw->pw_magic = RAID_PWMAGIC;
+ pw->pw_column = colindex;
+ pw->pw_free = un->un_pwcnt;
+ sb = &pw->pw_sb[0];
+ for (i = 0; i < un->un_pwcnt; i++) {
+ sb[i].sb_column = colindex;
+ sb[i].sb_flags = SB_UNUSED;
+ sb[i].sb_start_blk = 0;
+ sb[i].sb_last_blk = 0;
+ sb[i].sb_cs = NULL;
+ }
+ un->un_column_ic[colindex].un_pw_reserve = pw;
+ return (0);
+}
+/*
+ * NAME: raid_free_pw_reservation
+ * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * int colindex - index of the column whose pre-write slot struct
+ * is to be destroyed.
+ */
+void
+raid_free_pw_reservation(mr_unit_t *un, int colindex)
+{
+ mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve;
+
+ kmem_free(pw, sizeof (mr_pw_reserve_t) +
+ (sizeof (mr_scoreboard_t) * un->un_pwcnt));
+}
+
+/*
+ * NAME: raid_cancel_pwslot
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_cancel_pwslot(md_raidcs_t *cs)
+{
+ mr_unit_t *un = cs->cs_un;
+ mr_pw_reserve_t *pw;
+ mr_scoreboard_t *sb;
+ mr_column_ic_t *col;
+ md_raidcbuf_t *cbuf;
+ int broadcast = 0;
+
+ if (cs->cs_ps->ps_flags & MD_RPS_READ)
+ return;
+ if (cs->cs_dpwslot != -1) {
+ col = &un->un_column_ic[cs->cs_dcolumn];
+ pw = col->un_pw_reserve;
+ sb = &pw->pw_sb[cs->cs_dpwslot];
+ sb->sb_flags = SB_AVAIL;
+ if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
+ broadcast++;
+ sb->sb_cs = NULL;
+ }
+
+ if (cs->cs_ppwslot != -1) {
+ col = &un->un_column_ic[cs->cs_pcolumn];
+ pw = col->un_pw_reserve;
+ sb = &pw->pw_sb[cs->cs_ppwslot];
+ sb->sb_flags = SB_AVAIL;
+ if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
+ broadcast++;
+ sb->sb_cs = NULL;
+ }
+
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
+ if (cbuf->cbuf_pwslot == -1)
+ continue;
+ col = &un->un_column_ic[cbuf->cbuf_column];
+ pw = col->un_pw_reserve;
+ sb = &pw->pw_sb[cbuf->cbuf_pwslot];
+ sb->sb_flags = SB_AVAIL;
+ if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
+ broadcast++;
+ sb->sb_cs = NULL;
+ }
+ if (broadcast) {
+ cv_broadcast(&un->un_cv);
+ return;
+ }
+ mutex_enter(&un->un_mx);
+ if (un->un_rflags & MD_RFLAG_NEEDPW)
+ cv_broadcast(&un->un_cv);
+ mutex_exit(&un->un_mx);
+}
+
+static void
+raid_free_pwinvalidate(md_raidcs_t *cs)
+{
+ md_raidcbuf_t *cbuf;
+ md_raidcbuf_t *cbuf_to_free;
+ mr_unit_t *un = cs->cs_un;
+ mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
+ mr_pw_reserve_t *pw;
+ mr_scoreboard_t *sb;
+ int broadcast = 0;
+
+ cbuf = cs->cs_pw_inval_list;
+ ASSERT(cbuf);
+ mutex_enter(&un->un_linlck_mx);
+ while (cbuf) {
+ pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve;
+ sb = &pw->pw_sb[0];
+ ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND);
+ sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED;
+ sb[cbuf->cbuf_pwslot].sb_cs = NULL;
+ if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW))
+ broadcast++;
+ cbuf_to_free = cbuf;
+ cbuf = cbuf->cbuf_next;
+ kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize));
+ kmem_cache_free(raid_cbuf_cache, cbuf_to_free);
+ }
+ cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL;
+ /*
+ * now that there is a free prewrite slot, check to see if there
+ * are any io operations waiting first wake up the raid_io_startup
+ * then signal the the processes waiting in raid_write.
+ */
+ if (ui->ui_io_lock->io_list_front)
+ raid_io_startup(un);
+ mutex_exit(&un->un_linlck_mx);
+ if (broadcast) {
+ cv_broadcast(&un->un_cv);
+ return;
+ }
+ mutex_enter(&un->un_mx);
+ if (un->un_rflags & MD_RFLAG_NEEDPW)
+ cv_broadcast(&un->un_cv);
+ mutex_exit(&un->un_mx);
+}
+
+
+static int
+raid_get_pwslot(md_raidcs_t *cs, int column)
+{
+ mr_scoreboard_t *sb;
+ mr_pw_reserve_t *pw;
+ mr_unit_t *un = cs->cs_un;
+ diskaddr_t start_blk = cs->cs_blkno;
+ diskaddr_t last_blk = cs->cs_lastblk;
+ int i;
+ int pwcnt = un->un_pwcnt;
+ int avail = -1;
+ int use = -1;
+ int flags;
+
+
+ /* start with the data column */
+ pw = cs->cs_un->un_column_ic[column].un_pw_reserve;
+ sb = &pw->pw_sb[0];
+ ASSERT(pw->pw_free > 0);
+ for (i = 0; i < pwcnt; i++) {
+ flags = sb[i].sb_flags;
+ if (flags & SB_INVAL_PEND)
+ continue;
+
+ if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED)))
+ avail = i;
+
+ if ((start_blk > sb[i].sb_last_blk) ||
+ (last_blk < sb[i].sb_start_blk))
+ continue;
+
+ /* OVERLAP */
+ ASSERT(! (sb[i].sb_flags & SB_INUSE));
+
+ /*
+ * raid_invalidate_pwslot attempts to zero out prewrite entry
+ * in parallel with other disk reads/writes related to current
+ * transaction. however cs_frags accounting for this case is
+ * broken because raid_write_io resets cs_frags i.e. ignoring
+ * that it could have been been set to > 0 value by
+ * raid_invalidate_pwslot. While this can be fixed an
+ * additional problem is that we don't seem to handle
+ * correctly the case of getting a disk error for prewrite
+ * entry invalidation.
+ * It does not look like we really need
+ * to invalidate prewrite slots because raid_replay sorts
+ * prewrite id's in ascending order and during recovery the
+ * latest prewrite entry for the same block will be replay
+ * last. That's why i ifdef'd out the call to
+ * raid_invalidate_pwslot. --aguzovsk@east
+ */
+
+ if (use == -1) {
+ use = i;
+ }
+ }
+
+ ASSERT(avail != -1);
+ pw->pw_free--;
+ if (use == -1)
+ use = avail;
+
+ ASSERT(! (sb[use].sb_flags & SB_INUSE));
+ sb[use].sb_flags = SB_INUSE;
+ sb[use].sb_cs = cs;
+ sb[use].sb_start_blk = start_blk;
+ sb[use].sb_last_blk = last_blk;
+ ASSERT((use >= 0) && (use < un->un_pwcnt));
+ return (use);
+}
+
+static int
+raid_check_pw(md_raidcs_t *cs)
+{
+
+ mr_unit_t *un = cs->cs_un;
+ int i;
+
+ ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
+ /*
+ * check to be sure there is a prewrite slot available
+ * if not just return.
+ */
+ if (cs->cs_flags & MD_RCS_LINE) {
+ for (i = 0; i < un->un_totalcolumncnt; i++)
+ if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0)
+ return (1);
+ return (0);
+ }
+
+ if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0)
+ return (1);
+ if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0)
+ return (1);
+ return (0);
+}
+static int
+raid_alloc_pwslot(md_raidcs_t *cs)
+{
+ mr_unit_t *un = cs->cs_un;
+ md_raidcbuf_t *cbuf;
+
+ ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS));
+ if (raid_check_pw(cs))
+ return (1);
+
+ mutex_enter(&un->un_mx);
+ un->un_pwid++;
+ cs->cs_pwid = un->un_pwid;
+ mutex_exit(&un->un_mx);
+
+ cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn);
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
+ cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column);
+ }
+ cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn);
+
+ cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS;
+
+ return (0);
+}
+
+/*
+ * NAMES: raid_build_incore
+ * DESCRIPTION: RAID metadevice incore structure building routine
+ * PARAMETERS: void *p - pointer to a unit structure
+ * int snarfing - a flag to indicate snarfing is required
+ */
+int
+raid_build_incore(void *p, int snarfing)
+{
+ mr_unit_t *un = (mr_unit_t *)p;
+ minor_t mnum = MD_SID(un);
+ mddb_recid_t hs_recid = 0;
+ int i;
+ int preserve_flags;
+ mr_column_t *column;
+ int iosize;
+ md_dev64_t hs, dev;
+ int resync_cnt = 0,
+ error_cnt = 0;
+
+ hs = NODEV64;
+ dev = NODEV64;
+
+ /* clear out bogus pointer incase we return(1) prior to alloc */
+ un->mr_ic = NULL;
+
+ if (MD_STATUS(un) & MD_UN_BEING_RESET) {
+ mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
+ return (1);
+ }
+
+ if (MD_UNIT(mnum) != NULL)
+ return (0);
+
+ if (snarfing)
+ MD_STATUS(un) = 0;
+
+ un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic),
+ KM_SLEEP);
+
+ un->un_column_ic = (mr_column_ic_t *)
+ kmem_zalloc(sizeof (mr_column_ic_t) *
+ un->un_totalcolumncnt, KM_SLEEP);
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+
+ column = &un->un_column[i];
+ preserve_flags = column->un_devflags &
+ (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC);
+ column->un_devflags &=
+ ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN |
+ MD_RAID_WRITE_ALT);
+ if (raid_build_pw_reservation(un, i) != 0) {
+ /* could not build pwslot */
+ return (1);
+ }
+
+ if (snarfing) {
+ set_t setno = MD_MIN2SET(mnum);
+ dev = md_getdevnum(setno, mddb_getsidenum(setno),
+ column->un_orig_key, MD_NOTRUST_DEVT);
+ /*
+ * Comment out instead of remove so we have history
+ * In the pre-SVM releases stored devt is used so
+ * as long as there is one snarf is always happy
+ * even the component is powered off. This is not
+ * the case in current SVM implementation. NODEV64
+ * can be returned and in this case since we resolve
+ * the devt at 'open' time (first use of metadevice)
+ * we will allow snarf continue.
+ *
+ * if (dev == NODEV64)
+ * return (1);
+ */
+
+ /*
+ * Setup un_orig_dev from device id info if the device
+ * is valid (not NODEV64).
+ */
+ if (dev != NODEV64)
+ column->un_orig_dev = dev;
+
+ if (column->un_devstate & RCS_RESYNC)
+ resync_cnt++;
+ if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
+ error_cnt++;
+
+ if (HOTSPARED(un, i)) {
+ (void) md_hot_spare_ifc(HS_MKDEV,
+ 0, 0, 0, &column->un_hs_id, NULL,
+ &hs, NULL);
+ /*
+ * Same here
+ *
+ * if (hs == NODEV64)
+ * return (1);
+ */
+ }
+
+ if (HOTSPARED(un, i)) {
+ if (column->un_devstate &
+ (RCS_OKAY | RCS_LAST_ERRED)) {
+ column->un_dev = hs;
+ column->un_pwstart =
+ column->un_hs_pwstart;
+ column->un_devstart =
+ column->un_hs_devstart;
+ preserve_flags &=
+ ~(MD_RAID_COPY_RESYNC |
+ MD_RAID_REGEN_RESYNC);
+ } else if (column->un_devstate & RCS_RESYNC) {
+ /*
+ * if previous system was 4.0 set
+ * the direction flags
+ */
+ if ((preserve_flags &
+ (MD_RAID_COPY_RESYNC |
+ MD_RAID_REGEN_RESYNC)) == 0) {
+ if (column->un_alt_dev != NODEV64)
+ preserve_flags |=
+ MD_RAID_COPY_RESYNC;
+ else
+ preserve_flags |=
+ MD_RAID_REGEN_RESYNC;
+ }
+ }
+ } else { /* no hot spares */
+ column->un_dev = dev;
+ column->un_pwstart = column->un_orig_pwstart;
+ column->un_devstart = column->un_orig_devstart;
+ if (column->un_devstate & RCS_RESYNC) {
+ preserve_flags |= MD_RAID_REGEN_RESYNC;
+ preserve_flags &= ~MD_RAID_COPY_RESYNC;
+ }
+ }
+ if (! (column->un_devstate & RCS_RESYNC)) {
+ preserve_flags &=
+ ~(MD_RAID_REGEN_RESYNC |
+ MD_RAID_COPY_RESYNC);
+ }
+
+ column->un_devflags = preserve_flags;
+ column->un_alt_dev = NODEV64;
+ column->un_alt_pwstart = 0;
+ column->un_alt_devstart = 0;
+ un->un_resync_line_index = 0;
+ un->un_resync_index = 0;
+ un->un_percent_done = 0;
+ }
+ }
+
+ if (resync_cnt && error_cnt) {
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ column = &un->un_column[i];
+ if (HOTSPARED(un, i) &&
+ (column->un_devstate & RCS_RESYNC) &&
+ (column->un_devflags & MD_RAID_COPY_RESYNC))
+ /* hotspare has data */
+ continue;
+
+ if (HOTSPARED(un, i) &&
+ (column->un_devstate & RCS_RESYNC)) {
+ /* hotspare does not have data */
+ raid_hs_release(HS_FREE, un, &hs_recid, i);
+ column->un_dev = column->un_orig_dev;
+ column->un_pwstart = column->un_orig_pwstart;
+ column->un_devstart = column->un_orig_devstart;
+ mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM);
+ }
+
+ if (column->un_devstate & RCS_ERRED)
+ column->un_devstate = RCS_LAST_ERRED;
+
+ if (column->un_devstate & RCS_RESYNC)
+ column->un_devstate = RCS_ERRED;
+ }
+ }
+ mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
+
+ un->un_pwid = 1; /* or some other possible value */
+ un->un_magic = RAID_UNMAGIC;
+ iosize = un->un_iosize;
+ un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
+ un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP);
+ mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL);
+ un->un_linlck_chn = NULL;
+ MD_UNIT(mnum) = un;
+
+
+ return (0);
+}
+
+/*
+ * NAMES: reset_raid
+ * DESCRIPTION: RAID metadevice reset routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * minor_t mnum - RAID metadevice minor number
+ * int removing - a flag to imply removing device name from
+ * MDDB database.
+ */
+void
+reset_raid(mr_unit_t *un, minor_t mnum, int removing)
+{
+ int i, n = 0;
+ sv_dev_t *sv;
+ mr_column_t *column;
+ int column_cnt = un->un_totalcolumncnt;
+ mddb_recid_t *recids, vtoc_id;
+ int hserr;
+
+ ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) &&
+ (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL));
+
+ md_destroy_unit_incore(mnum, &raid_md_ops);
+
+ MD_UNIT(mnum) = NULL;
+
+ if (un->un_pbuffer) {
+ kmem_free(un->un_pbuffer, dbtob(un->un_iosize));
+ un->un_pbuffer = NULL;
+ }
+ if (un->un_dbuffer) {
+ kmem_free(un->un_dbuffer, dbtob(un->un_iosize));
+ un->un_dbuffer = NULL;
+ }
+
+ /* free all pre-write slots created during build incore */
+ for (i = 0; i < un->un_totalcolumncnt; i++)
+ raid_free_pw_reservation(un, i);
+
+ kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
+ un->un_totalcolumncnt);
+
+ kmem_free(un->mr_ic, sizeof (*un->mr_ic));
+
+ if (!removing)
+ return;
+
+ sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t),
+ KM_SLEEP);
+
+ recids = (mddb_recid_t *)
+ kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP);
+
+ for (i = 0; i < column_cnt; i++) {
+ md_unit_t *comp_un;
+ md_dev64_t comp_dev;
+
+ column = &un->un_column[i];
+ sv[i].setno = MD_MIN2SET(mnum);
+ sv[i].key = column->un_orig_key;
+ if (HOTSPARED(un, i)) {
+ if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED))
+ hserr = HS_BAD;
+ else
+ hserr = HS_FREE;
+ raid_hs_release(hserr, un, &recids[n++], i);
+ }
+ /*
+ * deparent any metadevices.
+ * NOTE: currently soft partitions are the only metadevices
+ * allowed in RAID metadevices.
+ */
+ comp_dev = column->un_dev;
+ if (md_getmajor(comp_dev) == md_major) {
+ comp_un = MD_UNIT(md_getminor(comp_dev));
+ recids[n++] = MD_RECID(comp_un);
+ md_reset_parent(comp_dev);
+ }
+ }
+ /* decrement the reference count of the old hsp */
+ if (un->un_hsp_id != -1)
+ (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
+ &recids[n++], NULL, NULL, NULL);
+ recids[n] = 0;
+ MD_STATUS(un) |= MD_UN_BEING_RESET;
+ vtoc_id = un->c.un_vtoc_id;
+
+ raid_commit(un, recids);
+
+
+ /* Remove the unit structure */
+ mddb_deleterec_wrapper(un->c.un_record_id);
+
+ /* Remove the vtoc, if present */
+ if (vtoc_id)
+ mddb_deleterec_wrapper(vtoc_id);
+ md_rem_names(sv, column_cnt);
+ kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t));
+ kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t));
+
+ SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
+ MD_MIN2SET(mnum), mnum);
+}
+
+/*
+ * NAMES: raid_error_parent
+ * DESCRIPTION: mark a parent structure in error
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ * int error - error value to set
+ * NOTE: (TBR) - this routine currently is not in use.
+ */
+static void
+raid_error_parent(md_raidps_t *ps, int error)
+{
+ mutex_enter(&ps->ps_mx);
+ ps->ps_flags |= MD_RPS_ERROR;
+ ps->ps_error = error;
+ mutex_exit(&ps->ps_mx);
+}
+
+/*
+ * The following defines tell raid_free_parent
+ * RFP_RLS_LOCK release the unit reader lock when done.
+ * RFP_DECR_PWFRAGS decrement ps_pwfrags
+ * RFP_DECR_FRAGS decrement ps_frags
+ * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep
+ */
+#define RFP_RLS_LOCK 0x00001
+#define RFP_DECR_PWFRAGS 0x00002
+#define RFP_DECR_FRAGS 0x00004
+#define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS)
+
+/*
+ * NAMES: raid_free_parent
+ * DESCRIPTION: free a parent structure
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ * int todo - indicates what needs to be done
+ */
+static void
+raid_free_parent(md_raidps_t *ps, int todo)
+{
+ mdi_unit_t *ui = ps->ps_ui;
+
+ ASSERT(ps->ps_magic == RAID_PSMAGIC);
+ ASSERT(ps->ps_flags & MD_RPS_INUSE);
+ mutex_enter(&ps->ps_mx);
+ if (todo & RFP_DECR_PWFRAGS) {
+ ASSERT(ps->ps_pwfrags);
+ ps->ps_pwfrags--;
+ if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) {
+ if (ps->ps_flags & MD_RPS_ERROR) {
+ ps->ps_bp->b_flags |= B_ERROR;
+ ps->ps_bp->b_error = ps->ps_error;
+ }
+ md_kstat_done(ui, ps->ps_bp, 0);
+ biodone(ps->ps_bp);
+ ps->ps_flags |= MD_RPS_IODONE;
+ }
+ }
+
+ if (todo & RFP_DECR_FRAGS) {
+ ASSERT(ps->ps_frags);
+ ps->ps_frags--;
+ }
+
+ if (ps->ps_frags != 0) {
+ mutex_exit(&ps->ps_mx);
+ return;
+ }
+
+ ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0));
+ mutex_exit(&ps->ps_mx);
+
+ if (todo & RFP_RLS_LOCK)
+ md_io_readerexit(ui);
+
+ if (panicstr) {
+ ps->ps_flags |= MD_RPS_DONE;
+ return;
+ }
+
+ if (ps->ps_flags & MD_RPS_HSREQ)
+ (void) raid_hotspares();
+
+ ASSERT(todo & RFP_RLS_LOCK);
+ ps->ps_flags &= ~MD_RPS_INUSE;
+
+ md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id));
+
+ kmem_cache_free(raid_parent_cache, ps);
+}
+
+/*
+ * NAMES: raid_free_child
+ * DESCRIPTION: free a parent structure
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ * int drop_locks - 0 for no locks held
+ * NOTE: (TBR) - this routine currently is not in use.
+ */
+static void
+raid_free_child(md_raidcs_t *cs, int drop_locks)
+{
+ mr_unit_t *un = cs->cs_un;
+ md_raidcbuf_t *cbuf, *cbuf1;
+
+ if (cs->cs_pw_inval_list)
+ raid_free_pwinvalidate(cs);
+
+ if (drop_locks) {
+ ASSERT(cs->cs_flags & MD_RCS_LLOCKD &&
+ (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER)));
+ md_unit_readerexit(MDI_UNIT(MD_SID(un)));
+ raid_line_exit(cs);
+ } else {
+ ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD));
+ }
+
+ freebuffers(cs);
+ cbuf = cs->cs_buflist;
+ while (cbuf) {
+ cbuf1 = cbuf->cbuf_next;
+ kmem_cache_free(raid_cbuf_cache, cbuf);
+ cbuf = cbuf1;
+ }
+ if (cs->cs_dbuf.b_flags & B_REMAPPED)
+ bp_mapout(&cs->cs_dbuf);
+ kmem_cache_free(raid_child_cache, cs);
+}
+
+/*
+ * NAME: raid_regen_parity
+ *
+ * DESCRIPTION: This routine is used to regenerate the parity blocks
+ * for the entire raid device. It is called from
+ * both the regen thread and the IO path.
+ *
+ * On error the entire device is marked as in error by
+ * placing the erroring device in error and all other
+ * devices in last_errored.
+ *
+ * PARAMETERS: md_raidcs_t *cs
+ */
+void
+raid_regen_parity(md_raidcs_t *cs)
+{
+ mr_unit_t *un = cs->cs_un;
+ mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id);
+ caddr_t buffer;
+ caddr_t parity_buffer;
+ buf_t *bp;
+ uint_t *dbuf, *pbuf;
+ uint_t colcnt = un->un_totalcolumncnt;
+ int column;
+ int parity_column = cs->cs_pcolumn;
+ size_t bcount;
+ int j;
+
+ /*
+ * This routine uses the data and parity buffers allocated to a
+ * write. In the case of a read the buffers are allocated and
+ * freed at the end.
+ */
+
+ ASSERT(IO_READER_HELD(un));
+ ASSERT(cs->cs_flags & MD_RCS_LLOCKD);
+ ASSERT(UNIT_READER_HELD(un));
+
+ if (raid_state_cnt(un, RCS_OKAY) != colcnt)
+ return;
+
+ if (cs->cs_flags & MD_RCS_READER) {
+ getpbuffer(cs);
+ getdbuffer(cs);
+ }
+ ASSERT(cs->cs_dbuffer && cs->cs_pbuffer);
+ bcount = cs->cs_bcount;
+ buffer = cs->cs_dbuffer;
+ parity_buffer = cs->cs_pbuffer;
+ bzero(parity_buffer, bcount);
+ bp = &cs->cs_dbuf;
+ for (column = 0; column < colcnt; column++) {
+ if (column == parity_column)
+ continue;
+ reset_buf(bp, B_READ | B_BUSY, bcount);
+ bp->b_un.b_addr = buffer;
+ bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev);
+ bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart;
+ bp->b_bcount = bcount;
+ bp->b_bufsize = bcount;
+ (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
+ if (biowait(bp))
+ goto bail;
+ pbuf = (uint_t *)(void *)parity_buffer;
+ dbuf = (uint_t *)(void *)buffer;
+ for (j = 0; j < (bcount / (sizeof (uint_t))); j++) {
+ *pbuf = *pbuf ^ *dbuf;
+ pbuf++;
+ dbuf++;
+ }
+ }
+
+ reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount);
+ bp->b_un.b_addr = parity_buffer;
+ bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev);
+ bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart;
+ bp->b_bcount = bcount;
+ bp->b_bufsize = bcount;
+ (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL);
+ if (biowait(bp))
+ goto bail;
+
+ if (cs->cs_flags & MD_RCS_READER) {
+ freebuffers(cs);
+ cs->cs_pbuffer = NULL;
+ cs->cs_dbuffer = NULL;
+ }
+ bp->b_chain = (struct buf *)cs;
+ return;
+bail:
+ if (cs->cs_flags & MD_RCS_READER) {
+ freebuffers(cs);
+ cs->cs_pbuffer = NULL;
+ cs->cs_dbuffer = NULL;
+ }
+ md_unit_readerexit(ui);
+ un = md_unit_writerlock(ui);
+ raid_set_state(un, column, RCS_ERRED, 0);
+ for (column = 0; column < colcnt; column++)
+ raid_set_state(un, column, RCS_ERRED, 0);
+ raid_commit(un, NULL);
+ md_unit_writerexit(ui);
+ un = md_unit_readerlock(ui);
+ bp->b_chain = (struct buf *)cs;
+}
+
+/*
+ * NAMES: raid_error_state
+ * DESCRIPTION: check unit and column states' impact on I/O error
+ * NOTE: the state now may not be the state when the
+ * I/O completed due to race conditions.
+ * PARAMETERS: mr_unit_t *un - pointer to raid unit structure
+ * md_raidcs_t *cs - pointer to child structure
+ * buf_t *bp - pointer to buffer structure
+ */
+static int
+raid_error_state(mr_unit_t *un, buf_t *bp)
+{
+ int column;
+ int i;
+
+ ASSERT(IO_READER_HELD(un));
+ ASSERT(UNIT_WRITER_HELD(un));
+
+ column = -1;
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) {
+ column = i;
+ break;
+ }
+ if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) {
+ column = i;
+ break;
+ }
+ }
+
+ /* in case a replace snuck in while waiting on unit writer lock */
+
+ if (column == -1) {
+ return (0);
+ }
+
+ (void) raid_set_state(un, column, RCS_ERRED, 0);
+ ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED));
+
+ raid_commit(un, NULL);
+ if (un->un_state & RUS_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE,
+ MD_UN2SET(un), MD_SID(un));
+ } else if (un->un_state & RUS_LAST_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE,
+ MD_UN2SET(un), MD_SID(un));
+ }
+
+ return (EIO);
+}
+
+/*
+ * NAME: raid_mapin_buf
+ * DESCRIPTION: wait for the input buffer header to be maped in
+ * PARAMETERS: md_raidps_t *ps
+ */
+static void
+raid_mapin_buf(md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+
+ /*
+ * check to see if the buffer is maped. If all is ok return the
+ * offset of the data and return. Since it is expensive to grab
+ * a mutex this is only done if the mapin is not complete.
+ * Once the mutex is aquired it is possible that the mapin was
+ * not done so recheck and if necessary do the mapin.
+ */
+ if (ps->ps_mapin > 0) {
+ cs->cs_addr = ps->ps_addr + cs->cs_offset;
+ return;
+ }
+ mutex_enter(&ps->ps_mapin_mx);
+ if (ps->ps_mapin > 0) {
+ cs->cs_addr = ps->ps_addr + cs->cs_offset;
+ mutex_exit(&ps->ps_mapin_mx);
+ return;
+ }
+ bp_mapin(ps->ps_bp);
+ /*
+ * get the new b_addr out of the parent since bp_mapin just changed it
+ */
+ ps->ps_addr = ps->ps_bp->b_un.b_addr;
+ cs->cs_addr = ps->ps_addr + cs->cs_offset;
+ ps->ps_mapin++;
+ mutex_exit(&ps->ps_mapin_mx);
+}
+
+/*
+ * NAMES: raid_read_no_retry
+ * DESCRIPTION: I/O retry routine for a RAID metadevice read
+ * read failed attempting to regenerate the data,
+ * no retry possible, error occured in raid_raidregenloop().
+ * PARAMETERS: mr_unit_t *un - pointer to raid unit structure
+ * md_raidcs_t *cs - pointer to child structure
+ */
+/*ARGSUSED*/
+static void
+raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+
+ raid_error_parent(ps, EIO);
+ raid_free_child(cs, 1);
+
+ /* decrement readfrags */
+ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
+}
+
+/*
+ * NAMES: raid_read_retry
+ * DESCRIPTION: I/O retry routine for a RAID metadevice read
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+static void
+raid_read_retry(mr_unit_t *un, md_raidcs_t *cs)
+{
+ /* re-initialize the buf_t structure for raid_read() */
+ cs->cs_dbuf.b_chain = (struct buf *)cs;
+ cs->cs_dbuf.b_back = &cs->cs_dbuf;
+ cs->cs_dbuf.b_forw = &cs->cs_dbuf;
+ cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */
+ cs->cs_dbuf.b_error = 0; /* initialize error */
+ cs->cs_dbuf.b_offset = -1;
+ /* Initialize semaphores */
+ sema_init(&cs->cs_dbuf.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+
+ cs->cs_pbuf.b_chain = (struct buf *)cs;
+ cs->cs_pbuf.b_back = &cs->cs_pbuf;
+ cs->cs_pbuf.b_forw = &cs->cs_pbuf;
+ cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */
+ cs->cs_pbuf.b_error = 0; /* initialize error */
+ cs->cs_pbuf.b_offset = -1;
+ sema_init(&cs->cs_pbuf.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+
+ cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */
+ cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */
+
+ /*
+ * re-scheduling I/O with raid_read_io() is simpler. basically,
+ * raid_read_io() is invoked again with same child structure.
+ * (NOTE: we aren`t supposed to do any error recovery when an I/O
+ * error occured in raid_raidregenloop().
+ */
+ raid_mapin_buf(cs);
+ raid_read_io(un, cs);
+}
+
+/*
+ * NAMES: raid_rderr
+ * DESCRIPTION: I/O error handling routine for a RAID metadevice read
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ * LOCKS: must obtain unit writer lock while calling raid_error_state
+ * since a unit or column state transition may take place.
+ * must obtain unit reader lock to retry I/O.
+ */
+/*ARGSUSED*/
+static void
+raid_rderr(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ mr_unit_t *un;
+ int error = 0;
+
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+ un = (mr_unit_t *)md_unit_writerlock(ui);
+ ASSERT(un != 0);
+
+ if (cs->cs_dbuf.b_flags & B_ERROR)
+ error = raid_error_state(un, &cs->cs_dbuf);
+ if (cs->cs_pbuf.b_flags & B_ERROR)
+ error |= raid_error_state(un, &cs->cs_pbuf);
+
+ md_unit_writerexit(ui);
+
+ ps->ps_flags |= MD_RPS_HSREQ;
+
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+ ASSERT(un != 0);
+ /* now attempt the appropriate retry routine */
+ (*(cs->cs_retry_call))(un, cs);
+}
+
+
+/*
+ * NAMES: raid_read_error
+ * DESCRIPTION: I/O error handling routine for a RAID metadevice read
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+/*ARGSUSED*/
+static void
+raid_read_error(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ mr_unit_t *un;
+ set_t setno;
+
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+ un = cs->cs_un;
+
+ setno = MD_UN2SET(un);
+
+ if ((cs->cs_dbuf.b_flags & B_ERROR) &&
+ (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
+ (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
+ cmn_err(CE_WARN, "md %s: read error on %s",
+ md_shortname(MD_SID(un)),
+ md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
+
+ if ((cs->cs_pbuf.b_flags & B_ERROR) &&
+ (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
+ (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
+ cmn_err(CE_WARN, "md %s: read error on %s",
+ md_shortname(MD_SID(un)),
+ md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
+
+ md_unit_readerexit(ui);
+
+ ASSERT(cs->cs_frags == 0);
+
+ /* now schedule processing for possible state change */
+ daemon_request(&md_mstr_daemon, raid_rderr,
+ (daemon_queue_t *)cs, REQ_OLD);
+
+}
+
+/*
+ * NAMES: getdbuffer
+ * DESCRIPTION: data buffer allocation for a child structure
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ *
+ * NOTE: always get dbuffer before pbuffer
+ * and get both buffers before pwslot
+ * otherwise a deadlock could be introduced.
+ */
+static void
+getdbuffer(md_raidcs_t *cs)
+{
+ mr_unit_t *un;
+
+ cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
+ if (cs->cs_dbuffer != NULL)
+ return;
+ un = cs->cs_ps->ps_un;
+ mutex_enter(&un->un_mx);
+ while (un->un_dbuffer == NULL) {
+ STAT_INC(data_buffer_waits);
+ un->un_rflags |= MD_RFLAG_NEEDBUF;
+ cv_wait(&un->un_cv, &un->un_mx);
+ }
+ cs->cs_dbuffer = un->un_dbuffer;
+ cs->cs_flags |= MD_RCS_UNDBUF;
+ un->un_dbuffer = NULL;
+ mutex_exit(&un->un_mx);
+}
+
+/*
+ * NAMES: getpbuffer
+ * DESCRIPTION: parity buffer allocation for a child structure
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ *
+ * NOTE: always get dbuffer before pbuffer
+ * and get both buffers before pwslot
+ * otherwise a deadlock could be introduced.
+ */
+static void
+getpbuffer(md_raidcs_t *cs)
+{
+ mr_unit_t *un;
+
+ cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP);
+ if (cs->cs_pbuffer != NULL)
+ return;
+ un = cs->cs_ps->ps_un;
+ mutex_enter(&un->un_mx);
+ while (un->un_pbuffer == NULL) {
+ STAT_INC(parity_buffer_waits);
+ un->un_rflags |= MD_RFLAG_NEEDBUF;
+ cv_wait(&un->un_cv, &un->un_mx);
+ }
+ cs->cs_pbuffer = un->un_pbuffer;
+ cs->cs_flags |= MD_RCS_UNPBUF;
+ un->un_pbuffer = NULL;
+ mutex_exit(&un->un_mx);
+}
+static void
+getresources(md_raidcs_t *cs)
+{
+ md_raidcbuf_t *cbuf;
+ /*
+ * NOTE: always get dbuffer before pbuffer
+ * and get both buffers before pwslot
+ * otherwise a deadlock could be introduced.
+ */
+ getdbuffer(cs);
+ getpbuffer(cs);
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
+ cbuf->cbuf_buffer =
+ kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP);
+}
+/*
+ * NAMES: freebuffers
+ * DESCRIPTION: child structure buffer freeing routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+static void
+freebuffers(md_raidcs_t *cs)
+{
+ mr_unit_t *un;
+ md_raidcbuf_t *cbuf;
+
+ /* free buffers used for full line write */
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
+ if (cbuf->cbuf_buffer == NULL)
+ continue;
+ kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE);
+ cbuf->cbuf_buffer = NULL;
+ cbuf->cbuf_bcount = 0;
+ }
+
+ if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
+ un = cs->cs_un;
+ mutex_enter(&un->un_mx);
+ }
+ if (cs->cs_dbuffer) {
+ if (cs->cs_flags & MD_RCS_UNDBUF)
+ un->un_dbuffer = cs->cs_dbuffer;
+ else
+ kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE);
+ }
+ if (cs->cs_pbuffer) {
+ if (cs->cs_flags & MD_RCS_UNPBUF)
+ un->un_pbuffer = cs->cs_pbuffer;
+ else
+ kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE);
+ }
+ if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) {
+ un->un_rflags &= ~MD_RFLAG_NEEDBUF;
+ cv_broadcast(&un->un_cv);
+ mutex_exit(&un->un_mx);
+ }
+}
+
+/*
+ * NAMES: raid_line_reader_lock, raid_line_writer_lock
+ * DESCRIPTION: RAID metadevice line reader and writer lock routines
+ * data column # and parity column #.
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+
+void
+raid_line_reader_lock(md_raidcs_t *cs, int resync_thread)
+{
+ mr_unit_t *un;
+ md_raidcs_t *cs1;
+
+ ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
+ un = cs->cs_un;
+ cs->cs_flags |= MD_RCS_READER;
+ STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
+ if (!panicstr)
+ mutex_enter(&un->un_linlck_mx);
+ cs1 = un->un_linlck_chn;
+ while (cs1 != NULL) {
+ for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
+ if (raid_io_overlaps(cs, cs1) == 1)
+ if (cs1->cs_flags & MD_RCS_WRITER)
+ break;
+
+ if (cs1 != NULL) {
+ if (panicstr)
+ panic("md; raid line write lock held");
+ un->un_linlck_flg = 1;
+ cv_wait(&un->un_linlck_cv, &un->un_linlck_mx);
+ STAT_INC(raid_read_waits);
+ }
+ }
+ STAT_MAX(raid_max_reader_locks, raid_reader_locks_active);
+ STAT_INC(raid_reader_locks);
+ cs1 = un->un_linlck_chn;
+ if (cs1 != NULL)
+ cs1->cs_linlck_prev = cs;
+ cs->cs_linlck_next = cs1;
+ cs->cs_linlck_prev = NULL;
+ un->un_linlck_chn = cs;
+ cs->cs_flags |= MD_RCS_LLOCKD;
+ if (resync_thread) {
+ diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
+ diskaddr_t line = (lastblk + 1) / un->un_segsize;
+ ASSERT(raid_state_cnt(un, RCS_RESYNC));
+ mutex_enter(&un->un_mx);
+ un->un_resync_line_index = line;
+ mutex_exit(&un->un_mx);
+ }
+ if (!panicstr)
+ mutex_exit(&un->un_linlck_mx);
+}
+
+int
+raid_line_writer_lock(md_raidcs_t *cs, int lock)
+{
+ mr_unit_t *un;
+ md_raidcs_t *cs1;
+
+ ASSERT(cs->cs_line != MD_DISKADDR_ERROR);
+ cs->cs_flags |= MD_RCS_WRITER;
+ un = cs->cs_ps->ps_un;
+
+ STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
+ if (lock && !panicstr)
+ mutex_enter(&un->un_linlck_mx);
+ ASSERT(MUTEX_HELD(&un->un_linlck_mx));
+
+ cs1 = un->un_linlck_chn;
+ for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
+ if (raid_io_overlaps(cs, cs1))
+ break;
+
+ if (cs1 != NULL) {
+ if (panicstr)
+ panic("md: line writer lock inaccessible");
+ goto no_lock_exit;
+ }
+
+ if (raid_alloc_pwslot(cs)) {
+ if (panicstr)
+ panic("md: no prewrite slots");
+ STAT_INC(raid_prewrite_waits);
+ goto no_lock_exit;
+ }
+
+ cs1 = un->un_linlck_chn;
+ if (cs1 != NULL)
+ cs1->cs_linlck_prev = cs;
+ cs->cs_linlck_next = cs1;
+ cs->cs_linlck_prev = NULL;
+ un->un_linlck_chn = cs;
+ cs->cs_flags |= MD_RCS_LLOCKD;
+ cs->cs_flags &= ~MD_RCS_WAITING;
+ STAT_INC(raid_writer_locks);
+ STAT_MAX(raid_max_write_locks, raid_write_locks_active);
+ if (lock && !panicstr)
+ mutex_exit(&un->un_linlck_mx);
+ return (0);
+
+no_lock_exit:
+ /* if this is already queued then do not requeue it */
+ ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
+ if (!lock || (cs->cs_flags & MD_RCS_WAITING))
+ return (1);
+ cs->cs_flags |= MD_RCS_WAITING;
+ cs->cs_un = un;
+ raid_enqueue(cs);
+ if (lock && !panicstr)
+ mutex_exit(&un->un_linlck_mx);
+ return (1);
+}
+
+static void
+raid_startio(md_raidcs_t *cs)
+{
+ mdi_unit_t *ui = cs->cs_ps->ps_ui;
+ mr_unit_t *un = cs->cs_un;
+
+ un = md_unit_readerlock(ui);
+ raid_write_io(un, cs);
+}
+
+void
+raid_io_startup(mr_unit_t *un)
+{
+ md_raidcs_t *waiting_list, *cs1;
+ md_raidcs_t *previous = NULL, *next = NULL;
+ mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id);
+ kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex;
+
+ ASSERT(MUTEX_HELD(&un->un_linlck_mx));
+ mutex_enter(io_list_mutex);
+
+ /*
+ * check to be sure there are no reader locks outstanding. If
+ * there are not then pass on the writer lock.
+ */
+ waiting_list = ui->ui_io_lock->io_list_front;
+ while (waiting_list) {
+ ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
+ ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD));
+ for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next)
+ if (raid_io_overlaps(waiting_list, cs1) == 1)
+ break;
+ /*
+ * there was an IOs that overlaps this io so go onto
+ * the next io in the waiting list
+ */
+ if (cs1) {
+ previous = waiting_list;
+ waiting_list = waiting_list->cs_linlck_next;
+ continue;
+ }
+
+ /*
+ * There are no IOs that overlap this, so remove it from
+ * the waiting queue, and start it
+ */
+
+ if (raid_check_pw(waiting_list)) {
+ ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
+ previous = waiting_list;
+ waiting_list = waiting_list->cs_linlck_next;
+ continue;
+ }
+ ASSERT(waiting_list->cs_flags & MD_RCS_WAITING);
+
+ next = waiting_list->cs_linlck_next;
+ if (previous)
+ previous->cs_linlck_next = next;
+ else
+ ui->ui_io_lock->io_list_front = next;
+
+ if (ui->ui_io_lock->io_list_front == NULL)
+ ui->ui_io_lock->io_list_back = NULL;
+
+ if (ui->ui_io_lock->io_list_back == waiting_list)
+ ui->ui_io_lock->io_list_back = previous;
+
+ waiting_list->cs_linlck_next = NULL;
+ waiting_list->cs_flags &= ~MD_RCS_WAITING;
+ STAT_DEC(raid_write_queue_length);
+ if (raid_line_writer_lock(waiting_list, 0))
+ panic("region locking corrupted");
+
+ ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD);
+ daemon_request(&md_mstr_daemon, raid_startio,
+ (daemon_queue_t *)waiting_list, REQ_OLD);
+ waiting_list = next;
+
+ }
+ mutex_exit(io_list_mutex);
+}
+
+void
+raid_line_exit(md_raidcs_t *cs)
+{
+ mr_unit_t *un;
+
+ un = cs->cs_ps->ps_un;
+ STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx));
+ mutex_enter(&un->un_linlck_mx);
+ if (cs->cs_flags & MD_RCS_READER)
+ STAT_DEC(raid_reader_locks_active);
+ else
+ STAT_DEC(raid_write_locks_active);
+
+ if (cs->cs_linlck_prev)
+ cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next;
+ else
+ un->un_linlck_chn = cs->cs_linlck_next;
+ if (cs->cs_linlck_next)
+ cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev;
+
+ cs->cs_flags &= ~MD_RCS_LLOCKD;
+
+ if (un->un_linlck_flg)
+ cv_broadcast(&un->un_linlck_cv);
+
+ un->un_linlck_flg = 0;
+ cs->cs_line = MD_DISKADDR_ERROR;
+
+ raid_cancel_pwslot(cs);
+ /*
+ * now that the lock is droped go ahead and see if there are any
+ * other writes that can be started up
+ */
+ raid_io_startup(un);
+
+ mutex_exit(&un->un_linlck_mx);
+}
+
+/*
+ * NAMES: raid_line, raid_pcolumn, raid_dcolumn
+ * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #,
+ * data column # and parity column #.
+ * PARAMETERS: int segment - segment number
+ * mr_unit_t *un - pointer to an unit structure
+ * RETURNS: raid_line returns line #
+ * raid_dcolumn returns data column #
+ * raid_pcolumn returns parity column #
+ */
+static diskaddr_t
+raid_line(diskaddr_t segment, mr_unit_t *un)
+{
+ diskaddr_t adj_seg;
+ diskaddr_t line;
+ diskaddr_t max_orig_segment;
+
+ max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
+ if (segment >= max_orig_segment) {
+ adj_seg = segment - max_orig_segment;
+ line = adj_seg % un->un_segsincolumn;
+ } else {
+ line = segment / (un->un_origcolumncnt - 1);
+ }
+ return (line);
+}
+
+uint_t
+raid_dcolumn(diskaddr_t segment, mr_unit_t *un)
+{
+ diskaddr_t adj_seg;
+ diskaddr_t line;
+ diskaddr_t max_orig_segment;
+ uint_t column;
+
+ max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
+ if (segment >= max_orig_segment) {
+ adj_seg = segment - max_orig_segment;
+ column = un->un_origcolumncnt +
+ (uint_t)(adj_seg / un->un_segsincolumn);
+ } else {
+ line = segment / (un->un_origcolumncnt - 1);
+ column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line)
+ % un->un_origcolumncnt);
+ }
+ return (column);
+}
+
+uint_t
+raid_pcolumn(diskaddr_t segment, mr_unit_t *un)
+{
+ diskaddr_t adj_seg;
+ diskaddr_t line;
+ diskaddr_t max_orig_segment;
+ uint_t column;
+
+ max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn;
+ if (segment >= max_orig_segment) {
+ adj_seg = segment - max_orig_segment;
+ line = adj_seg % un->un_segsincolumn;
+ } else {
+ line = segment / (un->un_origcolumncnt - 1);
+ }
+ column = (uint_t)((line + (un->un_origcolumncnt - 1))
+ % un->un_origcolumncnt);
+ return (column);
+}
+
+
+/*
+ * Is called in raid_iosetup to probe each column to insure
+ * that all the columns are in 'okay' state and meet the
+ * 'full line' requirement. If any column is in error,
+ * we don't want to enable the 'full line' flag. Previously,
+ * we would do so and disable it only when a error is
+ * detected after the first 'full line' io which is too late
+ * and leads to the potential data corruption.
+ */
+static int
+raid_check_cols(mr_unit_t *un)
+{
+ buf_t bp;
+ char *buf;
+ mr_column_t *colptr;
+ minor_t mnum = MD_SID(un);
+ int i;
+ int err = 0;
+
+ buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ md_dev64_t tmpdev;
+
+ colptr = &un->un_column[i];
+
+ tmpdev = colptr->un_dev;
+ /*
+ * Open by device id
+ * If this device is hotspared
+ * use the hotspare key
+ */
+ tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
+ colptr->un_hs_key : colptr->un_orig_key);
+
+ if (tmpdev == NODEV64) {
+ err = 1;
+ break;
+ }
+
+ colptr->un_dev = tmpdev;
+
+ bzero((caddr_t)&bp, sizeof (buf_t));
+ bp.b_back = &bp;
+ bp.b_forw = &bp;
+ bp.b_flags = (B_READ | B_BUSY);
+ sema_init(&bp.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&bp.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ bp.b_edev = md_dev64_to_dev(colptr->un_dev);
+ bp.b_lblkno = colptr->un_pwstart;
+ bp.b_bcount = DEV_BSIZE;
+ bp.b_bufsize = DEV_BSIZE;
+ bp.b_un.b_addr = (caddr_t)buf;
+ (void) md_call_strategy(&bp, 0, NULL);
+ if (biowait(&bp)) {
+ err = 1;
+ break;
+ }
+ }
+
+ kmem_free(buf, DEV_BSIZE);
+ return (err);
+}
+
+/*
+ * NAME: raid_iosetup
+ * DESCRIPTION: RAID metadevice specific I/O set up routine which does
+ * all the necessary calculations to determine the location
+ * of the segement for the I/O.
+ * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice
+ * diskaddr_t blkno - block number of the I/O attempt
+ * size_t blkcnt - block count for this I/O
+ * md_raidcs_t *cs - child structure for each segmented I/O
+ *
+ * NOTE: The following is an example of a raid disk layer out:
+ *
+ * Total Column = 5
+ * Original Column = 4
+ * Segment Per Column = 10
+ *
+ * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6
+ * -------------------------------------------------------------
+ * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40
+ * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31
+ * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32
+ * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33
+ * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34
+ * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35
+ * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36
+ * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37
+ * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38
+ * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39
+ */
+static size_t
+raid_iosetup(
+ mr_unit_t *un,
+ diskaddr_t blkno,
+ size_t blkcnt,
+ md_raidcs_t *cs
+)
+{
+ diskaddr_t segment;
+ diskaddr_t segstart;
+ diskaddr_t segoff;
+ size_t leftover;
+ diskaddr_t line;
+ uint_t iosize;
+ uint_t colcnt;
+
+ /* caculate the segment# and offset for the block */
+ segment = blkno / un->un_segsize;
+ segstart = segment * un->un_segsize;
+ segoff = blkno - segstart;
+ iosize = un->un_iosize - 1;
+ colcnt = un->un_totalcolumncnt - 1;
+ line = raid_line(segment, un);
+ cs->cs_dcolumn = raid_dcolumn(segment, un);
+ cs->cs_pcolumn = raid_pcolumn(segment, un);
+ cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags;
+ cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags;
+ cs->cs_line = line;
+
+ if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) &&
+ (UNIT_STATE(un) & RCS_OKAY) &&
+ (segoff == 0) &&
+ (un->un_totalcolumncnt == un->un_origcolumncnt) &&
+ (un->un_segsize < un->un_iosize) &&
+ (un->un_iosize <= un->un_maxio) &&
+ (blkno == line * un->un_segsize * colcnt) &&
+ (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) &&
+ (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) &&
+ (raid_check_cols(un) == 0)) {
+
+ md_raidcbuf_t **cbufp;
+ md_raidcbuf_t *cbuf;
+ int i, j;
+
+ STAT_INC(raid_full_line_writes);
+ leftover = blkcnt - (un->un_segsize * colcnt);
+ ASSERT(blkcnt >= (un->un_segsize * colcnt));
+ cs->cs_blkno = line * un->un_segsize;
+ cs->cs_blkcnt = un->un_segsize;
+ cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
+ cs->cs_bcount = dbtob(cs->cs_blkcnt);
+ cs->cs_flags |= MD_RCS_LINE;
+
+ cbufp = &cs->cs_buflist;
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ j = cs->cs_dcolumn + i;
+ j = j % un->un_totalcolumncnt;
+
+ if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn))
+ continue;
+ cbuf = kmem_cache_alloc(raid_cbuf_cache,
+ MD_ALLOCFLAGS);
+ raid_cbuf_init(cbuf);
+ cbuf->cbuf_un = cs->cs_un;
+ cbuf->cbuf_ps = cs->cs_ps;
+ cbuf->cbuf_column = j;
+ cbuf->cbuf_bcount = dbtob(un->un_segsize);
+ *cbufp = cbuf;
+ cbufp = &cbuf->cbuf_next;
+ }
+ return (leftover);
+ }
+
+ leftover = blkcnt - (un->un_segsize - segoff);
+ if (blkcnt > (un->un_segsize - segoff))
+ blkcnt -= leftover;
+ else
+ leftover = 0;
+
+ if (blkcnt > (size_t)iosize) {
+ leftover += (blkcnt - iosize);
+ blkcnt = iosize;
+ }
+
+ /* calculate the line# and column# for the segment */
+ cs->cs_flags &= ~MD_RCS_LINE;
+ cs->cs_blkno = line * un->un_segsize + segoff;
+ cs->cs_blkcnt = (uint_t)blkcnt;
+ cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1;
+ cs->cs_bcount = dbtob((uint_t)blkcnt);
+ return (leftover);
+}
+
+/*
+ * NAME: raid_done
+ * DESCRIPTION: RAID metadevice I/O done interrupt routine
+ * PARAMETERS: struct buf *bp - pointer to a buffer structure
+ */
+static void
+raid_done(struct buf *bp)
+{
+ md_raidcs_t *cs;
+ int flags, frags;
+
+ sema_v(&bp->b_io);
+ cs = (md_raidcs_t *)bp->b_chain;
+
+ ASSERT(cs != NULL);
+
+ mutex_enter(&cs->cs_mx);
+ if (bp->b_flags & B_ERROR) {
+ cs->cs_flags |= MD_RCS_ERROR;
+ cs->cs_flags &= ~(MD_RCS_ISCALL);
+ }
+
+ flags = cs->cs_flags;
+ frags = --cs->cs_frags;
+ mutex_exit(&cs->cs_mx);
+ if (frags != 0) {
+ return;
+ }
+
+ if (flags & MD_RCS_ERROR) {
+ if (cs->cs_error_call) {
+ daemon_request(&md_done_daemon, cs->cs_error_call,
+ (daemon_queue_t *)cs, REQ_OLD);
+ }
+ return;
+ }
+
+ if (flags & MD_RCS_ISCALL) {
+ cs->cs_flags &= ~(MD_RCS_ISCALL);
+ (*(cs->cs_call))(cs);
+ return;
+ }
+ daemon_request(&md_done_daemon, cs->cs_call,
+ (daemon_queue_t *)cs, REQ_OLD);
+}
+/*
+ * the flag RIO_EXTRA is used when dealing with a column in the process
+ * of being resynced. During the resync, writes may have to take place
+ * on both the original component and a hotspare component.
+ */
+#define RIO_DATA 0x00100 /* use data buffer & data column */
+#define RIO_PARITY 0x00200 /* use parity buffer & parity column */
+#define RIO_WRITE 0x00400 /* issue a write */
+#define RIO_READ 0x00800 /* issue a read */
+#define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */
+#define RIO_ALT 0x02000 /* do write to alternate device */
+#define RIO_EXTRA 0x04000 /* use extra buffer */
+
+#define RIO_COLMASK 0x000ff
+
+#define RIO_PREWRITE RIO_WRITE | RIO_PWIO
+
+/*
+ * NAME: raidio
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raidio(md_raidcs_t *cs, int flags)
+{
+ buf_t *bp;
+ int column;
+ int flag;
+ void *private;
+ mr_unit_t *un;
+ int iosize;
+ diskaddr_t pwstart;
+ diskaddr_t devstart;
+ md_dev64_t dev;
+
+ un = cs->cs_un;
+
+ ASSERT(IO_READER_HELD(un));
+ ASSERT(UNIT_READER_HELD(un));
+
+ if (flags & RIO_DATA) {
+ if (flags & RIO_EXTRA)
+ bp = &cs->cs_hbuf;
+ else
+ bp = &cs->cs_dbuf;
+ bp->b_un.b_addr = cs->cs_dbuffer;
+ column = cs->cs_dcolumn;
+ } else {
+ if (flags & RIO_EXTRA)
+ bp = &cs->cs_hbuf;
+ else
+ bp = &cs->cs_pbuf;
+ bp->b_un.b_addr = cs->cs_pbuffer;
+ column = cs->cs_pcolumn;
+ }
+ if (flags & RIO_COLMASK)
+ column = (flags & RIO_COLMASK) - 1;
+
+ bp->b_bcount = cs->cs_bcount;
+ bp->b_bufsize = cs->cs_bcount;
+ iosize = un->un_iosize;
+
+ /* check if the hotspared device will be used */
+ if (flags & RIO_ALT && (flags & RIO_WRITE)) {
+ pwstart = un->un_column[column].un_alt_pwstart;
+ devstart = un->un_column[column].un_alt_devstart;
+ dev = un->un_column[column].un_alt_dev;
+ } else {
+ pwstart = un->un_column[column].un_pwstart;
+ devstart = un->un_column[column].un_devstart;
+ dev = un->un_column[column].un_dev;
+ }
+
+ /* if not writing to log skip log header */
+ if ((flags & RIO_PWIO) == 0) {
+ bp->b_lblkno = devstart + cs->cs_blkno;
+ bp->b_un.b_addr += DEV_BSIZE;
+ } else {
+ bp->b_bcount += DEV_BSIZE;
+ bp->b_bufsize = bp->b_bcount;
+ if (flags & RIO_DATA) {
+ bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart;
+ } else { /* not DATA -> PARITY */
+ bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart;
+ }
+ }
+
+ bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available);
+ bp->b_flags |= B_BUSY;
+ if (flags & RIO_READ) {
+ bp->b_flags |= B_READ;
+ } else {
+ bp->b_flags |= B_WRITE;
+ if ((nv_available && nv_parity && (flags & RIO_PARITY)) ||
+ (nv_available && nv_prewrite && (flags & RIO_PWIO)))
+ bp->b_flags |= nv_available;
+ }
+ bp->b_iodone = (int (*)())raid_done;
+ bp->b_edev = md_dev64_to_dev(dev);
+
+ ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV));
+
+ private = cs->cs_strategy_private;
+ flag = cs->cs_strategy_flag;
+
+ md_call_strategy(bp, flag, private);
+}
+
+/*
+ * NAME: genstandardparity
+ * DESCRIPTION: This routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+genstandardparity(md_raidcs_t *cs)
+{
+ uint_t *dbuf, *pbuf;
+ size_t wordcnt;
+ uint_t dsum = 0;
+ uint_t psum = 0;
+
+ ASSERT((cs->cs_bcount & 0x3) == 0);
+
+ wordcnt = cs->cs_bcount / sizeof (uint_t);
+
+ dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+
+ /* Word aligned */
+ if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
+ uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr);
+ uint_t uval;
+
+ while (wordcnt--) {
+ uval = *uwbuf++;
+ psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval));
+ ++pbuf;
+ *dbuf = uval;
+ dsum ^= uval;
+ ++dbuf;
+ }
+ } else {
+ uchar_t *ubbuf = (uchar_t *)(cs->cs_addr);
+ union {
+ uint_t wb;
+ uchar_t bb[4];
+ } cb;
+
+ while (wordcnt--) {
+ cb.bb[0] = *ubbuf++;
+ cb.bb[1] = *ubbuf++;
+ cb.bb[2] = *ubbuf++;
+ cb.bb[3] = *ubbuf++;
+ psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb));
+ ++pbuf;
+ *dbuf = cb.wb;
+ dsum ^= cb.wb;
+ ++dbuf;
+ }
+ }
+
+ RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ 2, cs->cs_dcolumn, RAID_PWMAGIC);
+
+ RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ 2, cs->cs_pcolumn, RAID_PWMAGIC);
+}
+
+static void
+genlineparity(md_raidcs_t *cs)
+{
+
+ mr_unit_t *un = cs->cs_un;
+ md_raidcbuf_t *cbuf;
+ uint_t *pbuf, *dbuf;
+ uint_t *uwbuf;
+ uchar_t *ubbuf;
+ size_t wordcnt;
+ uint_t psum = 0, dsum = 0;
+ size_t count = un->un_segsize * DEV_BSIZE;
+ uint_t col;
+ buf_t *bp;
+
+ ASSERT((cs->cs_bcount & 0x3) == 0);
+
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+ dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
+ uwbuf = (uint_t *)(void *)(cs->cs_addr);
+ ubbuf = (uchar_t *)(void *)(cs->cs_addr);
+
+ wordcnt = count / sizeof (uint_t);
+
+ /* Word aligned */
+ if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
+ uint_t uval;
+
+ while (wordcnt--) {
+ uval = *uwbuf++;
+ *dbuf = uval;
+ *pbuf = uval;
+ dsum ^= uval;
+ ++pbuf;
+ ++dbuf;
+ }
+ } else {
+ union {
+ uint_t wb;
+ uchar_t bb[4];
+ } cb;
+
+ while (wordcnt--) {
+ cb.bb[0] = *ubbuf++;
+ cb.bb[1] = *ubbuf++;
+ cb.bb[2] = *ubbuf++;
+ cb.bb[3] = *ubbuf++;
+ *dbuf = cb.wb;
+ *pbuf = cb.wb;
+ dsum ^= cb.wb;
+ ++pbuf;
+ ++dbuf;
+ }
+ }
+
+ RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC);
+
+ raidio(cs, RIO_PREWRITE | RIO_DATA);
+
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
+
+ dsum = 0;
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+ dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE);
+
+ wordcnt = count / sizeof (uint_t);
+
+ col = cbuf->cbuf_column;
+
+ /* Word aligned */
+ if (((uintptr_t)cs->cs_addr & 0x3) == 0) {
+ uint_t uval;
+
+ /*
+ * Only calculate psum when working on the last
+ * data buffer.
+ */
+ if (cbuf->cbuf_next == NULL) {
+ psum = 0;
+ while (wordcnt--) {
+ uval = *uwbuf++;
+ *dbuf = uval;
+ psum ^= (*pbuf ^= uval);
+ dsum ^= uval;
+ ++dbuf;
+ ++pbuf;
+ }
+ } else {
+ while (wordcnt--) {
+ uval = *uwbuf++;
+ *dbuf = uval;
+ *pbuf ^= uval;
+ dsum ^= uval;
+ ++dbuf;
+ ++pbuf;
+ }
+ }
+ } else {
+ union {
+ uint_t wb;
+ uchar_t bb[4];
+ } cb;
+
+ /*
+ * Only calculate psum when working on the last
+ * data buffer.
+ */
+ if (cbuf->cbuf_next == NULL) {
+ psum = 0;
+ while (wordcnt--) {
+ cb.bb[0] = *ubbuf++;
+ cb.bb[1] = *ubbuf++;
+ cb.bb[2] = *ubbuf++;
+ cb.bb[3] = *ubbuf++;
+ *dbuf = cb.wb;
+ psum ^= (*pbuf ^= cb.wb);
+ dsum ^= cb.wb;
+ ++dbuf;
+ ++pbuf;
+ }
+ } else {
+ while (wordcnt--) {
+ cb.bb[0] = *ubbuf++;
+ cb.bb[1] = *ubbuf++;
+ cb.bb[2] = *ubbuf++;
+ cb.bb[3] = *ubbuf++;
+ *dbuf = cb.wb;
+ *pbuf ^= cb.wb;
+ dsum ^= cb.wb;
+ ++dbuf;
+ ++pbuf;
+ }
+ }
+ }
+ RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ un->un_totalcolumncnt, col, RAID_PWMAGIC);
+
+ /*
+ * fill in buffer for write to prewrite area
+ */
+ bp = &cbuf->cbuf_bp;
+ bp->b_un.b_addr = cbuf->cbuf_buffer;
+ bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE;
+ bp->b_bufsize = bp->b_bcount;
+ bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) +
+ un->un_column[col].un_pwstart;
+ bp->b_flags = B_WRITE | B_BUSY;
+ if (nv_available && nv_prewrite)
+ bp->b_flags |= nv_available;
+ bp->b_iodone = (int (*)())raid_done;
+ bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev);
+ bp->b_chain = (struct buf *)cs;
+ md_call_strategy(bp,
+ cs->cs_strategy_flag, cs->cs_strategy_private);
+ }
+
+ RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC);
+
+ raidio(cs, RIO_PREWRITE | RIO_PARITY);
+}
+
+/*
+ * NAME: raid_readregenloop
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_readregenloop(md_raidcs_t *cs)
+{
+ mr_unit_t *un;
+ md_raidps_t *ps;
+ uint_t *dbuf;
+ uint_t *pbuf;
+ size_t wordcnt;
+
+ un = cs->cs_un;
+
+ /*
+ * XOR the parity with data bytes, must skip the
+ * pre-write entry header in all data/parity buffers
+ */
+ wordcnt = cs->cs_bcount / sizeof (uint_t);
+ dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+ while (wordcnt--)
+ *dbuf++ ^= *pbuf++;
+
+ /* bump up the loop count */
+ cs->cs_loop++;
+
+ /* skip the errored component */
+ if (cs->cs_loop == cs->cs_dcolumn)
+ cs->cs_loop++;
+
+ if (cs->cs_loop != un->un_totalcolumncnt) {
+ cs->cs_frags = 1;
+ raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
+ return;
+ }
+ /* reaching the end sof loop */
+ ps = cs->cs_ps;
+ bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount);
+ raid_free_child(cs, 1);
+
+ /* decrement readfrags */
+ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
+}
+
+/*
+ * NAME: raid_read_io
+ * DESCRIPTION: RAID metadevice read I/O routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_read_io(mr_unit_t *un, md_raidcs_t *cs)
+{
+ int flag;
+ void *private;
+ buf_t *bp;
+ buf_t *pb = cs->cs_ps->ps_bp;
+ mr_column_t *column;
+
+ flag = cs->cs_strategy_flag;
+ private = cs->cs_strategy_private;
+ column = &un->un_column[cs->cs_dcolumn];
+
+ /*
+ * The component to be read is good, simply set up bp structure
+ * and call low level md routine doing the read.
+ */
+
+ if (COLUMN_ISOKAY(un, cs->cs_dcolumn) ||
+ (COLUMN_ISLASTERR(un, cs->cs_dcolumn) &&
+ (cs->cs_flags & MD_RCS_RECOVERY) == 0)) {
+ dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */
+ ddi_dev = md_dev64_to_dev(column->un_dev);
+
+ bp = &cs->cs_dbuf;
+ bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev,
+ column->un_devstart + cs->cs_blkno,
+ (int (*)())raid_done, bp, KM_NOSLEEP);
+
+ bp->b_chain = (buf_t *)cs;
+
+ cs->cs_frags = 1;
+ cs->cs_error_call = raid_read_error;
+ cs->cs_retry_call = raid_read_retry;
+ cs->cs_flags |= MD_RCS_ISCALL;
+ cs->cs_stage = RAID_READ_DONE;
+ cs->cs_call = raid_stage;
+
+ ASSERT(bp->b_edev != 0);
+
+ md_call_strategy(bp, flag, private);
+ return;
+ }
+
+ /*
+ * The component to be read is bad, have to go through
+ * raid specific method to read data from other members.
+ */
+ cs->cs_loop = 0;
+ /*
+ * NOTE: always get dbuffer before pbuffer
+ * and get both buffers before pwslot
+ * otherwise a deadlock could be introduced.
+ */
+ raid_mapin_buf(cs);
+ getdbuffer(cs);
+ getpbuffer(cs);
+ if (cs->cs_loop == cs->cs_dcolumn)
+ cs->cs_loop++;
+
+ /* zero out data buffer for use as a data sink */
+ bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount);
+ cs->cs_stage = RAID_NONE;
+ cs->cs_call = raid_readregenloop;
+ cs->cs_error_call = raid_read_error;
+ cs->cs_retry_call = raid_read_no_retry;
+ cs->cs_frags = 1;
+
+ /* use parity buffer to read other columns */
+ raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1));
+}
+
+/*
+ * NAME: raid_read
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+static int
+raid_read(mr_unit_t *un, md_raidcs_t *cs)
+{
+ int error = 0;
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ minor_t mnum;
+
+ ASSERT(IO_READER_HELD(un));
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+ raid_line_reader_lock(cs, 0);
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+ ASSERT(UNIT_STATE(un) != RUS_INIT);
+ mnum = MD_SID(un);
+ cs->cs_un = un;
+
+ /* make sure the read doesn't go beyond the end of the column */
+ if (cs->cs_blkno + cs->cs_blkcnt >
+ un->un_segsize * un->un_segsincolumn) {
+ error = ENXIO;
+ }
+ if (error)
+ goto rerror;
+
+ if (un->un_state & RUS_REGEN) {
+ raid_regen_parity(cs);
+ un = MD_UNIT(mnum);
+ cs->cs_un = un;
+ }
+
+ raid_read_io(un, cs);
+ return (0);
+
+rerror:
+ raid_error_parent(ps, error);
+ raid_free_child(cs, 1);
+ /* decrement readfrags */
+ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
+ return (0);
+}
+
+/*
+ * NAME: raid_write_err_retry
+ * DESCRIPTION: RAID metadevice write retry routine
+ * write was for parity or data only;
+ * complete write with error, no recovery possible
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+/*ARGSUSED*/
+static void
+raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+ int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
+
+ /* decrement pwfrags if needed, and frags */
+ if (!(cs->cs_flags & MD_RCS_PWDONE))
+ flags |= RFP_DECR_PWFRAGS;
+ raid_error_parent(ps, EIO);
+ raid_free_child(cs, 1);
+ raid_free_parent(ps, flags);
+}
+
+/*
+ * NAME: raid_write_err_retry
+ * DESCRIPTION: RAID metadevice write retry routine
+ * write is too far along to retry and parent
+ * has already been signaled with iodone.
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+/*ARGSUSED*/
+static void
+raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+ int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK;
+
+ /* decrement pwfrags if needed, and frags */
+ if (!(cs->cs_flags & MD_RCS_PWDONE))
+ flags |= RFP_DECR_PWFRAGS;
+ raid_free_child(cs, 1);
+ raid_free_parent(ps, flags);
+}
+
+/*
+ * NAME: raid_write_retry
+ * DESCRIPTION: RAID metadevice write retry routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_write_retry(mr_unit_t *un, md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+
+ ps = cs->cs_ps;
+
+ /* re-initialize the buf_t structure for raid_write() */
+ cs->cs_dbuf.b_chain = (struct buf *)cs;
+ cs->cs_dbuf.b_back = &cs->cs_dbuf;
+ cs->cs_dbuf.b_forw = &cs->cs_dbuf;
+ cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */
+ cs->cs_dbuf.b_error = 0; /* initialize error */
+ cs->cs_dbuf.b_offset = -1;
+ /* Initialize semaphores */
+ sema_init(&cs->cs_dbuf.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&cs->cs_dbuf.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+
+ cs->cs_pbuf.b_chain = (struct buf *)cs;
+ cs->cs_pbuf.b_back = &cs->cs_pbuf;
+ cs->cs_pbuf.b_forw = &cs->cs_pbuf;
+ cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */
+ cs->cs_pbuf.b_error = 0; /* initialize error */
+ cs->cs_pbuf.b_offset = -1;
+ sema_init(&cs->cs_pbuf.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&cs->cs_pbuf.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+
+ cs->cs_hbuf.b_chain = (struct buf *)cs;
+ cs->cs_hbuf.b_back = &cs->cs_hbuf;
+ cs->cs_hbuf.b_forw = &cs->cs_hbuf;
+ cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */
+ cs->cs_hbuf.b_error = 0; /* initialize error */
+ cs->cs_hbuf.b_offset = -1;
+ sema_init(&cs->cs_hbuf.b_io, 0, NULL,
+ SEMA_DEFAULT, NULL);
+ sema_init(&cs->cs_hbuf.b_sem, 0, NULL,
+ SEMA_DEFAULT, NULL);
+
+ cs->cs_flags &= ~(MD_RCS_ERROR);
+ /*
+ * If we have already done'ed the i/o but have done prewrite
+ * on this child, then reset PWDONE flag and bump pwfrags before
+ * restarting i/o.
+ * If pwfrags is zero, we have already 'iodone'd the i/o so
+ * leave things alone. We don't want to re-'done' it.
+ */
+ mutex_enter(&ps->ps_mx);
+ if (cs->cs_flags & MD_RCS_PWDONE) {
+ cs->cs_flags &= ~MD_RCS_PWDONE;
+ ps->ps_pwfrags++;
+ }
+ mutex_exit(&ps->ps_mx);
+ raid_write_io(un, cs);
+}
+
+/*
+ * NAME: raid_wrerr
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ * LOCKS: must obtain unit writer lock while calling raid_error_state
+ * since a unit or column state transition may take place.
+ * must obtain unit reader lock to retry I/O.
+ */
+static void
+raid_wrerr(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ mr_unit_t *un;
+ md_raidcbuf_t *cbuf;
+
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+
+ un = (mr_unit_t *)md_unit_writerlock(ui);
+ ASSERT(un != 0);
+
+ if (cs->cs_dbuf.b_flags & B_ERROR)
+ (void) raid_error_state(un, &cs->cs_dbuf);
+ if (cs->cs_pbuf.b_flags & B_ERROR)
+ (void) raid_error_state(un, &cs->cs_pbuf);
+ if (cs->cs_hbuf.b_flags & B_ERROR)
+ (void) raid_error_state(un, &cs->cs_hbuf);
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
+ if (cbuf->cbuf_bp.b_flags & B_ERROR)
+ (void) raid_error_state(un, &cbuf->cbuf_bp);
+
+ md_unit_writerexit(ui);
+
+ ps->ps_flags |= MD_RPS_HSREQ;
+
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+
+ /* now attempt the appropriate retry routine */
+ (*(cs->cs_retry_call))(un, cs);
+}
+/*
+ * NAMES: raid_write_error
+ * DESCRIPTION: I/O error handling routine for a RAID metadevice write
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+/*ARGSUSED*/
+static void
+raid_write_error(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ mr_unit_t *un;
+ md_raidcbuf_t *cbuf;
+ set_t setno;
+
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+ un = cs->cs_un;
+
+ setno = MD_UN2SET(un);
+
+ /*
+ * locate each buf that is in error on this io and then
+ * output an error message
+ */
+ if ((cs->cs_dbuf.b_flags & B_ERROR) &&
+ (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) &&
+ (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED))
+ cmn_err(CE_WARN, "md %s: write error on %s",
+ md_shortname(MD_SID(un)),
+ md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0));
+
+ if ((cs->cs_pbuf.b_flags & B_ERROR) &&
+ (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) &&
+ (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED))
+ cmn_err(CE_WARN, "md %s: write error on %s",
+ md_shortname(MD_SID(un)),
+ md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0));
+
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next)
+ if ((cbuf->cbuf_bp.b_flags & B_ERROR) &&
+ (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) &&
+ (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED))
+ cmn_err(CE_WARN, "md %s: write error on %s",
+ md_shortname(MD_SID(un)),
+ md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev),
+ NULL, 0));
+
+ md_unit_readerexit(ui);
+
+ ASSERT(cs->cs_frags == 0);
+
+ /* now schedule processing for possible state change */
+ daemon_request(&md_mstr_daemon, raid_wrerr,
+ (daemon_queue_t *)cs, REQ_OLD);
+
+}
+
+/*
+ * NAME: raid_write_ponly
+ * DESCRIPTION: RAID metadevice write routine
+ * in the case where only the parity column can be written
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_write_ponly(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mr_unit_t *un = cs->cs_un;
+
+ ps = cs->cs_ps;
+ /* decrement pwfrags if needed, but not frags */
+ ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
+ raid_free_parent(ps, RFP_DECR_PWFRAGS);
+ cs->cs_flags |= MD_RCS_PWDONE;
+ cs->cs_frags = 1;
+ cs->cs_stage = RAID_WRITE_PONLY_DONE;
+ cs->cs_call = raid_stage;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_no_retry;
+ if (WRITE_ALT(un, cs->cs_pcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE);
+ }
+ raidio(cs, RIO_PARITY | RIO_WRITE);
+}
+
+/*
+ * NAME: raid_write_ploop
+ * DESCRIPTION: RAID metadevice write routine, constructs parity from
+ * data in other columns.
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_write_ploop(md_raidcs_t *cs)
+{
+ mr_unit_t *un = cs->cs_un;
+ uint_t *dbuf;
+ uint_t *pbuf;
+ size_t wordcnt;
+ uint_t psum = 0;
+
+ wordcnt = cs->cs_bcount / sizeof (uint_t);
+ dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+ while (wordcnt--)
+ *pbuf++ ^= *dbuf++;
+ cs->cs_loop++;
+
+ /*
+ * build parity from scratch using new data,
+ * skip reading the data and parity columns.
+ */
+ while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn)
+ cs->cs_loop++;
+
+ if (cs->cs_loop != un->un_totalcolumncnt) {
+ cs->cs_frags = 1;
+ raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
+ return;
+ }
+
+ /* construct checksum for parity buffer */
+ wordcnt = cs->cs_bcount / sizeof (uint_t);
+ pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE);
+ while (wordcnt--) {
+ psum ^= *pbuf;
+ pbuf++;
+ }
+ RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ 1, cs->cs_pcolumn, RAID_PWMAGIC);
+
+ cs->cs_stage = RAID_NONE;
+ cs->cs_call = raid_write_ponly;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_err_retry;
+ cs->cs_frags = 1;
+ if (WRITE_ALT(un, cs->cs_pcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
+ }
+ raidio(cs, RIO_PARITY | RIO_PREWRITE);
+}
+
+/*
+ * NAME: raid_write_donly
+ * DESCRIPTION: RAID metadevice write routine
+ * Completed writing data to prewrite entry
+ * in the case where only the data column can be written
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_write_donly(md_raidcs_t *cs)
+{
+ md_raidps_t *ps;
+ mr_unit_t *un = cs->cs_un;
+
+ ps = cs->cs_ps;
+ /* WARNING: don't release unit reader lock here... */
+ /* decrement pwfrags if needed, but not frags */
+ ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
+ raid_free_parent(ps, RFP_DECR_PWFRAGS);
+ cs->cs_flags |= MD_RCS_PWDONE;
+ cs->cs_frags = 1;
+ cs->cs_stage = RAID_WRITE_DONLY_DONE;
+ cs->cs_call = raid_stage;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_err_retry;
+ if (WRITE_ALT(un, cs->cs_dcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
+ }
+ raidio(cs, RIO_DATA | RIO_WRITE);
+}
+
+/*
+ * NAME: raid_write_got_old
+ * DESCRIPTION: RAID metadevice write routine
+ * completed read of old data and old parity
+ * PARAMETERS: md_raidcs_t *cs - pointer to a child structure
+ */
+static void
+raid_write_got_old(md_raidcs_t *cs)
+{
+ mr_unit_t *un = cs->cs_un;
+
+ ASSERT(IO_READER_HELD(cs->cs_un));
+ ASSERT(UNIT_READER_HELD(cs->cs_un));
+
+ raid_mapin_buf(cs);
+ genstandardparity(cs);
+ cs->cs_frags = 2;
+ cs->cs_call = raid_stage;
+ cs->cs_stage = RAID_PREWRITE_DONE;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_retry;
+
+ if (WRITE_ALT(un, cs->cs_dcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE);
+ }
+
+ if (WRITE_ALT(un, cs->cs_pcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE);
+ }
+ ASSERT(cs->cs_frags < 4);
+ raidio(cs, RIO_DATA | RIO_PREWRITE);
+ raidio(cs, RIO_PARITY | RIO_PREWRITE);
+}
+
+/*
+ * NAME: raid_write_io
+ * DESCRIPTION: RAID metadevice write I/O routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+
+/*ARGSUSED*/
+static void
+raid_write_io(mr_unit_t *un, md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+ uint_t *dbuf;
+ uint_t *ubuf;
+ size_t wordcnt;
+ uint_t dsum = 0;
+ int pcheck;
+ int dcheck;
+
+ ASSERT((un->un_column[cs->cs_pcolumn].un_devstate &
+ RCS_INIT) == 0);
+ ASSERT((un->un_column[cs->cs_dcolumn].un_devstate &
+ RCS_INIT) == 0);
+ ASSERT(IO_READER_HELD(un));
+ ASSERT(UNIT_READER_HELD(un));
+ ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS);
+ if (cs->cs_flags & MD_RCS_LINE) {
+
+ mr_unit_t *un = cs->cs_un;
+
+ ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt);
+ raid_mapin_buf(cs);
+ cs->cs_frags = un->un_origcolumncnt;
+ cs->cs_call = raid_stage;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_no_retry;
+ cs->cs_stage = RAID_LINE_PWDONE;
+ genlineparity(cs);
+ return;
+ }
+
+ pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]);
+ dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]);
+ cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck;
+
+ if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) {
+ int err = EIO;
+
+ if ((un->un_column[cs->cs_pcolumn].un_devstate ==
+ RCS_LAST_ERRED) ||
+ (un->un_column[cs->cs_dcolumn].un_devstate ==
+ RCS_LAST_ERRED))
+ err = ENXIO;
+ raid_error_parent(ps, err);
+ ASSERT(!(cs->cs_flags & MD_RCS_PWDONE));
+ raid_free_child(cs, 1);
+ raid_free_parent(ps, RFP_DECR_FRAGS
+ | RFP_RLS_LOCK | RFP_DECR_PWFRAGS);
+ return;
+ }
+
+ if (pcheck & RCL_ERRED) {
+ /*
+ * handle case of only having data drive
+ */
+ raid_mapin_buf(cs);
+ wordcnt = cs->cs_bcount / sizeof (uint_t);
+
+ dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE);
+ ubuf = (uint_t *)(void *)(cs->cs_addr);
+
+ while (wordcnt--) {
+ *dbuf = *ubuf;
+ dsum ^= *ubuf;
+ dbuf++;
+ ubuf++;
+ }
+ RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1,
+ cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid,
+ 1, cs->cs_dcolumn, RAID_PWMAGIC);
+ cs->cs_frags = 1;
+ cs->cs_stage = RAID_NONE;
+ cs->cs_call = raid_write_donly;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_err_retry;
+ if (WRITE_ALT(un, cs->cs_dcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA |
+ RIO_PREWRITE);
+ }
+ raidio(cs, RIO_DATA | RIO_PREWRITE);
+ return;
+ }
+
+ if (dcheck & RCL_ERRED) {
+ /*
+ * handle case of only having parity drive
+ * build parity from scratch using new data,
+ * skip reading the data and parity columns.
+ */
+ raid_mapin_buf(cs);
+ cs->cs_loop = 0;
+ while (cs->cs_loop == cs->cs_dcolumn ||
+ cs->cs_loop == cs->cs_pcolumn)
+ cs->cs_loop++;
+
+ /* copy new data in to begin building parity */
+ bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount);
+ cs->cs_stage = RAID_NONE;
+ cs->cs_call = raid_write_ploop;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_err_retry;
+ cs->cs_frags = 1;
+ raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1));
+ return;
+ }
+ /*
+ * handle normal cases
+ * read old data and old parity
+ */
+ cs->cs_frags = 2;
+ cs->cs_stage = RAID_NONE;
+ cs->cs_call = raid_write_got_old;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_retry;
+ ASSERT(ps->ps_magic == RAID_PSMAGIC);
+ raidio(cs, RIO_DATA | RIO_READ);
+ raidio(cs, RIO_PARITY | RIO_READ);
+}
+
+static void
+raid_enqueue(md_raidcs_t *cs)
+{
+ mdi_unit_t *ui = cs->cs_ps->ps_ui;
+ kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex;
+ md_raidcs_t *cs1;
+
+ mutex_enter(io_list_mutex);
+ ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD));
+ if (ui->ui_io_lock->io_list_front == NULL) {
+ ui->ui_io_lock->io_list_front = cs;
+ ui->ui_io_lock->io_list_back = cs;
+ } else {
+ cs1 = ui->ui_io_lock->io_list_back;
+ cs1->cs_linlck_next = cs;
+ ui->ui_io_lock->io_list_back = cs;
+ }
+ STAT_INC(raid_write_waits);
+ STAT_MAX(raid_max_write_q_length, raid_write_queue_length);
+ cs->cs_linlck_next = NULL;
+ mutex_exit(io_list_mutex);
+}
+
+/*
+ * NAME: raid_write
+ * DESCRIPTION: RAID metadevice write routine
+ * PARAMETERS: mr_unit_t *un - pointer to a unit structure
+ * md_raidcs_t *cs - pointer to a child structure
+ */
+
+/*ARGSUSED*/
+static int
+raid_write(mr_unit_t *un, md_raidcs_t *cs)
+{
+ int error = 0;
+ md_raidps_t *ps;
+ mdi_unit_t *ui;
+ minor_t mnum;
+ clock_t timeout;
+
+ ASSERT(IO_READER_HELD(un));
+ ps = cs->cs_ps;
+ ui = ps->ps_ui;
+
+ ASSERT(UNIT_STATE(un) != RUS_INIT);
+ if (UNIT_STATE(un) == RUS_LAST_ERRED)
+ error = EIO;
+
+ /* make sure the write doesn't go beyond the column */
+ if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn)
+ error = ENXIO;
+ if (error)
+ goto werror;
+
+ getresources(cs);
+
+ /*
+ * this is an advisory loop that keeps the waiting lists short
+ * to reduce cpu time. Since there is a race introduced by not
+ * aquiring all the correct mutexes, use a cv_timedwait to be
+ * sure the write always will wake up and start.
+ */
+ while (raid_check_pw(cs)) {
+ mutex_enter(&un->un_mx);
+ (void) drv_getparm(LBOLT, &timeout);
+ timeout += md_wr_wait;
+ un->un_rflags |= MD_RFLAG_NEEDPW;
+ STAT_INC(raid_prewrite_waits);
+ (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout);
+ un->un_rflags &= ~MD_RFLAG_NEEDPW;
+ mutex_exit(&un->un_mx);
+ }
+
+ if (raid_line_writer_lock(cs, 1))
+ return (0);
+
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+ cs->cs_un = un;
+ mnum = MD_SID(un);
+
+ if (un->un_state & RUS_REGEN) {
+ raid_regen_parity(cs);
+ un = MD_UNIT(mnum);
+ cs->cs_un = un;
+ }
+
+ raid_write_io(un, cs);
+ return (0);
+werror:
+ /* aquire unit reader lock sinc raid_free_child always drops it */
+ raid_error_parent(ps, error);
+ raid_free_child(cs, 0);
+ /* decrement both pwfrags and frags */
+ raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK);
+ return (0);
+}
+
+
+/*
+ * NAMES: raid_stage
+ * DESCRIPTION: post-processing routine for a RAID metadevice
+ * PARAMETERS: md_raidcs_t *cs - pointer to child structure
+ */
+static void
+raid_stage(md_raidcs_t *cs)
+{
+ md_raidps_t *ps = cs->cs_ps;
+ mr_unit_t *un = cs->cs_un;
+ md_raidcbuf_t *cbuf;
+ buf_t *bp;
+ void *private;
+ int flag;
+
+ switch (cs->cs_stage) {
+ case RAID_READ_DONE:
+ raid_free_child(cs, 1);
+ /* decrement readfrags */
+ raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK);
+ return;
+
+ case RAID_WRITE_DONE:
+ case RAID_WRITE_PONLY_DONE:
+ case RAID_WRITE_DONLY_DONE:
+ /*
+ * Completed writing real parity and/or data.
+ */
+ ASSERT(cs->cs_flags & MD_RCS_PWDONE);
+ raid_free_child(cs, 1);
+ /* decrement frags but not pwfrags */
+ raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK);
+ return;
+
+ case RAID_PREWRITE_DONE:
+ /*
+ * completed writing data and parity to prewrite entries
+ */
+ /*
+ * WARNING: don't release unit reader lock here..
+ * decrement pwfrags but not frags
+ */
+ raid_free_parent(ps, RFP_DECR_PWFRAGS);
+ cs->cs_flags |= MD_RCS_PWDONE;
+ cs->cs_frags = 2;
+ cs->cs_stage = RAID_WRITE_DONE;
+ cs->cs_call = raid_stage;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_no_retry;
+ if (WRITE_ALT(un, cs->cs_pcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY |
+ RIO_WRITE);
+ }
+ if (WRITE_ALT(un, cs->cs_dcolumn)) {
+ cs->cs_frags++;
+ raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE);
+ }
+ ASSERT(cs->cs_frags < 4);
+ raidio(cs, RIO_DATA | RIO_WRITE);
+ raidio(cs, RIO_PARITY | RIO_WRITE);
+ if (cs->cs_pw_inval_list) {
+ raid_free_pwinvalidate(cs);
+ }
+ return;
+
+ case RAID_LINE_PWDONE:
+ ASSERT(cs->cs_frags == 0);
+ raid_free_parent(ps, RFP_DECR_PWFRAGS);
+ cs->cs_flags |= MD_RCS_PWDONE;
+ cs->cs_frags = un->un_origcolumncnt;
+ cs->cs_call = raid_stage;
+ cs->cs_error_call = raid_write_error;
+ cs->cs_retry_call = raid_write_no_retry;
+ cs->cs_stage = RAID_WRITE_DONE;
+ for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) {
+ /*
+ * fill in buffer for write to prewrite area
+ */
+ bp = &cbuf->cbuf_bp;
+ bp->b_back = bp;
+ bp->b_forw = bp;
+ bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE;
+ bp->b_bcount = cbuf->cbuf_bcount;
+ bp->b_bufsize = cbuf->cbuf_bcount;
+ bp->b_lblkno =
+ un->un_column[cbuf->cbuf_column].un_devstart +
+ cs->cs_blkno;
+ bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR);
+ bp->b_flags &= ~nv_available;
+ bp->b_flags |= B_WRITE | B_BUSY;
+ bp->b_iodone = (int (*)())raid_done;
+ bp->b_edev = md_dev64_to_dev(
+ un->un_column[cbuf->cbuf_column].un_dev);
+ bp->b_chain = (struct buf *)cs;
+ private = cs->cs_strategy_private;
+ flag = cs->cs_strategy_flag;
+ md_call_strategy(bp, flag, private);
+ }
+ raidio(cs, RIO_DATA | RIO_WRITE);
+ raidio(cs, RIO_PARITY | RIO_WRITE);
+ if (cs->cs_pw_inval_list) {
+ raid_free_pwinvalidate(cs);
+ }
+ return;
+
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+/*
+ * NAME: md_raid_strategy
+ * DESCRIPTION: RAID metadevice I/O oprations entry point.
+ * PARAMETERS: buf_t *pb - pointer to a user I/O buffer
+ * int flag - metadevice specific flag
+ * void *private - carry over flag ??
+ *
+ */
+
+void
+md_raid_strategy(buf_t *pb, int flag, void *private)
+{
+ md_raidps_t *ps;
+ md_raidcs_t *cs;
+ int doing_writes;
+ int err;
+ mr_unit_t *un;
+ mdi_unit_t *ui;
+ size_t count;
+ diskaddr_t blkno;
+ caddr_t addr;
+ off_t offset;
+ int colcnt;
+ minor_t mnum;
+ set_t setno;
+
+ ui = MDI_UNIT(getminor(pb->b_edev));
+ md_kstat_waitq_enter(ui);
+ un = (mr_unit_t *)md_io_readerlock(ui);
+ setno = MD_MIN2SET(getminor(pb->b_edev));
+
+ if ((flag & MD_NOBLOCK) == 0) {
+ if (md_inc_iocount(setno) != 0) {
+ pb->b_flags |= B_ERROR;
+ pb->b_error = ENXIO;
+ pb->b_resid = pb->b_bcount;
+ md_io_readerexit(ui);
+ biodone(pb);
+ return;
+ }
+ } else {
+ md_inc_iocount_noblock(setno);
+ }
+
+ mnum = MD_SID(un);
+ colcnt = un->un_totalcolumncnt - 1;
+ count = pb->b_bcount;
+
+ STAT_CHECK(raid_512, count == 512);
+ STAT_CHECK(raid_1024, count == 1024);
+ STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192);
+ STAT_CHECK(raid_8192, count == 8192);
+ STAT_CHECK(raid_8192_bigger, count > 8192);
+
+ (void *) md_unit_readerlock(ui);
+ if (!(flag & MD_STR_NOTTOP)) {
+ err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */
+ if (err != 0) {
+ md_kstat_waitq_exit(ui);
+ md_io_readerexit(ui);
+ return;
+ }
+ }
+ md_unit_readerexit(ui);
+
+ STAT_INC(raid_total_io);
+
+ /* allocate a parent structure for the user I/O */
+ ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS);
+ raid_parent_init(ps);
+
+ /*
+ * Save essential information from the original buffhdr
+ * in the md_save structure.
+ */
+ ps->ps_un = un;
+ ps->ps_ui = ui;
+ ps->ps_bp = pb;
+ ps->ps_addr = pb->b_un.b_addr;
+
+ if ((pb->b_flags & B_READ) == 0) {
+ ps->ps_flags |= MD_RPS_WRITE;
+ doing_writes = 1;
+ STAT_INC(raid_writes);
+ } else {
+ ps->ps_flags |= MD_RPS_READ;
+ doing_writes = 0;
+ STAT_INC(raid_reads);
+ }
+
+ count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */
+ blkno = pb->b_lblkno; /* block number on device */
+ addr = 0;
+ offset = 0;
+ ps->ps_pwfrags = 1;
+ ps->ps_frags = 1;
+ md_kstat_waitq_to_runq(ui);
+
+ do {
+ cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS);
+ raid_child_init(cs);
+ cs->cs_ps = ps;
+ cs->cs_un = un;
+ cs->cs_mdunit = mnum;
+ cs->cs_strategy_flag = flag;
+ cs->cs_strategy_private = private;
+ cs->cs_addr = addr;
+ cs->cs_offset = offset;
+ count = raid_iosetup(un, blkno, count, cs);
+ if (cs->cs_flags & MD_RCS_LINE) {
+ blkno += (cs->cs_blkcnt * colcnt);
+ offset += (cs->cs_bcount * colcnt);
+ } else {
+ blkno += cs->cs_blkcnt;
+ offset += cs->cs_bcount;
+ }
+ /* for each cs bump up the ps_pwfrags and ps_frags fields */
+ if (count) {
+ mutex_enter(&ps->ps_mx);
+ ps->ps_pwfrags++;
+ ps->ps_frags++;
+ mutex_exit(&ps->ps_mx);
+ if (doing_writes)
+ (void) raid_write(un, cs);
+ else
+ (void) raid_read(un, cs);
+ }
+ } while (count);
+ if (doing_writes) {
+ (void) raid_write(un, cs);
+ } else
+ (void) raid_read(un, cs);
+
+ if (! (flag & MD_STR_NOTTOP) && panicstr) {
+ while (! (ps->ps_flags & MD_RPS_DONE)) {
+ md_daemon(1, &md_done_daemon);
+ drv_usecwait(10);
+ }
+ kmem_cache_free(raid_parent_cache, ps);
+ }
+}
+
+/*
+ * NAMES: raid_snarf
+ * DESCRIPTION: RAID metadevice SNARF entry point
+ * PARAMETERS: md_snarfcmd_t cmd,
+ * set_t setno
+ * RETURNS:
+ */
+static int
+raid_snarf(md_snarfcmd_t cmd, set_t setno)
+{
+ mr_unit_t *un;
+ mddb_recid_t recid;
+ int gotsomething;
+ int all_raid_gotten;
+ mddb_type_t typ1;
+ uint_t ncol;
+ mddb_de_ic_t *dep;
+ mddb_rb32_t *rbp;
+ size_t newreqsize;
+ mr_unit_t *big_un;
+ mr_unit32_od_t *small_un;
+
+
+ if (cmd == MD_SNARF_CLEANUP)
+ return (0);
+
+ all_raid_gotten = 1;
+ gotsomething = 0;
+ typ1 = (mddb_type_t)md_getshared_key(setno,
+ raid_md_ops.md_driver.md_drivername);
+ recid = mddb_makerecid(setno, 0);
+
+ while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
+ if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) {
+ continue;
+ }
+
+ dep = mddb_getrecdep(recid);
+ dep->de_flags = MDDB_F_RAID;
+ rbp = dep->de_rb;
+ if ((rbp->rb_revision == MDDB_REV_RB) &&
+ ((rbp->rb_private & MD_PRV_CONVD) == 0)) {
+ /*
+ * This means, we have an old and small record
+ * and this record hasn't already been converted.
+ * Before we create an incore metadevice from this
+ * we have to convert it to a big record.
+ */
+ small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid);
+ ncol = small_un->un_totalcolumncnt;
+ newreqsize = sizeof (mr_unit_t) +
+ ((ncol - 1) * sizeof (mr_column_t));
+ big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP);
+ raid_convert((caddr_t)small_un, (caddr_t)big_un,
+ SMALL_2_BIG);
+ kmem_free(small_un, dep->de_reqsize);
+ dep->de_rb_userdata = big_un;
+ dep->de_reqsize = newreqsize;
+ un = big_un;
+ rbp->rb_private |= MD_PRV_CONVD;
+ } else {
+ /* Big device */
+ un = (mr_unit_t *)mddb_getrecaddr(recid);
+ }
+
+ /* Set revision and flag accordingly */
+ if (rbp->rb_revision == MDDB_REV_RB) {
+ un->c.un_revision = MD_32BIT_META_DEV;
+ } else {
+ un->c.un_revision = MD_64BIT_META_DEV;
+ un->c.un_flag |= MD_EFILABEL;
+ }
+
+ /*
+ * Create minor device node for snarfed entry.
+ */
+ (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
+
+ if (MD_UNIT(MD_SID(un)) != NULL) {
+ mddb_setrecprivate(recid, MD_PRV_PENDDEL);
+ continue;
+ }
+ all_raid_gotten = 0;
+ if (raid_build_incore((void *)un, 1) == 0) {
+ mddb_setrecprivate(recid, MD_PRV_GOTIT);
+ md_create_unit_incore(MD_SID(un), &raid_md_ops,
+ 1);
+ gotsomething = 1;
+ } else if (un->mr_ic) {
+ kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
+ un->un_totalcolumncnt);
+ kmem_free(un->mr_ic, sizeof (*un->mr_ic));
+ }
+ }
+
+ if (!all_raid_gotten) {
+ return (gotsomething);
+ }
+
+ recid = mddb_makerecid(setno, 0);
+ while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
+ if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
+ mddb_setrecprivate(recid, MD_PRV_PENDDEL);
+
+ return (0);
+}
+
+/*
+ * NAMES: raid_halt
+ * DESCRIPTION: RAID metadevice HALT entry point
+ * PARAMETERS: md_haltcmd_t cmd -
+ * set_t setno -
+ * RETURNS:
+ */
+static int
+raid_halt(md_haltcmd_t cmd, set_t setno)
+{
+ set_t i;
+ mdi_unit_t *ui;
+ minor_t mnum;
+
+ if (cmd == MD_HALT_CLOSE)
+ return (0);
+
+ if (cmd == MD_HALT_OPEN)
+ return (0);
+
+ if (cmd == MD_HALT_UNLOAD)
+ return (0);
+
+ if (cmd == MD_HALT_CHECK) {
+ for (i = 0; i < md_nunits; i++) {
+ mnum = MD_MKMIN(setno, i);
+ if ((ui = MDI_UNIT(mnum)) == NULL)
+ continue;
+ if (ui->ui_opsindex != raid_md_ops.md_selfindex)
+ continue;
+ if (md_unit_isopen(ui))
+ return (1);
+ }
+ return (0);
+ }
+
+ if (cmd != MD_HALT_DOIT)
+ return (1);
+
+ for (i = 0; i < md_nunits; i++) {
+ mnum = MD_MKMIN(setno, i);
+ if ((ui = MDI_UNIT(mnum)) == NULL)
+ continue;
+ if (ui->ui_opsindex != raid_md_ops.md_selfindex)
+ continue;
+ reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0);
+ }
+ return (0);
+}
+
+/*
+ * NAMES: raid_close_all_devs
+ * DESCRIPTION: Close all the devices of the unit.
+ * PARAMETERS: mr_unit_t *un - pointer to unit structure
+ * RETURNS:
+ */
+void
+raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags)
+{
+ int i;
+ mr_column_t *device;
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ device = &un->un_column[i];
+ if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
+ ASSERT((device->un_dev != (md_dev64_t)0) &&
+ (device->un_dev != NODEV64));
+ if ((device->un_devstate & RCS_OKAY) && init_pw)
+ (void) init_pw_area(un, device->un_dev,
+ device->un_pwstart, i);
+ md_layered_close(device->un_dev, md_cflags);
+ device->un_devflags &= ~MD_RAID_DEV_ISOPEN;
+ }
+ }
+}
+
+/*
+ * NAMES: raid_open_all_devs
+ * DESCRIPTION: Open all the components (columns) of the device unit.
+ * PARAMETERS: mr_unit_t *un - pointer to unit structure
+ * RETURNS:
+ */
+static int
+raid_open_all_devs(mr_unit_t *un, int md_oflags)
+{
+ minor_t mnum = MD_SID(un);
+ int i;
+ int not_opened = 0;
+ int commit = 0;
+ int col = -1;
+ mr_column_t *device;
+ set_t setno = MD_MIN2SET(MD_SID(un));
+ side_t side = mddb_getsidenum(setno);
+ mdkey_t key;
+ mdi_unit_t *ui = MDI_UNIT(mnum);
+
+ ui->ui_tstate &= ~MD_INACCESSIBLE;
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ md_dev64_t tmpdev;
+
+ device = &un->un_column[i];
+
+ if (COLUMN_STATE(un, i) & RCS_ERRED) {
+ not_opened++;
+ continue;
+ }
+
+ if (device->un_devflags & MD_RAID_DEV_ISOPEN)
+ continue;
+
+ tmpdev = device->un_dev;
+ /*
+ * Open by device id
+ */
+ key = HOTSPARED(un, i) ?
+ device->un_hs_key : device->un_orig_key;
+ if ((md_getmajor(tmpdev) != md_major) &&
+ md_devid_found(setno, side, key) == 1) {
+ tmpdev = md_resolve_bydevid(mnum, tmpdev, key);
+ }
+ if (md_layered_open(mnum, &tmpdev, md_oflags)) {
+ device->un_dev = tmpdev;
+ not_opened++;
+ continue;
+ }
+ device->un_dev = tmpdev;
+ device->un_devflags |= MD_RAID_DEV_ISOPEN;
+ }
+
+ /* if open errors and errored devices are 1 then device can run */
+ if (not_opened > 1) {
+ cmn_err(CE_WARN,
+ "md: %s failed to open. open error on %s\n",
+ md_shortname(MD_SID(un)),
+ md_devname(MD_UN2SET(un), device->un_orig_dev,
+ NULL, 0));
+
+ ui->ui_tstate |= MD_INACCESSIBLE;
+
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
+ MD_UN2SET(un), MD_SID(un));
+
+ return (not_opened > 1);
+ }
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ device = &un->un_column[i];
+ if (device->un_devflags & MD_RAID_DEV_ISOPEN) {
+ if (device->un_devstate & RCS_LAST_ERRED) {
+ /*
+ * At this point in time there is a possibility
+ * that errors were the result of a controller
+ * failure with more than a single column on it
+ * so clear out last errored columns and let errors
+ * re-occur is necessary.
+ */
+ raid_set_state(un, i, RCS_OKAY, 0);
+ commit++;
+ }
+ continue;
+ }
+ ASSERT(col == -1);
+ col = i;
+ }
+
+ if (col != -1) {
+ raid_set_state(un, col, RCS_ERRED, 0);
+ commit++;
+ }
+
+ if (commit)
+ raid_commit(un, NULL);
+
+ if (col != -1) {
+ if (COLUMN_STATE(un, col) & RCS_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
+ SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
+ } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
+ SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * NAMES: raid_internal_open
+ * DESCRIPTION: Do the actual RAID open
+ * PARAMETERS: minor_t mnum - minor number of the RAID device
+ * int flag -
+ * int otyp -
+ * int md_oflags - RAID open flags
+ * RETURNS: 0 if successful, nonzero otherwise
+ */
+int
+raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags)
+{
+ mr_unit_t *un;
+ mdi_unit_t *ui;
+ int err = 0;
+ int replay_error = 0;
+
+ ui = MDI_UNIT(mnum);
+ ASSERT(ui != NULL);
+
+ un = (mr_unit_t *)md_unit_openclose_enter(ui);
+ /*
+ * this MUST be checked before md_unit_isopen is checked.
+ * raid_init_columns sets md_unit_isopen to block reset, halt.
+ */
+ if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) &&
+ !(md_oflags & MD_OFLG_ISINIT)) {
+ md_unit_openclose_exit(ui);
+ return (EAGAIN);
+ }
+
+ if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) {
+ err = md_unit_incopen(mnum, flag, otyp);
+ goto out;
+ }
+
+ md_unit_readerexit(ui);
+
+ un = (mr_unit_t *)md_unit_writerlock(ui);
+ if (raid_open_all_devs(un, md_oflags) == 0) {
+ if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) {
+ md_unit_writerexit(ui);
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+ raid_close_all_devs(un, 0, md_oflags);
+ goto out;
+ }
+ } else {
+ /*
+ * if this unit contains more than two errored components
+ * should return error and close all opened devices
+ */
+
+ md_unit_writerexit(ui);
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+ raid_close_all_devs(un, 0, md_oflags);
+ md_unit_openclose_exit(ui);
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
+ MD_UN2SET(un), MD_SID(un));
+ return (ENXIO);
+ }
+
+ if (!(MD_STATUS(un) & MD_UN_REPLAYED)) {
+ replay_error = raid_replay(un);
+ MD_STATUS(un) |= MD_UN_REPLAYED;
+ }
+
+ md_unit_writerexit(ui);
+ un = (mr_unit_t *)md_unit_readerlock(ui);
+
+ if ((replay_error == RAID_RPLY_READONLY) &&
+ ((flag & (FREAD | FWRITE)) == FREAD)) {
+ md_unit_openclose_exit(ui);
+ return (0);
+ }
+
+ /* allocate hotspare if possible */
+ (void) raid_hotspares();
+
+
+out:
+ md_unit_openclose_exit(ui);
+ return (err);
+}
+/*
+ * NAMES: raid_open
+ * DESCRIPTION: RAID metadevice OPEN entry point
+ * PARAMETERS: dev_t dev -
+ * int flag -
+ * int otyp -
+ * cred_t * cred_p -
+ * int md_oflags -
+ * RETURNS:
+ */
+/*ARGSUSED1*/
+static int
+raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
+{
+ int error = 0;
+
+ if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) {
+ return (error);
+ }
+ return (0);
+}
+
+/*
+ * NAMES: raid_internal_close
+ * DESCRIPTION: RAID metadevice CLOSE actual implementation
+ * PARAMETERS: minor_t - minor number of the RAID device
+ * int otyp -
+ * int init_pw -
+ * int md_cflags - RAID close flags
+ * RETURNS: 0 if successful, nonzero otherwise
+ */
+/*ARGSUSED*/
+int
+raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags)
+{
+ mdi_unit_t *ui = MDI_UNIT(mnum);
+ mr_unit_t *un;
+ int err = 0;
+
+ /* single thread */
+ un = (mr_unit_t *)md_unit_openclose_enter(ui);
+
+ /* count closed */
+ if ((err = md_unit_decopen(mnum, otyp)) != 0)
+ goto out;
+ /* close devices, if necessary */
+ if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
+ raid_close_all_devs(un, init_pw, md_cflags);
+ }
+
+ /* unlock, return success */
+out:
+ md_unit_openclose_exit(ui);
+ return (err);
+}
+
+/*
+ * NAMES: raid_close
+ * DESCRIPTION: RAID metadevice close entry point
+ * PARAMETERS: dev_t dev -
+ * int flag -
+ * int otyp -
+ * cred_t * cred_p -
+ * int md_oflags -
+ * RETURNS:
+ */
+/*ARGSUSED1*/
+static int
+raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
+{
+ int retval;
+
+ (void) md_io_writerlock(MDI_UNIT(getminor(dev)));
+ retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags);
+ (void) md_io_writerexit(MDI_UNIT(getminor(dev)));
+ return (retval);
+}
+
+/*
+ * raid_probe_close_all_devs
+ */
+void
+raid_probe_close_all_devs(mr_unit_t *un)
+{
+ int i;
+ mr_column_t *device;
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ device = &un->un_column[i];
+
+ if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
+ md_layered_close(device->un_dev,
+ MD_OFLG_PROBEDEV);
+ device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN;
+ }
+ }
+}
+/*
+ * Raid_probe_dev:
+ *
+ * On entry the unit writerlock is held
+ */
+static int
+raid_probe_dev(mdi_unit_t *ui, minor_t mnum)
+{
+ mr_unit_t *un;
+ int i;
+ int not_opened = 0;
+ int commit = 0;
+ int col = -1;
+ mr_column_t *device;
+ int md_devopen = 0;
+
+ if (md_unit_isopen(ui))
+ md_devopen++;
+
+ un = MD_UNIT(mnum);
+ /*
+ * If the state has been set to LAST_ERRED because
+ * of an error when the raid device was open at some
+ * point in the past, don't probe. We really don't want
+ * to reset the state in this case.
+ */
+ if (UNIT_STATE(un) == RUS_LAST_ERRED)
+ return (0);
+
+ ui->ui_tstate &= ~MD_INACCESSIBLE;
+
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ md_dev64_t tmpdev;
+
+ device = &un->un_column[i];
+ if (COLUMN_STATE(un, i) & RCS_ERRED) {
+ not_opened++;
+ continue;
+ }
+
+ tmpdev = device->un_dev;
+ /*
+ * Currently the flags passed are not needed since
+ * there cannot be an underlying metadevice. However
+ * they are kept here for consistency.
+ *
+ * Open by device id
+ */
+ tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)?
+ device->un_hs_key : device->un_orig_key);
+ if (md_layered_open(mnum, &tmpdev,
+ MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) {
+ device->un_dev = tmpdev;
+ not_opened++;
+ continue;
+ }
+ device->un_dev = tmpdev;
+
+ device->un_devflags |= MD_RAID_DEV_PROBEOPEN;
+ }
+
+ /*
+ * The code below is careful on setting the LAST_ERRED state.
+ *
+ * If open errors and exactly one device has failed we can run.
+ * If more then one device fails we have to figure out when to set
+ * LAST_ERRED state. The rationale is to avoid unnecessary resyncs
+ * since they are painful and time consuming.
+ *
+ * When more than one component/column fails there are 2 scenerios.
+ *
+ * 1. Metadevice has NOT been opened: In this case, the behavior
+ * mimics the open symantics. ie. Only the first failed device
+ * is ERRED and LAST_ERRED is not set.
+ *
+ * 2. Metadevice has been opened: Here the read/write sematics are
+ * followed. The first failed devicce is ERRED and on the next
+ * failed device LAST_ERRED is set.
+ */
+
+ if (not_opened > 1 && !md_devopen) {
+ cmn_err(CE_WARN,
+ "md: %s failed to open. open error on %s\n",
+ md_shortname(MD_SID(un)),
+ md_devname(MD_UN2SET(un), device->un_orig_dev,
+ NULL, 0));
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
+ MD_UN2SET(un), MD_SID(un));
+ raid_probe_close_all_devs(un);
+ ui->ui_tstate |= MD_INACCESSIBLE;
+ return (not_opened > 1);
+ }
+
+ if (!md_devopen) {
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ device = &un->un_column[i];
+ if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) {
+ if (device->un_devstate & RCS_LAST_ERRED) {
+ /*
+ * At this point in time there is a
+ * possibility that errors were the
+ * result of a controller failure with
+ * more than a single column on it so
+ * clear out last errored columns and
+ * let errors re-occur is necessary.
+ */
+ raid_set_state(un, i, RCS_OKAY, 0);
+ commit++;
+ }
+ continue;
+ }
+ ASSERT(col == -1);
+ /*
+ * note if multiple devices are failing then only
+ * the last one is marked as error
+ */
+ col = i;
+ }
+
+ if (col != -1) {
+ raid_set_state(un, col, RCS_ERRED, 0);
+ commit++;
+ }
+
+ } else {
+ for (i = 0; i < un->un_totalcolumncnt; i++) {
+ device = &un->un_column[i];
+
+ /* if we have LAST_ERRED go ahead and commit. */
+ if (un->un_state & RUS_LAST_ERRED)
+ break;
+ /*
+ * could not open the component
+ */
+
+ if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) {
+ col = i;
+ raid_set_state(un, col, RCS_ERRED, 0);
+ commit++;
+ }
+ }
+ }
+
+ if (commit)
+ raid_commit(un, NULL);
+
+ if (col != -1) {
+ if (COLUMN_STATE(un, col) & RCS_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
+ SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
+ } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) {
+ SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
+ SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
+ }
+ }
+
+ raid_probe_close_all_devs(un);
+ return (0);
+}
+
+static int
+raid_imp_set(
+ set_t setno
+)
+{
+ mddb_recid_t recid;
+ int i, gotsomething;
+ mddb_type_t typ1;
+ mddb_de_ic_t *dep;
+ mddb_rb32_t *rbp;
+ mr_unit_t *un64;
+ mr_unit32_od_t *un32;
+ minor_t *self_id; /* minor needs to be updated */
+ md_parent_t *parent_id; /* parent needs to be updated */
+ mddb_recid_t *record_id; /* record id needs to be updated */
+ hsp_t *hsp_id;
+
+ gotsomething = 0;
+
+ typ1 = (mddb_type_t)md_getshared_key(setno,
+ raid_md_ops.md_driver.md_drivername);
+ recid = mddb_makerecid(setno, 0);
+
+ while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
+ if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
+ continue;
+
+ dep = mddb_getrecdep(recid);
+ rbp = dep->de_rb;
+
+ if (rbp->rb_revision == MDDB_REV_RB) {
+ /*
+ * Small device
+ */
+ un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid);
+ self_id = &(un32->c.un_self_id);
+ parent_id = &(un32->c.un_parent);
+ record_id = &(un32->c.un_record_id);
+ hsp_id = &(un32->un_hsp_id);
+
+ for (i = 0; i < un32->un_totalcolumncnt; i++) {
+ mr_column32_od_t *device;
+
+ device = &un32->un_column[i];
+ if (!md_update_minor(setno, mddb_getsidenum
+ (setno), device->un_orig_key))
+ goto out;
+
+ if (device->un_hs_id != 0)
+ device->un_hs_id = MAKERECID(
+ setno, device->un_hs_id);
+ }
+ } else {
+ un64 = (mr_unit_t *)mddb_getrecaddr(recid);
+ self_id = &(un64->c.un_self_id);
+ parent_id = &(un64->c.un_parent);
+ record_id = &(un64->c.un_record_id);
+ hsp_id = &(un64->un_hsp_id);
+
+ for (i = 0; i < un64->un_totalcolumncnt; i++) {
+ mr_column_t *device;
+
+ device = &un64->un_column[i];
+ if (!md_update_minor(setno, mddb_getsidenum
+ (setno), device->un_orig_key))
+ goto out;
+
+ if (device->un_hs_id != 0)
+ device->un_hs_id = MAKERECID(
+ setno, device->un_hs_id);
+ }
+ }
+
+ /*
+ * Update unit with the imported setno
+ */
+ mddb_setrecprivate(recid, MD_PRV_GOTIT);
+
+ *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
+
+ if (*hsp_id != -1)
+ *hsp_id = MAKERECID(setno, DBID(*hsp_id));
+
+ if (*parent_id != MD_NO_PARENT)
+ *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
+ *record_id = MAKERECID(setno, DBID(*record_id));
+ gotsomething = 1;
+ }
+
+out:
+ return (gotsomething);
+}
+
+static md_named_services_t raid_named_services[] = {
+ {raid_hotspares, "poke hotspares" },
+ {raid_rename_check, MDRNM_CHECK },
+ {raid_rename_lock, MDRNM_LOCK },
+ {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK },
+ {(intptr_t (*)()) raid_probe_dev, "probe open test" },
+ {NULL, 0 }
+};
+
+md_ops_t raid_md_ops = {
+ raid_open, /* open */
+ raid_close, /* close */
+ md_raid_strategy, /* strategy */
+ NULL, /* print */
+ NULL, /* dump */
+ NULL, /* read */
+ NULL, /* write */
+ md_raid_ioctl, /* ioctl, */
+ raid_snarf, /* raid_snarf */
+ raid_halt, /* raid_halt */
+ NULL, /* aread */
+ NULL, /* awrite */
+ raid_imp_set, /* import set */
+ raid_named_services
+};
+
+static void
+init_init()
+{
+ /* default to a second */
+ if (md_wr_wait == 0)
+ md_wr_wait = md_hz >> 1;
+
+ raid_parent_cache = kmem_cache_create("md_raid_parent",
+ sizeof (md_raidps_t), 0, raid_parent_constructor,
+ raid_parent_destructor, raid_run_queue, NULL, NULL, 0);
+ raid_child_cache = kmem_cache_create("md_raid_child",
+ sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0,
+ raid_child_constructor, raid_child_destructor,
+ raid_run_queue, NULL, NULL, 0);
+ raid_cbuf_cache = kmem_cache_create("md_raid_cbufs",
+ sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor,
+ raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0);
+}
+
+static void
+fini_uninit()
+{
+ kmem_cache_destroy(raid_parent_cache);
+ kmem_cache_destroy(raid_child_cache);
+ kmem_cache_destroy(raid_cbuf_cache);
+ raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL;
+}
+
+/* define the module linkage */
+MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit())