diff options
author | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
---|---|---|
committer | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
commit | 7c478bd95313f5f23a4c958a745db2134aa03244 (patch) | |
tree | c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/io/lvm/raid/raid.c | |
download | illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz |
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/io/lvm/raid/raid.c')
-rw-r--r-- | usr/src/uts/common/io/lvm/raid/raid.c | 4395 |
1 files changed, 4395 insertions, 0 deletions
diff --git a/usr/src/uts/common/io/lvm/raid/raid.c b/usr/src/uts/common/io/lvm/raid/raid.c new file mode 100644 index 0000000000..0fec16660d --- /dev/null +++ b/usr/src/uts/common/io/lvm/raid/raid.c @@ -0,0 +1,4395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * NAME: raid.c + * + * DESCRIPTION: Main RAID driver source file containing open, close and I/O + * operations. + * + * ROUTINES PROVIDED FOR EXTERNAL USE: + * raid_open() - open the RAID metadevice for access. + * raid_internal_open() - internal open routine of RAID metdevice. + * md_raid_strategy() - perform normal I/O operations, + * such as read and write. + * raid_close() - close the RAID metadevice. + * raid_internal_close() - internal close routine of RAID metadevice. + * raid_snarf() - initialize and clean up MDD records. + * raid_halt() - reset the RAID metadevice + * raid_line() - return the line # of this segment + * raid_dcolumn() - return the data column # of this segment + * raid_pcolumn() - return the parity column # of this segment + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/file.h> +#include <sys/user.h> +#include <sys/uio.h> +#include <sys/t_lock.h> +#include <sys/buf.h> +#include <sys/dkio.h> +#include <sys/vtoc.h> +#include <sys/kmem.h> +#include <vm/page.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/mkdev.h> +#include <sys/stat.h> +#include <sys/open.h> +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/debug.h> +#include <sys/lvm/md_raid.h> +#include <sys/lvm/mdvar.h> +#include <sys/lvm/md_convert.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/svm.h> + +md_ops_t raid_md_ops; +#ifndef lint +static char _depends_on[] = "drv/md"; +md_ops_t *md_interface_ops = &raid_md_ops; +#endif /* lint */ + +extern unit_t md_nunits; +extern unit_t md_nsets; +extern md_set_t md_set[]; +extern int md_status; +extern major_t md_major; +extern mdq_anchor_t md_done_daemon; +extern mdq_anchor_t md_mstr_daemon; +extern int md_sleep_for_test; +extern clock_t md_hz; + +extern md_event_queue_t *md_event_queue; + + +int pchunks = 16; +int phigh = 1024; +int plow = 128; +int cchunks = 64; +int chigh = 1024; +int clow = 512; +int bchunks = 32; +int bhigh = 256; +int blow = 128; + +int raid_total_io = 0; +int raid_reads = 0; +int raid_writes = 0; +int raid_no_bpmaps = 0; +int raid_512 = 0; +int raid_1024 = 0; +int raid_1024_8192 = 0; +int raid_8192 = 0; +int raid_8192_bigger = 0; +int raid_line_lock_wait = 0; + +int data_buffer_waits = 0; +int parity_buffer_waits = 0; + +/* writer line locks */ +int raid_writer_locks = 0; /* total writer locks */ +int raid_write_waits = 0; /* total writer locks that waited */ +int raid_full_line_writes = 0; /* total full line writes */ +int raid_write_queue_length = 0; /* wait queue length */ +int raid_max_write_q_length = 0; /* maximum queue length */ +int raid_write_locks_active = 0; /* writer locks at any time */ +int raid_max_write_locks = 0; /* maximum writer locks active */ + +/* read line locks */ +int raid_reader_locks = 0; /* total reader locks held */ +int raid_reader_locks_active = 0; /* reader locks held */ +int raid_max_reader_locks = 0; /* maximum reader locks held in run */ +int raid_read_overlaps = 0; /* number of times 2 reads hit same line */ +int raid_read_waits = 0; /* times a reader waited on writer */ + +/* prewrite stats */ +int raid_prewrite_waits = 0; /* number of waits for a pw slot */ +int raid_pw = 0; /* number of pw slots in use */ +int raid_prewrite_max = 0; /* maximum number of pw slots in use */ +int raid_pw_invalidates = 0; + +static clock_t md_wr_wait = 0; + +int nv_available = 0; /* presence of nv-ram support in device */ +int nv_prewrite = 1; /* mark prewrites with nv_available */ +int nv_parity = 1; /* mark parity with nv_available */ + +kmem_cache_t *raid_parent_cache = NULL; +kmem_cache_t *raid_child_cache = NULL; +kmem_cache_t *raid_cbuf_cache = NULL; + +int raid_internal_open(minor_t mnum, int flag, int otyp, + int md_oflags); + +static void freebuffers(md_raidcs_t *cs); +static int raid_read(mr_unit_t *un, md_raidcs_t *cs); +static void raid_read_io(mr_unit_t *un, md_raidcs_t *cs); +static int raid_write(mr_unit_t *un, md_raidcs_t *cs); +static void raid_write_io(mr_unit_t *un, md_raidcs_t *cs); +static void raid_stage(md_raidcs_t *cs); +static void raid_enqueue(md_raidcs_t *cs); +static diskaddr_t raid_line(diskaddr_t segment, mr_unit_t *un); +uint_t raid_dcolumn(diskaddr_t segment, mr_unit_t *un); +static void getpbuffer(md_raidcs_t *cs); +static void getdbuffer(md_raidcs_t *cs); +static void raid_done(buf_t *bp); +static void raid_io_startup(mr_unit_t *un); + +static rus_state_t +raid_col2unit(rcs_state_t state, rus_state_t unitstate) +{ + switch (state) { + case RCS_INIT: + return (RUS_INIT); + case RCS_OKAY: + return (RUS_OKAY); + case RCS_RESYNC: + if (unitstate & RUS_LAST_ERRED) + return (RUS_LAST_ERRED); + else + return (RUS_ERRED); + case RCS_ERRED: + return (RUS_ERRED); + case RCS_LAST_ERRED: + return (RUS_ERRED); + default: + break; + } + panic("raid_col2unit"); + /*NOTREACHED*/ +} + +void +raid_set_state(mr_unit_t *un, int col, rcs_state_t newstate, int force) +{ + + rus_state_t unitstate, origstate; + rcs_state_t colstate; + rcs_state_t orig_colstate; + int errcnt = 0, + okaycnt = 0, + resynccnt = 0; + int i; + char *devname; + + ASSERT(un); + ASSERT(col < un->un_totalcolumncnt); + ASSERT(newstate & + (RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | + RCS_LAST_ERRED | RCS_REGEN)); + ASSERT((newstate & + ~(RCS_INIT | RCS_INIT_ERRED | RCS_OKAY | RCS_RESYNC | RCS_ERRED | + RCS_LAST_ERRED | RCS_REGEN)) + == 0); + + ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1); + + unitstate = un->un_state; + origstate = unitstate; + + if (force) { + un->un_column[col].un_devstate = newstate; + un->un_state = raid_col2unit(newstate, unitstate); + uniqtime32(&un->un_column[col].un_devtimestamp); + uniqtime32(&un->un_timestamp); + return; + } + + ASSERT(un->un_state & + (RUS_INIT | RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | + RUS_REGEN)); + ASSERT((un->un_state & ~(RUS_INIT | + RUS_OKAY | RUS_ERRED | RUS_DOI | RUS_LAST_ERRED | RUS_REGEN)) == 0); + + if (un->un_column[col].un_devstate == newstate) + return; + + if (newstate == RCS_REGEN) { + if (raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) + return; + un->un_state = RUS_REGEN; + return; + } + + orig_colstate = un->un_column[col].un_devstate; + + /* + * if there is another column in the error state then this + * column should go to the last errored state + */ + for (i = 0; i < un->un_totalcolumncnt; i++) { + if (i == col) + colstate = newstate; + else + colstate = un->un_column[i].un_devstate; + if (colstate & (RCS_ERRED | RCS_LAST_ERRED | RCS_INIT_ERRED)) + errcnt++; + if (colstate & RCS_OKAY) + okaycnt++; + if (colstate & RCS_RESYNC) + resynccnt++; + } + ASSERT(resynccnt < 2); + + if (okaycnt == un->un_totalcolumncnt) + unitstate = RUS_OKAY; + else if (errcnt > 1) { + unitstate = RUS_LAST_ERRED; + if (newstate & RCS_ERRED) + newstate = RCS_LAST_ERRED; + } else if (errcnt == 1) + if (!(unitstate & RUS_LAST_ERRED)) + unitstate = RUS_ERRED; + + if (un->un_state == RUS_DOI) + unitstate = RUS_DOI; + + un->un_column[col].un_devstate = newstate; + uniqtime32(&un->un_column[col].un_devtimestamp); + /* + * if there are last errored column being brought back online + * by open or snarf, then be sure to clear the RUS_LAST_ERRED + * bit to allow writes. If there is a real error then the + * column will go back into last erred. + */ + if ((raid_state_cnt(un, RCS_LAST_ERRED) == 0) && + (raid_state_cnt(un, RCS_ERRED) == 1)) + unitstate = RUS_ERRED; + + un->un_state = unitstate; + uniqtime32(&un->un_timestamp); + + if ((! (origstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) && + (unitstate & (RUS_ERRED|RUS_LAST_ERRED|RUS_DOI))) { + devname = md_devname(MD_UN2SET(un), + un->un_column[col].un_dev, NULL, 0); + + cmn_err(CE_WARN, "md: %s: %s needs maintenance", + md_shortname(MD_SID(un)), devname); + + if (unitstate & RUS_LAST_ERRED) { + cmn_err(CE_WARN, "md: %s: %s last erred", + md_shortname(MD_SID(un)), devname); + + } else if (un->un_column[col].un_devflags & + MD_RAID_DEV_ISOPEN) { + /* + * Close the broken device and clear the open flag on + * it. We have to check that the device is open, + * otherwise the first open on it has resulted in the + * error that is being processed and the actual un_dev + * will be NODEV64. + */ + md_layered_close(un->un_column[col].un_dev, + MD_OFLG_NULL); + un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; + } + } else if (orig_colstate == RCS_LAST_ERRED && newstate == RCS_ERRED && + un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN) { + /* + * Similar to logic above except no log messages since we + * are just transitioning from Last Erred to Erred. + */ + md_layered_close(un->un_column[col].un_dev, MD_OFLG_NULL); + un->un_column[col].un_devflags &= ~MD_RAID_DEV_ISOPEN; + } + + /* + * If a resync has completed, see if there is a Last Erred + * component that we can change to the Erred state. + */ + if ((orig_colstate == RCS_RESYNC) && (newstate == RCS_OKAY)) { + for (i = 0; i < un->un_totalcolumncnt; i++) { + if (i != col && + (un->un_column[i].un_devstate & RCS_LAST_ERRED)) { + raid_set_state(un, i, RCS_ERRED, 0); + break; + } + } + } +} + +/* + * NAME: erred_check_line + * + * DESCRIPTION: Return the type of write to perform on an erred column based + * upon any resync activity. + * + * if a column is being resynced and the write is above the + * resync point may have to write to the target being resynced. + * + * Column state may make it impossible to do the write + * in which case RCL_EIO or RCL_ENXIO is returned. + * + * If a column cannot be written directly, RCL_ERRED is + * returned and processing should proceed accordingly. + * + * PARAMETERS: minor_t mnum - minor number identity of metadevice + * md_raidcs_t *cs - child save structure + * mr_column_t *dcolumn - pointer to data column structure + * mr_column_t *pcolumn - pointer to parity column structure + * + * RETURNS: RCL_OKAY, RCL_ERRED + * + * LOCKS: Expects Line Writer Lock and Unit Resource Lock to be held + * across call. + */ + +static int +erred_check_line(mr_unit_t *un, md_raidcs_t *cs, mr_column_t *column) +{ + + ASSERT(un != NULL); + ASSERT(cs->cs_flags & MD_RCS_LLOCKD); + + if (column->un_devstate & RCS_OKAY) + return (RCL_OKAY); + + if (column->un_devstate & RCS_ERRED) + return (RCL_ERRED); /* do not read from errored disk */ + + /* + * for the last errored case their are two considerations. + * When the last errored column is the only errored column then + * do treat it like a maintenance column, not doing I/O from + * it. When it there are other failures then just attempt + * to use it. + */ + if (column->un_devstate & RCS_LAST_ERRED) + return (RCL_ERRED); + + ASSERT(column->un_devstate & RCS_RESYNC); + + /* + * When a resync from a hotspare is being done (copy resync) + * then always treat it as an OKAY column, since no regen + * is required. + */ + if (column->un_devflags & MD_RAID_COPY_RESYNC) { + return (RCL_OKAY); + } + + mutex_enter(&un->un_mx); + if (cs->cs_line < un->un_resync_line_index) { + mutex_exit(&un->un_mx); + return (RCL_OKAY); + } + mutex_exit(&un->un_mx); + return (RCL_ERRED); + +} + +/* + * NAMES: raid_state_cnt + * + * DESCRIPTION: counts number of column in a specific state + * + * PARAMETERS: md_raid_t *un + * rcs_state state + */ +int +raid_state_cnt(mr_unit_t *un, rcs_state_t state) +{ + int i, retval = 0; + + for (i = 0; i < un->un_totalcolumncnt; i++) + if (un->un_column[i].un_devstate & state) + retval++; + return (retval); +} + +/* + * NAMES: raid_io_overlaps + * + * DESCRIPTION: checkst for overlap of 2 child save structures + * + * PARAMETERS: md_raidcs_t cs1 + * md_raidcs_t cs2 + * + * RETURNS: 0 - no overlap + * 1 - overlap + */ +int +raid_io_overlaps(md_raidcs_t *cs1, md_raidcs_t *cs2) +{ + if (cs1->cs_blkno > cs2->cs_lastblk) + return (0); + if (cs1->cs_lastblk < cs2->cs_blkno) + return (0); + return (1); +} + +/* + * NAMES: raid_parent_constructor + * DESCRIPTION: parent structure constructor routine + * PARAMETERS: + */ +/*ARGSUSED1*/ +static int +raid_parent_constructor(void *p, void *d1, int d2) +{ + mutex_init(&((md_raidps_t *)p)->ps_mx, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&((md_raidps_t *)p)->ps_mapin_mx, + NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +void +raid_parent_init(md_raidps_t *ps) +{ + bzero(ps, offsetof(md_raidps_t, ps_mx)); + ((md_raidps_t *)ps)->ps_flags = MD_RPS_INUSE; + ((md_raidps_t *)ps)->ps_magic = RAID_PSMAGIC; +} + +/*ARGSUSED1*/ +static void +raid_parent_destructor(void *p, void *d) +{ + mutex_destroy(&((md_raidps_t *)p)->ps_mx); + mutex_destroy(&((md_raidps_t *)p)->ps_mapin_mx); +} + +/* + * NAMES: raid_child_constructor + * DESCRIPTION: child structure constructor routine + * PARAMETERS: + */ +/*ARGSUSED1*/ +static int +raid_child_constructor(void *p, void *d1, int d2) +{ + md_raidcs_t *cs = (md_raidcs_t *)p; + mutex_init(&cs->cs_mx, NULL, MUTEX_DEFAULT, NULL); + bioinit(&cs->cs_dbuf); + bioinit(&cs->cs_pbuf); + bioinit(&cs->cs_hbuf); + return (0); +} + +void +raid_child_init(md_raidcs_t *cs) +{ + bzero(cs, offsetof(md_raidcs_t, cs_mx)); + + md_bioreset(&cs->cs_dbuf); + md_bioreset(&cs->cs_pbuf); + md_bioreset(&cs->cs_hbuf); + + ((md_raidcs_t *)cs)->cs_dbuf.b_chain = + ((md_raidcs_t *)cs)->cs_pbuf.b_chain = + ((md_raidcs_t *)cs)->cs_hbuf.b_chain = + (struct buf *)(cs); + + cs->cs_magic = RAID_CSMAGIC; + cs->cs_line = MD_DISKADDR_ERROR; + cs->cs_dpwslot = -1; + cs->cs_ppwslot = -1; +} + +/*ARGSUSED1*/ +static void +raid_child_destructor(void *p, void *d) +{ + biofini(&((md_raidcs_t *)p)->cs_dbuf); + biofini(&((md_raidcs_t *)p)->cs_hbuf); + biofini(&((md_raidcs_t *)p)->cs_pbuf); + mutex_destroy(&((md_raidcs_t *)p)->cs_mx); +} + +/*ARGSUSED1*/ +static int +raid_cbuf_constructor(void *p, void *d1, int d2) +{ + bioinit(&((md_raidcbuf_t *)p)->cbuf_bp); + return (0); +} + +static void +raid_cbuf_init(md_raidcbuf_t *cb) +{ + bzero(cb, offsetof(md_raidcbuf_t, cbuf_bp)); + md_bioreset(&cb->cbuf_bp); + cb->cbuf_magic = RAID_BUFMAGIC; + cb->cbuf_pwslot = -1; + cb->cbuf_flags = CBUF_WRITE; +} + +/*ARGSUSED1*/ +static void +raid_cbuf_destructor(void *p, void *d) +{ + biofini(&((md_raidcbuf_t *)p)->cbuf_bp); +} + +/* + * NAMES: raid_run_queue + * DESCRIPTION: spawn a backend processing daemon for RAID metadevice. + * PARAMETERS: + */ +/*ARGSUSED*/ +static void +raid_run_queue(void *d) +{ + if (!(md_status & MD_GBL_DAEMONS_LIVE)) + md_daemon(1, &md_done_daemon); +} + +/* + * NAME: raid_build_pwslot + * DESCRIPTION: builds mr_pw_reserve for the column + * PARAMETERS: un is the pointer to the unit structure + * colindex is the column to create the structure for + */ +int +raid_build_pw_reservation(mr_unit_t *un, int colindex) +{ + mr_pw_reserve_t *pw; + mr_scoreboard_t *sb; + int i; + + pw = (mr_pw_reserve_t *) kmem_zalloc(sizeof (mr_pw_reserve_t) + + (sizeof (mr_scoreboard_t) * un->un_pwcnt), KM_SLEEP); + pw->pw_magic = RAID_PWMAGIC; + pw->pw_column = colindex; + pw->pw_free = un->un_pwcnt; + sb = &pw->pw_sb[0]; + for (i = 0; i < un->un_pwcnt; i++) { + sb[i].sb_column = colindex; + sb[i].sb_flags = SB_UNUSED; + sb[i].sb_start_blk = 0; + sb[i].sb_last_blk = 0; + sb[i].sb_cs = NULL; + } + un->un_column_ic[colindex].un_pw_reserve = pw; + return (0); +} +/* + * NAME: raid_free_pw_reservation + * DESCRIPTION: RAID metadevice pre-write slot structure destroy routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * int colindex - index of the column whose pre-write slot struct + * is to be destroyed. + */ +void +raid_free_pw_reservation(mr_unit_t *un, int colindex) +{ + mr_pw_reserve_t *pw = un->un_column_ic[colindex].un_pw_reserve; + + kmem_free(pw, sizeof (mr_pw_reserve_t) + + (sizeof (mr_scoreboard_t) * un->un_pwcnt)); +} + +/* + * NAME: raid_cancel_pwslot + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_cancel_pwslot(md_raidcs_t *cs) +{ + mr_unit_t *un = cs->cs_un; + mr_pw_reserve_t *pw; + mr_scoreboard_t *sb; + mr_column_ic_t *col; + md_raidcbuf_t *cbuf; + int broadcast = 0; + + if (cs->cs_ps->ps_flags & MD_RPS_READ) + return; + if (cs->cs_dpwslot != -1) { + col = &un->un_column_ic[cs->cs_dcolumn]; + pw = col->un_pw_reserve; + sb = &pw->pw_sb[cs->cs_dpwslot]; + sb->sb_flags = SB_AVAIL; + if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) + broadcast++; + sb->sb_cs = NULL; + } + + if (cs->cs_ppwslot != -1) { + col = &un->un_column_ic[cs->cs_pcolumn]; + pw = col->un_pw_reserve; + sb = &pw->pw_sb[cs->cs_ppwslot]; + sb->sb_flags = SB_AVAIL; + if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) + broadcast++; + sb->sb_cs = NULL; + } + + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { + if (cbuf->cbuf_pwslot == -1) + continue; + col = &un->un_column_ic[cbuf->cbuf_column]; + pw = col->un_pw_reserve; + sb = &pw->pw_sb[cbuf->cbuf_pwslot]; + sb->sb_flags = SB_AVAIL; + if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) + broadcast++; + sb->sb_cs = NULL; + } + if (broadcast) { + cv_broadcast(&un->un_cv); + return; + } + mutex_enter(&un->un_mx); + if (un->un_rflags & MD_RFLAG_NEEDPW) + cv_broadcast(&un->un_cv); + mutex_exit(&un->un_mx); +} + +static void +raid_free_pwinvalidate(md_raidcs_t *cs) +{ + md_raidcbuf_t *cbuf; + md_raidcbuf_t *cbuf_to_free; + mr_unit_t *un = cs->cs_un; + mdi_unit_t *ui = MDI_UNIT(MD_SID(un)); + mr_pw_reserve_t *pw; + mr_scoreboard_t *sb; + int broadcast = 0; + + cbuf = cs->cs_pw_inval_list; + ASSERT(cbuf); + mutex_enter(&un->un_linlck_mx); + while (cbuf) { + pw = un->un_column_ic[cbuf->cbuf_column].un_pw_reserve; + sb = &pw->pw_sb[0]; + ASSERT(sb[cbuf->cbuf_pwslot].sb_flags & SB_INVAL_PEND); + sb[cbuf->cbuf_pwslot].sb_flags = SB_UNUSED; + sb[cbuf->cbuf_pwslot].sb_cs = NULL; + if ((pw->pw_free++ == 0) || (un->un_rflags & MD_RFLAG_NEEDPW)) + broadcast++; + cbuf_to_free = cbuf; + cbuf = cbuf->cbuf_next; + kmem_free(cbuf_to_free->cbuf_buffer, dbtob(un->un_iosize)); + kmem_cache_free(raid_cbuf_cache, cbuf_to_free); + } + cs->cs_pw_inval_list = (md_raidcbuf_t *)NULL; + /* + * now that there is a free prewrite slot, check to see if there + * are any io operations waiting first wake up the raid_io_startup + * then signal the the processes waiting in raid_write. + */ + if (ui->ui_io_lock->io_list_front) + raid_io_startup(un); + mutex_exit(&un->un_linlck_mx); + if (broadcast) { + cv_broadcast(&un->un_cv); + return; + } + mutex_enter(&un->un_mx); + if (un->un_rflags & MD_RFLAG_NEEDPW) + cv_broadcast(&un->un_cv); + mutex_exit(&un->un_mx); +} + + +static int +raid_get_pwslot(md_raidcs_t *cs, int column) +{ + mr_scoreboard_t *sb; + mr_pw_reserve_t *pw; + mr_unit_t *un = cs->cs_un; + diskaddr_t start_blk = cs->cs_blkno; + diskaddr_t last_blk = cs->cs_lastblk; + int i; + int pwcnt = un->un_pwcnt; + int avail = -1; + int use = -1; + int flags; + + + /* start with the data column */ + pw = cs->cs_un->un_column_ic[column].un_pw_reserve; + sb = &pw->pw_sb[0]; + ASSERT(pw->pw_free > 0); + for (i = 0; i < pwcnt; i++) { + flags = sb[i].sb_flags; + if (flags & SB_INVAL_PEND) + continue; + + if ((avail == -1) && (flags & (SB_AVAIL | SB_UNUSED))) + avail = i; + + if ((start_blk > sb[i].sb_last_blk) || + (last_blk < sb[i].sb_start_blk)) + continue; + + /* OVERLAP */ + ASSERT(! (sb[i].sb_flags & SB_INUSE)); + + /* + * raid_invalidate_pwslot attempts to zero out prewrite entry + * in parallel with other disk reads/writes related to current + * transaction. however cs_frags accounting for this case is + * broken because raid_write_io resets cs_frags i.e. ignoring + * that it could have been been set to > 0 value by + * raid_invalidate_pwslot. While this can be fixed an + * additional problem is that we don't seem to handle + * correctly the case of getting a disk error for prewrite + * entry invalidation. + * It does not look like we really need + * to invalidate prewrite slots because raid_replay sorts + * prewrite id's in ascending order and during recovery the + * latest prewrite entry for the same block will be replay + * last. That's why i ifdef'd out the call to + * raid_invalidate_pwslot. --aguzovsk@east + */ + + if (use == -1) { + use = i; + } + } + + ASSERT(avail != -1); + pw->pw_free--; + if (use == -1) + use = avail; + + ASSERT(! (sb[use].sb_flags & SB_INUSE)); + sb[use].sb_flags = SB_INUSE; + sb[use].sb_cs = cs; + sb[use].sb_start_blk = start_blk; + sb[use].sb_last_blk = last_blk; + ASSERT((use >= 0) && (use < un->un_pwcnt)); + return (use); +} + +static int +raid_check_pw(md_raidcs_t *cs) +{ + + mr_unit_t *un = cs->cs_un; + int i; + + ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); + /* + * check to be sure there is a prewrite slot available + * if not just return. + */ + if (cs->cs_flags & MD_RCS_LINE) { + for (i = 0; i < un->un_totalcolumncnt; i++) + if (un->un_column_ic[i].un_pw_reserve->pw_free <= 0) + return (1); + return (0); + } + + if (un->un_column_ic[cs->cs_dcolumn].un_pw_reserve->pw_free <= 0) + return (1); + if (un->un_column_ic[cs->cs_pcolumn].un_pw_reserve->pw_free <= 0) + return (1); + return (0); +} +static int +raid_alloc_pwslot(md_raidcs_t *cs) +{ + mr_unit_t *un = cs->cs_un; + md_raidcbuf_t *cbuf; + + ASSERT(! (cs->cs_flags & MD_RCS_HAVE_PW_SLOTS)); + if (raid_check_pw(cs)) + return (1); + + mutex_enter(&un->un_mx); + un->un_pwid++; + cs->cs_pwid = un->un_pwid; + mutex_exit(&un->un_mx); + + cs->cs_dpwslot = raid_get_pwslot(cs, cs->cs_dcolumn); + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { + cbuf->cbuf_pwslot = raid_get_pwslot(cs, cbuf->cbuf_column); + } + cs->cs_ppwslot = raid_get_pwslot(cs, cs->cs_pcolumn); + + cs->cs_flags |= MD_RCS_HAVE_PW_SLOTS; + + return (0); +} + +/* + * NAMES: raid_build_incore + * DESCRIPTION: RAID metadevice incore structure building routine + * PARAMETERS: void *p - pointer to a unit structure + * int snarfing - a flag to indicate snarfing is required + */ +int +raid_build_incore(void *p, int snarfing) +{ + mr_unit_t *un = (mr_unit_t *)p; + minor_t mnum = MD_SID(un); + mddb_recid_t hs_recid = 0; + int i; + int preserve_flags; + mr_column_t *column; + int iosize; + md_dev64_t hs, dev; + int resync_cnt = 0, + error_cnt = 0; + + hs = NODEV64; + dev = NODEV64; + + /* clear out bogus pointer incase we return(1) prior to alloc */ + un->mr_ic = NULL; + + if (MD_STATUS(un) & MD_UN_BEING_RESET) { + mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN); + return (1); + } + + if (MD_UNIT(mnum) != NULL) + return (0); + + if (snarfing) + MD_STATUS(un) = 0; + + un->mr_ic = (mr_unit_ic_t *)kmem_zalloc(sizeof (*un->mr_ic), + KM_SLEEP); + + un->un_column_ic = (mr_column_ic_t *) + kmem_zalloc(sizeof (mr_column_ic_t) * + un->un_totalcolumncnt, KM_SLEEP); + + for (i = 0; i < un->un_totalcolumncnt; i++) { + + column = &un->un_column[i]; + preserve_flags = column->un_devflags & + (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC); + column->un_devflags &= + ~(MD_RAID_ALT_ISOPEN | MD_RAID_DEV_ISOPEN | + MD_RAID_WRITE_ALT); + if (raid_build_pw_reservation(un, i) != 0) { + /* could not build pwslot */ + return (1); + } + + if (snarfing) { + set_t setno = MD_MIN2SET(mnum); + dev = md_getdevnum(setno, mddb_getsidenum(setno), + column->un_orig_key, MD_NOTRUST_DEVT); + /* + * Comment out instead of remove so we have history + * In the pre-SVM releases stored devt is used so + * as long as there is one snarf is always happy + * even the component is powered off. This is not + * the case in current SVM implementation. NODEV64 + * can be returned and in this case since we resolve + * the devt at 'open' time (first use of metadevice) + * we will allow snarf continue. + * + * if (dev == NODEV64) + * return (1); + */ + + /* + * Setup un_orig_dev from device id info if the device + * is valid (not NODEV64). + */ + if (dev != NODEV64) + column->un_orig_dev = dev; + + if (column->un_devstate & RCS_RESYNC) + resync_cnt++; + if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) + error_cnt++; + + if (HOTSPARED(un, i)) { + (void) md_hot_spare_ifc(HS_MKDEV, + 0, 0, 0, &column->un_hs_id, NULL, + &hs, NULL); + /* + * Same here + * + * if (hs == NODEV64) + * return (1); + */ + } + + if (HOTSPARED(un, i)) { + if (column->un_devstate & + (RCS_OKAY | RCS_LAST_ERRED)) { + column->un_dev = hs; + column->un_pwstart = + column->un_hs_pwstart; + column->un_devstart = + column->un_hs_devstart; + preserve_flags &= + ~(MD_RAID_COPY_RESYNC | + MD_RAID_REGEN_RESYNC); + } else if (column->un_devstate & RCS_RESYNC) { + /* + * if previous system was 4.0 set + * the direction flags + */ + if ((preserve_flags & + (MD_RAID_COPY_RESYNC | + MD_RAID_REGEN_RESYNC)) == 0) { + if (column->un_alt_dev != NODEV64) + preserve_flags |= + MD_RAID_COPY_RESYNC; + else + preserve_flags |= + MD_RAID_REGEN_RESYNC; + } + } + } else { /* no hot spares */ + column->un_dev = dev; + column->un_pwstart = column->un_orig_pwstart; + column->un_devstart = column->un_orig_devstart; + if (column->un_devstate & RCS_RESYNC) { + preserve_flags |= MD_RAID_REGEN_RESYNC; + preserve_flags &= ~MD_RAID_COPY_RESYNC; + } + } + if (! (column->un_devstate & RCS_RESYNC)) { + preserve_flags &= + ~(MD_RAID_REGEN_RESYNC | + MD_RAID_COPY_RESYNC); + } + + column->un_devflags = preserve_flags; + column->un_alt_dev = NODEV64; + column->un_alt_pwstart = 0; + column->un_alt_devstart = 0; + un->un_resync_line_index = 0; + un->un_resync_index = 0; + un->un_percent_done = 0; + } + } + + if (resync_cnt && error_cnt) { + for (i = 0; i < un->un_totalcolumncnt; i++) { + column = &un->un_column[i]; + if (HOTSPARED(un, i) && + (column->un_devstate & RCS_RESYNC) && + (column->un_devflags & MD_RAID_COPY_RESYNC)) + /* hotspare has data */ + continue; + + if (HOTSPARED(un, i) && + (column->un_devstate & RCS_RESYNC)) { + /* hotspare does not have data */ + raid_hs_release(HS_FREE, un, &hs_recid, i); + column->un_dev = column->un_orig_dev; + column->un_pwstart = column->un_orig_pwstart; + column->un_devstart = column->un_orig_devstart; + mddb_setrecprivate(hs_recid, MD_PRV_PENDCOM); + } + + if (column->un_devstate & RCS_ERRED) + column->un_devstate = RCS_LAST_ERRED; + + if (column->un_devstate & RCS_RESYNC) + column->un_devstate = RCS_ERRED; + } + } + mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM); + + un->un_pwid = 1; /* or some other possible value */ + un->un_magic = RAID_UNMAGIC; + iosize = un->un_iosize; + un->un_pbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); + un->un_dbuffer = kmem_alloc(dbtob(iosize), KM_SLEEP); + mutex_init(&un->un_linlck_mx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&un->un_linlck_cv, NULL, CV_DEFAULT, NULL); + un->un_linlck_chn = NULL; + MD_UNIT(mnum) = un; + + + return (0); +} + +/* + * NAMES: reset_raid + * DESCRIPTION: RAID metadevice reset routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * minor_t mnum - RAID metadevice minor number + * int removing - a flag to imply removing device name from + * MDDB database. + */ +void +reset_raid(mr_unit_t *un, minor_t mnum, int removing) +{ + int i, n = 0; + sv_dev_t *sv; + mr_column_t *column; + int column_cnt = un->un_totalcolumncnt; + mddb_recid_t *recids, vtoc_id; + int hserr; + + ASSERT((MDI_UNIT(mnum)->ui_io_lock->io_list_front == NULL) && + (MDI_UNIT(mnum)->ui_io_lock->io_list_back == NULL)); + + md_destroy_unit_incore(mnum, &raid_md_ops); + + MD_UNIT(mnum) = NULL; + + if (un->un_pbuffer) { + kmem_free(un->un_pbuffer, dbtob(un->un_iosize)); + un->un_pbuffer = NULL; + } + if (un->un_dbuffer) { + kmem_free(un->un_dbuffer, dbtob(un->un_iosize)); + un->un_dbuffer = NULL; + } + + /* free all pre-write slots created during build incore */ + for (i = 0; i < un->un_totalcolumncnt; i++) + raid_free_pw_reservation(un, i); + + kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * + un->un_totalcolumncnt); + + kmem_free(un->mr_ic, sizeof (*un->mr_ic)); + + if (!removing) + return; + + sv = (sv_dev_t *)kmem_zalloc((column_cnt + 1) * sizeof (sv_dev_t), + KM_SLEEP); + + recids = (mddb_recid_t *) + kmem_zalloc((column_cnt + 2) * sizeof (mddb_recid_t), KM_SLEEP); + + for (i = 0; i < column_cnt; i++) { + md_unit_t *comp_un; + md_dev64_t comp_dev; + + column = &un->un_column[i]; + sv[i].setno = MD_MIN2SET(mnum); + sv[i].key = column->un_orig_key; + if (HOTSPARED(un, i)) { + if (column->un_devstate & (RCS_ERRED | RCS_LAST_ERRED)) + hserr = HS_BAD; + else + hserr = HS_FREE; + raid_hs_release(hserr, un, &recids[n++], i); + } + /* + * deparent any metadevices. + * NOTE: currently soft partitions are the only metadevices + * allowed in RAID metadevices. + */ + comp_dev = column->un_dev; + if (md_getmajor(comp_dev) == md_major) { + comp_un = MD_UNIT(md_getminor(comp_dev)); + recids[n++] = MD_RECID(comp_un); + md_reset_parent(comp_dev); + } + } + /* decrement the reference count of the old hsp */ + if (un->un_hsp_id != -1) + (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, + &recids[n++], NULL, NULL, NULL); + recids[n] = 0; + MD_STATUS(un) |= MD_UN_BEING_RESET; + vtoc_id = un->c.un_vtoc_id; + + raid_commit(un, recids); + + + /* Remove the unit structure */ + mddb_deleterec_wrapper(un->c.un_record_id); + + /* Remove the vtoc, if present */ + if (vtoc_id) + mddb_deleterec_wrapper(vtoc_id); + md_rem_names(sv, column_cnt); + kmem_free(sv, (column_cnt + 1) * sizeof (sv_dev_t)); + kmem_free(recids, (column_cnt + 2) * sizeof (mddb_recid_t)); + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, + MD_MIN2SET(mnum), mnum); +} + +/* + * NAMES: raid_error_parent + * DESCRIPTION: mark a parent structure in error + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * int error - error value to set + * NOTE: (TBR) - this routine currently is not in use. + */ +static void +raid_error_parent(md_raidps_t *ps, int error) +{ + mutex_enter(&ps->ps_mx); + ps->ps_flags |= MD_RPS_ERROR; + ps->ps_error = error; + mutex_exit(&ps->ps_mx); +} + +/* + * The following defines tell raid_free_parent + * RFP_RLS_LOCK release the unit reader lock when done. + * RFP_DECR_PWFRAGS decrement ps_pwfrags + * RFP_DECR_FRAGS decrement ps_frags + * RFP_DECR_READFRAGS read keeps FRAGS and PWFRAGS in lockstep + */ +#define RFP_RLS_LOCK 0x00001 +#define RFP_DECR_PWFRAGS 0x00002 +#define RFP_DECR_FRAGS 0x00004 +#define RFP_DECR_READFRAGS (RFP_DECR_PWFRAGS | RFP_DECR_FRAGS) + +/* + * NAMES: raid_free_parent + * DESCRIPTION: free a parent structure + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * int todo - indicates what needs to be done + */ +static void +raid_free_parent(md_raidps_t *ps, int todo) +{ + mdi_unit_t *ui = ps->ps_ui; + + ASSERT(ps->ps_magic == RAID_PSMAGIC); + ASSERT(ps->ps_flags & MD_RPS_INUSE); + mutex_enter(&ps->ps_mx); + if (todo & RFP_DECR_PWFRAGS) { + ASSERT(ps->ps_pwfrags); + ps->ps_pwfrags--; + if (ps->ps_pwfrags == 0 && (! (ps->ps_flags & MD_RPS_IODONE))) { + if (ps->ps_flags & MD_RPS_ERROR) { + ps->ps_bp->b_flags |= B_ERROR; + ps->ps_bp->b_error = ps->ps_error; + } + md_kstat_done(ui, ps->ps_bp, 0); + biodone(ps->ps_bp); + ps->ps_flags |= MD_RPS_IODONE; + } + } + + if (todo & RFP_DECR_FRAGS) { + ASSERT(ps->ps_frags); + ps->ps_frags--; + } + + if (ps->ps_frags != 0) { + mutex_exit(&ps->ps_mx); + return; + } + + ASSERT((ps->ps_frags == 0) && (ps->ps_pwfrags == 0)); + mutex_exit(&ps->ps_mx); + + if (todo & RFP_RLS_LOCK) + md_io_readerexit(ui); + + if (panicstr) { + ps->ps_flags |= MD_RPS_DONE; + return; + } + + if (ps->ps_flags & MD_RPS_HSREQ) + (void) raid_hotspares(); + + ASSERT(todo & RFP_RLS_LOCK); + ps->ps_flags &= ~MD_RPS_INUSE; + + md_dec_iocount(MD_MIN2SET(ps->ps_un->c.un_self_id)); + + kmem_cache_free(raid_parent_cache, ps); +} + +/* + * NAMES: raid_free_child + * DESCRIPTION: free a parent structure + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * int drop_locks - 0 for no locks held + * NOTE: (TBR) - this routine currently is not in use. + */ +static void +raid_free_child(md_raidcs_t *cs, int drop_locks) +{ + mr_unit_t *un = cs->cs_un; + md_raidcbuf_t *cbuf, *cbuf1; + + if (cs->cs_pw_inval_list) + raid_free_pwinvalidate(cs); + + if (drop_locks) { + ASSERT(cs->cs_flags & MD_RCS_LLOCKD && + (cs->cs_flags & (MD_RCS_READER | MD_RCS_WRITER))); + md_unit_readerexit(MDI_UNIT(MD_SID(un))); + raid_line_exit(cs); + } else { + ASSERT(!(cs->cs_flags & MD_RCS_LLOCKD)); + } + + freebuffers(cs); + cbuf = cs->cs_buflist; + while (cbuf) { + cbuf1 = cbuf->cbuf_next; + kmem_cache_free(raid_cbuf_cache, cbuf); + cbuf = cbuf1; + } + if (cs->cs_dbuf.b_flags & B_REMAPPED) + bp_mapout(&cs->cs_dbuf); + kmem_cache_free(raid_child_cache, cs); +} + +/* + * NAME: raid_regen_parity + * + * DESCRIPTION: This routine is used to regenerate the parity blocks + * for the entire raid device. It is called from + * both the regen thread and the IO path. + * + * On error the entire device is marked as in error by + * placing the erroring device in error and all other + * devices in last_errored. + * + * PARAMETERS: md_raidcs_t *cs + */ +void +raid_regen_parity(md_raidcs_t *cs) +{ + mr_unit_t *un = cs->cs_un; + mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); + caddr_t buffer; + caddr_t parity_buffer; + buf_t *bp; + uint_t *dbuf, *pbuf; + uint_t colcnt = un->un_totalcolumncnt; + int column; + int parity_column = cs->cs_pcolumn; + size_t bcount; + int j; + + /* + * This routine uses the data and parity buffers allocated to a + * write. In the case of a read the buffers are allocated and + * freed at the end. + */ + + ASSERT(IO_READER_HELD(un)); + ASSERT(cs->cs_flags & MD_RCS_LLOCKD); + ASSERT(UNIT_READER_HELD(un)); + + if (raid_state_cnt(un, RCS_OKAY) != colcnt) + return; + + if (cs->cs_flags & MD_RCS_READER) { + getpbuffer(cs); + getdbuffer(cs); + } + ASSERT(cs->cs_dbuffer && cs->cs_pbuffer); + bcount = cs->cs_bcount; + buffer = cs->cs_dbuffer; + parity_buffer = cs->cs_pbuffer; + bzero(parity_buffer, bcount); + bp = &cs->cs_dbuf; + for (column = 0; column < colcnt; column++) { + if (column == parity_column) + continue; + reset_buf(bp, B_READ | B_BUSY, bcount); + bp->b_un.b_addr = buffer; + bp->b_edev = md_dev64_to_dev(un->un_column[column].un_dev); + bp->b_lblkno = cs->cs_blkno + un->un_column[column].un_devstart; + bp->b_bcount = bcount; + bp->b_bufsize = bcount; + (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); + if (biowait(bp)) + goto bail; + pbuf = (uint_t *)(void *)parity_buffer; + dbuf = (uint_t *)(void *)buffer; + for (j = 0; j < (bcount / (sizeof (uint_t))); j++) { + *pbuf = *pbuf ^ *dbuf; + pbuf++; + dbuf++; + } + } + + reset_buf(bp, B_WRITE | B_BUSY, cs->cs_bcount); + bp->b_un.b_addr = parity_buffer; + bp->b_edev = md_dev64_to_dev(un->un_column[parity_column].un_dev); + bp->b_lblkno = cs->cs_blkno + un->un_column[parity_column].un_devstart; + bp->b_bcount = bcount; + bp->b_bufsize = bcount; + (void) md_call_strategy(bp, MD_STR_NOTTOP, NULL); + if (biowait(bp)) + goto bail; + + if (cs->cs_flags & MD_RCS_READER) { + freebuffers(cs); + cs->cs_pbuffer = NULL; + cs->cs_dbuffer = NULL; + } + bp->b_chain = (struct buf *)cs; + return; +bail: + if (cs->cs_flags & MD_RCS_READER) { + freebuffers(cs); + cs->cs_pbuffer = NULL; + cs->cs_dbuffer = NULL; + } + md_unit_readerexit(ui); + un = md_unit_writerlock(ui); + raid_set_state(un, column, RCS_ERRED, 0); + for (column = 0; column < colcnt; column++) + raid_set_state(un, column, RCS_ERRED, 0); + raid_commit(un, NULL); + md_unit_writerexit(ui); + un = md_unit_readerlock(ui); + bp->b_chain = (struct buf *)cs; +} + +/* + * NAMES: raid_error_state + * DESCRIPTION: check unit and column states' impact on I/O error + * NOTE: the state now may not be the state when the + * I/O completed due to race conditions. + * PARAMETERS: mr_unit_t *un - pointer to raid unit structure + * md_raidcs_t *cs - pointer to child structure + * buf_t *bp - pointer to buffer structure + */ +static int +raid_error_state(mr_unit_t *un, buf_t *bp) +{ + int column; + int i; + + ASSERT(IO_READER_HELD(un)); + ASSERT(UNIT_WRITER_HELD(un)); + + column = -1; + for (i = 0; i < un->un_totalcolumncnt; i++) { + if (un->un_column[i].un_dev == md_expldev(bp->b_edev)) { + column = i; + break; + } + if (un->un_column[i].un_alt_dev == md_expldev(bp->b_edev)) { + column = i; + break; + } + } + + /* in case a replace snuck in while waiting on unit writer lock */ + + if (column == -1) { + return (0); + } + + (void) raid_set_state(un, column, RCS_ERRED, 0); + ASSERT(un->un_state & (RUS_ERRED | RUS_LAST_ERRED)); + + raid_commit(un, NULL); + if (un->un_state & RUS_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_METADEVICE, + MD_UN2SET(un), MD_SID(un)); + } else if (un->un_state & RUS_LAST_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, SVM_TAG_METADEVICE, + MD_UN2SET(un), MD_SID(un)); + } + + return (EIO); +} + +/* + * NAME: raid_mapin_buf + * DESCRIPTION: wait for the input buffer header to be maped in + * PARAMETERS: md_raidps_t *ps + */ +static void +raid_mapin_buf(md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + + /* + * check to see if the buffer is maped. If all is ok return the + * offset of the data and return. Since it is expensive to grab + * a mutex this is only done if the mapin is not complete. + * Once the mutex is aquired it is possible that the mapin was + * not done so recheck and if necessary do the mapin. + */ + if (ps->ps_mapin > 0) { + cs->cs_addr = ps->ps_addr + cs->cs_offset; + return; + } + mutex_enter(&ps->ps_mapin_mx); + if (ps->ps_mapin > 0) { + cs->cs_addr = ps->ps_addr + cs->cs_offset; + mutex_exit(&ps->ps_mapin_mx); + return; + } + bp_mapin(ps->ps_bp); + /* + * get the new b_addr out of the parent since bp_mapin just changed it + */ + ps->ps_addr = ps->ps_bp->b_un.b_addr; + cs->cs_addr = ps->ps_addr + cs->cs_offset; + ps->ps_mapin++; + mutex_exit(&ps->ps_mapin_mx); +} + +/* + * NAMES: raid_read_no_retry + * DESCRIPTION: I/O retry routine for a RAID metadevice read + * read failed attempting to regenerate the data, + * no retry possible, error occured in raid_raidregenloop(). + * PARAMETERS: mr_unit_t *un - pointer to raid unit structure + * md_raidcs_t *cs - pointer to child structure + */ +/*ARGSUSED*/ +static void +raid_read_no_retry(mr_unit_t *un, md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + + raid_error_parent(ps, EIO); + raid_free_child(cs, 1); + + /* decrement readfrags */ + raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); +} + +/* + * NAMES: raid_read_retry + * DESCRIPTION: I/O retry routine for a RAID metadevice read + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ +static void +raid_read_retry(mr_unit_t *un, md_raidcs_t *cs) +{ + /* re-initialize the buf_t structure for raid_read() */ + cs->cs_dbuf.b_chain = (struct buf *)cs; + cs->cs_dbuf.b_back = &cs->cs_dbuf; + cs->cs_dbuf.b_forw = &cs->cs_dbuf; + cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ + cs->cs_dbuf.b_error = 0; /* initialize error */ + cs->cs_dbuf.b_offset = -1; + /* Initialize semaphores */ + sema_init(&cs->cs_dbuf.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&cs->cs_dbuf.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + + cs->cs_pbuf.b_chain = (struct buf *)cs; + cs->cs_pbuf.b_back = &cs->cs_pbuf; + cs->cs_pbuf.b_forw = &cs->cs_pbuf; + cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ + cs->cs_pbuf.b_error = 0; /* initialize error */ + cs->cs_pbuf.b_offset = -1; + sema_init(&cs->cs_pbuf.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&cs->cs_pbuf.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + + cs->cs_flags &= ~MD_RCS_ERROR; /* reset child error flag */ + cs->cs_flags |= MD_RCS_RECOVERY; /* set RECOVERY flag */ + + /* + * re-scheduling I/O with raid_read_io() is simpler. basically, + * raid_read_io() is invoked again with same child structure. + * (NOTE: we aren`t supposed to do any error recovery when an I/O + * error occured in raid_raidregenloop(). + */ + raid_mapin_buf(cs); + raid_read_io(un, cs); +} + +/* + * NAMES: raid_rderr + * DESCRIPTION: I/O error handling routine for a RAID metadevice read + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * LOCKS: must obtain unit writer lock while calling raid_error_state + * since a unit or column state transition may take place. + * must obtain unit reader lock to retry I/O. + */ +/*ARGSUSED*/ +static void +raid_rderr(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mdi_unit_t *ui; + mr_unit_t *un; + int error = 0; + + ps = cs->cs_ps; + ui = ps->ps_ui; + un = (mr_unit_t *)md_unit_writerlock(ui); + ASSERT(un != 0); + + if (cs->cs_dbuf.b_flags & B_ERROR) + error = raid_error_state(un, &cs->cs_dbuf); + if (cs->cs_pbuf.b_flags & B_ERROR) + error |= raid_error_state(un, &cs->cs_pbuf); + + md_unit_writerexit(ui); + + ps->ps_flags |= MD_RPS_HSREQ; + + un = (mr_unit_t *)md_unit_readerlock(ui); + ASSERT(un != 0); + /* now attempt the appropriate retry routine */ + (*(cs->cs_retry_call))(un, cs); +} + + +/* + * NAMES: raid_read_error + * DESCRIPTION: I/O error handling routine for a RAID metadevice read + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ +/*ARGSUSED*/ +static void +raid_read_error(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mdi_unit_t *ui; + mr_unit_t *un; + set_t setno; + + ps = cs->cs_ps; + ui = ps->ps_ui; + un = cs->cs_un; + + setno = MD_UN2SET(un); + + if ((cs->cs_dbuf.b_flags & B_ERROR) && + (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && + (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) + cmn_err(CE_WARN, "md %s: read error on %s", + md_shortname(MD_SID(un)), + md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); + + if ((cs->cs_pbuf.b_flags & B_ERROR) && + (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && + (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) + cmn_err(CE_WARN, "md %s: read error on %s", + md_shortname(MD_SID(un)), + md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); + + md_unit_readerexit(ui); + + ASSERT(cs->cs_frags == 0); + + /* now schedule processing for possible state change */ + daemon_request(&md_mstr_daemon, raid_rderr, + (daemon_queue_t *)cs, REQ_OLD); + +} + +/* + * NAMES: getdbuffer + * DESCRIPTION: data buffer allocation for a child structure + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * + * NOTE: always get dbuffer before pbuffer + * and get both buffers before pwslot + * otherwise a deadlock could be introduced. + */ +static void +getdbuffer(md_raidcs_t *cs) +{ + mr_unit_t *un; + + cs->cs_dbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); + if (cs->cs_dbuffer != NULL) + return; + un = cs->cs_ps->ps_un; + mutex_enter(&un->un_mx); + while (un->un_dbuffer == NULL) { + STAT_INC(data_buffer_waits); + un->un_rflags |= MD_RFLAG_NEEDBUF; + cv_wait(&un->un_cv, &un->un_mx); + } + cs->cs_dbuffer = un->un_dbuffer; + cs->cs_flags |= MD_RCS_UNDBUF; + un->un_dbuffer = NULL; + mutex_exit(&un->un_mx); +} + +/* + * NAMES: getpbuffer + * DESCRIPTION: parity buffer allocation for a child structure + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + * + * NOTE: always get dbuffer before pbuffer + * and get both buffers before pwslot + * otherwise a deadlock could be introduced. + */ +static void +getpbuffer(md_raidcs_t *cs) +{ + mr_unit_t *un; + + cs->cs_pbuffer = kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_NOSLEEP); + if (cs->cs_pbuffer != NULL) + return; + un = cs->cs_ps->ps_un; + mutex_enter(&un->un_mx); + while (un->un_pbuffer == NULL) { + STAT_INC(parity_buffer_waits); + un->un_rflags |= MD_RFLAG_NEEDBUF; + cv_wait(&un->un_cv, &un->un_mx); + } + cs->cs_pbuffer = un->un_pbuffer; + cs->cs_flags |= MD_RCS_UNPBUF; + un->un_pbuffer = NULL; + mutex_exit(&un->un_mx); +} +static void +getresources(md_raidcs_t *cs) +{ + md_raidcbuf_t *cbuf; + /* + * NOTE: always get dbuffer before pbuffer + * and get both buffers before pwslot + * otherwise a deadlock could be introduced. + */ + getdbuffer(cs); + getpbuffer(cs); + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) + cbuf->cbuf_buffer = + kmem_alloc(cs->cs_bcount + DEV_BSIZE, KM_SLEEP); +} +/* + * NAMES: freebuffers + * DESCRIPTION: child structure buffer freeing routine + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ +static void +freebuffers(md_raidcs_t *cs) +{ + mr_unit_t *un; + md_raidcbuf_t *cbuf; + + /* free buffers used for full line write */ + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { + if (cbuf->cbuf_buffer == NULL) + continue; + kmem_free(cbuf->cbuf_buffer, cbuf->cbuf_bcount + DEV_BSIZE); + cbuf->cbuf_buffer = NULL; + cbuf->cbuf_bcount = 0; + } + + if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { + un = cs->cs_un; + mutex_enter(&un->un_mx); + } + if (cs->cs_dbuffer) { + if (cs->cs_flags & MD_RCS_UNDBUF) + un->un_dbuffer = cs->cs_dbuffer; + else + kmem_free(cs->cs_dbuffer, cs->cs_bcount + DEV_BSIZE); + } + if (cs->cs_pbuffer) { + if (cs->cs_flags & MD_RCS_UNPBUF) + un->un_pbuffer = cs->cs_pbuffer; + else + kmem_free(cs->cs_pbuffer, cs->cs_bcount + DEV_BSIZE); + } + if (cs->cs_flags & (MD_RCS_UNDBUF | MD_RCS_UNPBUF)) { + un->un_rflags &= ~MD_RFLAG_NEEDBUF; + cv_broadcast(&un->un_cv); + mutex_exit(&un->un_mx); + } +} + +/* + * NAMES: raid_line_reader_lock, raid_line_writer_lock + * DESCRIPTION: RAID metadevice line reader and writer lock routines + * data column # and parity column #. + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ + +void +raid_line_reader_lock(md_raidcs_t *cs, int resync_thread) +{ + mr_unit_t *un; + md_raidcs_t *cs1; + + ASSERT(cs->cs_line != MD_DISKADDR_ERROR); + un = cs->cs_un; + cs->cs_flags |= MD_RCS_READER; + STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); + if (!panicstr) + mutex_enter(&un->un_linlck_mx); + cs1 = un->un_linlck_chn; + while (cs1 != NULL) { + for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) + if (raid_io_overlaps(cs, cs1) == 1) + if (cs1->cs_flags & MD_RCS_WRITER) + break; + + if (cs1 != NULL) { + if (panicstr) + panic("md; raid line write lock held"); + un->un_linlck_flg = 1; + cv_wait(&un->un_linlck_cv, &un->un_linlck_mx); + STAT_INC(raid_read_waits); + } + } + STAT_MAX(raid_max_reader_locks, raid_reader_locks_active); + STAT_INC(raid_reader_locks); + cs1 = un->un_linlck_chn; + if (cs1 != NULL) + cs1->cs_linlck_prev = cs; + cs->cs_linlck_next = cs1; + cs->cs_linlck_prev = NULL; + un->un_linlck_chn = cs; + cs->cs_flags |= MD_RCS_LLOCKD; + if (resync_thread) { + diskaddr_t lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; + diskaddr_t line = (lastblk + 1) / un->un_segsize; + ASSERT(raid_state_cnt(un, RCS_RESYNC)); + mutex_enter(&un->un_mx); + un->un_resync_line_index = line; + mutex_exit(&un->un_mx); + } + if (!panicstr) + mutex_exit(&un->un_linlck_mx); +} + +int +raid_line_writer_lock(md_raidcs_t *cs, int lock) +{ + mr_unit_t *un; + md_raidcs_t *cs1; + + ASSERT(cs->cs_line != MD_DISKADDR_ERROR); + cs->cs_flags |= MD_RCS_WRITER; + un = cs->cs_ps->ps_un; + + STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); + if (lock && !panicstr) + mutex_enter(&un->un_linlck_mx); + ASSERT(MUTEX_HELD(&un->un_linlck_mx)); + + cs1 = un->un_linlck_chn; + for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) + if (raid_io_overlaps(cs, cs1)) + break; + + if (cs1 != NULL) { + if (panicstr) + panic("md: line writer lock inaccessible"); + goto no_lock_exit; + } + + if (raid_alloc_pwslot(cs)) { + if (panicstr) + panic("md: no prewrite slots"); + STAT_INC(raid_prewrite_waits); + goto no_lock_exit; + } + + cs1 = un->un_linlck_chn; + if (cs1 != NULL) + cs1->cs_linlck_prev = cs; + cs->cs_linlck_next = cs1; + cs->cs_linlck_prev = NULL; + un->un_linlck_chn = cs; + cs->cs_flags |= MD_RCS_LLOCKD; + cs->cs_flags &= ~MD_RCS_WAITING; + STAT_INC(raid_writer_locks); + STAT_MAX(raid_max_write_locks, raid_write_locks_active); + if (lock && !panicstr) + mutex_exit(&un->un_linlck_mx); + return (0); + +no_lock_exit: + /* if this is already queued then do not requeue it */ + ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); + if (!lock || (cs->cs_flags & MD_RCS_WAITING)) + return (1); + cs->cs_flags |= MD_RCS_WAITING; + cs->cs_un = un; + raid_enqueue(cs); + if (lock && !panicstr) + mutex_exit(&un->un_linlck_mx); + return (1); +} + +static void +raid_startio(md_raidcs_t *cs) +{ + mdi_unit_t *ui = cs->cs_ps->ps_ui; + mr_unit_t *un = cs->cs_un; + + un = md_unit_readerlock(ui); + raid_write_io(un, cs); +} + +void +raid_io_startup(mr_unit_t *un) +{ + md_raidcs_t *waiting_list, *cs1; + md_raidcs_t *previous = NULL, *next = NULL; + mdi_unit_t *ui = MDI_UNIT(un->c.un_self_id); + kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; + + ASSERT(MUTEX_HELD(&un->un_linlck_mx)); + mutex_enter(io_list_mutex); + + /* + * check to be sure there are no reader locks outstanding. If + * there are not then pass on the writer lock. + */ + waiting_list = ui->ui_io_lock->io_list_front; + while (waiting_list) { + ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); + ASSERT(! (waiting_list->cs_flags & MD_RCS_LLOCKD)); + for (cs1 = un->un_linlck_chn; cs1; cs1 = cs1->cs_linlck_next) + if (raid_io_overlaps(waiting_list, cs1) == 1) + break; + /* + * there was an IOs that overlaps this io so go onto + * the next io in the waiting list + */ + if (cs1) { + previous = waiting_list; + waiting_list = waiting_list->cs_linlck_next; + continue; + } + + /* + * There are no IOs that overlap this, so remove it from + * the waiting queue, and start it + */ + + if (raid_check_pw(waiting_list)) { + ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); + previous = waiting_list; + waiting_list = waiting_list->cs_linlck_next; + continue; + } + ASSERT(waiting_list->cs_flags & MD_RCS_WAITING); + + next = waiting_list->cs_linlck_next; + if (previous) + previous->cs_linlck_next = next; + else + ui->ui_io_lock->io_list_front = next; + + if (ui->ui_io_lock->io_list_front == NULL) + ui->ui_io_lock->io_list_back = NULL; + + if (ui->ui_io_lock->io_list_back == waiting_list) + ui->ui_io_lock->io_list_back = previous; + + waiting_list->cs_linlck_next = NULL; + waiting_list->cs_flags &= ~MD_RCS_WAITING; + STAT_DEC(raid_write_queue_length); + if (raid_line_writer_lock(waiting_list, 0)) + panic("region locking corrupted"); + + ASSERT(waiting_list->cs_flags & MD_RCS_LLOCKD); + daemon_request(&md_mstr_daemon, raid_startio, + (daemon_queue_t *)waiting_list, REQ_OLD); + waiting_list = next; + + } + mutex_exit(io_list_mutex); +} + +void +raid_line_exit(md_raidcs_t *cs) +{ + mr_unit_t *un; + + un = cs->cs_ps->ps_un; + STAT_CHECK(raid_line_lock_wait, MUTEX_HELD(&un->un_linlck_mx)); + mutex_enter(&un->un_linlck_mx); + if (cs->cs_flags & MD_RCS_READER) + STAT_DEC(raid_reader_locks_active); + else + STAT_DEC(raid_write_locks_active); + + if (cs->cs_linlck_prev) + cs->cs_linlck_prev->cs_linlck_next = cs->cs_linlck_next; + else + un->un_linlck_chn = cs->cs_linlck_next; + if (cs->cs_linlck_next) + cs->cs_linlck_next->cs_linlck_prev = cs->cs_linlck_prev; + + cs->cs_flags &= ~MD_RCS_LLOCKD; + + if (un->un_linlck_flg) + cv_broadcast(&un->un_linlck_cv); + + un->un_linlck_flg = 0; + cs->cs_line = MD_DISKADDR_ERROR; + + raid_cancel_pwslot(cs); + /* + * now that the lock is droped go ahead and see if there are any + * other writes that can be started up + */ + raid_io_startup(un); + + mutex_exit(&un->un_linlck_mx); +} + +/* + * NAMES: raid_line, raid_pcolumn, raid_dcolumn + * DESCRIPTION: RAID metadevice APIs for mapping segment # to line #, + * data column # and parity column #. + * PARAMETERS: int segment - segment number + * mr_unit_t *un - pointer to an unit structure + * RETURNS: raid_line returns line # + * raid_dcolumn returns data column # + * raid_pcolumn returns parity column # + */ +static diskaddr_t +raid_line(diskaddr_t segment, mr_unit_t *un) +{ + diskaddr_t adj_seg; + diskaddr_t line; + diskaddr_t max_orig_segment; + + max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; + if (segment >= max_orig_segment) { + adj_seg = segment - max_orig_segment; + line = adj_seg % un->un_segsincolumn; + } else { + line = segment / (un->un_origcolumncnt - 1); + } + return (line); +} + +uint_t +raid_dcolumn(diskaddr_t segment, mr_unit_t *un) +{ + diskaddr_t adj_seg; + diskaddr_t line; + diskaddr_t max_orig_segment; + uint_t column; + + max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; + if (segment >= max_orig_segment) { + adj_seg = segment - max_orig_segment; + column = un->un_origcolumncnt + + (uint_t)(adj_seg / un->un_segsincolumn); + } else { + line = segment / (un->un_origcolumncnt - 1); + column = (uint_t)((segment % (un->un_origcolumncnt - 1) + line) + % un->un_origcolumncnt); + } + return (column); +} + +uint_t +raid_pcolumn(diskaddr_t segment, mr_unit_t *un) +{ + diskaddr_t adj_seg; + diskaddr_t line; + diskaddr_t max_orig_segment; + uint_t column; + + max_orig_segment = (un->un_origcolumncnt - 1) * un->un_segsincolumn; + if (segment >= max_orig_segment) { + adj_seg = segment - max_orig_segment; + line = adj_seg % un->un_segsincolumn; + } else { + line = segment / (un->un_origcolumncnt - 1); + } + column = (uint_t)((line + (un->un_origcolumncnt - 1)) + % un->un_origcolumncnt); + return (column); +} + + +/* + * Is called in raid_iosetup to probe each column to insure + * that all the columns are in 'okay' state and meet the + * 'full line' requirement. If any column is in error, + * we don't want to enable the 'full line' flag. Previously, + * we would do so and disable it only when a error is + * detected after the first 'full line' io which is too late + * and leads to the potential data corruption. + */ +static int +raid_check_cols(mr_unit_t *un) +{ + buf_t bp; + char *buf; + mr_column_t *colptr; + minor_t mnum = MD_SID(un); + int i; + int err = 0; + + buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP); + + for (i = 0; i < un->un_totalcolumncnt; i++) { + md_dev64_t tmpdev; + + colptr = &un->un_column[i]; + + tmpdev = colptr->un_dev; + /* + * Open by device id + * If this device is hotspared + * use the hotspare key + */ + tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ? + colptr->un_hs_key : colptr->un_orig_key); + + if (tmpdev == NODEV64) { + err = 1; + break; + } + + colptr->un_dev = tmpdev; + + bzero((caddr_t)&bp, sizeof (buf_t)); + bp.b_back = &bp; + bp.b_forw = &bp; + bp.b_flags = (B_READ | B_BUSY); + sema_init(&bp.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&bp.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + bp.b_edev = md_dev64_to_dev(colptr->un_dev); + bp.b_lblkno = colptr->un_pwstart; + bp.b_bcount = DEV_BSIZE; + bp.b_bufsize = DEV_BSIZE; + bp.b_un.b_addr = (caddr_t)buf; + (void) md_call_strategy(&bp, 0, NULL); + if (biowait(&bp)) { + err = 1; + break; + } + } + + kmem_free(buf, DEV_BSIZE); + return (err); +} + +/* + * NAME: raid_iosetup + * DESCRIPTION: RAID metadevice specific I/O set up routine which does + * all the necessary calculations to determine the location + * of the segement for the I/O. + * PARAMETERS: mr_unit_t *un - unit number of RAID metadevice + * diskaddr_t blkno - block number of the I/O attempt + * size_t blkcnt - block count for this I/O + * md_raidcs_t *cs - child structure for each segmented I/O + * + * NOTE: The following is an example of a raid disk layer out: + * + * Total Column = 5 + * Original Column = 4 + * Segment Per Column = 10 + * + * Col#0 Col#1 Col#2 Col#3 Col#4 Col#5 Col#6 + * ------------------------------------------------------------- + * line#0 Seg#0 Seg#1 Seg#2 Parity Seg#30 Seg#40 + * line#1 Parity Seg#3 Seg#4 Seg#5 Seg#31 + * line#2 Seg#8 Parity Seg#6 Seg#7 Seg#32 + * line#3 Seg#10 Seg#11 Parity Seg#9 Seg#33 + * line#4 Seg#12 Seg#13 Seg#14 Parity Seg#34 + * line#5 Parity Seg#15 Seg#16 Seg#17 Seg#35 + * line#6 Seg#20 Parity Seg#18 Seg#19 Seg#36 + * line#7 Seg#22 Seg#23 Parity Seg#21 Seg#37 + * line#8 Seg#24 Seg#25 Seg#26 Parity Seg#38 + * line#9 Parity Seg#27 Seg#28 Seg#29 Seg#39 + */ +static size_t +raid_iosetup( + mr_unit_t *un, + diskaddr_t blkno, + size_t blkcnt, + md_raidcs_t *cs +) +{ + diskaddr_t segment; + diskaddr_t segstart; + diskaddr_t segoff; + size_t leftover; + diskaddr_t line; + uint_t iosize; + uint_t colcnt; + + /* caculate the segment# and offset for the block */ + segment = blkno / un->un_segsize; + segstart = segment * un->un_segsize; + segoff = blkno - segstart; + iosize = un->un_iosize - 1; + colcnt = un->un_totalcolumncnt - 1; + line = raid_line(segment, un); + cs->cs_dcolumn = raid_dcolumn(segment, un); + cs->cs_pcolumn = raid_pcolumn(segment, un); + cs->cs_dflags = un->un_column[cs->cs_dcolumn].un_devflags; + cs->cs_pflags = un->un_column[cs->cs_pcolumn].un_devflags; + cs->cs_line = line; + + if ((cs->cs_ps->ps_flags & MD_RPS_WRITE) && + (UNIT_STATE(un) & RCS_OKAY) && + (segoff == 0) && + (un->un_totalcolumncnt == un->un_origcolumncnt) && + (un->un_segsize < un->un_iosize) && + (un->un_iosize <= un->un_maxio) && + (blkno == line * un->un_segsize * colcnt) && + (blkcnt >= ((un->un_totalcolumncnt -1) * un->un_segsize)) && + (raid_state_cnt(un, RCS_OKAY) == un->un_origcolumncnt) && + (raid_check_cols(un) == 0)) { + + md_raidcbuf_t **cbufp; + md_raidcbuf_t *cbuf; + int i, j; + + STAT_INC(raid_full_line_writes); + leftover = blkcnt - (un->un_segsize * colcnt); + ASSERT(blkcnt >= (un->un_segsize * colcnt)); + cs->cs_blkno = line * un->un_segsize; + cs->cs_blkcnt = un->un_segsize; + cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; + cs->cs_bcount = dbtob(cs->cs_blkcnt); + cs->cs_flags |= MD_RCS_LINE; + + cbufp = &cs->cs_buflist; + for (i = 0; i < un->un_totalcolumncnt; i++) { + j = cs->cs_dcolumn + i; + j = j % un->un_totalcolumncnt; + + if ((j == cs->cs_dcolumn) || (j == cs->cs_pcolumn)) + continue; + cbuf = kmem_cache_alloc(raid_cbuf_cache, + MD_ALLOCFLAGS); + raid_cbuf_init(cbuf); + cbuf->cbuf_un = cs->cs_un; + cbuf->cbuf_ps = cs->cs_ps; + cbuf->cbuf_column = j; + cbuf->cbuf_bcount = dbtob(un->un_segsize); + *cbufp = cbuf; + cbufp = &cbuf->cbuf_next; + } + return (leftover); + } + + leftover = blkcnt - (un->un_segsize - segoff); + if (blkcnt > (un->un_segsize - segoff)) + blkcnt -= leftover; + else + leftover = 0; + + if (blkcnt > (size_t)iosize) { + leftover += (blkcnt - iosize); + blkcnt = iosize; + } + + /* calculate the line# and column# for the segment */ + cs->cs_flags &= ~MD_RCS_LINE; + cs->cs_blkno = line * un->un_segsize + segoff; + cs->cs_blkcnt = (uint_t)blkcnt; + cs->cs_lastblk = cs->cs_blkno + cs->cs_blkcnt - 1; + cs->cs_bcount = dbtob((uint_t)blkcnt); + return (leftover); +} + +/* + * NAME: raid_done + * DESCRIPTION: RAID metadevice I/O done interrupt routine + * PARAMETERS: struct buf *bp - pointer to a buffer structure + */ +static void +raid_done(struct buf *bp) +{ + md_raidcs_t *cs; + int flags, frags; + + sema_v(&bp->b_io); + cs = (md_raidcs_t *)bp->b_chain; + + ASSERT(cs != NULL); + + mutex_enter(&cs->cs_mx); + if (bp->b_flags & B_ERROR) { + cs->cs_flags |= MD_RCS_ERROR; + cs->cs_flags &= ~(MD_RCS_ISCALL); + } + + flags = cs->cs_flags; + frags = --cs->cs_frags; + mutex_exit(&cs->cs_mx); + if (frags != 0) { + return; + } + + if (flags & MD_RCS_ERROR) { + if (cs->cs_error_call) { + daemon_request(&md_done_daemon, cs->cs_error_call, + (daemon_queue_t *)cs, REQ_OLD); + } + return; + } + + if (flags & MD_RCS_ISCALL) { + cs->cs_flags &= ~(MD_RCS_ISCALL); + (*(cs->cs_call))(cs); + return; + } + daemon_request(&md_done_daemon, cs->cs_call, + (daemon_queue_t *)cs, REQ_OLD); +} +/* + * the flag RIO_EXTRA is used when dealing with a column in the process + * of being resynced. During the resync, writes may have to take place + * on both the original component and a hotspare component. + */ +#define RIO_DATA 0x00100 /* use data buffer & data column */ +#define RIO_PARITY 0x00200 /* use parity buffer & parity column */ +#define RIO_WRITE 0x00400 /* issue a write */ +#define RIO_READ 0x00800 /* issue a read */ +#define RIO_PWIO 0x01000 /* do the I/O to the prewrite entry */ +#define RIO_ALT 0x02000 /* do write to alternate device */ +#define RIO_EXTRA 0x04000 /* use extra buffer */ + +#define RIO_COLMASK 0x000ff + +#define RIO_PREWRITE RIO_WRITE | RIO_PWIO + +/* + * NAME: raidio + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raidio(md_raidcs_t *cs, int flags) +{ + buf_t *bp; + int column; + int flag; + void *private; + mr_unit_t *un; + int iosize; + diskaddr_t pwstart; + diskaddr_t devstart; + md_dev64_t dev; + + un = cs->cs_un; + + ASSERT(IO_READER_HELD(un)); + ASSERT(UNIT_READER_HELD(un)); + + if (flags & RIO_DATA) { + if (flags & RIO_EXTRA) + bp = &cs->cs_hbuf; + else + bp = &cs->cs_dbuf; + bp->b_un.b_addr = cs->cs_dbuffer; + column = cs->cs_dcolumn; + } else { + if (flags & RIO_EXTRA) + bp = &cs->cs_hbuf; + else + bp = &cs->cs_pbuf; + bp->b_un.b_addr = cs->cs_pbuffer; + column = cs->cs_pcolumn; + } + if (flags & RIO_COLMASK) + column = (flags & RIO_COLMASK) - 1; + + bp->b_bcount = cs->cs_bcount; + bp->b_bufsize = cs->cs_bcount; + iosize = un->un_iosize; + + /* check if the hotspared device will be used */ + if (flags & RIO_ALT && (flags & RIO_WRITE)) { + pwstart = un->un_column[column].un_alt_pwstart; + devstart = un->un_column[column].un_alt_devstart; + dev = un->un_column[column].un_alt_dev; + } else { + pwstart = un->un_column[column].un_pwstart; + devstart = un->un_column[column].un_devstart; + dev = un->un_column[column].un_dev; + } + + /* if not writing to log skip log header */ + if ((flags & RIO_PWIO) == 0) { + bp->b_lblkno = devstart + cs->cs_blkno; + bp->b_un.b_addr += DEV_BSIZE; + } else { + bp->b_bcount += DEV_BSIZE; + bp->b_bufsize = bp->b_bcount; + if (flags & RIO_DATA) { + bp->b_lblkno = cs->cs_dpwslot * iosize + pwstart; + } else { /* not DATA -> PARITY */ + bp->b_lblkno = cs->cs_ppwslot * iosize + pwstart; + } + } + + bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR | nv_available); + bp->b_flags |= B_BUSY; + if (flags & RIO_READ) { + bp->b_flags |= B_READ; + } else { + bp->b_flags |= B_WRITE; + if ((nv_available && nv_parity && (flags & RIO_PARITY)) || + (nv_available && nv_prewrite && (flags & RIO_PWIO))) + bp->b_flags |= nv_available; + } + bp->b_iodone = (int (*)())raid_done; + bp->b_edev = md_dev64_to_dev(dev); + + ASSERT((bp->b_edev != 0) && (bp->b_edev != NODEV)); + + private = cs->cs_strategy_private; + flag = cs->cs_strategy_flag; + + md_call_strategy(bp, flag, private); +} + +/* + * NAME: genstandardparity + * DESCRIPTION: This routine + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +genstandardparity(md_raidcs_t *cs) +{ + uint_t *dbuf, *pbuf; + size_t wordcnt; + uint_t dsum = 0; + uint_t psum = 0; + + ASSERT((cs->cs_bcount & 0x3) == 0); + + wordcnt = cs->cs_bcount / sizeof (uint_t); + + dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + + /* Word aligned */ + if (((uintptr_t)cs->cs_addr & 0x3) == 0) { + uint_t *uwbuf = (uint_t *)(void *)(cs->cs_addr); + uint_t uval; + + while (wordcnt--) { + uval = *uwbuf++; + psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ uval)); + ++pbuf; + *dbuf = uval; + dsum ^= uval; + ++dbuf; + } + } else { + uchar_t *ubbuf = (uchar_t *)(cs->cs_addr); + union { + uint_t wb; + uchar_t bb[4]; + } cb; + + while (wordcnt--) { + cb.bb[0] = *ubbuf++; + cb.bb[1] = *ubbuf++; + cb.bb[2] = *ubbuf++; + cb.bb[3] = *ubbuf++; + psum ^= (*pbuf = ((*pbuf ^ *dbuf) ^ cb.wb)); + ++pbuf; + *dbuf = cb.wb; + dsum ^= cb.wb; + ++dbuf; + } + } + + RAID_FILLIN_RPW(cs->cs_dbuffer, cs->cs_un, dsum, cs->cs_pcolumn, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + 2, cs->cs_dcolumn, RAID_PWMAGIC); + + RAID_FILLIN_RPW(cs->cs_pbuffer, cs->cs_un, psum, cs->cs_dcolumn, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + 2, cs->cs_pcolumn, RAID_PWMAGIC); +} + +static void +genlineparity(md_raidcs_t *cs) +{ + + mr_unit_t *un = cs->cs_un; + md_raidcbuf_t *cbuf; + uint_t *pbuf, *dbuf; + uint_t *uwbuf; + uchar_t *ubbuf; + size_t wordcnt; + uint_t psum = 0, dsum = 0; + size_t count = un->un_segsize * DEV_BSIZE; + uint_t col; + buf_t *bp; + + ASSERT((cs->cs_bcount & 0x3) == 0); + + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); + uwbuf = (uint_t *)(void *)(cs->cs_addr); + ubbuf = (uchar_t *)(void *)(cs->cs_addr); + + wordcnt = count / sizeof (uint_t); + + /* Word aligned */ + if (((uintptr_t)cs->cs_addr & 0x3) == 0) { + uint_t uval; + + while (wordcnt--) { + uval = *uwbuf++; + *dbuf = uval; + *pbuf = uval; + dsum ^= uval; + ++pbuf; + ++dbuf; + } + } else { + union { + uint_t wb; + uchar_t bb[4]; + } cb; + + while (wordcnt--) { + cb.bb[0] = *ubbuf++; + cb.bb[1] = *ubbuf++; + cb.bb[2] = *ubbuf++; + cb.bb[3] = *ubbuf++; + *dbuf = cb.wb; + *pbuf = cb.wb; + dsum ^= cb.wb; + ++pbuf; + ++dbuf; + } + } + + RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, cs->cs_pcolumn, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + un->un_totalcolumncnt, cs->cs_dcolumn, RAID_PWMAGIC); + + raidio(cs, RIO_PREWRITE | RIO_DATA); + + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { + + dsum = 0; + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + dbuf = (uint_t *)(void *)(cbuf->cbuf_buffer + DEV_BSIZE); + + wordcnt = count / sizeof (uint_t); + + col = cbuf->cbuf_column; + + /* Word aligned */ + if (((uintptr_t)cs->cs_addr & 0x3) == 0) { + uint_t uval; + + /* + * Only calculate psum when working on the last + * data buffer. + */ + if (cbuf->cbuf_next == NULL) { + psum = 0; + while (wordcnt--) { + uval = *uwbuf++; + *dbuf = uval; + psum ^= (*pbuf ^= uval); + dsum ^= uval; + ++dbuf; + ++pbuf; + } + } else { + while (wordcnt--) { + uval = *uwbuf++; + *dbuf = uval; + *pbuf ^= uval; + dsum ^= uval; + ++dbuf; + ++pbuf; + } + } + } else { + union { + uint_t wb; + uchar_t bb[4]; + } cb; + + /* + * Only calculate psum when working on the last + * data buffer. + */ + if (cbuf->cbuf_next == NULL) { + psum = 0; + while (wordcnt--) { + cb.bb[0] = *ubbuf++; + cb.bb[1] = *ubbuf++; + cb.bb[2] = *ubbuf++; + cb.bb[3] = *ubbuf++; + *dbuf = cb.wb; + psum ^= (*pbuf ^= cb.wb); + dsum ^= cb.wb; + ++dbuf; + ++pbuf; + } + } else { + while (wordcnt--) { + cb.bb[0] = *ubbuf++; + cb.bb[1] = *ubbuf++; + cb.bb[2] = *ubbuf++; + cb.bb[3] = *ubbuf++; + *dbuf = cb.wb; + *pbuf ^= cb.wb; + dsum ^= cb.wb; + ++dbuf; + ++pbuf; + } + } + } + RAID_FILLIN_RPW(cbuf->cbuf_buffer, un, dsum, cs->cs_pcolumn, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + un->un_totalcolumncnt, col, RAID_PWMAGIC); + + /* + * fill in buffer for write to prewrite area + */ + bp = &cbuf->cbuf_bp; + bp->b_un.b_addr = cbuf->cbuf_buffer; + bp->b_bcount = cbuf->cbuf_bcount + DEV_BSIZE; + bp->b_bufsize = bp->b_bcount; + bp->b_lblkno = (cbuf->cbuf_pwslot * un->un_iosize) + + un->un_column[col].un_pwstart; + bp->b_flags = B_WRITE | B_BUSY; + if (nv_available && nv_prewrite) + bp->b_flags |= nv_available; + bp->b_iodone = (int (*)())raid_done; + bp->b_edev = md_dev64_to_dev(un->un_column[col].un_dev); + bp->b_chain = (struct buf *)cs; + md_call_strategy(bp, + cs->cs_strategy_flag, cs->cs_strategy_private); + } + + RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, cs->cs_dcolumn, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + un->un_totalcolumncnt, cs->cs_pcolumn, RAID_PWMAGIC); + + raidio(cs, RIO_PREWRITE | RIO_PARITY); +} + +/* + * NAME: raid_readregenloop + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_readregenloop(md_raidcs_t *cs) +{ + mr_unit_t *un; + md_raidps_t *ps; + uint_t *dbuf; + uint_t *pbuf; + size_t wordcnt; + + un = cs->cs_un; + + /* + * XOR the parity with data bytes, must skip the + * pre-write entry header in all data/parity buffers + */ + wordcnt = cs->cs_bcount / sizeof (uint_t); + dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + while (wordcnt--) + *dbuf++ ^= *pbuf++; + + /* bump up the loop count */ + cs->cs_loop++; + + /* skip the errored component */ + if (cs->cs_loop == cs->cs_dcolumn) + cs->cs_loop++; + + if (cs->cs_loop != un->un_totalcolumncnt) { + cs->cs_frags = 1; + raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); + return; + } + /* reaching the end sof loop */ + ps = cs->cs_ps; + bcopy(cs->cs_dbuffer + DEV_BSIZE, cs->cs_addr, cs->cs_bcount); + raid_free_child(cs, 1); + + /* decrement readfrags */ + raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); +} + +/* + * NAME: raid_read_io + * DESCRIPTION: RAID metadevice read I/O routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_read_io(mr_unit_t *un, md_raidcs_t *cs) +{ + int flag; + void *private; + buf_t *bp; + buf_t *pb = cs->cs_ps->ps_bp; + mr_column_t *column; + + flag = cs->cs_strategy_flag; + private = cs->cs_strategy_private; + column = &un->un_column[cs->cs_dcolumn]; + + /* + * The component to be read is good, simply set up bp structure + * and call low level md routine doing the read. + */ + + if (COLUMN_ISOKAY(un, cs->cs_dcolumn) || + (COLUMN_ISLASTERR(un, cs->cs_dcolumn) && + (cs->cs_flags & MD_RCS_RECOVERY) == 0)) { + dev_t ddi_dev; /* needed for bioclone, so not md_dev64_t */ + ddi_dev = md_dev64_to_dev(column->un_dev); + + bp = &cs->cs_dbuf; + bp = md_bioclone(pb, cs->cs_offset, cs->cs_bcount, ddi_dev, + column->un_devstart + cs->cs_blkno, + (int (*)())raid_done, bp, KM_NOSLEEP); + + bp->b_chain = (buf_t *)cs; + + cs->cs_frags = 1; + cs->cs_error_call = raid_read_error; + cs->cs_retry_call = raid_read_retry; + cs->cs_flags |= MD_RCS_ISCALL; + cs->cs_stage = RAID_READ_DONE; + cs->cs_call = raid_stage; + + ASSERT(bp->b_edev != 0); + + md_call_strategy(bp, flag, private); + return; + } + + /* + * The component to be read is bad, have to go through + * raid specific method to read data from other members. + */ + cs->cs_loop = 0; + /* + * NOTE: always get dbuffer before pbuffer + * and get both buffers before pwslot + * otherwise a deadlock could be introduced. + */ + raid_mapin_buf(cs); + getdbuffer(cs); + getpbuffer(cs); + if (cs->cs_loop == cs->cs_dcolumn) + cs->cs_loop++; + + /* zero out data buffer for use as a data sink */ + bzero(cs->cs_dbuffer + DEV_BSIZE, cs->cs_bcount); + cs->cs_stage = RAID_NONE; + cs->cs_call = raid_readregenloop; + cs->cs_error_call = raid_read_error; + cs->cs_retry_call = raid_read_no_retry; + cs->cs_frags = 1; + + /* use parity buffer to read other columns */ + raidio(cs, RIO_PARITY | RIO_READ | (cs->cs_loop + 1)); +} + +/* + * NAME: raid_read + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ +static int +raid_read(mr_unit_t *un, md_raidcs_t *cs) +{ + int error = 0; + md_raidps_t *ps; + mdi_unit_t *ui; + minor_t mnum; + + ASSERT(IO_READER_HELD(un)); + ps = cs->cs_ps; + ui = ps->ps_ui; + raid_line_reader_lock(cs, 0); + un = (mr_unit_t *)md_unit_readerlock(ui); + ASSERT(UNIT_STATE(un) != RUS_INIT); + mnum = MD_SID(un); + cs->cs_un = un; + + /* make sure the read doesn't go beyond the end of the column */ + if (cs->cs_blkno + cs->cs_blkcnt > + un->un_segsize * un->un_segsincolumn) { + error = ENXIO; + } + if (error) + goto rerror; + + if (un->un_state & RUS_REGEN) { + raid_regen_parity(cs); + un = MD_UNIT(mnum); + cs->cs_un = un; + } + + raid_read_io(un, cs); + return (0); + +rerror: + raid_error_parent(ps, error); + raid_free_child(cs, 1); + /* decrement readfrags */ + raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); + return (0); +} + +/* + * NAME: raid_write_err_retry + * DESCRIPTION: RAID metadevice write retry routine + * write was for parity or data only; + * complete write with error, no recovery possible + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ +/*ARGSUSED*/ +static void +raid_write_err_retry(mr_unit_t *un, md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; + + /* decrement pwfrags if needed, and frags */ + if (!(cs->cs_flags & MD_RCS_PWDONE)) + flags |= RFP_DECR_PWFRAGS; + raid_error_parent(ps, EIO); + raid_free_child(cs, 1); + raid_free_parent(ps, flags); +} + +/* + * NAME: raid_write_err_retry + * DESCRIPTION: RAID metadevice write retry routine + * write is too far along to retry and parent + * has already been signaled with iodone. + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ +/*ARGSUSED*/ +static void +raid_write_no_retry(mr_unit_t *un, md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + int flags = RFP_DECR_FRAGS | RFP_RLS_LOCK; + + /* decrement pwfrags if needed, and frags */ + if (!(cs->cs_flags & MD_RCS_PWDONE)) + flags |= RFP_DECR_PWFRAGS; + raid_free_child(cs, 1); + raid_free_parent(ps, flags); +} + +/* + * NAME: raid_write_retry + * DESCRIPTION: RAID metadevice write retry routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_write_retry(mr_unit_t *un, md_raidcs_t *cs) +{ + md_raidps_t *ps; + + ps = cs->cs_ps; + + /* re-initialize the buf_t structure for raid_write() */ + cs->cs_dbuf.b_chain = (struct buf *)cs; + cs->cs_dbuf.b_back = &cs->cs_dbuf; + cs->cs_dbuf.b_forw = &cs->cs_dbuf; + cs->cs_dbuf.b_flags = B_BUSY; /* initialize flags */ + cs->cs_dbuf.b_error = 0; /* initialize error */ + cs->cs_dbuf.b_offset = -1; + /* Initialize semaphores */ + sema_init(&cs->cs_dbuf.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&cs->cs_dbuf.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + + cs->cs_pbuf.b_chain = (struct buf *)cs; + cs->cs_pbuf.b_back = &cs->cs_pbuf; + cs->cs_pbuf.b_forw = &cs->cs_pbuf; + cs->cs_pbuf.b_flags = B_BUSY; /* initialize flags */ + cs->cs_pbuf.b_error = 0; /* initialize error */ + cs->cs_pbuf.b_offset = -1; + sema_init(&cs->cs_pbuf.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&cs->cs_pbuf.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + + cs->cs_hbuf.b_chain = (struct buf *)cs; + cs->cs_hbuf.b_back = &cs->cs_hbuf; + cs->cs_hbuf.b_forw = &cs->cs_hbuf; + cs->cs_hbuf.b_flags = B_BUSY; /* initialize flags */ + cs->cs_hbuf.b_error = 0; /* initialize error */ + cs->cs_hbuf.b_offset = -1; + sema_init(&cs->cs_hbuf.b_io, 0, NULL, + SEMA_DEFAULT, NULL); + sema_init(&cs->cs_hbuf.b_sem, 0, NULL, + SEMA_DEFAULT, NULL); + + cs->cs_flags &= ~(MD_RCS_ERROR); + /* + * If we have already done'ed the i/o but have done prewrite + * on this child, then reset PWDONE flag and bump pwfrags before + * restarting i/o. + * If pwfrags is zero, we have already 'iodone'd the i/o so + * leave things alone. We don't want to re-'done' it. + */ + mutex_enter(&ps->ps_mx); + if (cs->cs_flags & MD_RCS_PWDONE) { + cs->cs_flags &= ~MD_RCS_PWDONE; + ps->ps_pwfrags++; + } + mutex_exit(&ps->ps_mx); + raid_write_io(un, cs); +} + +/* + * NAME: raid_wrerr + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + * LOCKS: must obtain unit writer lock while calling raid_error_state + * since a unit or column state transition may take place. + * must obtain unit reader lock to retry I/O. + */ +static void +raid_wrerr(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mdi_unit_t *ui; + mr_unit_t *un; + md_raidcbuf_t *cbuf; + + ps = cs->cs_ps; + ui = ps->ps_ui; + + un = (mr_unit_t *)md_unit_writerlock(ui); + ASSERT(un != 0); + + if (cs->cs_dbuf.b_flags & B_ERROR) + (void) raid_error_state(un, &cs->cs_dbuf); + if (cs->cs_pbuf.b_flags & B_ERROR) + (void) raid_error_state(un, &cs->cs_pbuf); + if (cs->cs_hbuf.b_flags & B_ERROR) + (void) raid_error_state(un, &cs->cs_hbuf); + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) + if (cbuf->cbuf_bp.b_flags & B_ERROR) + (void) raid_error_state(un, &cbuf->cbuf_bp); + + md_unit_writerexit(ui); + + ps->ps_flags |= MD_RPS_HSREQ; + + un = (mr_unit_t *)md_unit_readerlock(ui); + + /* now attempt the appropriate retry routine */ + (*(cs->cs_retry_call))(un, cs); +} +/* + * NAMES: raid_write_error + * DESCRIPTION: I/O error handling routine for a RAID metadevice write + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ +/*ARGSUSED*/ +static void +raid_write_error(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mdi_unit_t *ui; + mr_unit_t *un; + md_raidcbuf_t *cbuf; + set_t setno; + + ps = cs->cs_ps; + ui = ps->ps_ui; + un = cs->cs_un; + + setno = MD_UN2SET(un); + + /* + * locate each buf that is in error on this io and then + * output an error message + */ + if ((cs->cs_dbuf.b_flags & B_ERROR) && + (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_ERRED) && + (COLUMN_STATE(un, cs->cs_dcolumn) != RCS_LAST_ERRED)) + cmn_err(CE_WARN, "md %s: write error on %s", + md_shortname(MD_SID(un)), + md_devname(setno, md_expldev(cs->cs_dbuf.b_edev), NULL, 0)); + + if ((cs->cs_pbuf.b_flags & B_ERROR) && + (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_ERRED) && + (COLUMN_STATE(un, cs->cs_pcolumn) != RCS_LAST_ERRED)) + cmn_err(CE_WARN, "md %s: write error on %s", + md_shortname(MD_SID(un)), + md_devname(setno, md_expldev(cs->cs_pbuf.b_edev), NULL, 0)); + + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) + if ((cbuf->cbuf_bp.b_flags & B_ERROR) && + (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_ERRED) && + (COLUMN_STATE(un, cbuf->cbuf_column) != RCS_LAST_ERRED)) + cmn_err(CE_WARN, "md %s: write error on %s", + md_shortname(MD_SID(un)), + md_devname(setno, md_expldev(cbuf->cbuf_bp.b_edev), + NULL, 0)); + + md_unit_readerexit(ui); + + ASSERT(cs->cs_frags == 0); + + /* now schedule processing for possible state change */ + daemon_request(&md_mstr_daemon, raid_wrerr, + (daemon_queue_t *)cs, REQ_OLD); + +} + +/* + * NAME: raid_write_ponly + * DESCRIPTION: RAID metadevice write routine + * in the case where only the parity column can be written + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_write_ponly(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mr_unit_t *un = cs->cs_un; + + ps = cs->cs_ps; + /* decrement pwfrags if needed, but not frags */ + ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); + raid_free_parent(ps, RFP_DECR_PWFRAGS); + cs->cs_flags |= MD_RCS_PWDONE; + cs->cs_frags = 1; + cs->cs_stage = RAID_WRITE_PONLY_DONE; + cs->cs_call = raid_stage; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_no_retry; + if (WRITE_ALT(un, cs->cs_pcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_WRITE); + } + raidio(cs, RIO_PARITY | RIO_WRITE); +} + +/* + * NAME: raid_write_ploop + * DESCRIPTION: RAID metadevice write routine, constructs parity from + * data in other columns. + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_write_ploop(md_raidcs_t *cs) +{ + mr_unit_t *un = cs->cs_un; + uint_t *dbuf; + uint_t *pbuf; + size_t wordcnt; + uint_t psum = 0; + + wordcnt = cs->cs_bcount / sizeof (uint_t); + dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + while (wordcnt--) + *pbuf++ ^= *dbuf++; + cs->cs_loop++; + + /* + * build parity from scratch using new data, + * skip reading the data and parity columns. + */ + while (cs->cs_loop == cs->cs_dcolumn || cs->cs_loop == cs->cs_pcolumn) + cs->cs_loop++; + + if (cs->cs_loop != un->un_totalcolumncnt) { + cs->cs_frags = 1; + raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); + return; + } + + /* construct checksum for parity buffer */ + wordcnt = cs->cs_bcount / sizeof (uint_t); + pbuf = (uint_t *)(void *)(cs->cs_pbuffer + DEV_BSIZE); + while (wordcnt--) { + psum ^= *pbuf; + pbuf++; + } + RAID_FILLIN_RPW(cs->cs_pbuffer, un, psum, -1, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + 1, cs->cs_pcolumn, RAID_PWMAGIC); + + cs->cs_stage = RAID_NONE; + cs->cs_call = raid_write_ponly; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_err_retry; + cs->cs_frags = 1; + if (WRITE_ALT(un, cs->cs_pcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); + } + raidio(cs, RIO_PARITY | RIO_PREWRITE); +} + +/* + * NAME: raid_write_donly + * DESCRIPTION: RAID metadevice write routine + * Completed writing data to prewrite entry + * in the case where only the data column can be written + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_write_donly(md_raidcs_t *cs) +{ + md_raidps_t *ps; + mr_unit_t *un = cs->cs_un; + + ps = cs->cs_ps; + /* WARNING: don't release unit reader lock here... */ + /* decrement pwfrags if needed, but not frags */ + ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); + raid_free_parent(ps, RFP_DECR_PWFRAGS); + cs->cs_flags |= MD_RCS_PWDONE; + cs->cs_frags = 1; + cs->cs_stage = RAID_WRITE_DONLY_DONE; + cs->cs_call = raid_stage; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_err_retry; + if (WRITE_ALT(un, cs->cs_dcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); + } + raidio(cs, RIO_DATA | RIO_WRITE); +} + +/* + * NAME: raid_write_got_old + * DESCRIPTION: RAID metadevice write routine + * completed read of old data and old parity + * PARAMETERS: md_raidcs_t *cs - pointer to a child structure + */ +static void +raid_write_got_old(md_raidcs_t *cs) +{ + mr_unit_t *un = cs->cs_un; + + ASSERT(IO_READER_HELD(cs->cs_un)); + ASSERT(UNIT_READER_HELD(cs->cs_un)); + + raid_mapin_buf(cs); + genstandardparity(cs); + cs->cs_frags = 2; + cs->cs_call = raid_stage; + cs->cs_stage = RAID_PREWRITE_DONE; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_retry; + + if (WRITE_ALT(un, cs->cs_dcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_PREWRITE); + } + + if (WRITE_ALT(un, cs->cs_pcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | RIO_PREWRITE); + } + ASSERT(cs->cs_frags < 4); + raidio(cs, RIO_DATA | RIO_PREWRITE); + raidio(cs, RIO_PARITY | RIO_PREWRITE); +} + +/* + * NAME: raid_write_io + * DESCRIPTION: RAID metadevice write I/O routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ + +/*ARGSUSED*/ +static void +raid_write_io(mr_unit_t *un, md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + uint_t *dbuf; + uint_t *ubuf; + size_t wordcnt; + uint_t dsum = 0; + int pcheck; + int dcheck; + + ASSERT((un->un_column[cs->cs_pcolumn].un_devstate & + RCS_INIT) == 0); + ASSERT((un->un_column[cs->cs_dcolumn].un_devstate & + RCS_INIT) == 0); + ASSERT(IO_READER_HELD(un)); + ASSERT(UNIT_READER_HELD(un)); + ASSERT(cs->cs_flags & MD_RCS_HAVE_PW_SLOTS); + if (cs->cs_flags & MD_RCS_LINE) { + + mr_unit_t *un = cs->cs_un; + + ASSERT(un->un_origcolumncnt == un->un_totalcolumncnt); + raid_mapin_buf(cs); + cs->cs_frags = un->un_origcolumncnt; + cs->cs_call = raid_stage; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_no_retry; + cs->cs_stage = RAID_LINE_PWDONE; + genlineparity(cs); + return; + } + + pcheck = erred_check_line(un, cs, &un->un_column[cs->cs_pcolumn]); + dcheck = erred_check_line(un, cs, &un->un_column[cs->cs_dcolumn]); + cs->cs_resync_check = pcheck << RCL_PARITY_OFFSET || dcheck; + + if (pcheck == RCL_ERRED && dcheck == RCL_ERRED) { + int err = EIO; + + if ((un->un_column[cs->cs_pcolumn].un_devstate == + RCS_LAST_ERRED) || + (un->un_column[cs->cs_dcolumn].un_devstate == + RCS_LAST_ERRED)) + err = ENXIO; + raid_error_parent(ps, err); + ASSERT(!(cs->cs_flags & MD_RCS_PWDONE)); + raid_free_child(cs, 1); + raid_free_parent(ps, RFP_DECR_FRAGS + | RFP_RLS_LOCK | RFP_DECR_PWFRAGS); + return; + } + + if (pcheck & RCL_ERRED) { + /* + * handle case of only having data drive + */ + raid_mapin_buf(cs); + wordcnt = cs->cs_bcount / sizeof (uint_t); + + dbuf = (uint_t *)(void *)(cs->cs_dbuffer + DEV_BSIZE); + ubuf = (uint_t *)(void *)(cs->cs_addr); + + while (wordcnt--) { + *dbuf = *ubuf; + dsum ^= *ubuf; + dbuf++; + ubuf++; + } + RAID_FILLIN_RPW(cs->cs_dbuffer, un, dsum, -1, + cs->cs_blkno, cs->cs_blkcnt, cs->cs_pwid, + 1, cs->cs_dcolumn, RAID_PWMAGIC); + cs->cs_frags = 1; + cs->cs_stage = RAID_NONE; + cs->cs_call = raid_write_donly; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_err_retry; + if (WRITE_ALT(un, cs->cs_dcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_DATA | RIO_ALT | RIO_EXTRA | + RIO_PREWRITE); + } + raidio(cs, RIO_DATA | RIO_PREWRITE); + return; + } + + if (dcheck & RCL_ERRED) { + /* + * handle case of only having parity drive + * build parity from scratch using new data, + * skip reading the data and parity columns. + */ + raid_mapin_buf(cs); + cs->cs_loop = 0; + while (cs->cs_loop == cs->cs_dcolumn || + cs->cs_loop == cs->cs_pcolumn) + cs->cs_loop++; + + /* copy new data in to begin building parity */ + bcopy(cs->cs_addr, cs->cs_pbuffer + DEV_BSIZE, cs->cs_bcount); + cs->cs_stage = RAID_NONE; + cs->cs_call = raid_write_ploop; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_err_retry; + cs->cs_frags = 1; + raidio(cs, RIO_DATA | RIO_READ | (cs->cs_loop + 1)); + return; + } + /* + * handle normal cases + * read old data and old parity + */ + cs->cs_frags = 2; + cs->cs_stage = RAID_NONE; + cs->cs_call = raid_write_got_old; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_retry; + ASSERT(ps->ps_magic == RAID_PSMAGIC); + raidio(cs, RIO_DATA | RIO_READ); + raidio(cs, RIO_PARITY | RIO_READ); +} + +static void +raid_enqueue(md_raidcs_t *cs) +{ + mdi_unit_t *ui = cs->cs_ps->ps_ui; + kmutex_t *io_list_mutex = &ui->ui_io_lock->io_list_mutex; + md_raidcs_t *cs1; + + mutex_enter(io_list_mutex); + ASSERT(! (cs->cs_flags & MD_RCS_LLOCKD)); + if (ui->ui_io_lock->io_list_front == NULL) { + ui->ui_io_lock->io_list_front = cs; + ui->ui_io_lock->io_list_back = cs; + } else { + cs1 = ui->ui_io_lock->io_list_back; + cs1->cs_linlck_next = cs; + ui->ui_io_lock->io_list_back = cs; + } + STAT_INC(raid_write_waits); + STAT_MAX(raid_max_write_q_length, raid_write_queue_length); + cs->cs_linlck_next = NULL; + mutex_exit(io_list_mutex); +} + +/* + * NAME: raid_write + * DESCRIPTION: RAID metadevice write routine + * PARAMETERS: mr_unit_t *un - pointer to a unit structure + * md_raidcs_t *cs - pointer to a child structure + */ + +/*ARGSUSED*/ +static int +raid_write(mr_unit_t *un, md_raidcs_t *cs) +{ + int error = 0; + md_raidps_t *ps; + mdi_unit_t *ui; + minor_t mnum; + clock_t timeout; + + ASSERT(IO_READER_HELD(un)); + ps = cs->cs_ps; + ui = ps->ps_ui; + + ASSERT(UNIT_STATE(un) != RUS_INIT); + if (UNIT_STATE(un) == RUS_LAST_ERRED) + error = EIO; + + /* make sure the write doesn't go beyond the column */ + if (cs->cs_blkno + cs->cs_blkcnt > un->un_segsize * un->un_segsincolumn) + error = ENXIO; + if (error) + goto werror; + + getresources(cs); + + /* + * this is an advisory loop that keeps the waiting lists short + * to reduce cpu time. Since there is a race introduced by not + * aquiring all the correct mutexes, use a cv_timedwait to be + * sure the write always will wake up and start. + */ + while (raid_check_pw(cs)) { + mutex_enter(&un->un_mx); + (void) drv_getparm(LBOLT, &timeout); + timeout += md_wr_wait; + un->un_rflags |= MD_RFLAG_NEEDPW; + STAT_INC(raid_prewrite_waits); + (void) cv_timedwait(&un->un_cv, &un->un_mx, timeout); + un->un_rflags &= ~MD_RFLAG_NEEDPW; + mutex_exit(&un->un_mx); + } + + if (raid_line_writer_lock(cs, 1)) + return (0); + + un = (mr_unit_t *)md_unit_readerlock(ui); + cs->cs_un = un; + mnum = MD_SID(un); + + if (un->un_state & RUS_REGEN) { + raid_regen_parity(cs); + un = MD_UNIT(mnum); + cs->cs_un = un; + } + + raid_write_io(un, cs); + return (0); +werror: + /* aquire unit reader lock sinc raid_free_child always drops it */ + raid_error_parent(ps, error); + raid_free_child(cs, 0); + /* decrement both pwfrags and frags */ + raid_free_parent(ps, RFP_DECR_PWFRAGS | RFP_DECR_FRAGS | RFP_RLS_LOCK); + return (0); +} + + +/* + * NAMES: raid_stage + * DESCRIPTION: post-processing routine for a RAID metadevice + * PARAMETERS: md_raidcs_t *cs - pointer to child structure + */ +static void +raid_stage(md_raidcs_t *cs) +{ + md_raidps_t *ps = cs->cs_ps; + mr_unit_t *un = cs->cs_un; + md_raidcbuf_t *cbuf; + buf_t *bp; + void *private; + int flag; + + switch (cs->cs_stage) { + case RAID_READ_DONE: + raid_free_child(cs, 1); + /* decrement readfrags */ + raid_free_parent(ps, RFP_DECR_READFRAGS | RFP_RLS_LOCK); + return; + + case RAID_WRITE_DONE: + case RAID_WRITE_PONLY_DONE: + case RAID_WRITE_DONLY_DONE: + /* + * Completed writing real parity and/or data. + */ + ASSERT(cs->cs_flags & MD_RCS_PWDONE); + raid_free_child(cs, 1); + /* decrement frags but not pwfrags */ + raid_free_parent(ps, RFP_DECR_FRAGS | RFP_RLS_LOCK); + return; + + case RAID_PREWRITE_DONE: + /* + * completed writing data and parity to prewrite entries + */ + /* + * WARNING: don't release unit reader lock here.. + * decrement pwfrags but not frags + */ + raid_free_parent(ps, RFP_DECR_PWFRAGS); + cs->cs_flags |= MD_RCS_PWDONE; + cs->cs_frags = 2; + cs->cs_stage = RAID_WRITE_DONE; + cs->cs_call = raid_stage; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_no_retry; + if (WRITE_ALT(un, cs->cs_pcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_PARITY | + RIO_WRITE); + } + if (WRITE_ALT(un, cs->cs_dcolumn)) { + cs->cs_frags++; + raidio(cs, RIO_ALT | RIO_EXTRA | RIO_DATA | RIO_WRITE); + } + ASSERT(cs->cs_frags < 4); + raidio(cs, RIO_DATA | RIO_WRITE); + raidio(cs, RIO_PARITY | RIO_WRITE); + if (cs->cs_pw_inval_list) { + raid_free_pwinvalidate(cs); + } + return; + + case RAID_LINE_PWDONE: + ASSERT(cs->cs_frags == 0); + raid_free_parent(ps, RFP_DECR_PWFRAGS); + cs->cs_flags |= MD_RCS_PWDONE; + cs->cs_frags = un->un_origcolumncnt; + cs->cs_call = raid_stage; + cs->cs_error_call = raid_write_error; + cs->cs_retry_call = raid_write_no_retry; + cs->cs_stage = RAID_WRITE_DONE; + for (cbuf = cs->cs_buflist; cbuf; cbuf = cbuf->cbuf_next) { + /* + * fill in buffer for write to prewrite area + */ + bp = &cbuf->cbuf_bp; + bp->b_back = bp; + bp->b_forw = bp; + bp->b_un.b_addr = cbuf->cbuf_buffer + DEV_BSIZE; + bp->b_bcount = cbuf->cbuf_bcount; + bp->b_bufsize = cbuf->cbuf_bcount; + bp->b_lblkno = + un->un_column[cbuf->cbuf_column].un_devstart + + cs->cs_blkno; + bp->b_flags &= ~(B_READ | B_WRITE | B_ERROR); + bp->b_flags &= ~nv_available; + bp->b_flags |= B_WRITE | B_BUSY; + bp->b_iodone = (int (*)())raid_done; + bp->b_edev = md_dev64_to_dev( + un->un_column[cbuf->cbuf_column].un_dev); + bp->b_chain = (struct buf *)cs; + private = cs->cs_strategy_private; + flag = cs->cs_strategy_flag; + md_call_strategy(bp, flag, private); + } + raidio(cs, RIO_DATA | RIO_WRITE); + raidio(cs, RIO_PARITY | RIO_WRITE); + if (cs->cs_pw_inval_list) { + raid_free_pwinvalidate(cs); + } + return; + + default: + ASSERT(0); + break; + } +} +/* + * NAME: md_raid_strategy + * DESCRIPTION: RAID metadevice I/O oprations entry point. + * PARAMETERS: buf_t *pb - pointer to a user I/O buffer + * int flag - metadevice specific flag + * void *private - carry over flag ?? + * + */ + +void +md_raid_strategy(buf_t *pb, int flag, void *private) +{ + md_raidps_t *ps; + md_raidcs_t *cs; + int doing_writes; + int err; + mr_unit_t *un; + mdi_unit_t *ui; + size_t count; + diskaddr_t blkno; + caddr_t addr; + off_t offset; + int colcnt; + minor_t mnum; + set_t setno; + + ui = MDI_UNIT(getminor(pb->b_edev)); + md_kstat_waitq_enter(ui); + un = (mr_unit_t *)md_io_readerlock(ui); + setno = MD_MIN2SET(getminor(pb->b_edev)); + + if ((flag & MD_NOBLOCK) == 0) { + if (md_inc_iocount(setno) != 0) { + pb->b_flags |= B_ERROR; + pb->b_error = ENXIO; + pb->b_resid = pb->b_bcount; + md_io_readerexit(ui); + biodone(pb); + return; + } + } else { + md_inc_iocount_noblock(setno); + } + + mnum = MD_SID(un); + colcnt = un->un_totalcolumncnt - 1; + count = pb->b_bcount; + + STAT_CHECK(raid_512, count == 512); + STAT_CHECK(raid_1024, count == 1024); + STAT_CHECK(raid_1024_8192, count > 1024 && count < 8192); + STAT_CHECK(raid_8192, count == 8192); + STAT_CHECK(raid_8192_bigger, count > 8192); + + (void *) md_unit_readerlock(ui); + if (!(flag & MD_STR_NOTTOP)) { + err = md_checkbuf(ui, (md_unit_t *)un, pb); /* check and map */ + if (err != 0) { + md_kstat_waitq_exit(ui); + md_io_readerexit(ui); + return; + } + } + md_unit_readerexit(ui); + + STAT_INC(raid_total_io); + + /* allocate a parent structure for the user I/O */ + ps = kmem_cache_alloc(raid_parent_cache, MD_ALLOCFLAGS); + raid_parent_init(ps); + + /* + * Save essential information from the original buffhdr + * in the md_save structure. + */ + ps->ps_un = un; + ps->ps_ui = ui; + ps->ps_bp = pb; + ps->ps_addr = pb->b_un.b_addr; + + if ((pb->b_flags & B_READ) == 0) { + ps->ps_flags |= MD_RPS_WRITE; + doing_writes = 1; + STAT_INC(raid_writes); + } else { + ps->ps_flags |= MD_RPS_READ; + doing_writes = 0; + STAT_INC(raid_reads); + } + + count = lbtodb(pb->b_bcount); /* transfer count (in blocks) */ + blkno = pb->b_lblkno; /* block number on device */ + addr = 0; + offset = 0; + ps->ps_pwfrags = 1; + ps->ps_frags = 1; + md_kstat_waitq_to_runq(ui); + + do { + cs = kmem_cache_alloc(raid_child_cache, MD_ALLOCFLAGS); + raid_child_init(cs); + cs->cs_ps = ps; + cs->cs_un = un; + cs->cs_mdunit = mnum; + cs->cs_strategy_flag = flag; + cs->cs_strategy_private = private; + cs->cs_addr = addr; + cs->cs_offset = offset; + count = raid_iosetup(un, blkno, count, cs); + if (cs->cs_flags & MD_RCS_LINE) { + blkno += (cs->cs_blkcnt * colcnt); + offset += (cs->cs_bcount * colcnt); + } else { + blkno += cs->cs_blkcnt; + offset += cs->cs_bcount; + } + /* for each cs bump up the ps_pwfrags and ps_frags fields */ + if (count) { + mutex_enter(&ps->ps_mx); + ps->ps_pwfrags++; + ps->ps_frags++; + mutex_exit(&ps->ps_mx); + if (doing_writes) + (void) raid_write(un, cs); + else + (void) raid_read(un, cs); + } + } while (count); + if (doing_writes) { + (void) raid_write(un, cs); + } else + (void) raid_read(un, cs); + + if (! (flag & MD_STR_NOTTOP) && panicstr) { + while (! (ps->ps_flags & MD_RPS_DONE)) { + md_daemon(1, &md_done_daemon); + drv_usecwait(10); + } + kmem_cache_free(raid_parent_cache, ps); + } +} + +/* + * NAMES: raid_snarf + * DESCRIPTION: RAID metadevice SNARF entry point + * PARAMETERS: md_snarfcmd_t cmd, + * set_t setno + * RETURNS: + */ +static int +raid_snarf(md_snarfcmd_t cmd, set_t setno) +{ + mr_unit_t *un; + mddb_recid_t recid; + int gotsomething; + int all_raid_gotten; + mddb_type_t typ1; + uint_t ncol; + mddb_de_ic_t *dep; + mddb_rb32_t *rbp; + size_t newreqsize; + mr_unit_t *big_un; + mr_unit32_od_t *small_un; + + + if (cmd == MD_SNARF_CLEANUP) + return (0); + + all_raid_gotten = 1; + gotsomething = 0; + typ1 = (mddb_type_t)md_getshared_key(setno, + raid_md_ops.md_driver.md_drivername); + recid = mddb_makerecid(setno, 0); + + while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { + if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) { + continue; + } + + dep = mddb_getrecdep(recid); + dep->de_flags = MDDB_F_RAID; + rbp = dep->de_rb; + if ((rbp->rb_revision == MDDB_REV_RB) && + ((rbp->rb_private & MD_PRV_CONVD) == 0)) { + /* + * This means, we have an old and small record + * and this record hasn't already been converted. + * Before we create an incore metadevice from this + * we have to convert it to a big record. + */ + small_un = (mr_unit32_od_t *)mddb_getrecaddr(recid); + ncol = small_un->un_totalcolumncnt; + newreqsize = sizeof (mr_unit_t) + + ((ncol - 1) * sizeof (mr_column_t)); + big_un = (mr_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); + raid_convert((caddr_t)small_un, (caddr_t)big_un, + SMALL_2_BIG); + kmem_free(small_un, dep->de_reqsize); + dep->de_rb_userdata = big_un; + dep->de_reqsize = newreqsize; + un = big_un; + rbp->rb_private |= MD_PRV_CONVD; + } else { + /* Big device */ + un = (mr_unit_t *)mddb_getrecaddr(recid); + } + + /* Set revision and flag accordingly */ + if (rbp->rb_revision == MDDB_REV_RB) { + un->c.un_revision = MD_32BIT_META_DEV; + } else { + un->c.un_revision = MD_64BIT_META_DEV; + un->c.un_flag |= MD_EFILABEL; + } + + /* + * Create minor device node for snarfed entry. + */ + (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); + + if (MD_UNIT(MD_SID(un)) != NULL) { + mddb_setrecprivate(recid, MD_PRV_PENDDEL); + continue; + } + all_raid_gotten = 0; + if (raid_build_incore((void *)un, 1) == 0) { + mddb_setrecprivate(recid, MD_PRV_GOTIT); + md_create_unit_incore(MD_SID(un), &raid_md_ops, + 1); + gotsomething = 1; + } else if (un->mr_ic) { + kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) * + un->un_totalcolumncnt); + kmem_free(un->mr_ic, sizeof (*un->mr_ic)); + } + } + + if (!all_raid_gotten) { + return (gotsomething); + } + + recid = mddb_makerecid(setno, 0); + while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) + if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) + mddb_setrecprivate(recid, MD_PRV_PENDDEL); + + return (0); +} + +/* + * NAMES: raid_halt + * DESCRIPTION: RAID metadevice HALT entry point + * PARAMETERS: md_haltcmd_t cmd - + * set_t setno - + * RETURNS: + */ +static int +raid_halt(md_haltcmd_t cmd, set_t setno) +{ + set_t i; + mdi_unit_t *ui; + minor_t mnum; + + if (cmd == MD_HALT_CLOSE) + return (0); + + if (cmd == MD_HALT_OPEN) + return (0); + + if (cmd == MD_HALT_UNLOAD) + return (0); + + if (cmd == MD_HALT_CHECK) { + for (i = 0; i < md_nunits; i++) { + mnum = MD_MKMIN(setno, i); + if ((ui = MDI_UNIT(mnum)) == NULL) + continue; + if (ui->ui_opsindex != raid_md_ops.md_selfindex) + continue; + if (md_unit_isopen(ui)) + return (1); + } + return (0); + } + + if (cmd != MD_HALT_DOIT) + return (1); + + for (i = 0; i < md_nunits; i++) { + mnum = MD_MKMIN(setno, i); + if ((ui = MDI_UNIT(mnum)) == NULL) + continue; + if (ui->ui_opsindex != raid_md_ops.md_selfindex) + continue; + reset_raid((mr_unit_t *)MD_UNIT(mnum), mnum, 0); + } + return (0); +} + +/* + * NAMES: raid_close_all_devs + * DESCRIPTION: Close all the devices of the unit. + * PARAMETERS: mr_unit_t *un - pointer to unit structure + * RETURNS: + */ +void +raid_close_all_devs(mr_unit_t *un, int init_pw, int md_cflags) +{ + int i; + mr_column_t *device; + + for (i = 0; i < un->un_totalcolumncnt; i++) { + device = &un->un_column[i]; + if (device->un_devflags & MD_RAID_DEV_ISOPEN) { + ASSERT((device->un_dev != (md_dev64_t)0) && + (device->un_dev != NODEV64)); + if ((device->un_devstate & RCS_OKAY) && init_pw) + (void) init_pw_area(un, device->un_dev, + device->un_pwstart, i); + md_layered_close(device->un_dev, md_cflags); + device->un_devflags &= ~MD_RAID_DEV_ISOPEN; + } + } +} + +/* + * NAMES: raid_open_all_devs + * DESCRIPTION: Open all the components (columns) of the device unit. + * PARAMETERS: mr_unit_t *un - pointer to unit structure + * RETURNS: + */ +static int +raid_open_all_devs(mr_unit_t *un, int md_oflags) +{ + minor_t mnum = MD_SID(un); + int i; + int not_opened = 0; + int commit = 0; + int col = -1; + mr_column_t *device; + set_t setno = MD_MIN2SET(MD_SID(un)); + side_t side = mddb_getsidenum(setno); + mdkey_t key; + mdi_unit_t *ui = MDI_UNIT(mnum); + + ui->ui_tstate &= ~MD_INACCESSIBLE; + + for (i = 0; i < un->un_totalcolumncnt; i++) { + md_dev64_t tmpdev; + + device = &un->un_column[i]; + + if (COLUMN_STATE(un, i) & RCS_ERRED) { + not_opened++; + continue; + } + + if (device->un_devflags & MD_RAID_DEV_ISOPEN) + continue; + + tmpdev = device->un_dev; + /* + * Open by device id + */ + key = HOTSPARED(un, i) ? + device->un_hs_key : device->un_orig_key; + if ((md_getmajor(tmpdev) != md_major) && + md_devid_found(setno, side, key) == 1) { + tmpdev = md_resolve_bydevid(mnum, tmpdev, key); + } + if (md_layered_open(mnum, &tmpdev, md_oflags)) { + device->un_dev = tmpdev; + not_opened++; + continue; + } + device->un_dev = tmpdev; + device->un_devflags |= MD_RAID_DEV_ISOPEN; + } + + /* if open errors and errored devices are 1 then device can run */ + if (not_opened > 1) { + cmn_err(CE_WARN, + "md: %s failed to open. open error on %s\n", + md_shortname(MD_SID(un)), + md_devname(MD_UN2SET(un), device->un_orig_dev, + NULL, 0)); + + ui->ui_tstate |= MD_INACCESSIBLE; + + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, + MD_UN2SET(un), MD_SID(un)); + + return (not_opened > 1); + } + + for (i = 0; i < un->un_totalcolumncnt; i++) { + device = &un->un_column[i]; + if (device->un_devflags & MD_RAID_DEV_ISOPEN) { + if (device->un_devstate & RCS_LAST_ERRED) { + /* + * At this point in time there is a possibility + * that errors were the result of a controller + * failure with more than a single column on it + * so clear out last errored columns and let errors + * re-occur is necessary. + */ + raid_set_state(un, i, RCS_OKAY, 0); + commit++; + } + continue; + } + ASSERT(col == -1); + col = i; + } + + if (col != -1) { + raid_set_state(un, col, RCS_ERRED, 0); + commit++; + } + + if (commit) + raid_commit(un, NULL); + + if (col != -1) { + if (COLUMN_STATE(un, col) & RCS_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, + SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); + } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, + SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); + } + } + + return (0); +} + +/* + * NAMES: raid_internal_open + * DESCRIPTION: Do the actual RAID open + * PARAMETERS: minor_t mnum - minor number of the RAID device + * int flag - + * int otyp - + * int md_oflags - RAID open flags + * RETURNS: 0 if successful, nonzero otherwise + */ +int +raid_internal_open(minor_t mnum, int flag, int otyp, int md_oflags) +{ + mr_unit_t *un; + mdi_unit_t *ui; + int err = 0; + int replay_error = 0; + + ui = MDI_UNIT(mnum); + ASSERT(ui != NULL); + + un = (mr_unit_t *)md_unit_openclose_enter(ui); + /* + * this MUST be checked before md_unit_isopen is checked. + * raid_init_columns sets md_unit_isopen to block reset, halt. + */ + if ((UNIT_STATE(un) & (RUS_INIT | RUS_DOI)) && + !(md_oflags & MD_OFLG_ISINIT)) { + md_unit_openclose_exit(ui); + return (EAGAIN); + } + + if ((md_oflags & MD_OFLG_ISINIT) || md_unit_isopen(ui)) { + err = md_unit_incopen(mnum, flag, otyp); + goto out; + } + + md_unit_readerexit(ui); + + un = (mr_unit_t *)md_unit_writerlock(ui); + if (raid_open_all_devs(un, md_oflags) == 0) { + if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) { + md_unit_writerexit(ui); + un = (mr_unit_t *)md_unit_readerlock(ui); + raid_close_all_devs(un, 0, md_oflags); + goto out; + } + } else { + /* + * if this unit contains more than two errored components + * should return error and close all opened devices + */ + + md_unit_writerexit(ui); + un = (mr_unit_t *)md_unit_readerlock(ui); + raid_close_all_devs(un, 0, md_oflags); + md_unit_openclose_exit(ui); + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, + MD_UN2SET(un), MD_SID(un)); + return (ENXIO); + } + + if (!(MD_STATUS(un) & MD_UN_REPLAYED)) { + replay_error = raid_replay(un); + MD_STATUS(un) |= MD_UN_REPLAYED; + } + + md_unit_writerexit(ui); + un = (mr_unit_t *)md_unit_readerlock(ui); + + if ((replay_error == RAID_RPLY_READONLY) && + ((flag & (FREAD | FWRITE)) == FREAD)) { + md_unit_openclose_exit(ui); + return (0); + } + + /* allocate hotspare if possible */ + (void) raid_hotspares(); + + +out: + md_unit_openclose_exit(ui); + return (err); +} +/* + * NAMES: raid_open + * DESCRIPTION: RAID metadevice OPEN entry point + * PARAMETERS: dev_t dev - + * int flag - + * int otyp - + * cred_t * cred_p - + * int md_oflags - + * RETURNS: + */ +/*ARGSUSED1*/ +static int +raid_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) +{ + int error = 0; + + if (error = raid_internal_open(getminor(*dev), flag, otyp, md_oflags)) { + return (error); + } + return (0); +} + +/* + * NAMES: raid_internal_close + * DESCRIPTION: RAID metadevice CLOSE actual implementation + * PARAMETERS: minor_t - minor number of the RAID device + * int otyp - + * int init_pw - + * int md_cflags - RAID close flags + * RETURNS: 0 if successful, nonzero otherwise + */ +/*ARGSUSED*/ +int +raid_internal_close(minor_t mnum, int otyp, int init_pw, int md_cflags) +{ + mdi_unit_t *ui = MDI_UNIT(mnum); + mr_unit_t *un; + int err = 0; + + /* single thread */ + un = (mr_unit_t *)md_unit_openclose_enter(ui); + + /* count closed */ + if ((err = md_unit_decopen(mnum, otyp)) != 0) + goto out; + /* close devices, if necessary */ + if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { + raid_close_all_devs(un, init_pw, md_cflags); + } + + /* unlock, return success */ +out: + md_unit_openclose_exit(ui); + return (err); +} + +/* + * NAMES: raid_close + * DESCRIPTION: RAID metadevice close entry point + * PARAMETERS: dev_t dev - + * int flag - + * int otyp - + * cred_t * cred_p - + * int md_oflags - + * RETURNS: + */ +/*ARGSUSED1*/ +static int +raid_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags) +{ + int retval; + + (void) md_io_writerlock(MDI_UNIT(getminor(dev))); + retval = raid_internal_close(getminor(dev), otyp, 1, md_cflags); + (void) md_io_writerexit(MDI_UNIT(getminor(dev))); + return (retval); +} + +/* + * raid_probe_close_all_devs + */ +void +raid_probe_close_all_devs(mr_unit_t *un) +{ + int i; + mr_column_t *device; + + for (i = 0; i < un->un_totalcolumncnt; i++) { + device = &un->un_column[i]; + + if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { + md_layered_close(device->un_dev, + MD_OFLG_PROBEDEV); + device->un_devflags &= ~MD_RAID_DEV_PROBEOPEN; + } + } +} +/* + * Raid_probe_dev: + * + * On entry the unit writerlock is held + */ +static int +raid_probe_dev(mdi_unit_t *ui, minor_t mnum) +{ + mr_unit_t *un; + int i; + int not_opened = 0; + int commit = 0; + int col = -1; + mr_column_t *device; + int md_devopen = 0; + + if (md_unit_isopen(ui)) + md_devopen++; + + un = MD_UNIT(mnum); + /* + * If the state has been set to LAST_ERRED because + * of an error when the raid device was open at some + * point in the past, don't probe. We really don't want + * to reset the state in this case. + */ + if (UNIT_STATE(un) == RUS_LAST_ERRED) + return (0); + + ui->ui_tstate &= ~MD_INACCESSIBLE; + + for (i = 0; i < un->un_totalcolumncnt; i++) { + md_dev64_t tmpdev; + + device = &un->un_column[i]; + if (COLUMN_STATE(un, i) & RCS_ERRED) { + not_opened++; + continue; + } + + tmpdev = device->un_dev; + /* + * Currently the flags passed are not needed since + * there cannot be an underlying metadevice. However + * they are kept here for consistency. + * + * Open by device id + */ + tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i)? + device->un_hs_key : device->un_orig_key); + if (md_layered_open(mnum, &tmpdev, + MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV)) { + device->un_dev = tmpdev; + not_opened++; + continue; + } + device->un_dev = tmpdev; + + device->un_devflags |= MD_RAID_DEV_PROBEOPEN; + } + + /* + * The code below is careful on setting the LAST_ERRED state. + * + * If open errors and exactly one device has failed we can run. + * If more then one device fails we have to figure out when to set + * LAST_ERRED state. The rationale is to avoid unnecessary resyncs + * since they are painful and time consuming. + * + * When more than one component/column fails there are 2 scenerios. + * + * 1. Metadevice has NOT been opened: In this case, the behavior + * mimics the open symantics. ie. Only the first failed device + * is ERRED and LAST_ERRED is not set. + * + * 2. Metadevice has been opened: Here the read/write sematics are + * followed. The first failed devicce is ERRED and on the next + * failed device LAST_ERRED is set. + */ + + if (not_opened > 1 && !md_devopen) { + cmn_err(CE_WARN, + "md: %s failed to open. open error on %s\n", + md_shortname(MD_SID(un)), + md_devname(MD_UN2SET(un), device->un_orig_dev, + NULL, 0)); + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE, + MD_UN2SET(un), MD_SID(un)); + raid_probe_close_all_devs(un); + ui->ui_tstate |= MD_INACCESSIBLE; + return (not_opened > 1); + } + + if (!md_devopen) { + for (i = 0; i < un->un_totalcolumncnt; i++) { + device = &un->un_column[i]; + if (device->un_devflags & MD_RAID_DEV_PROBEOPEN) { + if (device->un_devstate & RCS_LAST_ERRED) { + /* + * At this point in time there is a + * possibility that errors were the + * result of a controller failure with + * more than a single column on it so + * clear out last errored columns and + * let errors re-occur is necessary. + */ + raid_set_state(un, i, RCS_OKAY, 0); + commit++; + } + continue; + } + ASSERT(col == -1); + /* + * note if multiple devices are failing then only + * the last one is marked as error + */ + col = i; + } + + if (col != -1) { + raid_set_state(un, col, RCS_ERRED, 0); + commit++; + } + + } else { + for (i = 0; i < un->un_totalcolumncnt; i++) { + device = &un->un_column[i]; + + /* if we have LAST_ERRED go ahead and commit. */ + if (un->un_state & RUS_LAST_ERRED) + break; + /* + * could not open the component + */ + + if (!(device->un_devflags & MD_RAID_DEV_PROBEOPEN)) { + col = i; + raid_set_state(un, col, RCS_ERRED, 0); + commit++; + } + } + } + + if (commit) + raid_commit(un, NULL); + + if (col != -1) { + if (COLUMN_STATE(un, col) & RCS_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, + SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); + } else if (COLUMN_STATE(un, col) & RCS_LAST_ERRED) { + SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED, + SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un)); + } + } + + raid_probe_close_all_devs(un); + return (0); +} + +static int +raid_imp_set( + set_t setno +) +{ + mddb_recid_t recid; + int i, gotsomething; + mddb_type_t typ1; + mddb_de_ic_t *dep; + mddb_rb32_t *rbp; + mr_unit_t *un64; + mr_unit32_od_t *un32; + minor_t *self_id; /* minor needs to be updated */ + md_parent_t *parent_id; /* parent needs to be updated */ + mddb_recid_t *record_id; /* record id needs to be updated */ + hsp_t *hsp_id; + + gotsomething = 0; + + typ1 = (mddb_type_t)md_getshared_key(setno, + raid_md_ops.md_driver.md_drivername); + recid = mddb_makerecid(setno, 0); + + while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { + if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) + continue; + + dep = mddb_getrecdep(recid); + rbp = dep->de_rb; + + if (rbp->rb_revision == MDDB_REV_RB) { + /* + * Small device + */ + un32 = (mr_unit32_od_t *)mddb_getrecaddr(recid); + self_id = &(un32->c.un_self_id); + parent_id = &(un32->c.un_parent); + record_id = &(un32->c.un_record_id); + hsp_id = &(un32->un_hsp_id); + + for (i = 0; i < un32->un_totalcolumncnt; i++) { + mr_column32_od_t *device; + + device = &un32->un_column[i]; + if (!md_update_minor(setno, mddb_getsidenum + (setno), device->un_orig_key)) + goto out; + + if (device->un_hs_id != 0) + device->un_hs_id = MAKERECID( + setno, device->un_hs_id); + } + } else { + un64 = (mr_unit_t *)mddb_getrecaddr(recid); + self_id = &(un64->c.un_self_id); + parent_id = &(un64->c.un_parent); + record_id = &(un64->c.un_record_id); + hsp_id = &(un64->un_hsp_id); + + for (i = 0; i < un64->un_totalcolumncnt; i++) { + mr_column_t *device; + + device = &un64->un_column[i]; + if (!md_update_minor(setno, mddb_getsidenum + (setno), device->un_orig_key)) + goto out; + + if (device->un_hs_id != 0) + device->un_hs_id = MAKERECID( + setno, device->un_hs_id); + } + } + + /* + * Update unit with the imported setno + */ + mddb_setrecprivate(recid, MD_PRV_GOTIT); + + *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); + + if (*hsp_id != -1) + *hsp_id = MAKERECID(setno, DBID(*hsp_id)); + + if (*parent_id != MD_NO_PARENT) + *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); + *record_id = MAKERECID(setno, DBID(*record_id)); + gotsomething = 1; + } + +out: + return (gotsomething); +} + +static md_named_services_t raid_named_services[] = { + {raid_hotspares, "poke hotspares" }, + {raid_rename_check, MDRNM_CHECK }, + {raid_rename_lock, MDRNM_LOCK }, + {(intptr_t (*)()) raid_rename_unlock, MDRNM_UNLOCK }, + {(intptr_t (*)()) raid_probe_dev, "probe open test" }, + {NULL, 0 } +}; + +md_ops_t raid_md_ops = { + raid_open, /* open */ + raid_close, /* close */ + md_raid_strategy, /* strategy */ + NULL, /* print */ + NULL, /* dump */ + NULL, /* read */ + NULL, /* write */ + md_raid_ioctl, /* ioctl, */ + raid_snarf, /* raid_snarf */ + raid_halt, /* raid_halt */ + NULL, /* aread */ + NULL, /* awrite */ + raid_imp_set, /* import set */ + raid_named_services +}; + +static void +init_init() +{ + /* default to a second */ + if (md_wr_wait == 0) + md_wr_wait = md_hz >> 1; + + raid_parent_cache = kmem_cache_create("md_raid_parent", + sizeof (md_raidps_t), 0, raid_parent_constructor, + raid_parent_destructor, raid_run_queue, NULL, NULL, 0); + raid_child_cache = kmem_cache_create("md_raid_child", + sizeof (md_raidcs_t) - sizeof (buf_t) + biosize(), 0, + raid_child_constructor, raid_child_destructor, + raid_run_queue, NULL, NULL, 0); + raid_cbuf_cache = kmem_cache_create("md_raid_cbufs", + sizeof (md_raidcbuf_t), 0, raid_cbuf_constructor, + raid_cbuf_destructor, raid_run_queue, NULL, NULL, 0); +} + +static void +fini_uninit() +{ + kmem_cache_destroy(raid_parent_cache); + kmem_cache_destroy(raid_child_cache); + kmem_cache_destroy(raid_cbuf_cache); + raid_parent_cache = raid_child_cache = raid_cbuf_cache = NULL; +} + +/* define the module linkage */ +MD_PLUGIN_MISC_MODULE("raid module %I%", init_init(), fini_uninit()) |