diff options
author | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
---|---|---|
committer | stevel@tonic-gate <none@none> | 2005-06-14 00:00:00 -0700 |
commit | 7c478bd95313f5f23a4c958a745db2134aa03244 (patch) | |
tree | c871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/lib/lvm | |
download | illumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz |
OpenSolaris Launch
Diffstat (limited to 'usr/src/lib/lvm')
120 files changed, 82196 insertions, 0 deletions
diff --git a/usr/src/lib/lvm/Makefile b/usr/src/lib/lvm/Makefile new file mode 100644 index 0000000000..15d40f66b7 --- /dev/null +++ b/usr/src/lib/lvm/Makefile @@ -0,0 +1,66 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 1998-2002 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.lib + +SUBDIRS = libmeta .WAIT libpreen libsvm +HDRSUBDIRS = libsvm +DCSUBDIRS = libmeta +MSGSUBDIRS = libmeta + +all := TARGET = all +clean := TARGET = clean +clobber := TARGET = clobber +check := TARGET = check +debug := TARGET = debug +install := TARGET = install +install_h := TARGET = install_h +lint := TARGET = lint +_dc := TARGET = _dc +_msg := TARGET = _msg + +.KEEP_STATE: + +.PARALLEL: $(SUBDIRS) + +all clean clobber debug lint: $(SUBDIRS) + +install: $(ROOTDIRS) $(SUBDIRS) + +_dc: $(DCSUBDIRS) + +check install_h: $(HDRSUBDIRS) + +_msg: $(MSGSUBDIRS) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include $(SRC)/lib/Makefile.targ diff --git a/usr/src/lib/lvm/Makefile.lvm b/usr/src/lib/lvm/Makefile.lvm new file mode 100644 index 0000000000..99a4d94727 --- /dev/null +++ b/usr/src/lib/lvm/Makefile.lvm @@ -0,0 +1,39 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include $(SRC)/lib/Makefile.lib + +sparc_C_PICFLAGS = -K PIC + +SRCDIR = ../common + +# base target directories +ROOTDIRS = $(ROOT)/usr $(ROOTHDRDIR) $(ROOTLIBDIR) + +RPCGENFLAGS = -C -M +CFLAGS += $(CCVERBOSE) diff --git a/usr/src/lib/lvm/Makefile.targ b/usr/src/lib/lvm/Makefile.targ new file mode 100644 index 0000000000..41665529c2 --- /dev/null +++ b/usr/src/lib/lvm/Makefile.targ @@ -0,0 +1,42 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 1998-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include $(SRC)/lib/Makefile.targ + +# basic target directories +$(ROOTDIRS): + $(INS.dir) + +debug := COPTFLAG = -g +debug := COPTFLAG64 = -g +debug := DYNFLAGS += -g + +cstyle: + cstyle -pP $(SRCS) + +lint: lintcheck diff --git a/usr/src/lib/lvm/libmeta/Makefile b/usr/src/lib/lvm/libmeta/Makefile new file mode 100644 index 0000000000..ab4fe9337c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/Makefile @@ -0,0 +1,78 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2003 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include $(SRC)/lib/Makefile.lib + +SUBDIRS = $(MACH) + +MSGSRCS :sh= echo */*.[ch] +MSGFILES = $(MSGSRCS:%.c=%.i) +POFILE = libmeta.po + +DCFILES = common/meta_print.po +DCFILE = libmeta.dc + +install := TARGET= install +clean := TARGET= clean +clobber := TARGET= clobber +lint := TARGET= lint +test := TARGET= test +debug := TARGET= debug + +CPPFLAGS += -I$(SRC)/lib/lvm/libmeta/common/hdrs + +.KEEP_STATE: + +all debug install: spec .WAIT $(SUBDIRS) + +clean: spec $(SUBDIRS) + $(RM) $(MSGFILES) $(DCFILES) + +clobber: spec $(SUBDIRS) + $(RM) $(POFILE) $(DCFILE) + +lint: $(SUBDIRS) + +$(DCFILE):= XGETFLAGS = -c TRANSLATION_NOTE_LC_TIME -t + +$(DCFILE): $(DCFILES) + $(CAT) $(DCFILES) > $(DCFILE) + +$(POFILE): $(MSGFILES) + $(BUILDPO.msgfiles) + +_msg: $(MSGDOMAINPOFILE) + +_dc: $(DCMSGDOMAINPOFILE) + +spec $(MACH): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include $(SRC)/Makefile.msg.targ diff --git a/usr/src/lib/lvm/libmeta/Makefile.com b/usr/src/lib/lvm/libmeta/Makefile.com new file mode 100644 index 0000000000..ad747e2331 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/Makefile.com @@ -0,0 +1,189 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +LIBRARY= libmeta.a +VERS= .1 +COMMON = $(SRC)/common/lvm + +CMN_OBJS = md_crc.o + +DERIVED_OBJS = \ + mdiox_xdr.o \ + meta_basic_xdr.o \ + metad_clnt.o \ + metad_xdr.o \ + metamed_clnt.o \ + metamed_xdr.o \ + metamhd_clnt.o \ + metamhd_xdr.o \ + mdmn_commd_xdr.o \ + mhdx_xdr.o + +LOCAL_OBJS= \ + metad_svc_stubs.o \ + meta_admin.o \ + meta_attach.o \ + meta_db.o \ + meta_db_balance.o \ + meta_devadm.o \ + meta_devstamp.o \ + meta_error.o \ + meta_getdevs.o \ + meta_hotspares.o \ + meta_import.o \ + meta_init.o \ + meta_lib_prv.o \ + meta_mdcf.o \ + meta_med_err.o \ + meta_mem.o \ + meta_metad.o \ + meta_metad_subr.o \ + meta_med.o \ + meta_mh.o \ + meta_mirror.o \ + meta_mirror_resync.o \ + meta_mn_comm.o \ + meta_mn_changelog.o \ + meta_mn_handlers.o \ + meta_mn_msg_table.o \ + meta_mn_subr.o \ + meta_mount.o \ + meta_name.o \ + meta_nameinfo.o \ + meta_namespace.o \ + meta_notify.o \ + meta_se_notify.o \ + meta_patch.o \ + meta_patch_root.o \ + meta_print.o \ + meta_raid.o \ + meta_raid_resync.o \ + meta_rename.o \ + meta_repartition.o \ + meta_replace.o \ + meta_reset.o \ + meta_resync.o \ + meta_runtime.o \ + meta_set.o \ + meta_set_drv.o \ + meta_set_hst.o \ + meta_set_med.o \ + meta_set_prv.o \ + meta_set_tkr.o \ + meta_setup.o \ + meta_smf.o \ + meta_stat.o \ + meta_sp.o \ + meta_stripe.o \ + meta_systemfile.o \ + meta_tab.o \ + meta_time.o \ + meta_trans.o \ + meta_userflags.o \ + metarpcopen.o \ + metasplitname.o \ + metagetroot.o \ + sdssc_bind.o + +SPC_OBJS= meta_check.o + +CMN_SRCS = $(CMN_OBJS:%.o=$(COMMON)/%.c) +LOCAL_SRCS = $(LOCAL_OBJS:%.o=../common/%.c) +DERIVED_SRCS = $(DERIVED_OBJS:%.o=%.c) +SPC_SRCS = $(SPC_OBJS:%.o=../common/%.c) + +include ../../../Makefile.lib + +MAPDIR= $(SRC)/lib/lvm/libmeta/spec/$(TRANSMACH) +SPECMAPFILE = $(MAPDIR)/mapfile +OBJECTS64 = $(LOCAL_OBJS) $(DERIVED_OBJS) $(CMN_OBJS) +OBJECTS = $(OBJECTS64) $(SPC_OBJS) + +include $(SRC)/lib/lvm/Makefile.lvm + +# install this library in the root filesystem +include ../../../Makefile.rootfs + +LIBS = $(DYNLIB) $(LINTLIB) +SRCS = $(CMN_SRCS) $(LOCAL_SRCS) $(DERIVED_SRCS) +$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC) +lint := SRCS = $(CMN_SRCS) $(LOCAL_SRCS) $(SPC_SRCS) +CPPFLAGS += -I$(SRC)/lib/lvm/libmeta/common/hdrs +LDLIBS += -lnsl -lc -ladm -ldevid -lgen -lefi -ldevinfo -lscf +CLEANFILES += $(DERIVED_SRCS) + +.KEEP_STATE: + +BIG_TARGETS = $(OBJECTS64:%=pics/%) + +$(BIG_TARGETS) := CPPFLAGS += -D_LARGEFILE_SOURCE=1 -D_FILE_OFFSET_BITS=64 + +$(LINTLIB) := CPPFLAGS += -D_LARGEFILE_SOURCE=1 -D_FILE_OFFSET_BITS=64 + +all: $(LIBS) + +objs/%.o profs/%.o pics/%.o: $(COMMON)/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) + +mdiox_xdr.c: $(SRC)/uts/common/sys/lvm/mdiox.x + $(RPCGEN) $(RPCGENFLAGS) -c -i 100 $(SRC)/uts/common/sys/lvm/mdiox.x | \ + nawk '{sub(/uts\/common\/sys\/lvm/, "head"); print $$0}' >$@ + +meta_basic_xdr.c: $(SRC)/uts/common/sys/lvm/meta_basic.x + $(RPCGEN) $(RPCGENFLAGS) -c $(SRC)/uts/common/sys/lvm/meta_basic.x | \ + nawk '{sub(/uts\/common\/sys\/lvm/, "head"); print $$0}' >$@ + +metad_clnt.c: $(SRC)/head/metad.x + $(RPCGEN) $(RPCGENFLAGS) -l $(SRC)/head/metad.x -o $@ + +metad_xdr.c: $(SRC)/head/metad.x + $(RPCGEN) $(RPCGENFLAGS) -c $(SRC)/head/metad.x -o $@ + +metamed_clnt.c: $(SRC)/uts/common/sys/lvm/metamed.x + $(RPCGEN) $(RPCGENFLAGS) -l $(SRC)/uts/common/sys/lvm/metamed.x | \ + nawk '{sub(/uts\/common\/sys\/lvm/, "head"); print $$0}' >$@ + +metamed_xdr.c: $(SRC)/uts/common/sys/lvm/metamed.x + $(RPCGEN) $(RPCGENFLAGS) -c $(SRC)/uts/common/sys/lvm/metamed.x | \ + nawk '{sub(/uts\/common\/sys\/lvm/, "head"); print $$0}' >$@ + +metamhd_clnt.c: $(SRC)/head/metamhd.x + $(RPCGEN) $(RPCGENFLAGS) -l $(SRC)/head/metamhd.x -o $@ + +metamhd_xdr.c: $(SRC)/head/metamhd.x + $(RPCGEN) $(RPCGENFLAGS) -c $(SRC)/head/metamhd.x -o $@ + +mhdx_xdr.c: $(SRC)/uts/common/sys/lvm/mhdx.x + $(RPCGEN) $(RPCGENFLAGS) -c $(SRC)/uts/common/sys/lvm/mhdx.x | \ + nawk '{sub(/uts\/common\/sys\/lvm/, "head"); print $$0}' >$@ + +mdmn_commd_xdr.c: $(SRC)/uts/common/sys/lvm/mdmn_commd.x + $(RPCGEN) -c $(SRC)/uts/common/sys/lvm/mdmn_commd.x -o $@ + +include $(SRC)/lib/lvm/Makefile.targ diff --git a/usr/src/lib/lvm/libmeta/common/hdrs/meta_lib_prv.h b/usr/src/lib/lvm/libmeta/common/hdrs/meta_lib_prv.h new file mode 100644 index 0000000000..e35cd8c07e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/hdrs/meta_lib_prv.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 1993, 1994, 2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ifndef _META_SET_COM_H +#define _META_SET_COM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> +#include <ctype.h> +#include <sys/mnttab.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* meta_lib_prv.c */ +extern FILE *open_mnttab(void); +extern int close_mnttab(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _META_SET_COM_H */ diff --git a/usr/src/lib/lvm/libmeta/common/hdrs/meta_repartition.h b/usr/src/lib/lvm/libmeta/common/hdrs/meta_repartition.h new file mode 100644 index 0000000000..f5053acccd --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/hdrs/meta_repartition.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2001, 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _META_REPARTITION_H +#define _META_REPARTITION_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* meta_repartition_drive() option flags */ +#define MD_REPART_FORCE 0x01 +#define MD_REPART_LEAVE_REP 0x02 +#define MD_REPART_DONT_LABEL 0x04 + +/* meta_repartition.c */ +extern int meta_repartition_drive(mdsetname_t *sp, + mddrivename_t *dnp, int options, mdvtoc_t *vtocp, md_error_t *ep); + +#ifdef __cplusplus +} +#endif + +#endif /* _META_REPARTITION_H */ diff --git a/usr/src/lib/lvm/libmeta/common/hdrs/meta_set_prv.h b/usr/src/lib/lvm/libmeta/common/hdrs/meta_set_prv.h new file mode 100644 index 0000000000..6f63b161e1 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/hdrs/meta_set_prv.h @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _META_SET_COM_H +#define _META_SET_COM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> +#include <ctype.h> +#include <sys/lvm/md_convert.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define RB_PREEMPT if (md_got_sig()) goto rollback +#ifdef DEBUG +#define RB_TEST(tstpt, tag, ep) if (rb_test(tstpt, tag, (ep)) < 0) \ + goto rollback; +#else /* !DEBUG */ +#define RB_TEST(tstpt, tag, ep) +#endif /* DEBUG */ + +/* meta_setup.c */ +extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); + +#ifdef DEBUG +extern int rb_test(int rbt_sel_tpt, char *rbt_sel_tag, md_error_t *ep); +#endif /* DEBUG */ + +/* + * Flag values used by the nodehasset() function. + */ +#define NHS_N_EQ 0x00000001 /* name == */ +#define NHS_NS_EQ 0x00000002 /* name, setno == */ +#define NHS_NST_EQ 0x00000004 /* name, setno, TS == */ +#define NHS_NSTG_EQ 0x00000008 /* name, setno, TS, genid == */ +#define NHS_NST_EQ_G_GT 0x00000010 /* name, setno, TS ==, genid > */ + +/* + * Node, set, and mediator names can be any printable characters + * (isprint()) except for the characters in the #define that follows. + */ +#define INVALID_IN_NAMES " *?/" + +/* meta_set_prv.c */ +extern int checkdrive_onnode(mdsetname_t *sp, mddrivename_t *dnp, + char *node, md_error_t *ep); +extern side_t getnodeside(char *node, md_set_desc *sd); +extern int halt_set(mdsetname_t *sp, md_error_t *ep); +extern md_drive_desc *metadrivedesc_append(md_drive_desc **dd, + mddrivename_t *dnp, int dbcnt, int dbsize, + md_timeval32_t timestamp, ulong_t genid, + uint_t flags); +extern int nodehasset(mdsetname_t *sp, char *node, + uint_t match_flag, md_error_t *ep); +extern int nodesuniq(mdsetname_t *sp, int cnt, char **strings, + md_error_t *ep); +extern int own_set(mdsetname_t *sp, char **owner_of_set, + int forceflg, md_error_t *ep); +extern void resync_genid(mdsetname_t *sp, md_set_desc *sd, + ulong_t max_genid, int node_c, char **node_v); +extern int setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, + int force, md_error_t *ep); +extern int snarf_set(mdsetname_t *sp, bool_t stale_bool, + md_error_t *ep); + +#ifdef __cplusplus +} +#endif + +#endif /* _META_SET_COM_H */ diff --git a/usr/src/lib/lvm/libmeta/common/inc.flg b/usr/src/lib/lvm/libmeta/common/inc.flg new file mode 100644 index 0000000000..58651e7f09 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/inc.flg @@ -0,0 +1,29 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 1995, 2000 by Sun Microsystems, Inc. +# All rights reserved. + +echo_file usr/src/common/lvm/md_crc.c diff --git a/usr/src/lib/lvm/libmeta/common/llib-lmeta b/usr/src/lib/lvm/libmeta/common/llib-lmeta new file mode 100644 index 0000000000..747bc0f9ae --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/llib-lmeta @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2001 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* LINTLIBRARY */ +/* PROTOLIB1 */ + +#include <meta_lib_prv.h> +#include <meta_set_prv.h> diff --git a/usr/src/lib/lvm/libmeta/common/meta_admin.c b/usr/src/lib/lvm/libmeta/common/meta_admin.c new file mode 100644 index 0000000000..9b3e13c10f --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_admin.c @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1992-1994, 2000-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * miscellaneous utilities + */ + +#include <meta.h> + +static int meta_fd = -1; +static major_t meta_major; + +/* + * open administrative device + */ +int +open_admin( + md_error_t *ep +) +{ + struct stat buf; + + /* if not already open */ + if (meta_fd < 0) { + ulong_t dversion = 0; + + /* try read/write fall back to readonly */ + if ((meta_fd = open(ADMSPECIAL, O_RDWR, 0)) < 0) { + if (errno != EACCES) + return (mdsyserror(ep, errno, ADMSPECIAL)); + if ((meta_fd = open(ADMSPECIAL, O_RDONLY, 0)) < 0) + return (mdsyserror(ep, errno, ADMSPECIAL)); + } + + /* get major */ + if (fstat(meta_fd, &buf) != 0) + return (mdsyserror(ep, errno, ADMSPECIAL)); + meta_major = major(buf.st_rdev); + + /* check driver version */ + if (metaioctl(MD_IOCGVERSION, &dversion, ep, NULL) != 0) + return (-1); + if (dversion != MD_DVERSION) + return (mderror(ep, MDE_DVERSION, NULL)); + } + + /* return fd */ + return (meta_fd); +} + +int +close_admin( + md_error_t *ep +) +{ + if (meta_fd >= 0) { + if (close(meta_fd) == -1) + return (mdsyserror(ep, errno, ADMSPECIAL)); + meta_fd = -1; + } + + return (0); +} + +/* + * Returns True if the md_dev64_t passed in is a metadevice. + * Else it returns False. + */ +int +meta_dev_ismeta( + md_dev64_t dev +) +{ + int fd; + md_error_t status = mdnullerror; + + fd = open_admin(&status); + assert(fd >= 0); + return (meta_getmajor(dev) == meta_major); +} + + +int +meta_get_nunits(md_error_t *ep) +{ + + static set_t max_nunits = 0; + + if (max_nunits == 0) + if (metaioctl(MD_IOCGETNUNITS, &max_nunits, ep, NULL) != 0) + return (-1); + + return (max_nunits); +} + +md_dev64_t +metamakedev(minor_t mnum) +{ + int fd; + md_error_t status = mdnullerror; + + fd = open_admin(&status); + + assert(fd >= 0); + + return (((md_dev64_t)meta_major << NBITSMINOR64) | mnum); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_attach.c b/usr/src/lib/lvm/libmeta/common/meta_attach.c new file mode 100644 index 0000000000..275640d927 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_attach.c @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 1992-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * attach operations + */ + +#include <meta.h> + +/* + * grow generic device + */ +int +meta_concat_generic( + mdsetname_t *sp, + mdname_t *namep, + u_longlong_t big_or_little, + md_error_t *ep +) +{ + md_grow_params_t mgp; + char *miscname; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(namep->dev))); + + /* get type */ + if ((miscname = metagetmiscname(namep, ep)) == NULL) + return (-1); + + /* grow device */ + (void) memset(&mgp, 0, sizeof (mgp)); + if (big_or_little == MD_64BIT_META_DEV) + mgp.options = MD_CRO_64BIT; + else + mgp.options = MD_CRO_32BIT; + + mgp.mnum = meta_getminor(namep->dev); + MD_SETDRIVERNAME(&mgp, miscname, sp->setno); + if (metaioctl(MD_IOCGROW, &mgp, &mgp.mde, namep->cname) != 0) + return (mdstealerror(ep, &mgp.mde)); + + /* clear cache */ + meta_invalidate_name(namep); + + /* return success */ + return (0); +} + +/* + * grow the parent of a device + */ +int +meta_concat_parent( + mdsetname_t *sp, + mdname_t *childnp, + md_error_t *ep +) +{ + md_common_t *mdp; + mdname_t *parentnp; + md_unit_t *mup; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(childnp->dev))); + + /* get parent */ + if ((mdp = meta_get_unit(sp, childnp, ep)) == NULL) + return (-1); + if (! MD_HAS_PARENT(mdp->parent)) + return (0); + if (mdp->parent == MD_MULTI_PARENT) + return (0); + + /* single parent */ + if ((parentnp = metamnumname(&sp, mdp->parent, 0, ep)) == NULL) + return (-1); + /* don't grow non-metadevices or soft partitions */ + if (! metaismeta(parentnp) || meta_sp_issp(sp, parentnp, ep) == 0) + return (0); + + if ((mup = meta_get_mdunit(sp, childnp, ep)) == NULL) + return (-1); + + /* grow parent */ + if (meta_concat_generic(sp, parentnp, mup->c.un_revision, ep) != 0) + return (-1); + + /* recursively check for parents of parents */ + return (meta_concat_parent(sp, parentnp, ep)); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_check.c b/usr/src/lib/lvm/libmeta/common/meta_check.c new file mode 100644 index 0000000000..94c103e0aa --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_check.c @@ -0,0 +1,874 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * check componets + */ + +#include <meta.h> +#include "meta_lib_prv.h" + +#include <sys/mnttab.h> +#include <sys/swap.h> + +#include "meta_lib_prv.h" +#include <devid.h> +#include <sys/dumpadm.h> + +/* + * static list(s) + */ +typedef struct dev_list { + char *dev_name; + ddi_devid_t devid; + struct dev_list *dev_nxt; +} dev_list_t; + +static dev_list_t *devnamelist = NULL; + +/* + * free swap info + */ +static void +free_swapinfo( + struct swaptable *swtp +) +{ + int i; + + if (swtp == NULL) + return; + + for (i = 0; (i < swtp->swt_n); ++i) { + if (swtp->swt_ent[i].ste_path != NULL) + Free(swtp->swt_ent[i].ste_path); + } + + Free(swtp); +} + +/* + * get swap info + */ +static int +get_swapinfo( + struct swaptable **swtpp, + int *nswap, + md_error_t *ep +) +{ + int i; + size_t swtsize; + + *swtpp = NULL; + + /* get number of entries */ + if ((*nswap = swapctl(SC_GETNSWP, NULL)) < 0) { + return (mdsyserror(ep, errno, "swapctl(SC_GETNSWP)")); + } + + /* allocate structure */ + swtsize = sizeof ((*swtpp)->swt_n) + + ((*nswap) * sizeof ((*swtpp)->swt_ent[0])); + *swtpp = (struct swaptable *)Zalloc(swtsize); + (*swtpp)->swt_n = *nswap; + for (i = 0; (i < (*nswap)); ++i) + (*swtpp)->swt_ent[i].ste_path = Zalloc(MAXPATHLEN); + + /* get info */ + if (((*nswap) = swapctl(SC_LIST, (*swtpp))) < 0) { + (void) mdsyserror(ep, errno, "swapctl(SC_LIST)"); + free_swapinfo(*swtpp); + return (-1); + } + + /* return success */ + return (0); +} + +/* + * check whether device is swapped on + */ +static int +meta_check_swapped( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + struct swaptable *swtp; + int nswap; + int i; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* get swap info */ + if (get_swapinfo(&swtp, &nswap, ep) != 0) + return (-1); + + /* look for match */ + for (i = 0; ((i < nswap) && (rval == 0)); ++i) { + mdname_t *snp; + + if ((snp = metaname(&sp, swtp->swt_ent[i].ste_path, + ep)) == NULL) { + mdclrerror(ep); + continue; + } + if (np->dev == snp->dev) { + rval = mddeverror(ep, MDE_IS_SWAPPED, + np->dev, np->cname); + } else { /* not swap - does it overlap */ + rval = meta_check_overlap(snp->cname, np, 0, -1, + snp, 0, -1, ep); + if (rval != 0) { + (void) mdoverlaperror(ep, MDE_OVERLAP_SWAP, + np->cname, NULL, snp->cname); + } + } + } + free_swapinfo(swtp); + + /* return success */ + return (rval); +} + +/* + * Is a driver currently swapped on? + */ +int +meta_check_driveswapped( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + struct swaptable *swtp; + int nswap; + int i; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* get swap info */ + if (get_swapinfo(&swtp, &nswap, ep) != 0) + return (-1); + + /* look for match */ + for (i = 0; (i < nswap); ++i) { + mdname_t *snp; + + if ((snp = metaname(&sp, swtp->swt_ent[i].ste_path, + ep)) == NULL) { + mdclrerror(ep); + continue; + } + + if (strcmp(dnp->cname, snp->drivenamep->cname) == 0) { + rval = mddeverror(ep, MDE_IS_SWAPPED, NODEV64, + dnp->cname); + } + } + free_swapinfo(swtp); + + /* return success */ + return (rval); +} + +/* + * check whether device is a dump device + */ +static int +meta_check_dump( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + int rval = 0; + int dump_fd; + char device[MAXPATHLEN]; + + + if ((dump_fd = open("/dev/dump", O_RDONLY)) < 0) + return (mdsyserror(ep, errno, "/dev/dump")); + + if (ioctl(dump_fd, DIOCGETDEV, device) != -1) { + mdname_t *dump_np; + + if ((dump_np = metaname(&sp, device, ep)) == NULL) { + mdclrerror(ep); + (void) close(dump_fd); + return (0); + } + + if (np->dev == dump_np->dev) { + rval = mddeverror(ep, MDE_IS_DUMP, + np->dev, np->cname); + } else { /* not a dump device - but does it overlap? */ + rval = meta_check_overlap(dump_np->cname, np, 0, -1, + dump_np, 0, -1, ep); + if (rval != 0) { + (void) mdoverlaperror(ep, MDE_OVERLAP_DUMP, + np->cname, NULL, dump_np->cname); + } + } + } + (void) close(dump_fd); + return (rval); +} + +/* + * check whether device is mounted + */ +static int +meta_check_mounted( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + FILE *mfp; + struct mnttab m; + int rval = 0; + char mountp[MNT_LINE_MAX]; + char mnt_special[MNT_LINE_MAX]; + + /* should have a set */ + assert(sp != NULL); + + /* look in mnttab */ + if ((mfp = open_mnttab()) == NULL) + return (mdsyserror(ep, errno, MNTTAB)); + while ((getmntent(mfp, &m) == 0) && (rval == 0)) { + mdname_t *mnp; + + if ((m.mnt_special == NULL) || (m.mnt_mountp == NULL)) + continue; + + if (m.mnt_mountp[0] != '/') + continue; + + if ((strcmp(m.mnt_fstype, "nfs") == 0) || + (strcmp(m.mnt_fstype, "autofs") == 0) || + (strcmp(m.mnt_fstype, "proc") == 0) || + (strcmp(m.mnt_fstype, "tmpfs") == 0) || + (strcmp(m.mnt_fstype, "cachefs") == 0) || + (strcmp(m.mnt_fstype, "lofs") == 0) || + (strcmp(m.mnt_fstype, "rfs") == 0) || + (strcmp(m.mnt_fstype, "fd") == 0) || + (strcmp(m.mnt_fstype, "mntfs") == 0) || + (strcmp(m.mnt_fstype, "devfs") == 0)) + continue; + + (void) strcpy(mountp, m.mnt_mountp); + (void) strcpy(mnt_special, m.mnt_special); + + if ((mnp = metaname(&sp, mnt_special, ep)) == NULL) { + mdclrerror(ep); + continue; + } + + if (np->dev == mnp->dev) { + rval = mduseerror(ep, MDE_IS_MOUNTED, + np->dev, mountp, np->cname); + } else { /* device isn't in mnttab - does it overlap? */ + rval = meta_check_overlap(mnp->cname, np, 0, -1, + mnp, 0, -1, ep); + if (rval != 0) { + (void) mdoverlaperror(ep, MDE_OVERLAP_MOUNTED, + np->cname, mountp, mnp->cname); + } + } + } + + /* return success */ + return (rval); +} + + +/* + * Is a file system currently mounted on this disk drive? + */ +int +meta_check_drivemounted( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + FILE *mfp; + struct mnttab m; + int rval = 0; + char mountp[MNT_LINE_MAX]; + char mnt_special[MNT_LINE_MAX]; + + /* should have a set */ + assert(sp != NULL); + + /* look in mnttab */ + if ((mfp = open_mnttab()) == NULL) + return (mdsyserror(ep, errno, MNTTAB)); + while ((getmntent(mfp, &m) == 0) && (rval == 0)) { + mdname_t *mnp; + + if ((m.mnt_special == NULL) || (m.mnt_mountp == NULL)) + continue; + + if (m.mnt_mountp[0] != '/') + continue; + + if ((strcmp(m.mnt_fstype, "nfs") == 0) || + (strcmp(m.mnt_fstype, "autofs") == 0) || + (strcmp(m.mnt_fstype, "proc") == 0) || + (strcmp(m.mnt_fstype, "tmpfs") == 0) || + (strcmp(m.mnt_fstype, "cachefs") == 0) || + (strcmp(m.mnt_fstype, "lofs") == 0) || + (strcmp(m.mnt_fstype, "rfs") == 0) || + (strcmp(m.mnt_fstype, "fd") == 0)) + continue; + + (void) strcpy(mountp, m.mnt_mountp); + (void) strcpy(mnt_special, m.mnt_special); + if ((mnp = metaname(&sp, mnt_special, ep)) == NULL) { + mdclrerror(ep); + continue; + } + if (strcmp(dnp->cname, mnp->drivenamep->cname) == 0) { + rval = mduseerror(ep, MDE_IS_MOUNTED, NODEV64, + mountp, dnp->cname); + } + } + + /* return success */ + return (rval); +} + +/* + * Check to see if the specified name is already in use or overlaps + * with a device already in use. Checks are made to determine whether + * the device is mounted, is a swap device, or a dump device. In each + * case if the device is not in use then an overlap check is done to ensure + * that the specified slice does not overlap. + */ +int +meta_check_inuse( + mdsetname_t *sp, + mdname_t *np, + mdinuseopts_t inuse_flags, + md_error_t *ep +) +{ + int rval = 0; + + if ((inuse_flags & MDCHK_MOUNTED) && + (rval = meta_check_mounted(sp, np, ep)) != 0) + return (rval); + + if ((inuse_flags & MDCHK_SWAP) && + (rval = meta_check_swapped(sp, np, ep)) != 0) + return (rval); + + if ((inuse_flags & MDCHK_DUMP) && + (rval = meta_check_dump(sp, np, ep)) != 0) + return (rval); + + return (rval); +} + +int +meta_check_driveinset(mdsetname_t *sp, mddrivename_t *dn, md_error_t *ep) +{ + set_t setno; + set_t max_sets; + + if ((max_sets = get_max_sets(ep)) == 0) + return (-1); + + for (setno = 1; setno < max_sets; setno++) { + mdsetname_t *sp1; + int is_it; + + if (setno == sp->setno) + continue; + + if ((sp1 = metasetnosetname(setno, ep)) == NULL) { + if (mdismddberror(ep, MDE_DB_NODB)) { + mdclrerror(ep); + return (0); + } + if (mdiserror(ep, MDE_NO_SET)) { + mdclrerror(ep); + continue; + } + return (-1); + } + + metaflushsetname(sp1); + + if ((is_it = meta_is_drive_in_thisset(sp1, dn, FALSE, ep)) + == -1) + return (-1); + + if (is_it) + return (mddserror(ep, MDE_DS_DRIVEINSET, sp->setno, + sp1->setname, dn->cname, sp->setname)); + } + + return (0); +} + +/* + * Add a device/device id tuple to the devname cache + */ +static void +add_to_devname_list( + char *device_name, /* fully qualified dev name */ + ddi_devid_t devid /* device id */ +) +{ + dev_list_t *dnlp; + + dnlp = Zalloc(sizeof (*dnlp)); + dnlp->dev_name = Strdup(device_name); + dnlp->devid = devid; + + /* link the node into the devname list */ + dnlp->dev_nxt = devnamelist; + devnamelist = dnlp; +} + +/* + * check for same drive + */ +int +meta_check_samedrive( + mdname_t *np1, /* first comp */ + mdname_t *np2, /* second comp */ + md_error_t *ep +) +{ + + mdcinfo_t *cinfop1, *cinfop2; + mdnmtype_t type1 = np1->drivenamep->type; + mdnmtype_t type2 = np2->drivenamep->type; + int l = 0; + + char *name1 = NULL; + char *name2 = NULL; + + int retval = -1; + int fd1 = -1; + int fd2 = -1; + int rc1 = -2, rc2 = -2; + uint_t strl1 = 0, strl2 = 0; + int devid1_found = 0; + int devid2_found = 0; + + ddi_devid_t devid1 = NULL; + ddi_devid_t devid2 = NULL; + dev_list_t *dnlp = NULL; + + assert(type1 != MDT_FAST_META && type1 != MDT_FAST_COMP); + assert(type2 != MDT_FAST_META && type2 != MDT_FAST_COMP); + + /* + * The process of determining if 2 names are the same drive is + * as follows: + * + * Case 1 - The filenames are identical + * + * Case 2 - Either name is a metadevice name. If so then they + * are not the same drive. + * + * Case 3 - Both devices have a devid + * get and compare the devids for the devices. If both + * devices have a devid then the compare will is all + * that is needed we are done. + * + * Case 4 - One or more devices does not have a devid + * start by doing a simple compare of the name, if they + * are the same just return. + * + * If the names differ then keep going and see if the + * may be the same underlying devic. First check to + * see if the sd name is the same (old code). + * + * Then check the major and minor numbers to see if + * they are the same. If they are then return (old code). + * + * Next compare the raw name and the component name and + * if they are the same then return. + * + * All else has failed so use the component name (cname) + * component number and unit number. If they all are + * equal then call them the same drive. + * + */ + + if ((np1 == NULL) || (np2 == NULL)) + return (0); + + /* if the name structs are the same then the drives must be */ + if (np1 == np2) + return (1); + + name1 = np1->bname; + name2 = np2->bname; + + if ((name1 == NULL) || ((strl1 = strlen(name1)) == 0) || + (name2 == NULL) || ((strl2 = strlen(name2)) == 0)) + return (0); + + if ((strl1 == strl2) && (strcmp(name1, name2) == 0)) { + /* names are identical */ + return (1); + } + + if (is_metaname(name1) || is_metaname(name2)) + return (0); + + /* + * Check to see if the devicename is in the static list. If so, + * use its devid. Otherwise do the expensive operations + * of opening the device, getting the devid, and closing the + * device. Add the result into the static list. + * + * The case where this list will be useful is when there are soft + * partitions on multiple drives and a new soft partition is being + * created. In that situation the underlying physical device name + * for the new soft partition would be compared against each of the + * existing soft partititions. Without this static list that would + * involve 2 opens, closes, and devid gets for each existing soft + * partition + */ + for (dnlp = devnamelist; + (dnlp != NULL) && !(devid1_found && devid2_found); + dnlp = dnlp->dev_nxt) { + if (!devid1_found && (strcmp(dnlp->dev_name, name1) == 0)) { + devid1_found = 1; + devid1 = dnlp->devid; + if (devid1 == NULL) + rc1 = 1; + else + rc1 = 0; + continue; + } + if (!devid2_found && (strcmp(dnlp->dev_name, name2) == 0)) { + devid2_found = 1; + devid2 = dnlp->devid; + if (devid2 == NULL) + rc2 = 1; + else + rc2 = 0; + continue; + } + } + + /* + * Start by checking if the device has a device id, and if they + * are equal. If they are there is no question there is a match. + * + * The process here is open each disk, get the devid for each + * disk. If they both have a devid compare them and return + * the results. + */ + if (!devid1_found) { + if ((fd1 = open(name1, O_RDONLY | O_NDELAY)) < 0) { + return (0); + } + rc1 = devid_get(fd1, &devid1); + (void) close(fd1); + + /* add the name and devid to the cache */ + add_to_devname_list(name1, devid1); + } + + if (!devid2_found) { + if ((fd2 = open(name2, O_RDONLY | O_NDELAY)) < 0) { + return (0); + } + rc2 = devid_get(fd2, &devid2); + (void) close(fd2); + + /* add the name and devid to the cache */ + add_to_devname_list(name2, devid2); + } + + + if ((rc1 == 0) && (rc2 == 0)) { + if (devid_compare(devid1, devid2) == 0) + retval = 1; /* same drive */ + else + retval = 0; /* different drives */ + + } + + if (retval >= 0) { + return (retval); + } + + /* + * At this point in time one of the two drives did not have a + * device ID. Do not make the assumption that is one drive + * did have a device id and the other did not that they are not + * the same. One drive could be covered by a device and still + * be the same drive. This is a general flaw in the system at + * this time. + */ + + /* + * The optimization can not happen if we are given an old style name + * in the form /dev/XXNN[a-h], since the name caches differently and + * allows overlaps to happen. + */ + if (! ((sscanf(np1->bname, "/dev/%*[^0-9/]%*u%*[a-h]%n", &l) == 0 && + l == strlen(np1->bname)) || + (sscanf(np2->bname, "/dev/%*[^0-9/]%*u%*[a-h]%n", &l) == 0 && + l == strlen(np2->bname))) && + ((type1 == MDT_COMP) || (type1 == MDT_META)) && + ((type2 == MDT_COMP) || (type2 == MDT_META))) + return (np1->drivenamep == np2->drivenamep); + + /* check for same drive */ + if (meta_getmajor(np1->dev) != meta_getmajor(np2->dev)) + return (0); /* not same drive */ + + if (((cinfop1 = metagetcinfo(np1, ep)) == NULL) || + ((cinfop2 = metagetcinfo(np2, ep)) == NULL)) { + if ((strcmp(np1->drivenamep->cname, + np2->drivenamep->cname) != 0) && + (strcmp(np1->drivenamep->rname, + np2->drivenamep->rname) != 0)) { + mdclrerror(ep); + return (0); /* not same drive */ + } else { + return (-1); /* can't tell */ + } + } else if ((strncmp(cinfop1->cname, cinfop2->cname, + sizeof (cinfop1->cname)) != 0) || + (cinfop1->cnum != cinfop2->cnum) || + (cinfop1->unit != cinfop2->unit)) { + return (0); /* not same drive */ + } + + /* same drive */ + return (1); +} + +/* + * check for overlap + */ +int +meta_check_overlap( + char *uname, /* user supplied name for errors */ + mdname_t *np1, /* first comp */ + diskaddr_t slblk1, /* first comp - start logical block */ + diskaddr_t nblks1, /* first comp - # of blocks */ + mdname_t *np2, /* second comp */ + diskaddr_t slblk2, /* second comp - start logical block */ + diskaddr_t nblks2, /* second comp - # of blocks */ + md_error_t *ep +) +{ + diskaddr_t sblk1, sblk2; + mdvtoc_t *vtocp1, *vtocp2; + uint_t partno1, partno2; + mdpart_t *partp1, *partp2; + int err; + + /* verify args */ + if (slblk1 == MD_DISKADDR_ERROR) { + assert(0); + return (mdsyserror(ep, EINVAL, np1->cname)); + } + if (slblk2 == MD_DISKADDR_ERROR) { + assert(0); + return (mdsyserror(ep, EINVAL, np2->cname)); + } + + /* check for same drive */ + if ((err = meta_check_samedrive(np1, np2, ep)) == 0) { + return (0); /* not same drive */ + } else if (err < 0) { + return (-1); /* can't tell */ + } + + /* check for overlap */ + if (((vtocp1 = metagetvtoc(np1, FALSE, &partno1, ep)) == NULL) || + ((vtocp2 = metagetvtoc(np2, FALSE, &partno2, ep)) == NULL)) { + return (-1); /* can't tell */ + } + partp1 = &vtocp1->parts[partno1]; + partp2 = &vtocp2->parts[partno2]; + sblk1 = partp1->start + slblk1; + if (nblks1 == -1) + nblks1 = partp1->size - slblk1; + sblk2 = partp2->start + slblk2; + if (nblks2 == -1) + nblks2 = partp2->size - slblk2; + if (((sblk1 >= sblk2) && (sblk1 < (sblk2 + nblks2))) || + ((sblk2 >= sblk1) && (sblk2 < (sblk1 + nblks1)))) { + if (np1->dev == np2->dev) { /* slice in use */ + return (mduseerror(ep, MDE_ALREADY, np1->dev, + uname, np1->cname)); + } + return (mduseerror(ep, /* slice overlaps */ + MDE_OVERLAP, np1->dev, uname, np1->cname)); + } + + /* return success */ + return (0); /* no overlap */ +} + +/* + * check to see if a device is in a metadevice + */ +int +meta_check_inmeta( + mdsetname_t *sp, + mdname_t *np, + mdchkopts_t options, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + uint_t partno; + + /* see if replica slice is ok, only applies to disks in sets */ + if (! (options & MDCHK_ALLOW_REPSLICE) && + ! metaislocalset(sp)) { + uint_t rep_slice; + + if (metagetvtoc(np, FALSE, &partno, ep) == NULL) + return (-1); + if (meta_replicaslice(np->drivenamep, &rep_slice, ep) + != 0) + return (-1); + if (partno == rep_slice) + return (mddeverror(ep, MDE_REPCOMP_INVAL, np->dev, + np->cname)); + } + + /* check for databases */ + if (meta_check_inreplica(sp, np, slblk, nblks, ep) != 0) { + if (mdisuseerror(ep, MDE_ALREADY)) { + if (options & MDCHK_ALLOW_MDDB) { + mdclrerror(ep); + } else { + return (mddeverror(ep, MDE_HAS_MDDB, + np->dev, np->cname)); + } + } else { + return (-1); + } + } + + /* check metadevices */ + if (meta_check_instripe(sp, np, slblk, nblks, ep) != 0) + return (-1); + if (meta_check_inmirror(sp, np, slblk, nblks, ep) != 0) + return (-1); + if (meta_check_intrans(sp, np, options, slblk, nblks, ep) != 0) + return (-1); + if (meta_check_insp(sp, np, slblk, nblks, ep) != 0) + return (-1); + if (! (options & MDCHK_ALLOW_HS)) { + if (meta_check_inhsp(sp, np, slblk, nblks, ep) != 0) + return (-1); + } + if (meta_check_inraid(sp, np, slblk, nblks, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * check to see if a device is in its set + */ +int +meta_check_inset( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + mdsetname_t *npsp; + int bypass_daemon = FALSE; + + + /* check devices set */ + if (metaislocalset(sp)) + bypass_daemon = TRUE; + if ((npsp = metagetset(np, bypass_daemon, ep)) == NULL) { + if ((! metaismeta(np)) && + (metaislocalset(sp)) && + (mdismddberror(ep, MDE_DB_NODB))) { + mdclrerror(ep); + npsp = sp; + } else { + return (-1); + } + } + + /* check set */ + if (metaissameset(sp, npsp)) + return (0); + + /* return appropriate error */ + if (metaislocalset(sp)) + return (mddeverror(ep, MDE_IN_SHARED_SET, np->dev, np->cname)); + else + return (mddeverror(ep, MDE_NOT_IN_SET, np->dev, np->cname)); +} + +/* + * check to see if current user is root + */ +int +meta_check_root(md_error_t *ep) +{ + if (geteuid() != 0) { + (void) mderror(ep, MDE_NOPERM, ""); + return (-1); + } + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_db.c b/usr/src/lib/lvm/libmeta/common/meta_db.c new file mode 100644 index 0000000000..e30eb58c06 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_db.c @@ -0,0 +1,2517 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Metadevice database interfaces. + */ + +#define MDDB + +#include <meta.h> +#include <sys/lvm/md_mddb.h> +#include <sys/lvm/md_crc.h> +#include <sys/lvm/mdio.h> +#include <string.h> +#include <strings.h> +#include <ctype.h> + +struct svm_daemon { + char *svmd_name; + char *svmd_kill_val; +}; + +struct svm_daemon svmd_kill_list[] = { + {"mdmonitord", "HUP"}, + {"mddoors", "KILL"}, + }; + +#define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) +#define MDMONITORD "/usr/sbin/mdmonitord" + +extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); + +/* + * meta_get_lb_inittime sends a request for the lb_inittime to the kernel + */ +md_timeval32_t +meta_get_lb_inittime( + mdsetname_t *sp, + md_error_t *ep +) +{ + mddb_config_t c; + + (void) memset(&c, 0, sizeof (c)); + + /* Fill in setno, setname, and sideno */ + c.c_setno = sp->setno; + + if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + } + + return (c.c_timestamp); +} + +/* + * mkmasterblks writes out the master blocks of the mddb to the replica. + * + * In a MN diskset, this is called by the node that is adding this replica + * to the diskset. + */ + +#define MDDB_VERIFY_SIZE 8192 + +static int +mkmasterblks( + mdsetname_t *sp, + mdname_t *np, + int fd, + daddr_t firstblk, + int dbsize, + md_timeval32_t inittime, + md_error_t *ep +) +{ + int consecutive; + md_timeval32_t tp; + struct mddb_mb *mb; + char *buffer; + int iosize; + md_set_desc *sd; + int mn_set = 0; + daddr_t startblk; + int cnt; + ddi_devid_t devid; + + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) { + mn_set = 1; /* Used later */ + } + } + + /* + * Loop to verify the entire mddb region on disk is read/writable. + * buffer is used to write/read in at most MDDB_VERIFY_SIZE block + * chunks. + * + * A side-effect of this loop is to zero out the entire mddb region + */ + if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) + return (mdsyserror(ep, ENOMEM, np->rname)); + + startblk = firstblk; + for (cnt = dbsize; cnt > 0; cnt -= consecutive) { + + if (cnt > MDDB_VERIFY_SIZE) + consecutive = MDDB_VERIFY_SIZE; + else + consecutive = cnt; + + if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { + Free(buffer); + return (mdsyserror(ep, errno, np->rname)); + } + + iosize = DEV_BSIZE * consecutive; + if (write(fd, buffer, iosize) != iosize) { + Free(buffer); + return (mdsyserror(ep, errno, np->rname)); + } + + if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { + Free(buffer); + return (mdsyserror(ep, errno, np->rname)); + } + + if (read(fd, buffer, iosize) != iosize) { + Free(buffer); + return (mdsyserror(ep, errno, np->rname)); + } + + startblk += consecutive; + } + + Free(buffer); + if ((mb = Zalloc(DEV_BSIZE)) == NULL) + return (mdsyserror(ep, ENOMEM, np->rname)); + + if (meta_gettimeofday(&tp) == -1) { + Free(mb); + return (mdsyserror(ep, errno, np->rname)); + } + + mb->mb_magic = MDDB_MAGIC_MB; + /* + * If a MN diskset, set master block revision for a MN set. + * Even though the master block structure is no different + * for a MN set, setting the revision field to a different + * number keeps any pre-MN_diskset code from accessing + * this diskset. It also allows for an early determination + * of a MN diskset when reading in from disk so that the + * proper size locator block and locator names structure + * can be read in thus saving time on diskset startup. + */ + if (mn_set) + mb->mb_revision = MDDB_REV_MNMB; + else + mb->mb_revision = MDDB_REV_MB; + mb->mb_timestamp = tp; + mb->mb_setno = sp->setno; + mb->mb_blkcnt = dbsize - 1; + mb->mb_blkno = firstblk; + mb->mb_nextblk = 0; + + mb->mb_blkmap.m_firstblk = firstblk + 1; + mb->mb_blkmap.m_consecutive = dbsize - 1; + if (! metaislocalset(sp)) { + mb->mb_setcreatetime = inittime; + } + + /* + * We try to save the disks device ID into the remaining bytes in + * the master block. The saved devid is used to provide a mapping + * between this disk's devid and the devid stored into the master + * block. This allows the disk image to be self-identifying + * if it gets copied (e.g. SNDR, True Copy, etc.). This is used + * when we try to import these disks on the remote copied image. + * If we cannot save the disks device ID onto the master block that is + * ok. The disk is just not self-identifying and won't be importable + * in the remote copy scenario. + */ + if (devid_get(fd, &devid) == 0) { + size_t len; + + len = devid_sizeof(devid); + if (len <= DEV_BSIZE - sizeof (*mb)) { + /* there is enough space to store the devid */ + mb->mb_devid_magic = MDDB_MAGIC_DE; + mb->mb_devid_len = len; + (void) memcpy(mb->mb_devid, devid, len); + } + devid_free(devid); + } + + crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, + (crc_skip_t *)NULL); + + if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { + Free(mb); + return (mdsyserror(ep, errno, np->rname)); + } + + if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { + Free(mb); + return (mdsyserror(ep, errno, np->rname)); + } + + if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { + Free(mb); + return (mdsyserror(ep, errno, np->rname)); + } + + if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { + Free(mb); + return (mdsyserror(ep, errno, np->rname)); + } + + if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, + (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { + Free(mb); + return (mdmddberror(ep, MDE_NOTVERIFIED, + meta_getminor(np->dev), sp->setno, 0, np->rname)); + } + + Free(mb); + return (0); +} + +void +meta_mkdummymaster( + mdsetname_t *sp, + int fd, + daddr_t firstblk +) +{ + md_timeval32_t tp; + struct mddb_mb *mb; + ddi_devid_t devid; + md_set_desc *sd; + md_error_t ep = mdnullerror; + md_timeval32_t inittime; + + /* + * No dummy master blocks are written for a MN diskset since devids + * are not supported in MN disksets. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, &ep)) == NULL) + return; + + if (MD_MNSET_DESC(sd)) + return; + } + + if ((mb = Zalloc(DEV_BSIZE)) == NULL) + return; + + mb->mb_magic = MDDB_MAGIC_DU; + mb->mb_revision = MDDB_REV_MB; + mb->mb_setno = sp->setno; + inittime = meta_get_lb_inittime(sp, &ep); + mb->mb_setcreatetime = inittime; + + if (meta_gettimeofday(&tp) != -1) + mb->mb_timestamp = tp; + + /* + * We try to save the disks device ID into the remaining bytes in + * the master block. This allows the disk image to be self-identifying + * if it gets copied (e.g. SNDR, True Copy, etc.). This is used + * when we try to import these disks on the remote copied image. + * If we cannot save the disks device ID onto the master block that is + * ok. The disk is just not self-identifying and won't be importable + * in the remote copy scenario. + */ + if (devid_get(fd, &devid) == 0) { + int len; + + len = devid_sizeof(devid); + if (len <= DEV_BSIZE - sizeof (*mb)) { + /* there is enough space to store the devid */ + mb->mb_devid_magic = MDDB_MAGIC_DE; + mb->mb_devid_len = len; + (void) memcpy(mb->mb_devid, (char *)devid, len); + } + devid_free(devid); + } + + crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, + (crc_skip_t *)NULL); + + /* + * If any of these operations fail, we need to inform the + * user that the disk won't be self identifying. When support + * for importing remotely replicated disksets is added, we + * want to add the error messages here. + */ + if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) + goto out; + + if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) + goto out; + + if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) + goto out; + + if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) + goto out; + + if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, + (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) + goto out; + +out: + Free(mb); +} + +static int +buildconf(mdsetname_t *sp, md_error_t *ep) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + FILE *cfp = NULL; + FILE *mfp = NULL; + struct stat sbuf; + int rval = 0; + int in_miniroot = 0; + char line[MDDB_BOOTLIST_MAX_LEN]; + char *tname = NULL; + + /* get list of local replicas */ + if (! metaislocalset(sp)) + return (0); + + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) + return (-1); + + /* open tempfile, copy permissions of original file */ + if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { + /* + * On the miniroot tmp files must be created in /var/tmp. + * If we get a EROFS error, we assume that we are in the + * miniroot. + */ + if (errno != EROFS) + goto error; + in_miniroot = 1; + errno = 0; + tname = tempnam("/var/tmp", "slvm_"); + if (tname == NULL && errno == EROFS) { + /* + * If we are booted on a read-only root because + * of mddb quorum problems we don't want to emit + * any scary error messages. + */ + errno = 0; + goto out; + } + + /* open tempfile, copy permissions of original file */ + if ((cfp = fopen(tname, "w+")) == NULL) + goto error; + } + if (stat(META_DBCONF, &sbuf) == 0) { + if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) + goto error; + if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) + goto error; + } + + /* print header */ + if (fprintf(cfp, "#metadevice database location file ") == EOF) + goto error; + if (fprintf(cfp, "do not hand edit\n") < 0) + goto error; + if (fprintf(cfp, + "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) + goto error; + + /* dump replicas */ + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + int checksum = 42; + int i; + char *devidp; + minor_t min; + + devidp = devid_str_encode(r->r_devid, r->r_minor_name); + /* If devid code can't encode devidp - skip entry */ + if (devidp == NULL) { + continue; + } + + /* compute checksum */ + for (i = 0; ((r->r_driver_name[i] != '\0') && + (i < sizeof (r->r_driver_name))); i++) { + checksum -= r->r_driver_name[i]; + } + min = meta_getminor(r->r_namep->dev); + checksum -= min; + checksum -= r->r_blkno; + + for (i = 0; i < strlen(devidp); i++) { + checksum -= devidp[i]; + } + /* print info */ + if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", + r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { + goto error; + } + + devid_str_free(devidp); + } + + /* close and rename to real file */ + if (fflush(cfp) != 0) + goto error; + if (fsync(fileno(cfp)) != 0) + goto error; + if (fclose(cfp) != 0) { + cfp = NULL; + goto error; + } + cfp = NULL; + + /* + * Renames don't work in the miniroot since tmpfiles are + * created in /var/tmp. Hence we copy the data out. + */ + + if (! in_miniroot) { + if (rename(META_DBCONFTMP, META_DBCONF) != 0) + goto error; + } else { + if ((cfp = fopen(tname, "r")) == NULL) + goto error; + if ((mfp = fopen(META_DBCONF, "w+")) == NULL) + goto error; + while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { + if (fputs(line, mfp) == NULL) + goto error; + } + (void) fclose(cfp); + cfp = NULL; + if (fflush(mfp) != 0) + goto error; + if (fsync(fileno(mfp)) != 0) + goto error; + if (fclose(mfp) != 0) { + mfp = NULL; + goto error; + } + /* delete the tempfile */ + (void) unlink(tname); + } + /* success */ + rval = 0; + goto out; + + /* tempfile error */ +error: + rval = (in_miniroot) ? mdsyserror(ep, errno, tname): + mdsyserror(ep, errno, META_DBCONFTMP); + + + /* cleanup, return success */ +out: + if (rlp != NULL) + metafreereplicalist(rlp); + if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { + rval = (in_miniroot) ? mdsyserror(ep, errno, tname): + mdsyserror(ep, errno, META_DBCONFTMP); + } + free(tname); + return (rval); +} + +/* + * check replica for dev + */ +static int +in_replica( + mdsetname_t *sp, + md_replica_t *rp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdname_t *repnp = rp->r_namep; + diskaddr_t rep_sblk = rp->r_blkno; + diskaddr_t rep_nblks = rp->r_nblk; + + /* should be in the same set */ + assert(sp != NULL); + + /* if error in master block, assume whole partition */ + if ((rep_sblk == MD_DISKADDR_ERROR) || + (rep_nblks == MD_DISKADDR_ERROR)) { + rep_sblk = 0; + rep_nblks = MD_DISKADDR_ERROR; + } + + /* check overlap */ + if (meta_check_overlap( + MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { + return (-1); + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a replica + */ +int +meta_check_inreplica( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each replica */ + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) + return (-1); + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + md_replica_t *rp = rl->rl_repp; + + /* check replica */ + if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreereplicalist(rlp); + return (rval); +} + +/* + * check replica + */ +int +meta_check_replica( + mdsetname_t *sp, /* set to check against */ + mdname_t *np, /* component to check against */ + mdchkopts_t options, /* option flags */ + diskaddr_t slblk, /* start logical block */ + diskaddr_t nblks, /* number of blocks (-1,rest of them) */ + md_error_t *ep /* error packet */ +) +{ + mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; + + /* make sure we have a disk */ + if (metachkcomp(np, ep) != 0) + return (-1); + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + if (options & MDCHK_ALLOW_NODBS) + return (0); + + if (options & MDCHK_DRVINSET) + return (0); + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +static int +update_dbinfo_on_drives( + mdsetname_t *sp, + md_drive_desc *dd, + int set_locked, + int force, + md_error_t *ep +) +{ + md_set_desc *sd; + int i; + md_setkey_t *cl_sk; + int rval = 0; + md_mnnode_desc *nd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (! set_locked) { + if (MD_MNSET_DESC(sd)) { + md_error_t xep = mdnullerror; + sigset_t sigs; + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &sigs, &xep) < 0) + mdclrerror(&xep); + + nd = sd->sd_nodelist; + while (nd) { + if (force && strcmp(nd->nd_nodename, + mynode()) != 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_lock_set(nd->nd_nodename, sp, ep)) + return (-1); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (force && strcmp(sd->sd_nodes[i], + mynode()) != 0) + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) + return (-1); + } + } + } + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (force && strcmp(nd->nd_nodename, mynode()) != 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) + == -1) { + rval = -1; + break; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) + continue; + + if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) + == -1) { + rval = -1; + break; + } + } + } + + if (! set_locked) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (force && + strcmp(nd->nd_nodename, mynode()) != 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_unlock_set(nd->nd_nodename, cl_sk, + ep)) { + rval = -1; + break; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (force && + strcmp(sd->sd_nodes[i], mynode()) != 0) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, + ep)) { + rval = -1; + break; + } + } + + } + cl_set_setkey(NULL); + } + + return (rval); +} + +int +meta_db_addsidenms( + mdsetname_t *sp, + mdname_t *np, + daddr_t blkno, + int bcast, + md_error_t *ep +) +{ + side_t sideno; + char *bname = NULL; + char *dname = NULL; + minor_t mnum; + mddb_config_t c; + int done; + int rval = 0; + md_set_desc *sd; + + sideno = MD_SIDEWILD; + /*CONSTCOND*/ + while (1) { + if (bname != NULL) { + Free(bname); + bname = NULL; + } + if (dname != NULL) { + Free(dname); + dname = NULL; + } + if ((done = meta_getnextside_devinfo(sp, np->bname, + &sideno, &bname, &dname, &mnum, ep)) == -1) { + rval = -1; + break; + } + + if (done == 0) + break; + + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + rval = -1; + break; + } + } + + /* + * Send addsidenms to all nodes using rpc.mdcommd if + * sidename is being added to MN diskset. + * + * It's ok to broadcast this call to other nodes. + * + * Note: The broadcast to other nodes isn't needed during + * the addition of the first mddbs to the set since the + * other nodes haven't been joined to the set yet. All + * nodes in a MN diskset are (implicitly) joined to the set + * on the addition of the first mddb. + */ + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (bcast == DB_ADDSIDENMS_BCAST)) { + md_mn_result_t *resultp = NULL; + md_mn_msg_meta_db_newside_t db_ns; + int send_rval; + + db_ns.msg_l_dev = np->dev; + db_ns.msg_sideno = sideno; + db_ns.msg_blkno = blkno; + (void) strncpy(db_ns.msg_dname, dname, + sizeof (db_ns.msg_dname)); + (void) splitname(np->bname, &db_ns.msg_splitname); + db_ns.msg_mnum = mnum; + + /* Set devid to NULL until devids are supported */ + db_ns.msg_devid[0] = NULL; + + /* + * If reconfig cycle has been started, this node is + * stuck in in the return step until this command has + * completed. If mdcommd is suspended, ask + * send_message to fail (instead of retrying) + * so that metaset can finish allowing the reconfig + * cycle to proceed. + */ + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | + MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, + sizeof (md_mn_msg_meta_db_newside_t), + &resultp, ep); + if (send_rval != 0) { + rval = -1; + if (resultp == NULL) + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + else { + (void) mdstealerror(ep, + &(resultp->mmr_ep)); + if (mdisok(ep)) { + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + } + free_result(resultp); + } + break; + } + if (resultp) + free_result(resultp); + } else { + /* + * Let this side's device name, minor # and driver name + * be known to the database replica. + */ + (void) memset(&c, 0, sizeof (c)); + + /* Fill in device/replica info */ + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_blkno = blkno; + (void) strncpy(c.c_locator.l_driver, dname, + sizeof (c.c_locator.l_driver)); + (void) splitname(bname, &c.c_devname); + c.c_locator.l_mnum = mnum; + + /* Fill in setno, setname, and sideno */ + c.c_setno = sp->setno; + (void) strncpy(c.c_setname, sp->setname, + sizeof (c.c_setname)); + c.c_sideno = sideno; + + /* + * Don't need device id information from this ioctl + * Kernel determines device id from dev_t, which + * is just what this code would do. + */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { + rval = mdstealerror(ep, &c.c_mde); + break; + } + } + } + + /* cleanup, return success */ + if (bname != NULL) { + Free(bname); + bname = NULL; + } + if (dname != NULL) { + Free(dname); + dname = NULL; + } + return (rval); +} + + +int +meta_db_delsidenm( + mdsetname_t *sp, + side_t sideno, + mdname_t *np, + daddr_t blkno, + md_error_t *ep +) +{ + mddb_config_t c; + md_set_desc *sd; + + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + } + /* Use rpc.mdcommd to delete mddb side from all nodes */ + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + md_mn_result_t *resultp = NULL; + md_mn_msg_meta_db_delside_t db_ds; + int send_rval; + + db_ds.msg_l_dev = np->dev; + db_ds.msg_blkno = blkno; + db_ds.msg_sideno = sideno; + + /* Set devid to NULL until devids are supported */ + db_ds.msg_devid[0] = NULL; + + /* + * If reconfig cycle has been started, this node is + * stuck in in the return step until this command has + * completed. If mdcommd is suspended, ask + * send_message to fail (instead of retrying) + * so that metaset can finish allowing the reconfig + * cycle to proceed. + */ + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | + MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, + sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); + if (send_rval != 0) { + if (resultp == NULL) + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + else { + (void) mdstealerror(ep, &(resultp->mmr_ep)); + if (mdisok(ep)) { + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + } + free_result(resultp); + } + return (-1); + } + if (resultp) + free_result(resultp); + + } else { + /* + * Let this side's device name, minor # and driver name + * be known to the database replica. + */ + (void) memset(&c, 0, sizeof (c)); + + /* Fill in device/replica info */ + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_blkno = blkno; + + /* Fill in setno, setname, and sideno */ + c.c_setno = sp->setno; + (void) strcpy(c.c_setname, sp->setname); + c.c_sideno = sideno; + + /* + * Don't need device id information from this ioctl + * Kernel determines device id from dev_t, which + * is just what this code would do. + */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + } + return (0); +} + + +static int +mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) +{ + mdnamelist_t *dnp1, *dnp2; + + for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { + for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { + if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) + return (mderror(ep, MDE_DUPDRIVE, + dnp1->namep->cname)); + } + } + return (0); +} + + +/* + * Return 1 if files are different, else return 0 + */ +static int +filediff(char *tsname, char *sname) +{ + int ret = 1, fd; + size_t tsz, sz; + struct stat sbuf; + char *tbuf, *buf; + + if (stat(tsname, &sbuf) != 0) + return (1); + tsz = sbuf.st_size; + if (stat(sname, &sbuf) != 0) + return (1); + sz = sbuf.st_size; + if (tsz != sz) + return (1); + + /* allocate memory and read both files into buffer */ + tbuf = malloc(tsz); + buf = malloc(sz); + if (tbuf == NULL || buf == NULL) + goto out; + + fd = open(tsname, O_RDONLY); + if (fd == -1) + goto out; + sz = read(fd, tbuf, tsz); + (void) close(fd); + if (sz != tsz) + goto out; + + fd = open(sname, O_RDONLY); + if (fd == -1) + goto out; + sz = read(fd, buf, tsz); + (void) close(fd); + if (sz != tsz) + goto out; + + /* compare content */ + ret = bcmp(tbuf, buf, tsz); +out: + if (tbuf) + free(tbuf); + if (buf) + free(buf); + return (ret); +} + +/* + * patch md.conf file with mddb locations + */ +int +meta_db_patch( + char *sname, /* system file name */ + char *cname, /* mddb.cf file name */ + int patch, /* patching locally */ + md_error_t *ep +) +{ + char *tsname = NULL; + char line[MDDB_BOOTLIST_MAX_LEN]; + FILE *tsfp = NULL; + FILE *mfp = NULL; + int rval = -1; + + /* check names */ + if (sname == NULL) { + if (patch) + sname = "md.conf"; + else + sname = "/kernel/drv/md.conf"; + } + if (cname == NULL) + cname = META_DBCONF; + + /* + * edit file + */ + if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { + if (mdissyserror(ep, EROFS)) { + /* + * If we are booted on a read-only root because + * of mddb quorum problems we don't want to emit + * any scary error messages. + */ + mdclrerror(ep); + rval = 0; + } + goto out; + } + + if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, + ep) != 0) + goto out; + + /* if file content is identical, skip rename */ + if (filediff(tsname, sname) == 0) { + rval = 0; + goto out; + } + + if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || + (fclose(tsfp) != 0)) { + (void) mdsyserror(ep, errno, tsname); + goto out; + } + + tsfp = NULL; + + /* + * rename file. If we get a Cross Device error then it + * is because we are in the miniroot. + */ + if (rename(tsname, sname) != 0 && errno != EXDEV) { + (void) mdsyserror(ep, errno, sname); + goto out; + } + + if (errno == EXDEV) { + if ((tsfp = fopen(tsname, "r")) == NULL) + goto out; + if ((mfp = fopen(sname, "w+")) == NULL) + goto out; + while (fgets(line, sizeof (line), tsfp) != NULL) { + if (fputs(line, mfp) == NULL) + goto out; + } + (void) fclose(tsfp); + tsfp = NULL; + if (fflush(mfp) != 0) + goto out; + if (fsync(fileno(mfp)) != 0) + goto out; + if (fclose(mfp) != 0) { + mfp = NULL; + goto out; + } + } + + Free(tsname); + tsname = NULL; + rval = 0; + + /* cleanup, return error */ +out: + if (tsfp != NULL) + (void) fclose(tsfp); + if (tsname != NULL) { + (void) unlink(tsname); + Free(tsname); + } + return (rval); +} + +/* + * Add replicas to set. This happens as a result of: + * - metadb [-s set_name] -a + * - metaset -s set_name -a disk + * - metaset -s set_name -d disk (causes a rebalance of mddbs) + * - metaset -s set_name -b + * + * For a local set, this routine is run on the local set host. + * + * For a traditional diskset, this routine is run on the node that + * is running the metaset command. + * + * For a multinode diskset, this routine is run by the node that is + * running the metaset command. If this is the first mddb added to + * the MN diskset, then no communication is made to other nodes via commd + * since the other nodes will be in-sync with respect to the mddbs when + * those other nodes join the set and snarf in the newly created mddb. + * If this is not the first mddb added to the MN diskset, then this + * attach command is sent to all of the nodes using commd. This keeps + * the nodes in-sync. + */ +int +meta_db_attach( + mdsetname_t *sp, + mdnamelist_t *db_nlp, + mdchkopts_t options, + md_timeval32_t *timeval, + int dbcnt, + int dbsize, + char *sysfilename, + md_error_t *ep +) +{ + struct mddb_config c; + mdnamelist_t *nlp; + mdname_t *np; + md_drive_desc *dd = NULL; + md_drive_desc *p; + int i; + int fd; + side_t sideno; + daddr_t blkno; + int replicacount = 0; + int start_mdmonitord = 0; + int rval = 0; + md_error_t status = mdnullerror; + md_set_desc *sd; + int stale_bool = FALSE; + int flags; + int firstmddb = 1; + md_timeval32_t inittime = {0, 0}; + + /* + * Error if we don't get some work to do. + */ + if (db_nlp == NULL) + return (mdsyserror(ep, EINVAL, NULL)); + + if (mdnamesareunique(db_nlp, ep) != 0) + return (-1); + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + if (metaislocalset(sp)) { + if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) + mdclrerror(&c.c_mde); + else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || + (! (options & MDCHK_ALLOW_NODBS))) + return (mdstealerror(ep, &c.c_mde)); + } else { + if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) + return (mdstealerror(ep, &c.c_mde)); + } + mdclrerror(&c.c_mde); + } + /* + * Is current set STALE? + */ + if (c.c_flags & MDDB_C_STALE) { + stale_bool = TRUE; + } + + assert(db_nlp != NULL); + + /* if creating the metadbs for the first time start mdmonitord */ + if (c.c_dbcnt == 0) + start_mdmonitord = 1; + + /* + * check to see if we will go over the total possible number + * of data bases + */ + nlp = db_nlp; + while (nlp) { + replicacount += dbcnt; + nlp = nlp->next; + } + + if ((replicacount + c.c_dbcnt) > c.c_dbmax) + return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, + sp->setno, c.c_dbcnt + replicacount, NULL)); + + /* + * go through and check to make sure all locations specified + * are legal also pick out driver name; + */ + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + diskaddr_t devsize; + + np = nlp->namep; + + if (! metaislocalset(sp)) { + uint_t partno; + uint_t rep_partno; + mddrivename_t *dnp = np->drivenamep; + + /* + * make sure that non-local database replicas + * are always on the replica slice. + */ + if (meta_replicaslice(dnp, + &rep_partno, ep) != 0) + return (-1); + if (metagetvtoc(np, FALSE, &partno, ep) == NULL) + return (-1); + if (partno != rep_partno) + return (mddeverror(ep, MDE_REPCOMP_ONLY, + np->dev, sp->setname)); + } + + if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), + ep)) { + return (-1); + } + + if ((devsize = metagetsize(np, ep)) == -1) + return (-1); + + if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) + return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, + meta_getminor(np->dev), sp->setno, devsize, + np->cname)); + } + + /* + * If first disk in set we don't have lb_inittime yet for use as + * mb_setcreatetime so don't go looking for it. WE'll come back + * later and update after the locator block has been created. + * If this isn't the first disk in the set, we have a locator + * block and thus we have lb_inittime. Set mb_setcreatetime to + * lb_inittime. + */ + if (! metaislocalset(sp)) { + if (c.c_dbcnt != 0) { + firstmddb = 0; + inittime = meta_get_lb_inittime(sp, ep); + } + } + + /* + * go through and write all master blocks + */ + + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + np = nlp->namep; + + if ((fd = open(np->rname, O_RDWR)) < 0) + return (mdsyserror(ep, errno, np->rname)); + + for (i = 0; i < dbcnt; i++) { + if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, + inittime, ep)) { + (void) close(fd); + return (-1); + } + } + (void) close(fd); + } + + if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) + return (-1); + + if (! metaislocalset(sp)) { + dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); + if (! mdisok(ep)) + return (-1); + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + } + + /* + * go through and tell kernel to add them + */ + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + mdcinfo_t *cinfo; + + np = nlp->namep; + + if ((cinfo = metagetcinfo(np, ep)) == NULL) { + rval = -1; + goto out; + } + + /* + * If mddb is being added to MN diskset and there already + * exists a valid mddb in the set (which equates to this + * node being an owner of the set) then use rpc.mdcommd + * mechanism to add mddb(s) so that all nodes stay in sync. + * If set is stale, don't log the message since rpc.mdcommd + * can't write the message to the mddb. + * + * Otherwise, just add mddb to this node. + */ + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + md_mn_result_t *resultp = NULL; + md_mn_msg_meta_db_attach_t attach; + int send_rval; + + /* + * In a scenario where new replicas had been added on + * the master, and then all of the old replicas failed + * before the slaves had knowledge of the new replicas, + * the slaves are unable to re-parse in the mddb + * from the new replicas since the slaves have no + * knowledge of the new replicas. The following + * algorithm solves this problem: + * - META_DB_ATTACH message generates submsgs + * - BLOCK parse (master) + * - MDDB_ATTACH new replicas + * - UNBLOCK parse (master) causing parse + * information to be sent from master + * to slaves at a higher class than the + * unblock so the parse message will + * reach slaves before unblock message. + */ + attach.msg_l_dev = np->dev; + attach.msg_cnt = dbcnt; + attach.msg_dbsize = dbsize; + (void) strncpy(attach.msg_dname, cinfo->dname, + sizeof (attach.msg_dname)); + (void) splitname(np->bname, &attach.msg_splitname); + attach.msg_options = options; + + /* Set devid to NULL until devids are supported */ + attach.msg_devid[0] = NULL; + + /* + * If reconfig cycle has been started, this node is + * stuck in in the return step until this command has + * completed. If mdcommd is suspended, ask + * send_message to fail (instead of retrying) + * so that metaset can finish allowing the reconfig + * cycle to proceed. + */ + flags = MD_MSGF_FAIL_ON_SUSPEND; + if (stale_bool == TRUE) + flags |= MD_MSGF_NO_LOG; + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_DB_ATTACH, + flags, (char *)&attach, + sizeof (md_mn_msg_meta_db_attach_t), + &resultp, ep); + if (send_rval != 0) { + rval = -1; + if (resultp == NULL) + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + else { + (void) mdstealerror(ep, + &(resultp->mmr_ep)); + if (mdisok(ep)) { + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + } + free_result(resultp); + } + goto out; + } + if (resultp) + free_result(resultp); + } else { + /* Adding mddb(s) to just this node */ + for (i = 0; i < dbcnt; i++) { + (void) memset(&c, 0, sizeof (c)); + /* Fill in device/replica info */ + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_blkno = i * dbsize + 16; + blkno = c.c_locator.l_blkno; + (void) strncpy(c.c_locator.l_driver, cinfo->dname, + sizeof (c.c_locator.l_driver)); + (void) splitname(np->bname, &c.c_devname); + c.c_locator.l_mnum = meta_getminor(np->dev); + + /* Fill in setno, setname, and sideno */ + c.c_setno = sp->setno; + if (! metaislocalset(sp)) { + if (MD_MNSET_DESC(sd)) { + c.c_multi_node = 1; + } + } + (void) strcpy(c.c_setname, sp->setname); + c.c_sideno = sideno; + + /* + * Don't need device id information from this ioctl + * Kernel determines device id from dev_t, which + * is just what this code would do. + */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (timeval != NULL) + c.c_timestamp = *timeval; + + if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE), + ep)) { + rval = -1; + goto out; + } + + if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) { + rval = mdstealerror(ep, &c.c_mde); + goto out; + } + /* + * This is either a traditional diskset OR this + * is the first replica added to a MN diskset. + * In either case, set broadcast to NO_BCAST so + * that message won't go through rpc.mdcommd. + * If this is a traditional diskset, the bcast + * flag is ignored since traditional disksets + * don't use the rpc.mdcommd. + */ + if (meta_db_addsidenms(sp, np, blkno, + DB_ADDSIDENMS_NO_BCAST, ep)) + goto out; + } + } + if (! metaislocalset(sp)) { + /* update the dbcnt and size in dd */ + for (p = dd; p != NULL; p = p->dd_next) + if (p->dd_dnp == np->drivenamep) { + p->dd_dbcnt = dbcnt; + p->dd_dbsize = dbsize; + break; + } + } + + /* + * If this was the first addition of disks to the + * diskset you now need to update the mb_setcreatetime + * which needed lb_inittime which wasn't there until now. + */ + if (firstmddb) { + if (meta_update_mb(sp, dd, ep) != 0) { + return (-1); + } + } + (void) close(fd); + } + +out: + if (metaislocalset(sp)) { + + /* everything looks fine. Start mdmonitord */ + /* Note: popen/pclose is the MT-safe replacement for system */ + if (rval == 0 && start_mdmonitord == 1) { + if (pclose(popen(MDMONITORD, "w")) == -1) + md_perror(MDMONITORD); + + if (meta_smf_enable(META_SMF_CORE, &status) == -1) { + mde_perror(&status, ""); + mdclrerror(&status); + } + } + + if (buildconf(sp, &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + return (rval); + } + + if (meta_db_patch(sysfilename, NULL, 0, &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + } + } else { + if (update_dbinfo_on_drives(sp, dd, + (options & MDCHK_SET_LOCKED), + (options & MDCHK_SET_FORCE), + &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + else + mdclrerror(&status); + } + metafreedrivedesc(&dd); + } + /* + * For MN disksets that already had already had nodes joined + * before the attach of this mddb(s), the name invalidation is + * done by the commd handler routine. Otherwise, if this + * is the first attach of a MN diskset mddb, the invalidation + * must be done here since the first attach cannot be sent + * via the commd since there are no nodes joined to the set yet. + */ + if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || + (MD_MNSET_DESC(sd) && + (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { + for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { + meta_invalidate_name(nlp->namep); + } + } + return (rval); +} + +/* + * deletelist_length + * + * return the number of slices that have been specified for deletion + * on the metadb command line. This does not calculate the number + * of replicas because there may be multiple replicas per slice. + */ +static int +deletelist_length(mdnamelist_t *db_nlp) +{ + + mdnamelist_t *nlp; + int list_length = 0; + + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + list_length++; + } + + return (list_length); +} + +static int +in_deletelist(char *devname, mdnamelist_t *db_nlp) +{ + + mdnamelist_t *nlp; + mdname_t *np; + int index = 0; + + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + np = nlp->namep; + + if (strcmp(devname, np->bname) == 0) + return (index); + index++; + } + + return (-1); +} + +/* + * Delete replicas from set. This happens as a result of: + * - metadb [-s set_name] -d + * - metaset -s set_name -a disk (causes a rebalance of mddbs) + * - metaset -s set_name -d disk + * - metaset -s set_name -b + * + * For a local set, this routine is run on the local set host. + * + * For a traditional diskset, this routine is run on the node that + * is running the metaset command. + * + * For a multinode diskset, this routine is run by the node that is + * running the metaset command. This detach routine is sent to all + * of the joined nodes in the diskset using commd. This keeps + * the nodes in-sync. + */ +int +meta_db_detach( + mdsetname_t *sp, + mdnamelist_t *db_nlp, + mdforceopts_t force_option, + char *sysfilename, + md_error_t *ep +) +{ + struct mddb_config c; + mdnamelist_t *nlp; + mdname_t *np; + md_drive_desc *dd = NULL; + md_drive_desc *p; + int replicacount; + int replica_delete_count; + int nr_replica_slices; + int i; + int stop_svmdaemons = 0; + int rval = 0; + int index; + int valid_replicas_nottodelete = 0; + int invalid_replicas_nottodelete = 0; + int invalid_replicas_todelete = 0; + int errored = 0; + int *tag_array; + int fd = -1; + md_error_t status = mdnullerror; + md_set_desc *sd; + int stale_bool = FALSE; + int flags; + + /* + * Error if we don't get some work to do. + */ + if (db_nlp == NULL) + return (mdsyserror(ep, EINVAL, NULL)); + + if (mdnamesareunique(db_nlp, ep) != 0) + return (-1); + + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + + /* + * Is current set STALE? + */ + if (c.c_flags & MDDB_C_STALE) { + stale_bool = TRUE; + } + + replicacount = c.c_dbcnt; + + assert(db_nlp != NULL); + + /* + * go through and gather how many data bases are on each + * device specified. + */ + + nr_replica_slices = deletelist_length(db_nlp); + tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); + + replica_delete_count = 0; + for (i = 0; i < replicacount; i++) { + char *devname; + int found = 0; + + c.c_id = i; + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + + devname = splicename(&c.c_devname); + + if ((index = in_deletelist(devname, db_nlp)) != -1) { + found = 1; + tag_array[index] = 1; + replica_delete_count++; + } + + errored = c.c_locator.l_flags & (MDDB_F_EREAD | + MDDB_F_EWRITE | MDDB_F_TOOSMALL | + MDDB_F_EFMT | MDDB_F_EDATA | + MDDB_F_EMASTER); + + /* + * There are four combinations of "errored" and "found" + * and they are used to find the number of + * (a) valid/invalid replicas that are not in the delete + * list and are available in the system. + * (b) valid/invalid replicas that are to be deleted. + */ + + if (errored && !found) /* errored and !found */ + invalid_replicas_nottodelete++; + else if (!found) /* !errored and !found */ + valid_replicas_nottodelete++; + else if (errored) /* errored and found */ + invalid_replicas_todelete++; + /* + * else it is !errored and found. This means + * valid_replicas_todelete++; But this variable will not + * be used anywhere + */ + + Free(devname); + } + + index = 0; + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + np = nlp->namep; + if (tag_array[index++] != 1) { + Free(tag_array); + return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); + } + } + + Free(tag_array); + + + /* if all replicas are deleted stop mdmonitord */ + if ((replicacount - replica_delete_count) == 0) + stop_svmdaemons = 1; + + if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { + if (force_option & MDFORCE_NONE) + return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); + if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) + return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); + } + + /* + * The following algorithms are followed to check for deletion: + * (a) If the delete list(db_nlp) has all invalid replicas and no valid + * replicas, then deletion should be allowed. + * (b) Deletion should be allowed only if valid replicas that are "not" + * to be deleted is always greater than the invalid replicas that + * are "not" to be deleted. + * (c) If the user uses -f option, then deletion should be allowed. + */ + + if ((invalid_replicas_todelete != replica_delete_count) && + (invalid_replicas_nottodelete > valid_replicas_nottodelete) && + (force_option != MDFORCE_LOCAL)) + return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); + + /* + * go through and tell kernel to delete them + */ + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + + if (! metaislocalset(sp)) { + dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); + if (! mdisok(ep)) + return (-1); + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + } + + for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { + np = nlp->namep; + + /* + * If mddb is being deleted from MN diskset and node is + * an owner of the diskset then use rpc.mdcommd + * mechanism to add mddb(s) so that all nodes stay in sync. + * If set is stale, don't log the message since rpc.mdcommd + * can't write the message to the mddb. + * + * When mddbs are first being added to set, a detach can + * be called before any node has joined the diskset, so + * must check to see if node is an owner of the diskset. + * + * Otherwise, just delete mddb from this node. + */ + + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + md_mn_result_t *resultp; + md_mn_msg_meta_db_detach_t detach; + int send_rval; + + /* + * The following algorithm is used to detach replicas. + * - META_DB_DETACH message generates submsgs + * - BLOCK parse (master) + * - MDDB_DETACH replicas + * - UNBLOCK parse (master) causing parse + * information to be sent from master + * to slaves at a higher class than the + * unblock so the parse message will + * reach slaves before unblock message. + */ + (void) splitname(np->bname, &detach.msg_splitname); + + /* Set devid to NULL until devids are supported */ + detach.msg_devid[0] = NULL; + + /* + * If reconfig cycle has been started, this node is + * stuck in in the return step until this command has + * completed. If mdcommd is suspended, ask + * send_message to fail (instead of retrying) + * so that metaset can finish allowing the reconfig + * cycle to proceed. + */ + flags = MD_MSGF_FAIL_ON_SUSPEND; + if (stale_bool == TRUE) + flags |= MD_MSGF_NO_LOG; + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_DB_DETACH, + flags, (char *)&detach, + sizeof (md_mn_msg_meta_db_detach_t), + &resultp, ep); + if (send_rval != 0) { + rval = -1; + if (resultp == NULL) + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + else { + (void) mdstealerror(ep, + &(resultp->mmr_ep)); + if (mdisok(ep)) { + (void) mddserror(ep, + MDE_DS_COMMD_SEND_FAIL, + sp->setno, NULL, NULL, + sp->setname); + } + free_result(resultp); + } + goto out; + } + if (resultp) + free_result(resultp); + } else { + i = 0; + while (i < c.c_dbcnt) { + char *devname; + + c.c_id = i; + + /* Don't need devid info from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_GETDEV, &c, + &c.c_mde, NULL)) { + rval = mdstealerror(ep, &c.c_mde); + goto out; + } + + devname = splicename(&c.c_devname); + if (strcmp(devname, np->bname) != 0) { + Free(devname); + i++; + continue; + } + Free(devname); + + /* Don't need devid info from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_DELDEV, &c, + &c.c_mde, NULL) != 0) { + rval = mdstealerror(ep, &c.c_mde); + goto out; + } + + /* Not incrementing "i" intentionally */ + } + } + if (! metaislocalset(sp)) { + /* update the dbcnt and size in dd */ + for (p = dd; p != NULL; p = p->dd_next) { + if (p->dd_dnp == np->drivenamep) { + p->dd_dbcnt = 0; + p->dd_dbsize = 0; + break; + } + } + + /* + * Slam a dummy master block and make it self + * identifying + */ + if ((fd = open(np->rname, O_RDWR)) >= 0) { + meta_mkdummymaster(sp, fd, 16); + (void) close(fd); + } + } + } +out: + if (metaislocalset(sp)) { + /* + * Stop all the daemons if there are + * no more replicas so that the module can be + * unloaded. + */ + if (rval == 0 && stop_svmdaemons == 1) { + char buf[MAXPATHLEN]; + int i; + + for (i = 0; i < DAEMON_COUNT; i++) { + (void) snprintf(buf, MAXPATHLEN, + "/usr/bin/pkill -%s -x %s", + svmd_kill_list[i].svmd_kill_val, + svmd_kill_list[i].svmd_name); + if (pclose(popen(buf, "w")) == -1) + md_perror(buf); + } + + if (meta_smf_disable(META_SMF_ALL, &status) == -1) { + mde_perror(&status, ""); + mdclrerror(&status); + } + } + if (buildconf(sp, &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + else + mdclrerror(&status); + return (rval); + } + + if (meta_db_patch(sysfilename, NULL, 0, &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + else + mdclrerror(&status); + } + } else { + if (update_dbinfo_on_drives(sp, dd, + (force_option & MDFORCE_SET_LOCKED), + ((force_option & MDFORCE_LOCAL) | + (force_option & MDFORCE_DS)), &status)) { + /* Don't mask any previous errors */ + if (rval == 0) + rval = mdstealerror(ep, &status); + else + mdclrerror(&status); + } + metafreedrivedesc(&dd); + } + if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { + for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { + meta_invalidate_name(nlp->namep); + } + } + return (rval); +} + +static md_replica_t * +metareplicaname( + mdsetname_t *sp, + int flags, + struct mddb_config *c, + md_error_t *ep +) +{ + md_replica_t *rp; + char *devname; + size_t sz; + + /* allocate replicaname */ + rp = Zalloc(sizeof (*rp)); + + /* get device name */ + devname = splicename(&c->c_devname); + if (flags & PRINT_FAST) { + if ((rp->r_namep = metaname_fast(&sp, devname, ep)) == NULL) { + Free(devname); + Free(rp); + return (NULL); + } + } else { + if ((rp->r_namep = metaname(&sp, devname, ep)) == NULL) { + Free(devname); + Free(rp); + return (NULL); + } + } + Free(devname); + + /* make sure it's OK */ + if ((! (flags & MD_BASICNAME_OK)) && + (metachkcomp(rp->r_namep, ep) != 0)) { + Free(rp); + return (NULL); + } + + rp->r_blkno = MD_DISKADDR_ERROR; + rp->r_nblk = MD_DISKADDR_ERROR; + rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; + if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { + sz = devid_sizeof((ddi_devid_t)(c->c_locator.l_devid)); + if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == + (ddi_devid_t)NULL) { + Free(rp); + return (NULL); + } + (void) memcpy((void *)rp->r_devid, + (void *)c->c_locator.l_devid, sz); + (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); + rp->r_flags &= ~MDDB_F_NODEVID; + /* Overwrite dev derived from name with dev from devid */ + rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); + } + (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); + + rp->r_blkno = c->c_locator.l_blkno; + if (c->c_dbend != 0) + rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; + + /* return replica */ + return (rp); +} + +/* + * free replica list + */ +void +metafreereplicalist( + md_replicalist_t *rlp +) +{ + md_replicalist_t *rl = NULL; + + for (/* void */; (rlp != NULL); rlp = rl) { + rl = rlp->rl_next; + if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { + free(rlp->rl_repp->r_devid); + } + Free(rlp->rl_repp); + Free(rlp); + } +} + +/* + * return list of all replicas in set + */ +int +metareplicalist( + mdsetname_t *sp, + int flags, + md_replicalist_t **rlpp, + md_error_t *ep +) +{ + md_replicalist_t **tail = rlpp; + int count = 0; + struct mddb_config c; + int i; + char *devid; + + /* for each replica */ + i = 0; + do { + md_replica_t *rp; + + /* get next replica */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = i; + c.c_setno = sp->setno; + + c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; + if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { + if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { + mdclrerror(&c.c_mde); + break; /* handle none at all */ + } + (void) mdstealerror(ep, &c.c_mde); + goto out; + } + + if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { + if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { + (void) mdsyserror(ep, ENOMEM, META_DBCONF); + goto out; + } + c.c_locator.l_devid = (uintptr_t)devid; + /* + * Turn on space and sz flags since 'sz' amount of + * space has been alloc'd. + */ + c.c_locator.l_devid_flags = + MDDB_DEVID_SPACE | MDDB_DEVID_SZ; + } + + if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { + if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { + mdclrerror(&c.c_mde); + break; /* handle none at all */ + } + (void) mdstealerror(ep, &c.c_mde); + goto out; + } + + /* + * Paranoid check - shouldn't happen, but is left as + * a place holder for changes that will be needed after + * dynamic reconfiguration changes are added to SVM (to + * support movement of disks at any point in time). + */ + if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { + (void) fprintf(stderr, + dgettext(TEXT_DOMAIN, + "Error: Relocation Information " + "(drvnm=%s, mnum=0x%lx) \n" + "relocation information size changed - \n" + "rerun command\n"), + c.c_locator.l_driver, c.c_locator.l_mnum); + (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); + goto out; + } + + if (c.c_dbcnt == 0) + break; /* handle none at all */ + + /* get info */ + if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) + goto out; + + /* append to list */ + *tail = Zalloc(sizeof (**tail)); + (*tail)->rl_repp = rp; + tail = &(*tail)->rl_next; + ++count; + + if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { + free(devid); + c.c_locator.l_devid_flags = 0; + } + + } while (++i < c.c_dbcnt); + + if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { + free(devid); + } + + /* return count */ + return (count); + + /* cleanup, return error */ +out: + if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { + free(devid); + } + metafreereplicalist(*rlpp); + *rlpp = NULL; + return (-1); +} + +/* + * meta_sync_db_locations - get list of replicas from kernel and write + * out to mddb.cf and md.conf. 'Syncs up' the replica list in + * the kernel with the replica list in the conf files. + * + */ +void +meta_sync_db_locations( + mdsetname_t *sp, + md_error_t *ep +) +{ + char *sname = 0; /* system file name */ + char *cname = 0; /* config file name */ + + if (!metaislocalset(sp)) + return; + + /* Updates backup of configuration file (aka mddb.cf) */ + if (buildconf(sp, ep) != 0) + return; + + /* Updates system configuration file (aka md.conf) */ + (void) meta_db_patch(sname, cname, 0, ep); +} + +/* + * setup_db_locations - parse the mddb.cf file and + * tells the driver which db locations to use. + */ +int +meta_setup_db_locations( + md_error_t *ep +) +{ + mddb_config_t c; + FILE *fp; + char inbuff[1024]; + char *buff; + uint_t i; + size_t sz; + int rval = 0; + char *devidp; + uint_t devid_size; + char *minor_name = NULL; + ddi_devid_t devid_decode; + int checksum; + + /* do mddb.cf file */ + (void) memset(&c, '\0', sizeof (c)); + if ((fp = fopen(META_DBCONF, "r")) == NULL) { + if (errno != ENOENT) + return (mdsyserror(ep, errno, META_DBCONF)); + } + while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), + fp)) != NULL)) { + + /* ignore comments */ + if (*buff == '#') + continue; + + /* parse locator */ + (void) memset(&c, 0, sizeof (c)); + c.c_setno = MD_LOCAL_SET; + i = strcspn(buff, " \t"); + if (i > sizeof (c.c_locator.l_driver)) + i = sizeof (c.c_locator.l_driver); + (void) strncpy(c.c_locator.l_driver, buff, i); + buff += i; + c.c_locator.l_dev = + makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); + c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); + c.c_locator.l_mnum = minor(c.c_locator.l_dev); + + /* parse out devid */ + while (isspace((int)(*buff))) + buff += 1; + i = strcspn(buff, " \t"); + if ((devidp = (char *)malloc(i+1)) == NULL) + return (mdsyserror(ep, ENOMEM, META_DBCONF)); + + (void) strncpy(devidp, buff, i); + devidp[i] = '\0'; + if (devid_str_decode(devidp, &devid_decode, + &minor_name) == -1) { + free(devidp); + continue; + } + + /* Conf file must have minor name associated with devid */ + if (minor_name == NULL) { + free(devidp); + devid_free(devid_decode); + continue; + } + + sz = devid_sizeof(devid_decode); + /* Copy to devid size buffer that ioctl expects */ + if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { + devid_free(devid_decode); + free(minor_name); + free(devidp); + return (mdsyserror(ep, ENOMEM, META_DBCONF)); + } + + (void) memcpy((void *)c.c_locator.l_devid, + (void *)devid_decode, sz); + + devid_free(devid_decode); + + if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { + free(minor_name); + free(devidp); + free((void *)c.c_locator.l_devid); + return (mdsyserror(ep, ENOMEM, META_DBCONF)); + } + (void) strcpy(c.c_locator.l_minor_name, minor_name); + free(minor_name); + c.c_locator.l_devid_flags = MDDB_DEVID_VALID | + MDDB_DEVID_SPACE | MDDB_DEVID_SZ; + c.c_locator.l_devid_sz = sz; + + devid_size = strlen(devidp); + buff += devid_size; + + checksum = strtol(buff, &buff, 10); + for (i = 0; c.c_locator.l_driver[i] != 0; i++) + checksum += c.c_locator.l_driver[i]; + for (i = 0; i < devid_size; i++) { + checksum += devidp[i]; + } + free(devidp); + + checksum += minor(c.c_locator.l_dev); + checksum += c.c_locator.l_blkno; + if (checksum != 42) { + /* overwritten later for more serious problems */ + rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); + free((void *)c.c_locator.l_devid); + continue; + } + c.c_locator.l_flags = 0; + + /* use db location */ + if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { + free((void *)c.c_locator.l_devid); + return (mdstealerror(ep, &c.c_mde)); + } + + /* free up devid if in use */ + free((void *)c.c_locator.l_devid); + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + } + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, META_DBCONF)); + + /* check for stale database */ + (void) memset((char *)&c, 0, sizeof (struct mddb_config)); + c.c_id = 0; + c.c_setno = MD_LOCAL_SET; + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) + return (mdstealerror(ep, &c.c_mde)); + mdclrerror(&c.c_mde); + } + + if (c.c_flags & MDDB_C_STALE) + return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, + 0, NULL)); + + /* success */ + return (rval); +} + +/* + * meta_db_minreplica - returns the minimum size replica currently in use. + */ +daddr_t +meta_db_minreplica( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_replica_t *r; + md_replicalist_t *rl, *rlp = NULL; + daddr_t nblks = 0; + + if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) + return (-1); + + if (rlp == NULL) + return (-1); + + /* find the smallest existing replica */ + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + r = rl->rl_repp; + nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); + } + + metafreereplicalist(rlp); + return (nblks); +} + +/* + * meta_get_replica_names + * returns an mdnamelist_t of replica slices + */ +/*ARGSUSED*/ +int +meta_get_replica_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + mdnamelist_t **tailpp = nlpp; + int cnt = 0; + + assert(nlpp != NULL); + + if (!metaislocalset(sp)) + goto out; + + /* get replicas */ + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { + cnt = -1; + goto out; + } + + /* build name list */ + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + /* + * Add the name struct to the end of the + * namelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + tailpp = meta_namelist_append_wrapper( + tailpp, rl->rl_repp->r_namep); + ++cnt; + } + + /* cleanup, return count or error */ +out: + metafreereplicalist(rlp); + return (cnt); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_db_balance.c b/usr/src/lib/lvm/libmeta/common/meta_db_balance.c new file mode 100644 index 0000000000..2becd5a5a4 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_db_balance.c @@ -0,0 +1,1215 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Database location balancing code. + */ + +#include <meta.h> +#include <sys/lvm/md_mddb.h> +#include <sdssc.h> + +#define MD_MINBALREP 2 + +/* + * Stuff for DB balancing. + */ +enum md_ctlr_ops_t { + DRV_NOP = 0, + DRV_ADD = 1, + DRV_DEL = 2 +}; +typedef enum md_ctlr_ops_t md_ctlr_ops_t; + +/* drive flag fields */ +#define DRV_F_ERROR 0x1 +#define DRV_F_INDISKSET 0x2 + +struct md_ctlr_drv_t { + md_ctlr_ops_t drv_op; + int drv_flags; + int drv_dbcnt; + int drv_new_dbcnt; + daddr_t drv_dbsize; + mddrivename_t *drv_dnp; + struct md_ctlr_drv_t *drv_next; +}; +typedef struct md_ctlr_drv_t md_ctlr_drv_t; + +struct md_ctlr_ctl_t { + mdcinfo_t *ctl_cinfop; + int ctl_dbcnt; + int ctl_drcnt; + md_ctlr_drv_t *ctl_drvs; + struct md_ctlr_ctl_t *ctl_next; +}; +typedef struct md_ctlr_ctl_t md_ctlr_ctl_t; + +static int +add_replica( + mdsetname_t *sp, + mddrivename_t *dnp, + int dbcnt, + daddr_t dbsize, + md_error_t *ep +) +{ + mdnamelist_t *nlp = NULL; + mdname_t *np; + md_set_desc *sd; + uint_t rep_slice; + + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) + return (-1); + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + + (void) metanamelist_append(&nlp, np); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + metafreenamelist(nlp); + return (-1); + } + + if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED), + (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) { + metafreenamelist(nlp); + return (-1); + } + + metafreenamelist(nlp); + return (0); +} + +static int +del_replica( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + mdnamelist_t *nlp = NULL; + mdname_t *np; + uint_t rep_slice; + + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) + return (-1); + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + + (void) metanamelist_append(&nlp, np); + + if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED), + NULL, ep) == -1) { + metafreenamelist(nlp); + return (-1); + } + + metafreenamelist(nlp); + return (0); +} + +static int +rep_has_err(md_replicalist_t *rlp, mdname_t *np) +{ + md_replicalist_t *rl; + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + + if (strcmp(r->r_namep->cname, np->cname) != 0) + continue; + + if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA | + MDDB_F_EMASTER | MDDB_F_EWRITE)) + return (1); + + } + return (0); +} + +static int +add_drv_to_ctl_lst( + md_ctlr_ctl_t **clpp, + md_replicalist_t *rlp, + mddrivename_t *dnp, + int dbcnt, + daddr_t dbsize, + mdcinfo_t *cinfop, + int indiskset, + int with_bus, + int errored, + md_error_t *ep +) +{ + md_ctlr_drv_t **dpp; + mdname_t *np; + mdcinfo_t *tcinfop; + char *cmp_name_1, + *cmp_name_2; + int not_found; + + /* + * The user must pass in a list head. + */ + assert(clpp != NULL); + + if (cinfop == NULL) { + uint_t rep_slice; + + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { + /* + * A failure to get the slice information can occur + * because the drive has failed, if this is the + * case then there is nothing that can be done + * with this drive, so do not include it in the + * list of drives. Clear the error and return. + */ + mdclrerror(ep); + return (0); + } + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + + if ((tcinfop = metagetcinfo(np, ep)) == NULL) + return (-1); + + if (metagetvtoc(np, FALSE, NULL, ep) == NULL) + errored = 1; + + if (rep_has_err(rlp, np)) + errored = 1; + } else + tcinfop = cinfop; + + for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) { + /* + * Try to locate ctlr. + */ + (void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1); + (void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname, + &cmp_name_2); + + if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype || + tcinfop->cnum != (*clpp)->ctl_cinfop->cnum || + strncmp(cmp_name_1, cmp_name_2, 16) != 0 || + (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) { + not_found = 1; + } else + not_found = 0; + + + sdssc_convert_path_free(cmp_name_1); + sdssc_convert_path_free(cmp_name_2); + + if (not_found) + continue; + + /* + * Found ctlr, try to locate the drive. + */ + for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL; + dpp = &(*dpp)->drv_next) { + (void) sdssc_convert_cluster_path( + (*dpp)->drv_dnp->cname, &cmp_name_1); + (void) sdssc_convert_cluster_path(dnp->cname, + &cmp_name_2); + + not_found = strcmp(cmp_name_1, cmp_name_2); + + sdssc_convert_path_free(cmp_name_1); + sdssc_convert_path_free(cmp_name_2); + + if (not_found) + continue; + + /* + * Found drive, must be deleting. + */ + (*dpp)->drv_op = DRV_DEL; + if (indiskset) + (*dpp)->drv_flags |= DRV_F_INDISKSET; + if (errored) { + mdclrerror(ep); + (*dpp)->drv_flags |= DRV_F_ERROR; + } + (*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt; + (*clpp)->ctl_drcnt--; + return (0); + } + /* + * The ctlr was found, but not the drive, so add + * the drive + */ + (*dpp) = Zalloc(sizeof (**dpp)); + + + if (indiskset) { + (*dpp)->drv_op = DRV_NOP; + (*dpp)->drv_flags |= DRV_F_INDISKSET; + if (errored) { + mdclrerror(ep); + (*dpp)->drv_flags |= DRV_F_ERROR; + } + } else { + (*dpp)->drv_op = DRV_ADD; + if (errored) { + (*dpp)->drv_flags |= DRV_F_ERROR; + return (-1); + } + assert(dbsize != 0); + } + (*dpp)->drv_dbcnt = dbcnt; + (*dpp)->drv_dbsize = dbsize; + (*dpp)->drv_dnp = dnp; + (*clpp)->ctl_dbcnt += dbcnt; + (*clpp)->ctl_drcnt++; + return (0); + } + /* + * No ctlr was located, so add the ctlr, then recurse to add the + * drive to the ctlr. + */ + (*clpp) = Zalloc(sizeof (**clpp)); + + (*clpp)->ctl_cinfop = tcinfop; + + return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop, + indiskset, with_bus, errored, ep)); +} + +static int +add_replica_to_ctl( + mdsetname_t *sp, + md_ctlr_ctl_t *c, + int minimum_replicas, + md_error_t *ep +) +{ + md_ctlr_drv_t *d; + int maxdb = 0; + + /* + * If this ctrl has no "usable" drives, assert() or just return if + * assert()'s are turned off. + */ + if (c->ctl_drcnt == 0) { + assert(0); + return (0); + } + + /* + * Determine the largest DB count on a drive. + */ + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) + if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) + maxdb = d->drv_dbcnt; + + /* + * Make sure we start at a reasonable number + */ + if (maxdb == 0) + maxdb = 1; + + /* + * Add a replica to a drive on this ctrl. + */ + /*CONSTCOND*/ + while (1) { + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { + /* + * If this drive is being deleted, skip it. + */ + if (d->drv_op == DRV_DEL) + continue; + + if (d->drv_flags & DRV_F_ERROR) + continue; + /* + * Make sure that the replicas are distributed across + * the drives. + */ + if (d->drv_dbcnt >= maxdb) + continue; + /* + * See if the drive already has replicas, + * if it does, then delete the exisiting + * replica(s) and re-add n+1 replicas to the drive. + */ + /* ==== Vulnerability - no DB's start ==== */ + if (d->drv_dbcnt > 0) { + if (del_replica(sp, d->drv_dnp, ep) == -1) { + d->drv_flags |= DRV_F_ERROR; + if (! (d->drv_flags & DRV_F_INDISKSET)) + return (-1); + mdclrerror(ep); + continue; + } + } + if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1), + d->drv_dbsize, ep) == -1) { + if (d->drv_dbcnt) { + c->ctl_dbcnt -= d->drv_dbcnt; + d->drv_dbcnt = 0; + } + + if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) + return (-1); + + if (mdismddberror(ep, MDE_REPLICA_TOOSMALL)) + return (-1); + + d->drv_flags |= DRV_F_ERROR; + if (! (d->drv_flags & DRV_F_INDISKSET)) + return (-1); + mdclrerror(ep); + continue; + } + + d->drv_dbcnt++; + c->ctl_dbcnt++; + /* ==== Vulnerability - no DB's end ==== */ + return (1); + } + maxdb++; + if (maxdb > minimum_replicas) + return (0); + } + /*NOTREACHED*/ +} + +static int +del_replica_from_ctl( + mdsetname_t *sp, + md_ctlr_ctl_t *c, + md_error_t *ep +) +{ + md_ctlr_drv_t *d; + int maxdb = 0; + + /* + * If this ctrl has no "usable" drives, assert() or just return if + * assert()'s are turned off. + */ + if (c->ctl_drcnt == 0) { + assert(0); + return (0); + } + + /* + * Determine the largest DB count on a drive. + */ + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) + if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL) + maxdb = d->drv_dbcnt; + + if (maxdb == 0) + return (0); + + /* + * Delete a replica from a drive on this ctrl. + */ + /*CONSTCOND*/ + while (1) { + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { + /* + * If this drive is being deleted, skip it. + */ + if (d->drv_op == DRV_DEL) + continue; + + /* + * Make sure that there are replicas on this drive to + * delete. + */ + if (d->drv_dbcnt == 0) + continue; + + if (d->drv_flags & DRV_F_ERROR) + continue; + + /* + * We need to keep the DB's distributed across the + * drives. + */ + if (d->drv_dbcnt < maxdb) + continue; + + /* + * Delete all the replicas on the drive. + */ + /* ==== Vulnerability - no DB's start ==== */ + if (del_replica(sp, d->drv_dnp, ep) == -1) { + d->drv_flags |= DRV_F_ERROR; + if (! (d->drv_flags & DRV_F_INDISKSET)) + return (-1); + mdclrerror(ep); + continue; + } + d->drv_dbcnt--; + c->ctl_dbcnt--; + /* + * If there is still a dbcnt for this drive, then add + * back the needed DB's. + */ + if (d->drv_dbcnt > 0) { + if (add_replica(sp, d->drv_dnp, d->drv_dbcnt, + d->drv_dbsize, ep) == -1) { + c->ctl_dbcnt -= d->drv_dbcnt; + d->drv_dbcnt = 0; + + if (mdismddberror(ep, + MDE_TOOMANY_REPLICAS)) + return (-1); + + d->drv_flags |= DRV_F_ERROR; + if (! (d->drv_flags & DRV_F_INDISKSET)) + return (-1); + mdclrerror(ep); + continue; + } + } + /* ==== Vulnerability - no DB's end ==== */ + return (1); + } + maxdb--; + if (maxdb <= 0) + return (0); + } + /*NOTREACHED*/ +} + +static int +del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep) +{ + md_ctlr_ctl_t *c; + md_ctlr_drv_t *d; + mdnamelist_t *nlp; + mdname_t *np; + + for (c = clp; c != NULL; c = c->ctl_next) { + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { + uint_t rep_slice; + + if (! (d->drv_flags & DRV_F_ERROR) && + (d->drv_op != DRV_DEL)) + continue; + + if (d->drv_dbcnt == 0) + continue; + + if (meta_replicaslice(d->drv_dnp, + &rep_slice, ep) != 0) + return (-1); + + np = metaslicename(d->drv_dnp, rep_slice, ep); + if (np == NULL) + return (-1); + + nlp = NULL; + (void) metanamelist_append(&nlp, np); + + /* + * Delete the replicas listed. + */ + if (meta_db_detach(sp, nlp, + (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, + ep) == -1) { + metafreenamelist(nlp); + if (d->drv_flags & DRV_F_INDISKSET) { + mdclrerror(ep); + continue; + } + return (-1); + } + metafreenamelist(nlp); + } + } + + return (0); +} + +static void +free_ctlr_lst(md_ctlr_ctl_t **clpp) +{ + md_ctlr_ctl_t *c, *tc = NULL; + md_ctlr_drv_t *d, *td = NULL; + + for (c = *clpp; c != NULL; c = tc) { + tc = c->ctl_next; + for (d = c->ctl_drvs; d != NULL; d = td) { + td = d->drv_next; + Free(d); + } + Free(c); + } + *clpp = NULL; +} + +static int +build_ctlr_lst( + mdsetname_t *sp, + md_ctlr_ctl_t **clpp, + md_drive_desc *opdd, + md_drive_desc *curdd, + int with_bus, + daddr_t dbsize, + md_error_t *ep +) +{ + md_drive_desc *d; + md_set_desc *sd; + daddr_t nblks; + md_replicalist_t *rlp = NULL; + static daddr_t min_dbsize = 0; + + if (min_dbsize == 0) { + if ((nblks = meta_db_minreplica(sp, ep)) < 0) { + min_dbsize = MD_DBSIZE; + + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + min_dbsize = MD_MN_DBSIZE; + } + mdclrerror(ep); + } else + min_dbsize = nblks; + } + + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { + if (! mdismddberror(ep, MDE_DB_NODB) && + ! mdismddberror(ep, MDE_DB_NOTOWNER)) + return (-1); + mdclrerror(ep); + } + + /* + * Add drives currently in the set to the ctlr list. + */ + for (d = curdd; d != NULL; d = d->dd_next) { + daddr_t this_dbsize = d->dd_dbsize; + + if (this_dbsize == 0) + this_dbsize = min_dbsize; + + if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt, + this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1) + return (-1); + } + + /* + * Add the drives that are being operated on to the ctlr list. + */ + for (d = opdd; d != NULL; d = d->dd_next) + if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL, + FALSE, with_bus, 0, ep) == -1) + return (-1); + + metafreereplicalist(rlp); + return (0); +} + +static int +count_replica_on_ctl( + md_ctlr_ctl_t *c, + int adding, + int *db_cnt, + int minimum_replicas +) +{ + md_ctlr_drv_t *d; + int maxdb = 0; + + /* + * If this ctrl has no "usable" drives, nothing to do. + */ + if (c->ctl_drcnt == 0) + return (0); + + /* + * Determine the largest DB count on a drive. + */ + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) + if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL) + maxdb = d->drv_new_dbcnt; + + /* + * Make sure we start at a reasonable number + */ + if (maxdb == 0) { + if (!adding) + return (0); + maxdb = 1; + } + + /* + * Count or Un-Count replicas that would be + * added or deleted respectively. + */ + /*CONSTCOND*/ + while (1) { + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { + /* + * If this drive is being deleted, skip it. + */ + if (d->drv_op == DRV_DEL) + continue; + + /* + * If the drive is errored and adding, skip it. + */ + if (adding && (d->drv_flags & DRV_F_ERROR)) + continue; + + /* + * Make sure that the replicas are distributed across + * the drives. + */ + if (adding) { + if (d->drv_new_dbcnt >= maxdb) + continue; + } else { + if (d->drv_new_dbcnt == 0) + continue; + if (d->drv_new_dbcnt < maxdb) + continue; + } + + /* + * Count or Un-Count replicas here. + */ + if (adding) { + mdpart_t *partp; + uint_t rep_slice; + md_error_t mde; + + if (meta_replicaslice(d->drv_dnp, + &rep_slice, &mde) != 0) + continue; + + partp = &d->drv_dnp->vtoc.parts[rep_slice]; + if (! partp) + continue; + + if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) > + (partp->size - 16)) + continue; + (*db_cnt)++; + d->drv_new_dbcnt++; + } else { + (*db_cnt)--; + d->drv_new_dbcnt--; + } + return (0); + } + + /* + * This should make sure they get spread + * around. This is to emulate the {add,del}_replica + * routines. + */ + if (adding) { + maxdb++; + if (maxdb > minimum_replicas) + return (-1); + } else { + maxdb--; + if (maxdb <= 0) + return (-1); + } + } + /*NOTREACHED*/ +} + +static int +count_replicas( + md_ctlr_ctl_t *clp, + int min_reps +) +{ + md_ctlr_ctl_t *c; + md_ctlr_drv_t *d; + int db_cnt; + int uctlrs = 0; + int total_cnt = 0; + + /* + * Count the number of controllers, + * counting the replicas is slightly different based + * on the controller count. + */ + for (c = clp; c != NULL; c = c->ctl_next) + if (c->ctl_drcnt > 0) { + uctlrs++; + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) + d->drv_new_dbcnt = d->drv_dbcnt; + } + + if (uctlrs > 2) { + for (c = clp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt == 0) + continue; + + db_cnt = c->ctl_dbcnt; + /* + * Count the replicas that would be added. + */ + while (db_cnt < min_reps) + if (count_replica_on_ctl(c, TRUE, + &db_cnt, min_reps)) + return (-1); + + /* + * Un-Count the replicas that would be deleted. + */ + while (db_cnt > min_reps) + if (count_replica_on_ctl(c, FALSE, + &db_cnt, min_reps)) + return (-1); + total_cnt += db_cnt; + } + } else { + for (c = clp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt == 0) + continue; + + db_cnt = c->ctl_dbcnt; + /* + * Count the replicas that woud be added. + */ + while (db_cnt < (min_reps * c->ctl_drcnt)) + if (count_replica_on_ctl(c, TRUE, + &db_cnt, min_reps)) + return (-1); + + total_cnt += db_cnt; + } + } + + return (total_cnt); +} + +static int +balance_replicas( + mdsetname_t *sp, + md_ctlr_ctl_t **clpp, + md_drive_desc *opdd, + md_drive_desc *curdd, + daddr_t dbsize, + int *minimum_replicas, + md_error_t *ep +) +{ + int n; + int rctlrs = 0; + int uctlrs; + int ructlrs; + int octlrs; + int save_done; + int prevcnt = 0, issame = 1; + uint_t drvcnt = ~0U; + uint_t save_cnum; + mhd_ctlrtype_t save_ctype; + char save_cname[16], + *cmp_name_1, + *cmp_name_2; + int reps; + md_ctlr_ctl_t *c; + + /* + * Build a ctlr list with SSA-100 busses NOT as separate controllers. + */ + if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) + return (-1); + + /* + * Determine what controllers are usable in the sense of being able to + * add a replica to a drive on the controller. + * Also find the minimum number of drives on a controller. + */ + for (c = *clpp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt > 0) { + rctlrs++; + drvcnt = min(drvcnt, c->ctl_drcnt); + if (prevcnt == 0) + prevcnt = c->ctl_drcnt; + else if (prevcnt != c->ctl_drcnt) + issame = 0; + } + } + + if ((rctlrs <= 2) || (issame && (drvcnt >= 30))) + goto cont; + + /* + * If here: Handling 3 or more controllers most + * likely with non-symmetrical number of + * disks. The number of replicas will be + * the minimum number of disks on a controller. + * + * The main point is to insure that a + * controller does not have more than half + * of the replicas. + */ + drvcnt = min(drvcnt, 12); + drvcnt = max(drvcnt, MD_MINBALREP); + + /* + * Can we find fewer than the maximum replicas by reducing the + * number of replicas per drive. + */ + for (n = drvcnt; n > 0; n--) { + reps = count_replicas(*clpp, n); + if (reps > 0 && reps <= MDDB_NLB) { + *minimum_replicas = n; + return (0); + } + } + +cont: + free_ctlr_lst(clpp); + + /* + * Build a ctlr list with SSA-100 busses as separate controllers. + * + * If Here: Try to put 2 replicas per controller/bus + * If that doesn't work put 1 replica per controller/bus + */ + if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1) + return (-1); + + /* + * If the number of "real" controllers is 2, special handling may be + * needed. + */ + if (rctlrs != 2) { + drvcnt = MD_MINBALREP; + goto other; + } + + /* + * Determine what controllers are usable in the sense of being able to + * add a replica to a drive on the controller. + * Also find the minimum number of drives on a controller. + */ + drvcnt = ~0U; + uctlrs = 0; + for (c = *clpp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt > 0) { + uctlrs++; + drvcnt = min(drvcnt, c->ctl_drcnt); + } + } + + /* + * If the number of controllers is not changed, continue with original + * strategy. + */ + if (uctlrs == rctlrs) { + drvcnt = MD_MINBALREP; + goto other; + } + + /* + * Check the distribution of bus ctlrs across real controllers. + */ + ructlrs = 0; + octlrs = 0; + save_done = 0; + for (c = *clpp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt == 0) + continue; + + if (! save_done) { + save_cnum = c->ctl_cinfop->cnum; + save_ctype = c->ctl_cinfop->ctype; + (void) strncpy(save_cname, c->ctl_cinfop->cname, 16); + save_done = 1; + } + + (void) sdssc_convert_cluster_path(c->ctl_cinfop->cname, + &cmp_name_1); + (void) sdssc_convert_cluster_path(save_cname, &cmp_name_2); + + if (save_ctype != c->ctl_cinfop->ctype || + save_cnum != c->ctl_cinfop->cnum || + strncmp(cmp_name_1, cmp_name_2, 16) != 0) + octlrs++; + else + ructlrs++; + + sdssc_convert_path_free(cmp_name_1); + sdssc_convert_path_free(cmp_name_2); + } + + /* + * Take the largest of the counts + */ + ructlrs = max(ructlrs, octlrs); + + /* + * If the distribution of bus controlers is half of the total, then + * this layout strategy will work, doit. + */ + if ((uctlrs / 2) == ructlrs) { + drvcnt = MD_MINBALREP; + goto other; + } + + /* + * If here, there is a distribution of bus controllers that will cause + * the real controller distribution to be unbalanced, so a different + * strategy is used. + */ + free_ctlr_lst(clpp); + + /* + * Build the ctlr list with SSA-100 busses NOT as separate controllers. + */ + if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) + return (-1); + + /* + * Make ctl_drcnt limit the number of replicas + */ + for (c = *clpp; c != NULL; c = c->ctl_next) + c->ctl_drcnt = min(drvcnt, c->ctl_drcnt); + + /* + * Try at least MD_MINBALREP's per controller after changing ctl_drcnt + */ + drvcnt = MD_MINBALREP; + +other: + /* + * Can we find fewer than the maximum replicas by reducing the number + * of replicas per drive. + */ + for (n = drvcnt; n > 0; n--) { + reps = count_replicas(*clpp, n); + if (reps > 0 && reps <= MDDB_NLB) { + *minimum_replicas = n; + return (0); + } + } + + free_ctlr_lst(clpp); + + /* + * Build a ctlr list with SSA-100 busses NOT as separate controllers. + * + * If Here: Try to put 2 replicas per controller (not on busses) + * If that doesn't work put 1 replica per controller + */ + if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1) + return (-1); + + /* + * Can we find fewer than the maximum replicas by reducing the + * number of replicas per drive. + */ + for (n = MD_MINBALREP; n > 0; n--) { + reps = count_replicas(*clpp, n); + if (reps > 0 && reps <= MDDB_NLB) { + *minimum_replicas = n; + return (0); + } + } + + /* + * Return a ctrl list that does not include the SSA-100 buses as + * separate controllers. This will create fewer separate controllers. + */ + *minimum_replicas = 1; + return (0); +} + +static int +morethan2_ctl_balance( + mdsetname_t *sp, + md_ctlr_ctl_t *clp, + int min_reps, + md_error_t *ep +) +{ + md_ctlr_ctl_t *c; + int err; + + for (c = clp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt == 0) + continue; + + while (c->ctl_dbcnt < min_reps) { + if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) + return (-1); + if (err == 0) + break; + } + + while (c->ctl_dbcnt > min_reps) { + if ((err = del_replica_from_ctl(sp, c, ep)) < 0) + return (-1); + if (err == 0) + break; + } + } + + return (0); +} + +static int +lessthan3_ctl_balance( + mdsetname_t *sp, + md_ctlr_ctl_t *clp, + int min_reps, + md_error_t *ep +) +{ + md_ctlr_ctl_t *c; + int err; + + for (c = clp; c != NULL; c = c->ctl_next) { + if (c->ctl_drcnt == 0) + continue; + + while (c->ctl_dbcnt < (min_reps * c->ctl_drcnt)) { + if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0) + return (-1); + if (err == 0) + break; + } + + while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) { + if ((err = del_replica_from_ctl(sp, c, ep)) < 0) + return (-1); + if (err == 0) + break; + } + } + + return (0); +} + +static int +try_again( + md_ctlr_ctl_t *clp, + md_error_t *ep +) +{ + md_ctlr_ctl_t *c; + md_ctlr_drv_t *d; + + if (mdismddberror(ep, MDE_TOOMANY_REPLICAS)) + return (TRUE); + + /* + * retry if all the errored drives are already in the diskset. + */ + for (c = clp; c != NULL; c = c->ctl_next) { + for (d = c->ctl_drvs; d != NULL; d = d->drv_next) { + if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR)) + == DRV_F_ERROR) + return (FALSE); + } + } + return (TRUE); +} + +int +meta_db_balance( + mdsetname_t *sp, + md_drive_desc *opdd, + md_drive_desc *curdd, + daddr_t dbsize, + md_error_t *ep +) +{ + int min_reps; + md_ctlr_ctl_t *c, *cl = NULL; + int uctlrs = 0; + int retry = 0; + int rval = 0; + + if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1) + return (-1); + + /* + * Determine what controllers are usable in the sense of being able to + * add a replica to a drive on the controller. + */ + for (c = cl; c != NULL; c = c->ctl_next) + if (c->ctl_drcnt > 0) + uctlrs++; + + /* + * Add replicas to achieve a balance. + */ + if (uctlrs > 2) + rval = morethan2_ctl_balance(sp, cl, min_reps, ep); + else + rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); + + if (rval) { + if ((retry = try_again(cl, ep)) == TRUE) { + mdclrerror(ep); + rval = 0; + } + } + + /* + * Delete all the replicas from drives that are so marked. + */ + if (! rval) + rval = del_replicas(sp, cl, ep); + + if (retry) { + if (uctlrs > 2) + rval = morethan2_ctl_balance(sp, cl, min_reps, ep); + else + rval = lessthan3_ctl_balance(sp, cl, min_reps, ep); + + if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) { + mdclrerror(ep); + rval = 0; + } + } + + /* + * Free up the ctlr list. + */ + free_ctlr_lst(&cl); + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_devadm.c b/usr/src/lib/lvm/libmeta/common/meta_devadm.c new file mode 100644 index 0000000000..a30789a72e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_devadm.c @@ -0,0 +1,1607 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stdarg.h> +#include <ctype.h> +#include <sys/fcntl.h> +#include <sys/types.h> +#include <devid.h> +#include <ftw.h> +#include <string.h> +#include <mdiox.h> +#include <sys/lvm/mdio.h> +#include <meta.h> +#include <syslog.h> +#include <sdssc.h> +#include "meta_set_prv.h" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#define RAW_PATH 0x001 /* rdsk */ +#define BLOCK_PATH 0x002 /* dsk */ +#define DSK_TYPE 0x004 /* normal /dev/[r]dsk */ +#define TEST_TYPE 0x008 /* test driver path */ +#define DID_TYPE 0x010 /* cluster did path */ +#define AP_TYPE 0x020 /* should be obsolete */ + +typedef struct path_list { + char *search_path; + char *search_type; + int path_type; +} path_list_t; + +/* + * A table of the supported path types - this should ideally be generated + * by reading the /etc/lvm/devpath file + */ +static path_list_t plist[] = { + {"/dev/rdsk", DEVID_MINOR_NAME_ALL_CHR, RAW_PATH|DSK_TYPE}, + {"/dev/dsk", DEVID_MINOR_NAME_ALL_BLK, BLOCK_PATH|DSK_TYPE}, + {"/dev/did/rdsk", DEVID_MINOR_NAME_ALL_CHR, RAW_PATH|DID_TYPE}, + {"/dev/did/dsk", DEVID_MINOR_NAME_ALL_BLK, BLOCK_PATH|DID_TYPE}, + {"/dev/td/dsk", DEVID_MINOR_NAME_ALL_BLK, BLOCK_PATH|TEST_TYPE}, + {"/dev/td/rdsk", DEVID_MINOR_NAME_ALL_CHR, RAW_PATH|TEST_TYPE}, +}; +static int num = sizeof (plist)/sizeof (path_list_t); + +static mddevopts_t dev_options = 0; + +/* indicate whether to print an error message or not */ +static int firsttime = 1; + +#define DEV_MATCH 0x1 +#define NAME_MATCH 0x2 + +#define DEBUGON 1 +#define DEBUGOFF 2 + +/* + * Debug function: to turn on devadm function debugging include DEVADM + * in the MD_DEBUG enviroment variable: MD_DEBUG=...,DEVADM... + */ +/*PRINTFLIKE1*/ +static void +mda_debug(char *format, ...) +{ + char *p; + static int debug_set = 0; + va_list ap; + + if (debug_set == 0) { + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "DEVADM") != NULL)) + debug_set = DEBUGON; + else + debug_set = DEBUGOFF; + } + if (debug_set == DEBUGON) { + va_start(ap, format); + (void) vfprintf(stderr, format, ap); + va_end(ap); + } +} + +/* print error messages to the terminal or syslog */ +/*PRINTFLIKE1*/ +static void +mda_print(char *message, ...) +{ + va_list ap; + + va_start(ap, message); + if (dev_options & DEV_LOG) { + /* + * The program is a daemon in the sense that it + * is a system utility. + */ + (void) vsyslog((LOG_ERR | LOG_DAEMON), message, ap); + } else { + (void) vfprintf(stderr, message, ap); + } + va_end(ap); +} + +/* + * Utility to find the correct options to use for the devid search + * based upon the path of the device. + * + * RETURN: + * -1 Error, the path passed in is not in the table + * >= 0 The element number for the options within the table + */ +static int +mda_findpath(char *path) +{ + int i = 0; + + for (i = 0; i < num; i++) { + if (strncmp(plist[i].search_path, path, + strlen(plist[i].search_path)) == 0) + return (i); + } + return (-1); +} + +/* + * Utility to get the path of a device + */ +static char * +mda_getpath(char *devname) +{ + char *ptr; + char *pathname; + size_t len; + + if ((ptr = strrchr(devname, '/')) == NULL) { + mda_debug("Invalid format: %s\n", devname); + return (NULL); + } + ptr++; + len = strlen(devname) - strlen(ptr); + pathname = Malloc(len + 1); + (void) strncpy(pathname, devname, len); + pathname[len] = '\0'; + return (pathname); +} + +/* + * update_locator_namespace -- Contains the ioctl call that will update + * the ctds and pathname (ie. /dev/dsk etc) within the + * locator block namespace. + * + * RETURN + * METADEVADM_ERR ioctl failed and ep is updated with the error + * METADEVADM_SUCCESS success + */ +static int +update_locator_namespace( + set_t setno, + side_t sideno, + char *devname, + md_dev64_t dev, + char *pname, + md_error_t *ep +) +{ + mdnm_params_t nm; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = sideno; + nm.devname = (uintptr_t)devname; + nm.devname_len = strlen(devname); + nm.devt = dev; + nm.pathname = (uintptr_t)pname; + nm.pathname_len = strlen(pname); + if (metaioctl(MD_IOCUPD_LOCNM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (METADEVADM_ERR); + } + return (METADEVADM_SUCCESS); +} + +/* + * update_namespace -- Contains the ioctl call that will update the + * device name and pathname in the namespace area. + * + * RETURN + * METADEVADM_ERR ioctl failed and ep is updated with the error + * METADEVADM_SUCCESS success + */ +static int +update_namespace( + set_t setno, + side_t sideno, + char *devname, + md_dev64_t dev, + mdkey_t key, + char *pname, + md_error_t *ep +) +{ + mdnm_params_t nm; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = sideno; + nm.devname = (uintptr_t)devname; + nm.devname_len = strlen(devname); + nm.mnum = meta_getminor(dev); + nm.key = key; + nm.pathname = (uintptr_t)pname; + nm.pathname_len = strlen(pname); + if (metaioctl(MD_IOCUPD_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (METADEVADM_ERR); + } + return (METADEVADM_SUCCESS); +} + +/* + * stripS - Strip s<digits> off the end of the ctds name if it exists + */ +static void +stripS(char *name) +{ + char *p; + + /* gobble number and 's' */ + p = name + strlen(name) - 1; + for (; (p > name); --p) { + if (!isdigit(*p)) + break; + } + + if (*p == 's') { + *p = '\0'; + } +} + +/* + * getdiskname -- to be used when scanning the input from the -u arg. + * This routine will strip off input that is anything but cxtxdx. + * ie. it will call stripS to get rid of slice info. Will also + * strip off /dev/dsk, /dev/rdsk, /dev/ap/dsk, /dev/ap/rdsk, + * /dev/did/dsk, or /dev/did/rdsk. The caller will need to free + * the return value. + * + * RETURN + * string that has the disk name in it ie. c0t0d0 + */ +static char * +getdiskname( + char *name +) +{ + char *p; + char *diskname; + + /* regular device */ + if ((strncmp(name, "/dev/dsk/", strlen("/dev/dsk/")) == 0) && + (strchr((p = name + strlen("/dev/dsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + if ((strncmp(name, "/dev/rdsk/", strlen("/dev/rdsk/")) == 0) && + (strchr((p = name + strlen("/dev/rdsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + if ((strncmp(name, "/dev/ap/dsk/", strlen("/dev/ap/dsk/")) == 0) && + (strchr((p = name + strlen("/dev/ap/dsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + if ((strncmp(name, "/dev/ap/rdsk/", strlen("/dev/ap/rdsk/")) == 0) && + (strchr((p = name + strlen("/dev/ap/rdsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + if ((strncmp(name, "/dev/did/dsk/", strlen("/dev/did/dsk/")) == 0) && + (strchr((p = name + strlen("/dev/did/dsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + if ((strncmp(name, "/dev/did/rdsk/", strlen("/dev/did/rdsk/")) == 0) && + (strchr((p = name + strlen("/dev/did/rdsk/")), '/') == NULL)) { + diskname = Strdup(p); + stripS(diskname); + return (diskname); + } + + diskname = Strdup(name); + stripS(diskname); + return (diskname); +} + +/* + * has_devid -- return the device ID for a given key + * + * RETURN + * NULL error + * devid devid found that corresponds to the given key. + */ +static ddi_devid_t +has_devid(set_t setno, side_t sideno, mdkey_t key, md_error_t *ep) +{ + return (meta_getdidbykey(setno, sideno, key, ep)); +} + +/* + * Go through the existing list of replicas and check to see + * if their disk has moved, if so update the replica list + * + * RETURN + * -1 error + * 0 success + */ +static int +fix_replicanames( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + int ret = -1; + int match_type = 0; + devid_nmlist_t *disklist = NULL; + dev_t small_dev = (dev_t)NODEV; + side_t sideno; + set_t setno = sp->setno; + char *search_path; + int search_number; + char *ctds_name; + char *path_name; + int i; + + sideno = getmyside(sp, ep); + if (sideno == MD_SIDEWILD) { + mda_debug("Failed to find the side number\n"); + return (-1); + } + + if (metareplicalist(sp, MD_BASICNAME_OK | PRINT_FAST, &rlp, ep) < 0) { + mda_debug("Unable to get a list of replicas\n"); + return (METADEVADM_ERR); + } + + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + + small_dev = meta_cmpldev(r->r_namep->dev); + search_number = mda_findpath(r->r_namep->bname); + if (search_number == -1) { + mda_debug("replica update: invalid path: %s", + r->r_namep->bname); + continue; + } else { + search_path = plist[search_number].search_path; + } + + if (r->r_devid == NULL) + continue; + + ret = meta_deviceid_to_nmlist(search_path, r->r_devid, + r->r_minor_name, &disklist); + + mda_debug("replica update: search_path %s\n", search_path); + + if (ret != 0) { + /* + * Failed to find the disk, nothing can be done. + * The replica will be marked as bad later. + */ + mda_debug("replica update: failed to find disk %s\n", + r->r_namep->cname); + continue; + } + mda_debug("replica update: current %s (%p)\n", + r->r_namep->bname, (void *) small_dev); + + /* + * Check to see if the returned disk matches the stored one + */ + for (i = 0; disklist[i].dev != NODEV; i++) { + match_type = 0; + + mda_debug("replica update: devid list: %s (%p)\n", + disklist[i].devname, (void *) disklist[i].dev); + + if (disklist[i].dev == small_dev) { + match_type |= DEV_MATCH; + } + + if (strncmp(r->r_namep->bname, disklist[i].devname, + strlen(r->r_namep->bname)) == 0) { + match_type |= NAME_MATCH; + } + + /* + * break out if some sort of match is found because + * we already match on the devid. + */ + if (match_type != 0) + break; + } + + mda_debug("fix_replicanames: match: %x i: %d\n", match_type, i); + + if (match_type == (DEV_MATCH|NAME_MATCH)) { + /* no change */ + mda_debug("replica update: no change %s\n", + disklist[i].devname); + devid_free_nmlist(disklist); + continue; + } + + /* No match found - use the first entry in disklist */ + if (disklist[i].dev == NODEV) + i = 0; + + mda_debug("replica update: reloading %s %p\n", + disklist[i].devname, + (void *) meta_expldev(disklist[i].dev)); + + if (firsttime) { + mda_print(dgettext(TEXT_DOMAIN, + "Disk movement detected\n")); + mda_print(dgettext(TEXT_DOMAIN, + "Updating device names in Solaris Volume " + "Manager\n")); + firsttime = 0; + } + + if (dev_options & DEV_VERBOSE) { + char *devidstr; + + devidstr = + devid_str_encode(r->r_devid, r->r_minor_name); + if (devidstr == NULL) { + mda_print(dgettext(TEXT_DOMAIN, + "Failed to encode the devid\n")); + continue; + } + mda_print(dgettext(TEXT_DOMAIN, + "%s changed to %s from device relocation " + "information %s\n"), + (char *)r->r_namep->cname, disklist[i].devname, + devidstr); + } + + if (!(dev_options & DEV_NOACTION)) { + mda_debug("Updating locator name\n"); + ctds_name = strrchr(disklist[i].devname, '/'); + ctds_name++; + if ((path_name = mda_getpath(disklist[i].devname)) + == NULL) { + continue; + } + if (update_locator_namespace(setno, sideno, + ctds_name, meta_expldev(disklist[i].dev), + path_name, ep) != 0) { + mda_debug("replica update: ioctl failed\n"); + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "Failed to update locator " + "namespace on change from %s " + "to %s\n"), ctds_name, + disklist[i].devname); + } + } + } + Free(path_name); + devid_free_nmlist(disklist); + } + metafreereplicalist(rlp); + return (0); +} + +/* + * pathname_reload - main function for the -r option. Will reload the + * pathname in both the main namespace and the locator namespace. + * Also, checks both areas for invalid device ID's and prints them + * out. + * + * If the set is a multi-node diskset that means there are no devid's + * so just return. + * + * RETURN + * METADEVADM_ERR error + * METADEVADM_SUCCESS success + * METADEVADM_DEVIDINVALID success, but invalid devids detected + */ +int +pathname_reload( + mdsetname_t **spp, + set_t setno, + md_error_t *ep) +{ + char *drvnmp; + minor_t mnum = 0; + md_dev64_t dev = 0; + mdnm_params_t nm; + char *ctds_name; + ddi_devid_t devidp; + md_i_didstat_t ds; + side_t sideno; + char *search_path = NULL; + int search_number; + devid_nmlist_t *disklist = NULL; + char *minor_name = NULL; + char *devidstr = NULL; + char *path = NULL; + int ret; + dev_t small_dev = (dev_t)NODEV; + int match_type; + char *tmp = NULL; + mdsetname_t *sp = *spp; + md_set_desc *sd; + int i; + + /* + * Check for multi-node diskset and return if it is one. + */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (METADEVADM_ERR); + + if (MD_MNSET_DESC(sd)) + return (METADEVADM_SUCCESS); + } + + /* + * Get the entry of the namespace via the key. To do this + * call MD_IOCNXTKEY until no more. + * For each entry in the namespace we want to check + * for devid and update + */ + + (void) memset(&nm, '\0', sizeof (nm)); + nm.key = MD_KEYWILD; + + sideno = getmyside(*spp, ep); + if (sideno == MD_SIDEWILD) { + /* failed to find this node in the set */ + mda_debug("Failed to find the side number\n"); + return (METADEVADM_ERR); + } + + /* LINTED */ + while (1) { + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = sideno; + /* look at each key in the namespace */ + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (METADEVADM_ERR); + } + + if (nm.key == MD_KEYWILD) { + /* no more entries */ + break; + } + + /* + * get the nm entry using the key. Then check to see if + * there's a devid associated with this entry + * If not, go onto next key. + */ + if ((nm.devname = (uintptr_t)meta_getnmentbykey(setno, sideno, + nm.key, &drvnmp, &mnum, &dev, ep)) == NULL) { + mda_debug("pathname_reload: no name for key: %d\n", + nm.key); + continue; + } + + mda_debug("pathname_reload: examining %s\n", + (char *)nm.devname); + + if ((devidp = has_devid(setno, sideno, nm.key, ep)) == NULL) { + /* metadevices do not have devid's in them */ + mda_debug("pathname_reload: no devid for %s\n", + (char *)nm.devname); + continue; + } + + if ((minor_name = meta_getdidminorbykey(setno, sideno, + nm.key, ep)) == NULL) { + /* + * In theory this is impossible because if the + * devidp is non-null then the minor_name has + * already been looked up. + */ + mda_debug("No minor name for %s\n", (char *)nm.devname); + free(devidp); + continue; + } + /* + * If there is a devid then we have a real device that + * could have moved. + */ + devidstr = devid_str_encode(devidp, minor_name); + if (devidstr == NULL) { + mda_debug("Failed to encode the devid\n"); + free(devidp); + continue; + } + mda_debug("devid: %s\n", devidstr); + + /* + * Find the search path that should be used. This is an + * optimization to try and prevent a search for the complete + * /dev namespace. + */ + search_number = mda_findpath((char *)nm.devname); + if (search_number == -1) { + search_path = "/dev"; + } else { + search_path = plist[search_number].search_path; + } + + /* now look for the disk name using the devid */ + ret = meta_deviceid_to_nmlist(search_path, devidp, + minor_name, &disklist); + free(devidp); + + if (ret != 0) { + /* + * Failed to find the disk + */ + devid_str_free(devidstr); + continue; + } + + small_dev = meta_cmpldev(dev); + mda_debug("Old device lookup: %s (%p)\n", + (char *)nm.devname, (void *)small_dev); + + /* + * Check to see if the returned disk matches the stored one + */ + for (i = 0; disklist[i].dev != NODEV; i++) { + match_type = 0; + mda_debug("From devid lookup: %s (%p)\n", + (char *)disklist[i].devname, + (void *)disklist[i].dev); + + if (disklist[i].dev == small_dev) { + match_type |= DEV_MATCH; + } + + if (strncmp((char *)nm.devname, disklist[i].devname, + strlen((char *)nm.devname)) == 0) { + mda_debug("Name match: %s and %s (%d)\n", + disklist[i].devname, (char *)nm.devname, + strlen((char *)nm.devname)); + match_type |= NAME_MATCH; + } + + if (match_type == (DEV_MATCH|NAME_MATCH)) + break; + } + + if (match_type == (DEV_MATCH|NAME_MATCH)) { + /* no change */ + devid_str_free(devidstr); + mda_debug("All matched %s\n", disklist[i].devname); + devid_free_nmlist(disklist); + continue; + } + + /* No match found - use the first entry in disklist */ + i = 0; + + if (firsttime) { + mda_print(dgettext(TEXT_DOMAIN, + "Disk movement detected\n")); + mda_print(dgettext(TEXT_DOMAIN, + "Updating device names in " + "Solaris Volume Manager\n")); + firsttime = 0; + } + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "%s changed to %s from device relocation " + "information %s\n"), + (char *)nm.devname, disklist[i].devname, + devidstr); + } + devid_str_free(devidstr); + + /* need to build up the path of the disk */ + if ((path = Strdup(disklist[i].devname)) == NULL) { + mda_debug("Failed to duplicate path: %s\n", + disklist[i].devname); + devid_free_nmlist(disklist); + continue; + } + if ((tmp = strrchr(path, '/')) == NULL) { + mda_debug("Failed to parse %s\n", path); + devid_free_nmlist(disklist); + Free(path); + continue; + } + tmp += sizeof (char); + *tmp = '\0'; + + if ((ctds_name = strrchr(disklist[i].devname, '/')) == NULL) { + mda_debug("Failed to parse ctds name: %s\n", + disklist[i].devname); + devid_free_nmlist(disklist); + Free(path); + continue; + } + ctds_name += sizeof (char); + + mda_debug("Reloading disk %s %s %p\n", + ctds_name, path, (void *) meta_expldev(disklist[i].dev)); + + if (!(dev_options & DEV_NOACTION)) { + /* Something has changed so update the namespace */ + if (update_namespace(setno, sideno, ctds_name, + meta_expldev(disklist[i].dev), nm.key, path, + ep) != 0) { + mda_debug("Failed to update namespace\n"); + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "Failed to update namespace on " + "change from %s to %s\n"), + ctds_name, disklist[i].devname); + } + } + } + devid_free_nmlist(disklist); + Free(path); + } + + if (fix_replicanames(*spp, ep) == -1) + mda_debug("Failed to update replicas\n"); + + /* + * check for invalid device id's + */ + (void) memset(&ds, '\0', sizeof (ds)); + ds.setno = setno; + ds.side = sideno; + ds.mode = MD_FIND_INVDID; + /* get count of number of invalid device id's */ + if (metaioctl(MD_IOCDID_STAT, &ds, &ds.mde, NULL) != 0) { + (void) mdstealerror(ep, &ds.mde); + return (METADEVADM_ERR); + } + if (ds.cnt != 0) { + char *ctdptr, *ctdp; + /* + * we have some invalid device id's so we need to + * print them out + */ + ds.mode = MD_GET_INVDID; + /* malloc buffer for kernel to place devid list into */ + if ((ctdptr = (char *)Malloc((ds.cnt * ds.maxsz) + 1)) == 0) { + return (METADEVADM_ERR); + } + ds.ctdp = (uintptr_t)ctdptr; + /* get actual list of invalid device id's */ + if (metaioctl(MD_IOCDID_STAT, &ds, &ds.mde, NULL) != 0) { + Free(ctdptr); + (void) mdstealerror(ep, &ds.mde); + return (METADEVADM_ERR); + } + + /* print out the invalid devid's */ + mda_print(dgettext(TEXT_DOMAIN, + "Invalid device relocation information " + "detected in Solaris Volume Manager\n")); + mda_print(dgettext(TEXT_DOMAIN, + "Please check the status of the following disk(s):\n")); + ctdp = (char *)ds.ctdp; + while (*ctdp != NULL) { + mda_print("\t%s\n", ctdp); + ctdp += ds.maxsz; + } + Free(ctdptr); + return (METADEVADM_DEVIDINVALID); + } + return (METADEVADM_SUCCESS); +} + +/* + * replica_update_devid - cycle through the replica list, rlp, and + * update the device ids on all of the replicas that are on the + * device specified by lp. A side effect is to update the value of + * cdevidpp to contain the character representation of the device + * id before updating if it is not already set. + * + * RETURN + * METADEVADM_ERR error + * METADEVADM_SUCCESS success + */ +static int +replica_update_devid( + md_replicalist_t *rlp, + mddrivename_t *dnp, + set_t setno, + char **cdevidpp, + md_error_t *ep +) +{ + mddb_config_t db_c; + md_replicalist_t *rl; + ddi_devid_t devidp; + int ret; + + if (cdevidpp == NULL) + return (METADEVADM_ERR); + + ret = devid_str_decode(dnp->devid, &devidp, NULL); + if (ret != 0) { + /* failed to encode the devid */ + mda_debug("Failed to decode %s into a valid devid\n", + dnp->devid); + return (METADEVADM_ERR); + } + + /* search replica list for give ctd name */ + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + mdname_t *rnp = r->r_namep; + + if (strncmp(rnp->cname, dnp->cname, strlen(dnp->cname)) == 0) { + + /* found the replica, now grab the devid */ + if (*cdevidpp == NULL) { + *cdevidpp = devid_str_encode(r->r_devid, NULL); + } + + if (*cdevidpp == NULL) { + devid_free(devidp); + return (METADEVADM_ERR); + } + + mda_debug("Updating replica %s, set %d, old devid %s\n", + rnp->cname, setno, *cdevidpp); + + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "Updating replica %s of set number %d from " + "device id %s to device id %s\n"), + rnp->cname, setno, *cdevidpp, dnp->devid); + } + + (void) memset(&db_c, '\0', sizeof (db_c)); + + db_c.c_setno = setno; + db_c.c_devt = rnp->dev; + + if (!(dev_options & DEV_NOACTION)) { + + mda_debug("Updating replica\n"); + + /* + * call into kernel to update lb + * namespace device id + * of given devt + */ + if (metaioctl(MD_DB_SETDID, &db_c, + &db_c.c_mde, NULL) != 0) { + devid_free(devidp); + (void) mdstealerror(ep, &db_c.c_mde); + return (METADEVADM_ERR); + } + } + + } + } + devid_free(devidp); + return (METADEVADM_SUCCESS); +} + +/* + * devid_update -- main routine for the -u option. Will update both the + * namespace and the locator block with the correct devid for the + * disk specified. + * + * RETURN + * METADEVADM_ERR error + * METADEVADM_SUCCESS success + */ +static int +devid_update( + mdsetname_t **spp, + set_t setno, + char *ctd, + md_error_t *ep +) +{ + md_drive_desc *dd, *ddp; + mddrivename_t *dnp; + mdnm_params_t nm; + ddi_devid_t devidp; + side_t side; + char *old_cdevidp = NULL; + md_replicalist_t *rlp = NULL; + int rval = METADEVADM_ERR; + mdname_t *np = NULL; + uint_t rep_slice; + char *pathname = NULL; + char *diskname = NULL; + int fd = -1; + int len; + char *fp; + + side = getmyside(*spp, ep); + if (side == MD_SIDEWILD) { + /* failed to find this node in the set */ + mda_debug("Failed to find the side number\n"); + return (METADEVADM_ERR); + } + + if ((dnp = metadrivename(spp, ctd, ep)) == NULL) { + mda_debug("Failed to create a dnp for %s\n", ctd); + return (METADEVADM_ERR); + } + if (dnp->devid == NULL) { + /* + * Disk does not have a devid! So cannot update the + * devid within the replica. + */ + mda_debug("%s does not have a devid\n", dnp->cname); + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "%s does not have a device id. Cannot update " + "device id if none exists\n"), ctd); + } + return (METADEVADM_ERR); + } + + mda_debug("Devid update to: %s\n", dnp->devid); + + /* + * Check if we own the set, if we do then do some processing + * on the replicas. + */ + if (meta_check_ownership(*spp, ep) == 0) { + + /* get the replicas */ + if (metareplicalist(*spp, MD_BASICNAME_OK | PRINT_FAST, &rlp, + ep) < 0) + return (METADEVADM_ERR); + + /* update the devids in the replicas if necessary */ + if (replica_update_devid(rlp, dnp, setno, &old_cdevidp, + ep) != METADEVADM_SUCCESS) { + metafreereplicalist(rlp); + return (METADEVADM_ERR); + } + + metafreereplicalist(rlp); + } + + /* + * If this is not the LOCAL set then need to update the LOCAL + * replica with the new disk record. + */ + + if (setno != MD_LOCAL_SET) { + mda_debug("Non-local set: %d side %d\n", setno, side); + + /* + * Need to find the disk record within the set and then + * update it. + */ + if ((dd = + metaget_drivedesc(*spp, MD_FULLNAME_ONLY, ep)) == NULL) { + if (! mdisok(ep)) + goto out; + /* no disks in the set - no point continuing */ + mda_debug("No disks in diskset\n"); + rval = METADEVADM_SUCCESS; + goto out; + } + + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + if (strncmp(ddp->dd_dnp->cname, dnp->cname, + strlen(dnp->cname)) == 0) + break; + } + + if (ddp == NULL) { + /* failed to finddisk in the set */ + mda_print(dgettext(TEXT_DOMAIN, + "%s not found in set %s. Check your syntax\n"), + ctd, (*spp)->setname); + (void) mddserror(ep, MDE_DS_DRIVENOTINSET, setno, NULL, + ctd, (*spp)->setname); + goto out; + } + + /* + * Now figure out the correct slice, for a diskset the slice + * we care about is always the 'replica' slice. + */ + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) { + mda_debug("Unable to find replica slice for %s\n", + dnp->cname); + goto out; + } + + mda_debug("slice no: %d disk %s\n", rep_slice, dnp->cname); + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) { + mda_debug("Unable to build namespace\n"); + goto out; + } + + mda_debug("check: ctdname: %s\n", np->cname); + mda_debug("check: ctdname: %s\n", np->rname); + mda_debug("check: ctdname: %s\n", np->bname); + + if (!(dev_options & DEV_NOACTION)) { + + mda_debug("Updating record: key %d name %s\n", + ddp->dd_dnp->side_names_key, np->cname); + + pathname = mda_getpath(np->bname); + + if (update_namespace(MD_LOCAL_SET, side + SKEW, + np->cname, np->dev, ddp->dd_dnp->side_names_key, + pathname, ep) != 0) { + goto out; + } + + /* + * Now update the devid entry as well, this works + * correctly because the prior call to + * update_namespace() above puts the correct dev_t + * in the namespace which will then be resolved + * to the new devid by the ioctl now called. + */ + nm.mde = mdnullerror; + nm.setno = MD_LOCAL_SET; + nm.side = side + SKEW; + nm.key = ddp->dd_dnp->side_names_key; + if (metaioctl(MD_SETNMDID, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + goto out; + } + } + } + + if ((dev_options & DEV_LOCAL_SET) && (setno != MD_LOCAL_SET)) { + /* + * Only want to update the local set so do not continue. + */ + rval = METADEVADM_SUCCESS; + goto out; + } + + /* + * Iterate through all of the metadevices looking for the + * passed in ctd. If found then update the devid + */ + (void) memset(&nm, '\0', sizeof (nm)); + nm.key = MD_KEYWILD; + /* LINTED */ + while (1) { + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = side; + + /* search each namespace entry */ + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + rval = METADEVADM_ERR; + goto out; + } + if (nm.key == MD_KEYWILD) { + if (setno != MD_LOCAL_SET) { + mda_print(dgettext(TEXT_DOMAIN, + "%s not found in set %s. Check your " + "syntax\n"), ctd, (*spp)->setname); + goto out; + } else { + mda_print(dgettext(TEXT_DOMAIN, + "%s not found in local set. " + "Check your syntax\n"), ctd); + goto out; + } + } + + nm.devname = (uintptr_t)meta_getnmentbykey(setno, side, nm.key, + NULL, NULL, NULL, ep); + if (nm.devname == NULL) { + rval = METADEVADM_ERR; + goto out; + } + + diskname = getdiskname((char *)nm.devname); + + mda_debug("Checking %s with %s\n", diskname, dnp->cname); + if (strcmp(diskname, dnp->cname) != 0) + continue; + + mda_debug("Updating device %s in namespace\n", + (char *)nm.devname); + + /* + * found disk, does it have a devid within the namespace ? + * It might not because it does not support devid's or was + * put into the namespace when there was no devid support + */ + if ((devidp = has_devid(setno, side, nm.key, ep)) == NULL) { + mda_debug("%s has no devid in the namespace", + (char *)nm.devname); + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "SVM has no device id for " + "%s, cannot update.\n"), (char *)nm.devname); + } + continue; /* no devid. go on to next */ + } + if (old_cdevidp == NULL) { + old_cdevidp = devid_str_encode(devidp, NULL); + } + free(devidp); + + /* + * has devid so update namespace, note the key has been set + * by the prior MD_IOCNXTKEY_NM ioctl. + */ + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = side; + if (!(dev_options & DEV_NOACTION)) { + /* + * The call below may fail if the -u option is being + * used to update a disk that has been replaced. + * The -u option to metadevadm should not be used + * for this purpose because we trust the dev_t of + * the device in the replica and if we have replaced + * the device and it is a fibre one then the dev_t + * will have changed. This means we end up looking for + * the devid of a non-existant disk and we subsequently + * fail with NODEVID. + */ + if (metaioctl(MD_SETNMDID, &nm, + &nm.mde, NULL) != 0) { + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "SVM failed to update the device " + "id for %s probably due to both " + "devt and device id changing.\n"), + (char *)nm.devname); + } + (void) mdstealerror(ep, &nm.mde); + mde_perror(ep, ""); + rval = METADEVADM_ERR; + goto out; + } + } + if (old_cdevidp == NULL) { + rval = METADEVADM_ERR; + goto out; + } + break; + } /* end while */ + + mda_print(dgettext(TEXT_DOMAIN, + "Updating Solaris Volume Manager device relocation " + "information for %s\n"), ctd); + + mda_print(dgettext(TEXT_DOMAIN, + "Old device reloc information:\n\t%s\n"), old_cdevidp); + + len = strlen(dnp->rname) + strlen("s0"); + if ((fp = (char *)Malloc(len + 1)) == NULL) { + mda_print(dgettext(TEXT_DOMAIN, + "insufficient memory, device Reloc info not " + "available\n")); + } else { + (void) snprintf(fp, len + 1, "%ss0", dnp->rname); + if ((fd = open(fp, O_RDONLY|O_NDELAY)) < 0) { + mda_print(dgettext(TEXT_DOMAIN, + "Open of %s failed\n"), fp); + } else { + int rc = -1; + ddi_devid_t devid1 = NULL; + char *cdevidp; + + rc = devid_get(fd, &devid1); + if (close(fd) < 0) { + mda_print(dgettext(TEXT_DOMAIN, + "Close of %s failed\n"), fp); + } + if (rc != 0) { + mda_print(dgettext(TEXT_DOMAIN, + "Unable to obtain device " + "Reloc info for %s\n"), fp); + } else { + cdevidp = devid_str_encode(devid1, NULL); + if (cdevidp == NULL) { + mda_print(dgettext(TEXT_DOMAIN, + "Unable to print " + "device Reloc info for %s\n"), fp); + } else { + mda_print(dgettext(TEXT_DOMAIN, + "New device reloc " + "information:\n\t%s\n"), cdevidp); + devid_str_free(cdevidp); + } + devid_free(devid1); + } + } + Free(fp); + } + + rval = METADEVADM_SUCCESS; + +out: + if (diskname) + Free(diskname); + if (pathname) + Free(pathname); + if (old_cdevidp) { + devid_str_free(old_cdevidp); + } + return (rval); + +} + +/* + * Check the ctd name of the disk to see if the disk has moved. If it + * has moved then the newname is returned in 'newname', it is up to + * the caller to free the memory associated with it. + * + * RETURN + * METADEVADM_ERR error + * METADEVADM_SUCCESS success + * METADEVADM_DISKMOVE success, and the disk has moved + * METADEVADM_DSKNAME_ERR error creating the disk name structures. + */ +int +meta_upd_ctdnames( + mdsetname_t **spp, + set_t setno, + side_t sideno, + mddrivename_t *dnp, + char **newname, + md_error_t *ep +) +{ + char *drvnmp; + int i; + minor_t mnum = 0; + md_dev64_t dev = 0; + dev_t small_dev = (dev_t)NODEV; + mdnm_params_t nm; + char *pathname; + char *minor_name = NULL; + ddi_devid_t devidp; + devid_nmlist_t *disklist = NULL; + int ret = 0; + mdsidenames_t *snp; + int match_type; + int search_number = -1; + char *search_type = NULL; + char *search_path = NULL; + uint_t rep_slice; + mddrivename_t *newdnp; + mdname_t *np; + mdsetname_t *sp = *spp; + md_set_desc *sd; + + /* + * setno should always be 0 but we're going to + * check for multi-node diskset and return if it is one. + */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (METADEVADM_ERR); + + if (MD_MNSET_DESC(sd)) + return (METADEVADM_SUCCESS); + } + + if (dnp->devid == NULL) { + /* no devid, nothing can be done */ + mda_debug("meta_upd_ctdnames: %s has no devid\n", dnp->cname); + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "%s has no devid, cannot detect " + "disk movement for this disk.\n"), dnp->cname); + } + return (ret); + } + + /* + * Find the correct side name for the disk. There is a sidename + * for each host associated with the diskset. + */ + for (snp = dnp->side_names; snp != NULL; snp = snp->next) { + mda_debug("meta_upd_ctdnames: %s %d args: setno %d sideno %d\n", + snp->cname, snp->sideno, setno, sideno); + /* only use SKEW for the local replica */ + if (setno == 0) { + if (snp->sideno + SKEW == sideno) + break; + } else { + if (snp->sideno == sideno) + break; + } + } + + if (snp == NULL) { + /* + * Failed to find the side name, this should not + * be possible. However if it does happen this is an + * indication of an inconsistant replica - something + * might have gone wrong during an add or a delete of + * a host. + */ + mda_debug("Unable to find the side information for disk %s", + dnp->cname); + (void) mddserror(ep, MDE_DS_HOSTNOSIDE, (*spp)->setno, mynode(), + NULL, dnp->cname); + return (METADEVADM_ERR); + } + /* + * Find the type of device we are to be searching on + */ + search_number = mda_findpath(snp->cname); + if (search_number == -1) { + search_path = "/dev"; + search_type = DEVID_MINOR_NAME_ALL; + } else { + search_path = plist[search_number].search_path; + search_type = plist[search_number].search_type; + } + + mda_debug("Search path :%s searth_type: %x\n", + search_path, (int)search_type); + (void) memset(&nm, '\0', sizeof (nm)); + + nm.mde = mdnullerror; + nm.setno = setno; + nm.side = sideno; + + /* + * Get the devname from the name space. + */ + if ((nm.devname = (uintptr_t)meta_getnmentbykey(setno, sideno, + dnp->side_names_key, &drvnmp, &mnum, &dev, ep)) == NULL) { + return (METADEVADM_ERR); + } + + ret = devid_str_decode(dnp->devid, &devidp, &minor_name); + devid_str_free(minor_name); + + if (ret != 0) { + /* + * Failed to encode the devid. + */ + devid_free(devidp); + return (METADEVADM_ERR); + } + + /* + * Use the stored devid to find the existing device node and check + * to see if the disk has moved. Use the raw devices as the name + * of the disk is stored as the raw device, if this is not done + * then the disk will not be found. + */ + ret = meta_deviceid_to_nmlist(search_path, devidp, + search_type, &disklist); + + if (ret != 0) { + if (dev_options & DEV_VERBOSE) { + mda_print(dgettext(TEXT_DOMAIN, + "Device ID %s last associated with " + "disk %s no longer found in system\n"), + dnp->devid, dnp->cname); + } + devid_free(devidp); + devid_free_nmlist(disklist); + return (METADEVADM_SUCCESS); + } + + small_dev = meta_cmpldev(dev); + mda_debug("Old device lookup: %s (%p)\n", + (char *)nm.devname, (void *)small_dev); + /* + * Check to see if the returned disk matches the stored one + */ + for (i = 0; disklist[i].dev != NODEV; i++) { + match_type = 0; + mda_debug("From devid lookup: %s (%p)\n", + disklist[i].devname, (void *)disklist[i].dev); + + if (disklist[i].dev == small_dev) { + match_type |= DEV_MATCH; + } + + if (strncmp((char *)nm.devname, disklist[i].devname, + strlen((char *)nm.devname)) == 0) { + match_type |= NAME_MATCH; + } + + if (match_type != 0) + break; + } + devid_free(devidp); + + mda_debug("meta_upd_ctdnames: match: %x i: %d\n", match_type, i); + + if (match_type == (DEV_MATCH|NAME_MATCH)) { + /* no change */ + devid_free_nmlist(disklist); + return (METADEVADM_SUCCESS); + } + + /* No match found - use the first entry in disklist */ + if (disklist[i].dev == NODEV) + i = 0; + + if (!(match_type & DEV_MATCH)) { + /* did not match on the dev, so dev_t has changed */ + mda_debug("Did not match on dev: %p %p\n", + (void *) small_dev, (void *) disklist[i].dev); + dev = meta_expldev(disklist[i].dev); + } + + if (!(match_type & NAME_MATCH)) { + mda_debug("Did not match on name: %s (%p)\n", + (char *)nm.devname, (void *) disklist[i].dev); + } + + /* + * If here, then the name in the disklist is the one we + * want in any case so use it. + */ + mda_debug("devname: %s\n", disklist[i].devname); + /* + * Need to remove the slice as metadrivename() expects a diskname + */ + stripS(disklist[i].devname); + /* + * Build an mddrivename_t to use + */ + if ((newdnp = metadrivename(spp, disklist[i].devname, ep)) == NULL) { + mda_debug("Unable to make a dnp out of %s\n", + disklist[i].devname); + return (METADEVADM_DSKNAME_ERR); + } + /* + * Need to find the correct slice used for the replica + */ + if (meta_replicaslice(newdnp, &rep_slice, ep) != 0) { + return (METADEVADM_DSKNAME_ERR); + } + + if ((np = metaslicename(newdnp, rep_slice, ep)) == NULL) { + mda_debug("Failed to build an np for %s\n", dnp->rname); + return (METADEVADM_DSKNAME_ERR); + } + mda_debug("check: cname: %s\n", np->cname); + mda_debug("check: rname: %s\n", np->rname); + mda_debug("check: bname: %s\n", np->bname); + + if (newname != NULL) + *newname = Strdup(np->bname); + + if (!(dev_options & DEV_NOACTION)) { + + mda_debug("update namespace\n"); + + /* get the block path */ + pathname = mda_getpath(np->bname); + + if (update_namespace(setno, sideno, np->cname, + dev, dnp->side_names_key, pathname, ep) != 0) { + /* finished with the list so return the memory */ + Free(pathname); + devid_free_nmlist(disklist); + return (METADEVADM_ERR); + } + } + /* finished with the list so return the memory */ + Free(pathname); + devid_free_nmlist(disklist); + ret = METADEVADM_DISKMOVE; + return (ret); +} + +int +meta_fixdevid( + mdsetname_t *sp, + mddevopts_t options, + char *diskname, + md_error_t *ep +) +{ + set_t setno = sp->setno; + int ret = 0; + char *pathname = NULL; + mdsetname_t *local_sp = NULL; + md_drive_desc *d = NULL; + char *newname = NULL; + md_drive_desc *dd; + side_t sideno; + md_set_desc *sd; + + /* if MN diskset just return */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (METADEVADM_ERR); + } + if (MD_MNSET_DESC(sd)) + return (METADEVADM_SUCCESS); + } + + dev_options |= options; + mda_debug("dev_options: %x\n", dev_options); + if (dev_options & DEV_RELOAD) { + /* + * If it's not the local set we need to check the local + * namespace to see if disks have moved as it contains + * entries for the disks in the set. + */ + if (setno != MD_LOCAL_SET) { + if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK | + PRINT_FAST, ep)) == NULL) { + mde_perror(ep, ""); + mdclrerror(ep); + return (METADEVADM_ERR); + } + local_sp = metasetname(MD_LOCAL_NAME, ep); + sideno = getmyside(sp, ep) + SKEW; + for (d = dd; d != NULL; d = d->dd_next) { + /* + * Actually do the check of the disks. + */ + ret = meta_upd_ctdnames(&local_sp, 0, sideno, + d->dd_dnp, &newname, ep); + + if ((ret == METADEVADM_ERR) || + (ret == METADEVADM_DSKNAME_ERR)) { + /* check failed in unknown manner */ + mda_debug("meta_upd_ctdnames failed\n"); + return (METADEVADM_ERR); + } + } + } + + /* do a reload of the devid namespace */ + ret = pathname_reload(&sp, setno, ep); + } else if (dev_options & DEV_UPDATE) { + pathname = getdiskname(diskname); + ret = devid_update(&sp, setno, pathname, ep); + free(pathname); + } + return (ret); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_devstamp.c b/usr/src/lib/lvm/libmeta/common/meta_devstamp.c new file mode 100644 index 0000000000..1a3cf3e1ce --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_devstamp.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1993-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * get timestamp from device + */ + +#include <meta.h> + +/* + * get timestamp + */ +int +getdevstamp( + mddrivename_t *dnp, + time_t *stamp, /* return timestamp here */ + md_error_t *ep +) +{ + int fd; + int partno; + struct vtoc vtocbuf; + mdname_t *np; + + if ((np = metaslicename(dnp, MD_SLICE0, ep)) == NULL) + return (-1); + + /* open given device */ + if ((fd = open(np->rname, O_RDONLY | O_NDELAY, 0)) < 0) + return (mdsyserror(ep, errno, np->cname)); + + /* re-read vtoc */ + if (meta_getvtoc(fd, np->cname, &vtocbuf, &partno, ep) == -1) { + (void) close(fd); + return (-1); + } + + /* close device */ + (void) close(fd); /* sd/ssd bug */ + + /* return timestamp, success */ + *stamp = vtocbuf.timestamp[partno]; + return (0); +} + +/* + * returns + * 0 on success, + * ENOTSUP if it's not a device with a vtoc + * -1 on failure + */ +int +setdevstamp( + mddrivename_t *dnp, + time_t *stamp, /* returned timestamp */ + md_error_t *ep +) +{ + int fd; + int partno; + struct vtoc vtocbuf; + time_t now = time(NULL); + mdname_t *np; + + if ((np = metaslicename(dnp, MD_SLICE0, ep)) == NULL) + return (-1); + + /* open for vtoc */ + if ((fd = open(np->rname, O_RDWR | O_NDELAY, 0)) < 0) + return (mdsyserror(ep, errno, np->cname)); + + if (meta_getvtoc(fd, np->cname, &vtocbuf, &partno, ep) == -1) { + (void) close(fd); + if (partno == VT_ENOTSUP) + return (ENOTSUP); + else + return (-1); + } + + *stamp = vtocbuf.timestamp[partno] = now; + + if (meta_setvtoc(fd, np->cname, &vtocbuf, ep) == -1) { + (void) close(fd); + return (-1); + } + + /* Clear the timestamp */ + vtocbuf.timestamp[partno] = 0; + + if (meta_getvtoc(fd, np->cname, &vtocbuf, &partno, ep) == -1) { + (void) close(fd); + return (-1); + } + + (void) close(fd); /* sd/ssd bug */ + + if (*stamp != vtocbuf.timestamp[partno]) + return (mddeverror(ep, MDE_CANTVERIFY_VTOC, NODEV64, + np->cname)); + + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_error.c b/usr/src/lib/lvm/libmeta/common/meta_error.c new file mode 100644 index 0000000000..0c359f344b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_error.c @@ -0,0 +1,2309 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * print metedevice errors + */ + +#include <meta.h> +#include <sys/lvm/md_mddb.h> + +#include <syslog.h> + +/* + * clear error + */ +void +mdclrerror( + md_error_t *ep +) +{ + if (ep->name != NULL) + Free(ep->name); + if (ep->host != NULL) + Free(ep->host); + if (ep->extra != NULL) + Free(ep->extra); + (void) memset(ep, '\0', sizeof (*ep)); +} + +/* + * cook names + */ +static char * +md_name( + minor_t mnum +) +{ + char *name; + + /* get name, or fake it */ + if ((name = get_mdname(mnum)) == NULL) { + char buf[40]; + + (void) sprintf(buf, "%lu/d%lu", MD_MIN2SET(mnum), + MD_MIN2UNIT(mnum)); + return (Strdup(buf)); + } + return (Strdup(name)); +} + +static char * +dev_name( + set_t setno, + md_dev64_t dev +) +{ + char *name; + + /* get name or fake it */ + if (dev == NODEV64) + return (Strdup(dgettext(TEXT_DOMAIN, "unknown device"))); + if ((name = get_devname(setno, dev)) == NULL) { + char buf[40]; + + (void) sprintf(buf, "%lu.%lu", meta_getmajor(dev), + meta_getminor(dev)); + return (Strdup(buf)); + } + return (Strdup(name)); +} + +static char * +hsp_name( + hsp_t hsp +) +{ + char *name; + + if ((name = get_hspname(hsp)) == NULL) { + char buf[40]; + + (void) sprintf(buf, "%u/hsp%03u", HSP_SET(hsp), HSP_ID(hsp)); + return (Strdup(buf)); + } + return (Strdup(name)); +} + +static char * +set_name( + set_t setno +) +{ + mdsetname_t *sp; + md_error_t xep = mdnullerror; + + if (setno == MD_SET_BAD) + return (NULL); + + if ((sp = metasetnosetname(setno, &xep)) == NULL) { + char buf[40]; + + mdclrerror(&xep); + (void) sprintf(buf, "setno %u", setno); + return (Strdup(buf)); + } + return (Strdup(sp->setname)); +} + +/* + * fill in all the appropriate md_error_t fields + */ +static void +metacookerror( + md_error_t *ep, /* generic error */ + char *name /* optional name or host */ +) +{ + /* get host name */ + if (ep->host != NULL) { + Free(ep->host); + ep->host = NULL; + } + if ((ep->info.errclass == MDEC_RPC) && + (name != NULL) && (*name != '\0')) { + ep->host = Strdup(name); + name = NULL; + } else + ep->host = Strdup(mynode()); + + /* get appropriate name */ + if (ep->name != NULL) { + Free(ep->name); + ep->name = NULL; + } + if ((name != NULL) && (*name != '\0')) { + ep->name = Strdup(name); + } else { + switch (ep->info.errclass) { + + /* can't do anything about these */ + case MDEC_VOID: + case MDEC_SYS: + case MDEC_RPC: + default: + break; + + /* device name */ + case MDEC_DEV: + { + md_dev_error_t *ip = + &ep->info.md_error_info_t_u.dev_error; + + ep->name = dev_name(MD_SET_BAD, ip->dev); + break; + } + + /* device name */ + case MDEC_USE: + { + md_use_error_t *ip = + &ep->info.md_error_info_t_u.use_error; + + ep->name = dev_name(MD_SET_BAD, ip->dev); + if (ip->where == NULL) { + ip->where = Strdup(dgettext(TEXT_DOMAIN, + "unknown")); + } + break; + } + + /* metadevice name */ + case MDEC_MD: + { + md_md_error_t *ip = + &ep->info.md_error_info_t_u.md_error; + + ep->name = md_name(ip->mnum); + break; + } + + /* component name */ + case MDEC_COMP: + { + md_comp_error_t *ip = + &ep->info.md_error_info_t_u.comp_error; + char *mdname, *devname; + size_t len; + + mdname = md_name(ip->comp.mnum); + devname = dev_name(MD_MIN2SET(ip->comp.mnum), + ip->comp.dev); + len = strlen(mdname) + strlen(": ") + + strlen(devname) + 1; + ep->name = Malloc(len); + (void) snprintf(ep->name, len, "%s: %s", + mdname, devname); + Free(mdname); + Free(devname); + break; + } + + /* hotspare pool name */ + case MDEC_HSP: + { + md_hsp_error_t *ip = + &ep->info.md_error_info_t_u.hsp_error; + + ep->name = hsp_name(ip->hsp); + break; + } + + /* hotspare name */ + case MDEC_HS: + { + md_hs_error_t *ip = + &ep->info.md_error_info_t_u.hs_error; + char *hspname, *devname; + size_t len; + + hspname = hsp_name(ip->hs.hsp); + devname = dev_name(HSP_SET(ip->hs.hsp), ip->hs.dev); + len = strlen(hspname) + strlen(": ") + + strlen(devname) + 1; + ep->name = Malloc(len); + (void) snprintf(ep->name, len, "%s: %s", + hspname, devname); + Free(hspname); + Free(devname); + break; + } + + /* mddb name */ + case MDEC_MDDB: + { + md_mddb_error_t *ip = + &ep->info.md_error_info_t_u.mddb_error; + if (ip->mnum != NODEV32) + ep->name = md_name(ip->mnum); + ep->name = set_name(ip->setno); + break; + } + + /* set name */ + case MDEC_DS: + { + md_ds_error_t *ip = + &ep->info.md_error_info_t_u.ds_error; + + ep->name = set_name(ip->setno); + break; + } + } + } +} + +/* + * simple error + */ +int +mderror( + md_error_t *ep, + md_void_errno_t errnum, + char *name +) +{ + md_void_error_t *ip = &ep->info.md_error_info_t_u.void_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_VOID; + ip->errnum = errnum; + + metacookerror(ep, name); + return (-1); +} + +/* + * system error + */ +int +mdsyserror( + md_error_t *ep, + int errnum, + char *name +) +{ + md_sys_error_t *ip = &ep->info.md_error_info_t_u.sys_error; + + mdclrerror(ep); + if (errnum != 0) { + ep->info.errclass = MDEC_SYS; + ip->errnum = errnum; + } + + metacookerror(ep, name); + return (-1); +} + +/* + * RPC error + */ +int +mdrpcerror( + md_error_t *ep, + CLIENT *clntp, + char *host, + char *extra +) +{ + md_rpc_error_t *ip = &ep->info.md_error_info_t_u.rpc_error; + struct rpc_err rpcerr; + + mdclrerror(ep); + clnt_geterr(clntp, &rpcerr); + ep->info.errclass = MDEC_RPC; + ip->errnum = rpcerr.re_status; + + metacookerror(ep, host); + mderrorextra(ep, extra); + return (-1); +} + +/* + * RPC create error + */ +int +mdrpccreateerror( + md_error_t *ep, + char *host, + char *extra +) +{ + md_rpc_error_t *ip = &ep->info.md_error_info_t_u.rpc_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_RPC; + ip->errnum = rpc_createerr.cf_stat; + + metacookerror(ep, host); + mderrorextra(ep, extra); + return (-1); +} + +/* + * device error + */ +int +mddeverror( + md_error_t *ep, + md_dev_errno_t errnum, + md_dev64_t dev, + char *name +) +{ + md_dev_error_t *ip = &ep->info.md_error_info_t_u.dev_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_DEV; + ip->errnum = errnum; + ip->dev = dev; + + metacookerror(ep, name); + return (-1); +} + +/* + * use error + */ +int +mduseerror( + md_error_t *ep, + md_use_errno_t errnum, + md_dev64_t dev, + char *where, + char *name +) +{ + md_use_error_t *ip = &ep->info.md_error_info_t_u.use_error; + + assert(where != NULL); + mdclrerror(ep); + ep->info.errclass = MDEC_USE; + ip->errnum = errnum; + ip->dev = dev; + ip->where = Strdup(where); + + metacookerror(ep, name); + return (-1); +} + +/* + * overlap error + */ +int +mdoverlaperror( + md_error_t *ep, + md_overlap_errno_t errnum, + char *name, + char *where, + char *overlap +) +{ + md_overlap_error_t *ip = + &ep->info.md_error_info_t_u.overlap_error; + + assert(overlap != NULL); + mdclrerror(ep); + ep->info.errclass = MDEC_OVERLAP; + ip->errnum = errnum; + ip->overlap = Strdup(overlap); + ip->where = NULL; + if (where != NULL) + ip->where = Strdup(where); + + metacookerror(ep, name); + return (-1); +} + +/* + * metadevice error + */ +int +mdmderror( + md_error_t *ep, + md_md_errno_t errnum, + minor_t mnum, + char *name +) +{ + md_md_error_t *ip = &ep->info.md_error_info_t_u.md_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_MD; + ip->errnum = errnum; + ip->mnum = mnum; + + metacookerror(ep, name); + return (-1); +} + +/* + * component error + */ +int +mdcomperror( + md_error_t *ep, + md_comp_errno_t errnum, + minor_t mnum, + md_dev64_t dev, + char *name +) +{ + md_comp_error_t *ip = &ep->info.md_error_info_t_u.comp_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_COMP; + ip->errnum = errnum; + ip->comp.mnum = mnum; + ip->comp.dev = dev; + + metacookerror(ep, name); + return (-1); +} + +/* + * hotspare pool error + */ +int +mdhsperror( + md_error_t *ep, + md_hsp_errno_t errnum, + hsp_t hsp, + char *name +) +{ + md_hsp_error_t *ip = &ep->info.md_error_info_t_u.hsp_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_HSP; + ip->errnum = errnum; + ip->hsp = hsp; + + metacookerror(ep, name); + return (-1); +} + +/* + * hotspare error + */ +int +mdhserror( + md_error_t *ep, + md_hs_errno_t errnum, + hsp_t hsp, + md_dev64_t dev, + char *name +) +{ + md_hs_error_t *ip = &ep->info.md_error_info_t_u.hs_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_HS; + ip->errnum = errnum; + ip->hs.hsp = hsp; + ip->hs.dev = dev; + + metacookerror(ep, name); + return (-1); +} + +/* + * MDDB error + */ +int +mdmddberror( + md_error_t *ep, + md_mddb_errno_t errnum, + minor_t mnum, + set_t setno, + size_t size, + char *name +) +{ + md_mddb_error_t *ip = &ep->info.md_error_info_t_u.mddb_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_MDDB; + ip->errnum = errnum; + ip->mnum = mnum; + ip->setno = setno; + ip->size = size; + + metacookerror(ep, name); + return (-1); +} + +/* + * metadevice diskset (ds) error + */ +int +mddserror( + md_error_t *ep, + md_ds_errno_t errnum, + set_t setno, + char *node, + char *drive, + char *name +) +{ + md_ds_error_t *ip = &ep->info.md_error_info_t_u.ds_error; + + mdclrerror(ep); + ep->info.errclass = MDEC_DS; + ip->errnum = errnum; + ip->setno = setno; + ip->node = ((node != NULL) ? Strdup(node) : NULL); + ip->drive = ((drive != NULL) ? Strdup(drive) : NULL); + + metacookerror(ep, name); + return (-1); +} + +/* + * clear/attach extra context information + */ +void +mderrorextra( + md_error_t *ep, + char *extra +) +{ + if (ep->extra != NULL) + Free(ep->extra); + if (extra != NULL) + ep->extra = Strdup(extra); + else + ep->extra = NULL; +} + +/* + * steal (copy) an error code safely + */ +int +mdstealerror( + md_error_t *to, + md_error_t *from +) +{ + mdclrerror(to); + *to = *from; + (void) memset(from, '\0', sizeof (*from)); + return (-1); +} + +/* + * do an ioctl, cook the error, and return status + */ +int +metaioctl( + int cmd, + void *data, + md_error_t *ep, + char *name +) +{ + int fd; + + /* open admin device */ + if ((fd = open_admin(ep)) < 0) + return (-1); + + /* do ioctl */ + mdclrerror(ep); + if (ioctl(fd, cmd, data) != 0) { + return (mdsyserror(ep, errno, name)); + } else if (! mdisok(ep)) { + metacookerror(ep, name); + return (-1); + } + + /* return success */ + return (0); +} + +/* + * print void class errors + */ +static char * +void_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_void_error_t *ip = &ep->info.md_error_info_t_u.void_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_NONE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "no error")); + break; + case MDE_UNIT_NOT_FOUND: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit not found")); + break; + case MDE_DUPDRIVE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "drive specified more than once")); + break; + case MDE_INVAL_HSOP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "illegal hot spare operation")); + break; + case MDE_NO_SET: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "no such set")); + break; + case MDE_SET_DIFF: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "set name is inconsistent")); + break; + case MDE_BAD_RD_OPT: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid read option")); + break; + case MDE_BAD_WR_OPT: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid write option")); + break; + case MDE_BAD_PASS_NUM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid pass number")); + break; + case MDE_BAD_RESYNC_OPT: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid resync option")); + break; + case MDE_BAD_INTERLACE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid interlace")); + break; + case MDE_NO_HSPS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "no hotspare pools found")); + break; + case MDE_NOTENOUGH_DB: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "must have at least 1 database (-f overrides)")); + break; + case MDE_DELDB_NOTALLOWED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "cannot delete the last database replica in the diskset")); + break; + case MDE_DEL_VALIDDB_NOTALLOWED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Deleting specified valid replicas results in stale " + "state database. Configuration changes with stale " + "database result in panic(-f overrides)")); + break; + case MDE_SYSTEM_FILE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "error in system file")); + break; + case MDE_MDDB_FILE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "error in mddb.cf file")); + break; + case MDE_MDDB_CKSUM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "checksum error in mddb.cf file")); + break; + case MDE_VFSTAB_FILE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "error in vfstab file")); + break; + case MDE_NOSLICE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "invalid slice number for drive name")); + break; + case MDE_SYNTAX: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "syntax error")); + break; + case MDE_OPTION: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "illegal option")); + break; + case MDE_TAKE_OWN: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "failed to reserve any drives")); + break; + case MDE_NOT_DRIVENAME: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not a valid drive name")); + break; + case MDE_RESERVED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "reserved by another host")); + break; + case MDE_DVERSION: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "driver version mismatch")); + break; + case MDE_MVERSION: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice state database version mismatch")); + break; + case MDE_TESTERROR: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "TEST ERROR MESSAGE")); + break; + case MDE_BAD_ORIG_NCOL: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid column count")); + break; + case MDE_RAID_INVALID: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "devices were not RAIDed previously or " + "are specified in the wrong order")); + break; + case MDE_MED_ERROR: + break; + case MDE_TOOMANYMED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "too many mediator hosts requested")); + break; + case MDE_NOMED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "no mediator hosts found")); + break; + case MDE_ONLYNODENAME: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "only the nodename of a host is required for deletes")); + break; + case MDE_RAID_BAD_PW_CNT: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "simultaneous writes out of range")); + break; + case MDE_DEVID_TOOBIG: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "relocation information size is greater than reported")); + break; + case MDE_NOPERM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Permission denied. You must have root privilege " + "to execute this command.")); + break; + case MDE_NODEVID: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Device relocation information not available " + "for this device")); + break; + case MDE_NOROOT: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no root filesystem in /etc/mnttab")); + break; + case MDE_EOF_TRANS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + MD_EOF_TRANS_MSG)); + break; + case MDE_NOT_MN: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "option only valid within a multi-owner set")); + break; + case MDE_ABR_SET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Invalid command for mirror with ABR set")); + break; + case MDE_INVAL_MNOP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Invalid operation on multi-owner set")); + break; + case MDE_MNSET_NOTRANS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Trans metadevice not supported on multi-owner set")); + break; + case MDE_MNSET_NORAID: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "RAID-5 metadevice not supported on multi-owner set")); + break; + case MDE_FORCE_DEL_ALL_DRV: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Must specify -f option to delete all drives from set")); + break; + case MDE_STRIPE_TRUNC_SINGLE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "The necessary rounding would result in data loss. " + "You can avoid this by concatenating additional devices " + "totaling at least %s blocks, or by increasing the size " + "of the specified component by exactly %s blocks."), + ep->extra, ep->extra); + break; + case MDE_STRIPE_TRUNC_MULTIPLE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "The necessary rounding would result in data loss. " + "You can avoid this by concatenating additional devices " + "totaling at least %s blocks."), ep->extra); + break; + case MDE_SMF_FAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "failed to enable/disable SVM service")); + break; + case MDE_SMF_NO_SERVICE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "service(s) not online in SMF")); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown void error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print sys class errors + */ +static char * +sys_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_sys_error_t *ip = &ep->info.md_error_info_t_u.sys_error; + char *emsg; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + if ((emsg = strerror(ip->errnum)) == NULL) { + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unknown errno %d out of range"), + ip->errnum); + } else { + (void) snprintf(p, psize, "%s", emsg); + } + + return (buf); +} + +/* + * print RPC class errors + */ +static char * +rpc_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_rpc_error_t *ip = &ep->info.md_error_info_t_u.rpc_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + (void) snprintf(p, psize, "%s", clnt_sperrno(ip->errnum)); + return (buf); +} + +/* + * print dev class errors + */ +static char * +dev_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_dev_error_t *ip = &ep->info.md_error_info_t_u.dev_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_INVAL_HS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "hotspare doesn't exist")); + break; + case MDE_FIX_INVAL_STATE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "cannot enable hotspared device")); + break; + case MDE_FIX_INVAL_HS_STATE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare isn't broken, can't enable")); + break; + case MDE_NOT_META: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not a metadevice")); + break; + case MDE_IS_DUMP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "is a dump device")); + break; + case MDE_IS_META: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "is a metadevice")); + break; + case MDE_IS_SWAPPED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "is swapped on")); + break; + case MDE_NAME_SPACE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "namespace error")); + break; + case MDE_IN_SHARED_SET: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "device in shared set")); + break; + case MDE_NOT_IN_SET: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "device not in set")); + break; + case MDE_NOT_DISK: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not a disk device")); + break; + case MDE_CANT_CONFIRM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "can't confirm device")); + break; + case MDE_INVALID_PART: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid partition")); + break; + case MDE_HAS_MDDB: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "has a metadevice database replica")); + break; + case MDE_NO_DB: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no metadevice database replica on device")); + break; + case MDE_CANTVERIFY_VTOC: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unable to verify the vtoc")); + break; + case MDE_NOT_LOCAL: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not in local set")); + break; + case MDE_DEVICES_NAME: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "can't parse /devices name")); + break; + case MDE_REPCOMP_INVAL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "replica slice is not usable as a metadevice component")); + break; + case MDE_REPCOMP_ONLY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "only replica slice is usable for a diskset " + "database replica")); + break; + case MDE_INV_ROOT: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "invalid root device for this operation")); + break; + case MDE_MULTNM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "multiple entries for device in Solaris Volume Manager " + "configuration")); + break; + case MDE_TOO_MANY_PARTS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Disks with more than %d partitions are not supported " + "in Solaris Volume Manager"), MD_MAX_PARTS); + break; + case MDE_REPART_REPLICA: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "cannot repartition a slice with an existing replica")); + break; + default: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unknown dev error code %d"), + ip->errnum); + break; + } + + return (buf); +} + +/* + * print overlap class errors + */ +static char * +overlap_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_overlap_error_t *ip = + &ep->info.md_error_info_t_u.overlap_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_OVERLAP_MOUNTED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "overlaps with %s which is mounted as \'%s\'"), + ip->overlap, ip->where); + break; + case MDE_OVERLAP_SWAP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "overlaps with %s which is a swap device"), ip->overlap); + break; + case MDE_OVERLAP_DUMP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "overlaps with %s which is the dump device"), ip->overlap); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown overlap error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print use class errors + */ +static char * +use_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_use_error_t *ip = &ep->info.md_error_info_t_u.use_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_IS_MOUNTED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "is mounted on %s"), + ip->where); + break; + case MDE_ALREADY: + /* + * when the object of the error (existing device that + * would being used by SVM) is the metadb then it is necessary + * to explicitly specify the string in the error message so + * that it can be successfully localized for the Asian locales. + */ + if (strcmp(ip->where, MDB_STR) != 0) { + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "has appeared more than once in the " + "specification of %s"), ip->where); + } else { + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "has appeared more than once in the " + "specification of " MDB_STR)); + } + break; + case MDE_OVERLAP: + /* + * when the object of the error (existing device that + * would overlap) is the metadb then it is necessary + * to explicitly specify the string in the error message so + * that it can be successfully localized for the Asian locales. + */ + if (strcmp(ip->where, MDB_STR) != 0) { + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "overlaps with device in %s"), + ip->where); + } else { + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "overlaps with device in " + MDB_STR)); + } + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown dev error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print md class errors + */ +static char * +md_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_md_error_t *ip = &ep->info.md_error_info_t_u.md_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_INVAL_UNIT: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid unit")); + break; + case MDE_UNIT_NOT_SETUP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit not set up")); + break; + case MDE_UNIT_ALREADY_SETUP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit already set up")); + break; + case MDE_NOT_MM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit is not a mirror")); + break; + case MDE_IS_SM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "illegal to clear submirror")); + break; + case MDE_IS_OPEN: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "metadevice is open")); + break; + case MDE_C_WITH_INVAL_SM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempted to clear mirror with submirror(s) " + "in invalid state")); + break; + case MDE_RESYNC_ACTIVE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "resync in progress")); + break; + case MDE_LAST_SM_RE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempt to replace a component on the last " + "running submirror")); + break; + case MDE_MIRROR_FULL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "mirror has maximum number of submirrors")); + break; + case MDE_IN_UNAVAIL_STATE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "component is in unavailable state; run 'metastat -i'")); + break; + case MDE_IN_USE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice in use")); + break; + case MDE_SM_TOO_SMALL: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "submirror too small to attach")); + break; + case MDE_NO_LABELED_SM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "can't attach labeled submirror to an unlabeled mirror")); + break; + case MDE_SM_OPEN_ERR: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "submirror open error")); + break; + case MDE_CANT_FIND_SM: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "can't find submirror in mirror")); + break; + case MDE_LAST_SM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempt to detach last running submirror")); + break; + case MDE_NO_READABLE_SM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "operation would result in no readable submirrors")); + break; + case MDE_SM_FAILED_COMPS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempt an operation on a submirror " + "that has erred components")); + break; + case MDE_ILLEGAL_SM_STATE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempt operation on a submirror in illegal state")); + break; + case MDE_RR_ALLOC_ERROR: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attach failed, unable to allocate new resync info")); + break; + case MDE_MIRROR_OPEN_FAILURE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "insufficient devices to open")); + break; + case MDE_MIRROR_THREAD_FAILURE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "mirror thread failure")); + break; + case MDE_GROW_DELAYED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "growing of metadevice delayed")); + break; + case MDE_NOT_MT: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit is not a trans")); + break; + case MDE_HS_IN_USE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "can't modify hot spare pool, hot spare in use")); + break; + case MDE_HAS_LOG: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "already has log")); + break; + case MDE_UNKNOWN_TYPE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unknown metadevice type")); + break; + case MDE_NOT_STRIPE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit is not a concat/stripe")); + break; + case MDE_NOT_RAID: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit is not a RAID")); + break; + case MDE_NROWS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not enough stripes specified")); + break; + case MDE_NCOMPS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not enough components specified")); + break; + case MDE_NSUBMIRS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "not enough submirrors specified")); + break; + case MDE_BAD_STRIPE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid stripe configuration")); + break; + case MDE_BAD_MIRROR: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid mirror configuration")); + break; + case MDE_BAD_TRANS: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid trans configuration")); + break; + case MDE_BAD_RAID: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "invalid RAID configuration")); + break; + case MDE_RAID_OPEN_FAILURE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "resync unable to open RAID unit")); + break; + case MDE_RAID_THREAD_FAILURE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "attempt to start resync thread failed")); + break; + case MDE_RAID_NEED_FORCE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "operation requires -f (force) flag")); + break; + case MDE_NO_LOG: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "log has already been detached")); + break; + case MDE_RAID_DOI: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "only valid action is metaclear")); + break; + case MDE_RAID_LAST_ERRED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "in Last Erred state, " + "errored components must be replaced")); + break; + case MDE_RAID_NOT_OKAY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "all components must be Okay to perform this operation")); + break; + case MDE_RENAME_BUSY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice is temporarily too busy for renames")); + break; + case MDE_RENAME_SOURCE_BAD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "source metadevice is not able to be renamed")); + break; + case MDE_RENAME_TARGET_BAD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "target metadevice is not able to be renamed")); + break; + case MDE_RENAME_TARGET_UNRELATED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "target metadevice is not related to source metadevice")); + break; + case MDE_RENAME_CONFIG_ERROR: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice driver configuration error; " + "rename can't occur")); + break; + case MDE_RENAME_ORDER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "units may not be renamed in that order")); + break; + case MDE_RECOVER_FAILED: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "recovery failed")); + break; + case MDE_SP_NOSPACE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "not enough space available for request")); + break; + case MDE_SP_BADWMREAD: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "error reading extent header")); + break; + case MDE_SP_BADWMWRITE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "error writing extent header")); + break; + case MDE_SP_BADWMMAGIC: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "bad magic number in extent header")); + break; + case MDE_SP_BADWMCRC: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "bad checksum in extent header")); + break; + case MDE_NOT_SP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unit is not a soft partition")); + break; + case MDE_SP_OVERLAP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "overlapping extents specified")); + break; + case MDE_SP_BAD_LENGTH: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "bad length specified")); + break; + case MDE_SP_NOSP: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "no soft partitions on this device")); + break; + case MDE_UNIT_TOO_LARGE: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "Volume size cannot exceed 1 TByte")); + break; + case MDE_LOG_TOO_LARGE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Trans log size must be less than 1 TByte")); + break; + default: + (void) snprintf(p, psize, + dgettext(TEXT_DOMAIN, "unknown md error code %d"), + ip->errnum); + break; + } + + return (buf); +} + +/* + * print comp class errors + */ +static char * +comp_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_comp_error_t *ip = &ep->info.md_error_info_t_u.comp_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_CANT_FIND_COMP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "can't find component in unit")); + break; + case MDE_REPL_INVAL_STATE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "component in invalid state to replace - " + "Replace \"Maintenance\" components first")); + break; + case MDE_COMP_TOO_SMALL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "replace failure, new component is too small")); + break; + case MDE_COMP_OPEN_ERR: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unable to open concat/stripe component")); + break; + case MDE_RAID_COMP_ERRED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "must replace errored component first")); + break; + case MDE_MAXIO: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "maxtransfer is too small")); + break; + case MDE_SP_COMP_OPEN_ERR: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "error opening device under soft partition. Check" + " device status, then use metadevadm(1M).")); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown comp error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print hsp class errors + */ +static char * +hsp_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_hsp_error_t *ip = &ep->info.md_error_info_t_u.hsp_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_HSP_CREATE_FAILURE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare pool database create failure")); + break; + case MDE_HSP_IN_USE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare pool in use")); + break; + case MDE_INVAL_HSP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "invalid hotspare pool")); + break; + case MDE_HSP_BUSY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare pool is busy")); + break; + case MDE_HSP_REF: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare pool is referenced")); + break; + case MDE_HSP_ALREADY_SETUP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare pool is already setup")); + break; + case MDE_BAD_HSP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "invalid hotspare pool configuration")); + break; + case MDE_HSP_UNIT_TOO_LARGE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "units in the hotspare pool cannot exceed 1 TByte")); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown hsp error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print hs class errors + */ +static char * +hs_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_hs_error_t *ip = &ep->info.md_error_info_t_u.hs_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_HS_RESVD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare is in use")); + break; + case MDE_HS_CREATE_FAILURE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare database create failure")); + break; + case MDE_HS_INUSE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "add or replace failed, hot spare is already in use")); + break; + case MDE_HS_UNIT_TOO_LARGE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "hotspare size cannot exceed 1 TByte")); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown hs error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print mddb class errors + */ +static char * +mddb_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_mddb_error_t *ip = &ep->info.md_error_info_t_u.mddb_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_TOOMANY_REPLICAS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "%d metadevice database replicas is too many; the maximum is %d"), + ip->size, MDDB_NLB); + break; + case MDE_REPLICA_TOOSMALL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "device size %d is too small for metadevice database replica"), + ip->size); + break; + case MDE_NOTVERIFIED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "data not returned correctly from disk")); + break; + case MDE_DB_INVALID: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "invalid argument")); + break; + case MDE_DB_EXISTS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice database replica exists on device")); + break; + case MDE_DB_MASTER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "has bad master block on device")); + break; + case MDE_DB_TOOSMALL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "device is too small")); + break; + case MDE_DB_NORECORD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no such metadevice database record")); + break; + case MDE_DB_NOSPACE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice database is full, can't create new records")); + break; + case MDE_DB_NOTNOW: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metadevice database has too few replicas, for " + "metadevice database operation")); + break; + case MDE_DB_NODB: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "there are no existing databases")); + break; + case MDE_DB_NOTOWNER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "not owner of metadevice database")); + break; + case MDE_DB_STALE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "stale databases")); + break; + case MDE_DB_TOOFEW: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "not enough databases")); + break; + case MDE_DB_TAGDATA: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "tagged data detected, user intervention required")); + break; + case MDE_DB_ACCOK: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "50% replicas & 50% mediator hosts available, " + "user intervention required")); + break; + case MDE_DB_NTAGDATA: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no tagged data available or only one tag found")); + break; + case MDE_DB_ACCNOTOK: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "50% replicas & 50% mediator hosts not available")); + break; + case MDE_DB_NOLOCBLK: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no valid locator blocks were found")); + break; + case MDE_DB_NOLOCNMS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no valid locator name information was found")); + break; + case MDE_DB_NODIRBLK: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no valid directory blocks were found")); + break; + case MDE_DB_NOTAGREC: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no tag record was allocated, so data " + "tagging is disabled")); + break; + case MDE_DB_NOTAG: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no tag records exist or no matching tag was found")); + break; + case MDE_DB_BLKRANGE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "logical block number %d out of range"), ip->size); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown mddb error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * print diskset (ds) class errors + */ +static char * +ds_to_str( + md_error_t *ep, + char *buf, + size_t size +) +{ + md_ds_error_t *ip = &ep->info.md_error_info_t_u.ds_error; + char *p = buf + strlen(buf); + size_t psize = size - strlen(buf); + + switch (ip->errnum) { + case MDE_DS_DUPHOST: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s is specified more than once"), ip->node); + break; + case MDE_DS_NOTNODENAME: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "\"%s\" is not a nodename, but a network name"), ip->node); + break; + case MDE_DS_SELFNOTIN: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "nodename of host %s creating the set must be included"), + ip->node); + break; + case MDE_DS_NODEHASSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s already has set"), ip->node); + break; + case MDE_DS_NODENOSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s does not have set"), ip->node); + break; + case MDE_DS_NOOWNER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "must be owner of the set for this command")); + break; + case MDE_DS_NOTOWNER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "only the current owner %s may operate on this set"), + ip->node); + break; + case MDE_DS_NODEISNOTOWNER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s is not the owner"), ip->node); + break; + case MDE_DS_NODEINSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s is already in the set"), ip->node); + break; + case MDE_DS_NODENOTINSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s is not in the set"), ip->node); + break; + case MDE_DS_SETNUMBUSY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s already has a set numbered %ld"), + ip->node, ip->setno); + break; + case MDE_DS_SETNUMNOTAVAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "no available set numbers")); + break; + case MDE_DS_SETNAMEBUSY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set name is in-use or invalid on host %s"), ip->node); + break; + case MDE_DS_DRIVENOTCOMMON: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is not common with host %s"), + ip->drive, ip->node); + break; + case MDE_DS_DRIVEINSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is in set %s"), ip->drive, ip->node); + break; + case MDE_DS_DRIVENOTINSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is not in set"), ip->drive); + break; + case MDE_DS_DRIVEINUSE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is in use"), ip->drive); + break; + case MDE_DS_DUPDRIVE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is specified more than once"), ip->drive); + break; + case MDE_DS_INVALIDSETNAME: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set name contains invalid characters")); + break; + case MDE_DS_HASDRIVES: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unable to delete set, it still has drives")); + break; + case MDE_DS_SIDENUMNOTAVAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "maximum number of nodenames exceeded")); + break; + case MDE_DS_SETNAMETOOLONG: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set name is too long")); + break; + case MDE_DS_NODENAMETOOLONG: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host name %s is too long"), ip->node); + break; + case MDE_DS_OHACANTDELSELF: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, +"administrator host %s deletion disallowed in one host admin mode"), + ip->node); + break; + case MDE_DS_HOSTNOSIDE: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "side information missing for host %s"), ip->node); + break; + case MDE_DS_SETLOCKED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s is modifying set - try later or restart rpc.metad"), + ip->drive); + break; + case MDE_DS_ULKSBADKEY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set unlock failed - bad key")); + break; + case MDE_DS_LKSBADKEY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set lock failed - bad key")); + break; + case MDE_DS_WRITEWITHSULK: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "write operation attempted on set with set unlocked")); + break; + case MDE_DS_SETCLEANUP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "set \"%s\" is out of date - cleaning up - take failed"), + ip->node); + break; + case MDE_DS_CANTDELSELF: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, +"administrator host %s can't be deleted, other hosts still in set\n" +"Use -f to override"), ip->node); + break; + case MDE_DS_HASMED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unable to delete set, it still has mediator hosts")); + break; + case MDE_DS_TOOMANYALIAS: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "%s causes there to be more aliases than allowed"), + ip->node); + break; + case MDE_DS_ISMED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "%s is already a mediator host"), ip->node); + break; + case MDE_DS_ISNOTMED: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "%s is not a mediator host"), ip->node); + break; + case MDE_DS_INVALIDMEDNAME: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "mediator name \"%s\" contains invalid characters"), + ip->node); + break; + case MDE_DS_ALIASNOMATCH: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "mediator alias \"%s\" is not an alias for host " + "\"%s\""), ip->node, ip->drive); + break; + case MDE_DS_NOMEDONHOST: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unable to contact %s on host \"%s\""), + MED_SERVNAME, ip->node); + break; + case MDE_DS_DRIVENOTONHOST: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "drive %s is not present on host %s"), + ip->drive, ip->node); + break; + case MDE_DS_CANTDELMASTER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "master %s can't be deleted, other hosts still in set"), + ip->node); + break; + case MDE_DS_NOTINMEMBERLIST: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "node %s is not in membership list"), + ip->node); + break; + case MDE_DS_MNCANTDELSELF: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s can't delete self from multi-owner set\n" + "while other hosts still in set"), + ip->node); + break; + case MDE_DS_RPCVERSMISMATCH: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "host %s does not support multi-owner diskset"), + ip->node); + break; + case MDE_DS_WITHDRAWMASTER: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "master host %s cannot withdraw from multi-owner diskset " + "when other owner nodes are still present in diskset"), + ip->node); + break; + case MDE_DS_CANTRESNARF: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "imported set could not be loaded")); + break; + case MDE_DS_INSUFQUORUM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "insufficient replica quorum detected. Use " + "-f to force import of the set")); + break; + case MDE_DS_EXTENDEDNM: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "multiple namespace records detected")); + break; + case MDE_DS_PARTIALSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "partial diskset detected\n" + "Please refer to the Solaris Volume Manager documentation," + "\nTroubleshooting section, at http://docs.sun.com or from" + "\nyour local copy")); + break; + case MDE_DS_COMMDCTL_SUSPEND_NYD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "rpc.mdcommd on host %s is not yet drained during " + "suspend operation"), + ip->node); + break; + case MDE_DS_COMMDCTL_SUSPEND_FAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "rpc.mdcommd on host %s failed suspend operation"), + ip->node); + break; + case MDE_DS_COMMDCTL_REINIT_FAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "rpc.mdcommd on host %s failed reinitialization operation"), + ip->node); + break; + case MDE_DS_COMMDCTL_RESUME_FAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "rpc.mdcommd on host %s failed resume operation"), + ip->node); + break; + case MDE_DS_NOTNOW_RECONFIG: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "command terminated, host %s starting reconfig cycle"), + ip->node); + break; + case MDE_DS_NOTNOW_CMD: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "metaset or metadb command already running on diskset " + "on host %s"), ip->node); + break; + case MDE_DS_COMMD_SEND_FAIL: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "rpc.mdcommd on host %s failed operation"), + ip->node); + break; + case MDE_DS_MASTER_ONLY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "this command must be run on the master node of the set," + " which is currently %s"), ip->node); + break; + case MDE_DS_SINGLEHOST: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "diskset is auto-take; cannot accept additional hosts")); + break; + case MDE_DS_AUTONOTSET: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "auto-take is not enabled on diskset")); + break; + case MDE_DS_INVALIDDEVID: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Invalid device id on drive %s on host %s"), ip->drive, + ip->node); + break; + case MDE_DS_SETNOTIMP: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Unable to import set on node %s"), ip->node); + break; + case MDE_DS_NOTSELFIDENTIFY: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "Drive %s won't be self identifying"), ip->drive); + break; + default: + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown diskset error code %d"), ip->errnum); + break; + } + + return (buf); +} + +/* + * convert error to printable string + */ +static char * +mde_to_str( + md_error_t *ep +) +{ + static char buf[BUFSIZ]; + size_t bufsz; + + /* intialize buf */ + buf[0] = '\0'; + bufsz = sizeof (buf); + + /* class specific */ + switch (ep->info.errclass) { + case MDEC_VOID: + return (void_to_str(ep, buf, bufsz)); + case MDEC_SYS: + return (sys_to_str(ep, buf, bufsz)); + case MDEC_RPC: + return (rpc_to_str(ep, buf, bufsz)); + case MDEC_DEV: + return (dev_to_str(ep, buf, bufsz)); + case MDEC_USE: + return (use_to_str(ep, buf, bufsz)); + case MDEC_MD: + return (md_to_str(ep, buf, bufsz)); + case MDEC_COMP: + return (comp_to_str(ep, buf, bufsz)); + case MDEC_HSP: + return (hsp_to_str(ep, buf, bufsz)); + case MDEC_HS: + return (hs_to_str(ep, buf, bufsz)); + case MDEC_MDDB: + return (mddb_to_str(ep, buf, bufsz)); + case MDEC_DS: + return (ds_to_str(ep, buf, bufsz)); + case MDEC_OVERLAP: + return (overlap_to_str(ep, buf, bufsz)); + default: + (void) snprintf(buf, bufsz, + dgettext(TEXT_DOMAIN, "unknown error class %d"), + ep->info.errclass); + return (buf); + } +} + +/* + * print log prefix + */ +void +md_logpfx( + FILE *fp +) +{ + time_t t; + struct tm *tm; + char buf[100]; + + if ((time(&t) != (time_t)-1) && + ((tm = localtime(&t)) != NULL) && + (strftime(buf, sizeof (buf), (char *)0, tm) < sizeof (buf))) { + (void) fprintf(fp, "%s: ", buf); + } + (void) fprintf(fp, "%s: ", myname); +} + +/* + * varargs sperror() + */ +/*PRINTFLIKE2*/ +static char * +mde_vsperror( + md_error_t *ep, + const char *fmt, + va_list ap +) +{ + static char buf[BUFSIZ]; + size_t bufsz = sizeof (buf); + char *p = buf; + char *host1 = ""; + char *host2 = ""; + char *extra1 = ""; + char *extra2 = ""; + char *name1 = ""; + char *name2 = ""; + + /* get stuff */ + if ((ep->host != NULL) && (*(ep->host) != '\0')) { + host1 = ep->host; + host2 = ": "; + } + if ((ep->extra != NULL) && (*(ep->extra) != '\0')) { + extra1 = ep->extra; + extra2 = ": "; + } + if ((ep->name != NULL) && (*(ep->name) != '\0')) { + name1 = ep->name; + name2 = ": "; + } + + /* context */ + (void) snprintf(p, bufsz, "%s%s%s%s%s%s", + host1, host2, extra1, extra2, name1, name2); + p = &buf[strlen(buf)]; + bufsz -= strlen(buf); + + /* user defined part */ + if ((fmt != NULL) && (*fmt != '\0')) { + (void) vsnprintf(p, bufsz, fmt, ap); + p = &buf[strlen(buf)]; + bufsz = sizeof (buf) - strlen(buf); + (void) snprintf(p, bufsz, ": "); + p = &buf[strlen(buf)]; + bufsz = sizeof (buf) - strlen(buf); + } + + /* error code */ + (void) snprintf(p, bufsz, "%s\n", mde_to_str(ep)); + + /* return error message */ + return (buf); +} + +/* + * printf-like sperror() + */ +/*PRINTFLIKE2*/ +char * +mde_sperror( + md_error_t *ep, + const char *fmt, + ... +) +{ + va_list ap; + char *emsg; + + va_start(ap, fmt); + emsg = mde_vsperror(ep, fmt, ap); + va_end(ap); + return (emsg); +} + +/* + * printf-like perror() + */ +/*PRINTFLIKE2*/ +void +mde_perror( + md_error_t *ep, + const char *fmt, + ... +) +{ + va_list ap; + char *emsg; + + /* get error message */ + va_start(ap, fmt); + emsg = mde_vsperror(ep, fmt, ap); + va_end(ap); + assert((emsg != NULL) && (*emsg != '\0')); + + /* stderr */ + (void) fprintf(stderr, "%s: %s\n", myname, emsg); + (void) fflush(stderr); + + /* metalog */ + if (metalogfp != NULL) { + md_logpfx(metalogfp); + (void) fprintf(metalogfp, "%s\n", emsg); + (void) fflush(metalogfp); + (void) fsync(fileno(metalogfp)); + } + + /* syslog */ + if (metasyslog) { + syslog(LOG_ERR, emsg); + } +} + +/* + * printf-like perror() + */ +/*PRINTFLIKE1*/ +void +md_perror( + const char *fmt, + ... +) +{ + md_error_t status = mdnullerror; + va_list ap; + char *emsg; + + /* get error message */ + (void) mdsyserror(&status, errno, NULL); + va_start(ap, fmt); + emsg = mde_vsperror(&status, fmt, ap); + va_end(ap); + assert((emsg != NULL) && (*emsg != '\0')); + mdclrerror(&status); + + /* stderr */ + (void) fprintf(stderr, "%s: %s\n", myname, emsg); + (void) fflush(stderr); + + /* metalog */ + if (metalogfp != NULL) { + md_logpfx(metalogfp); + (void) fprintf(metalogfp, "%s\n", emsg); + (void) fflush(metalogfp); + (void) fsync(fileno(metalogfp)); + } + + /* syslog */ + if (metasyslog) { + syslog(LOG_ERR, emsg); + } +} + +/* + * printf-like log + */ +/*PRINTFLIKE1*/ +void +md_eprintf( + const char *fmt, + ... +) +{ + va_list ap; + + /* begin */ + va_start(ap, fmt); + + /* stderr */ + (void) fprintf(stderr, "%s: ", myname); + (void) vfprintf(stderr, fmt, ap); + (void) fflush(stderr); + + /* metalog */ + if (metalogfp != NULL) { + md_logpfx(metalogfp); + (void) vfprintf(metalogfp, fmt, ap); + (void) fflush(metalogfp); + (void) fsync(fileno(metalogfp)); + } + + /* syslog */ + if (metasyslog) { + vsyslog(LOG_ERR, fmt, ap); + } + + /* end */ + va_end(ap); +} + +/* + * metaclust timing messages logging routine + * + * level - The class of the message to be logged. Message will be logged + * if this is less than or equal to the verbosity level. + */ +void +meta_mc_log(int level, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + /* + * Log all messages upto MC_LOG2 to syslog regardless of the + * verbosity level + */ + if (metasyslog && (level <= MC_LOG2)) { + if (level <= MC_LOG1) + (void) vsyslog(LOG_ERR, fmt, args); + else + (void) vsyslog(LOG_INFO, fmt, args); + } + /* + * Print all messages to stderr provided the message level is + * within the verbosity level + */ + if (level <= verbosity) { + (void) fprintf(stderr, "%s: ", myname); + (void) vfprintf(stderr, fmt, args); + (void) fprintf(stderr, "\n"); + (void) fflush(stderr); + } + va_end(args); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_getdevs.c b/usr/src/lib/lvm/libmeta/common/meta_getdevs.c new file mode 100644 index 0000000000..af828bd083 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_getdevs.c @@ -0,0 +1,592 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * get dev_t list + */ + +#include <meta.h> + +#include <sys/mhd.h> +#include <strings.h> + +/* + * private version of minor(), able to handle 64 bit and 32 bit devices. + * print a warning out in case a 32 bit dev is specified. + */ +minor_t +meta_getminor(md_dev64_t dev64) +{ + /* check if it's a real 64 bit dev */ + if ((dev64 >> NBITSMAJOR64) > 0) { + return ((minor_t)(dev64 & MAXMIN64)); + } else { + if (getenv("META_DEBUG")) + (void) printf( + "meta_getminor called with 32 bit dev: 0x%llx\n", + dev64); + return ((minor_t)(dev64 & MAXMIN32)); + } +} + +/* + * private version of major(), able to handle 64 bit and 32 bit devices. + * print a warning out in case a 32 bit dev is specified. + */ +major_t +meta_getmajor(md_dev64_t dev64) +{ + /* check if it's a real 64 bit dev */ + if ((dev64 >> NBITSMAJOR64) > 0) { + return ((major_t)((dev64 >> NBITSMINOR64) & MAXMAJ64)); + } else { + if (getenv("META_DEBUG")) + (void) printf( + "meta_getmajor called with 32 bit dev: 0x%llx\n", + dev64); + return ((major_t)((dev64 >> NBITSMINOR32) & MAXMAJ32)); + } +} + +/* + * private version of cmpldev(), able to handle 64 bit and 32 bit devices. + */ +dev32_t +meta_cmpldev(md_dev64_t dev64) +{ + minor_t minor; + major_t major; + + major = (major_t)(dev64 >> NBITSMAJOR64); + if (major == 0) { + return ((dev32_t)dev64); + } + minor = (dev32_t)dev64 & MAXMIN32; + return ((major << NBITSMINOR32) | minor); +} + +/* + * private version of expldev(), able to handle 64 bit and 32 bit devices. + */ +md_dev64_t +meta_expldev(md_dev64_t dev64) +{ + minor_t minor; + major_t major; + + major = (major_t)(dev64 >> NBITSMAJOR64); + if (major > 0) { /* a 64 bit device was given, return unchanged */ + return (dev64); + } + minor = (minor_t)(dev64) & MAXMIN32; + major = ((major_t)dev64 >> NBITSMINOR32) & MAXMAJ32; + return (((md_dev64_t)major << NBITSMINOR64) | minor); +} + +/* + * get underlying devices (recursively) + */ +int +meta_getdevs( + mdsetname_t *sp, + mdname_t *namep, + mdnamelist_t **nlpp, + md_error_t *ep +) +{ + char *miscname; + md_dev64_t *mydevs = NULL; + md_getdevs_params_t mgd; + size_t i; + int rval = -1; + md_sys_error_t *ip; + + /* must have local set */ + assert(sp != NULL); + + /* just add regular devices */ + if (! metaismeta(namep)) { + mdnamelist_t *p; + + /* + * If the dev_t is in the array already + * then let's continue. + */ + for (p = *nlpp; (p != NULL); p = p->next) { + if (strcmp(namep->bname, p->namep->bname) == 0) { + rval = 0; + goto out; + } + } + + /* add to list */ + (void) metanamelist_append(nlpp, namep); + rval = 0; + goto out; + } + + /* get MD misc module */ + if ((miscname = metagetmiscname(namep, ep)) == NULL) + goto out; + + /* get count of underlying devices */ + (void) memset(&mgd, '\0', sizeof (mgd)); + MD_SETDRIVERNAME(&mgd, miscname, sp->setno); + mgd.mnum = meta_getminor(namep->dev); + mgd.cnt = 0; + mgd.devs = NULL; + if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, namep->cname) != 0) { + if (mgd.mde.info.errclass == MDEC_SYS) { + ip = &mgd.mde.info.md_error_info_t_u.sys_error; + if (ip->errnum == ENODEV) { + rval = 0; + goto out; + } + } + (void) mdstealerror(ep, &mgd.mde); + goto out; + } else if (mgd.cnt <= 0) { + assert(mgd.cnt >= 0); + rval = 0; + goto out; + } + + /* get underlying devices */ + mydevs = Zalloc(sizeof (*mydevs) * mgd.cnt); + mgd.devs = (uintptr_t)mydevs; + if (metaioctl(MD_IOCGET_DEVS, &mgd, &mgd.mde, namep->cname) != 0) { + if (mgd.mde.info.errclass == MDEC_SYS) { + ip = &mgd.mde.info.md_error_info_t_u.sys_error; + if (ip->errnum == ENODEV) { + rval = 0; + goto out; + } + } + (void) mdstealerror(ep, &mgd.mde); + goto out; + } else if (mgd.cnt <= 0) { + assert(mgd.cnt >= 0); + rval = 0; + goto out; + } + /* recurse */ + for (i = 0; (i < mgd.cnt); ++i) { + mdname_t *devnp; + + if (mydevs[i] == NODEV64) { + continue; + } + if ((devnp = metadevname(&sp, mydevs[i], ep)) == NULL) { + goto out; + } + if (meta_getdevs(sp, devnp, nlpp, ep) != 0) + goto out; + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (mydevs != NULL) + Free(mydevs); + return (rval); +} + +/* + * get all dev_t for a set + */ +int +meta_getalldevs( + mdsetname_t *sp, /* set to look in */ + mdnamelist_t **nlpp, /* returned devices */ + int check_db, + md_error_t *ep +) +{ + md_replicalist_t *rlp, *rp; + mdnamelist_t *nlp, *np; + mdhspnamelist_t *hspnlp, *hspp; + int rval = 0; + + assert(sp != NULL); + + /* + * Get a replica namelist, + * and then get all the devs within the replicas. + */ + if (check_db == TRUE) { + rlp = NULL; + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) + rval = -1; + for (rp = rlp; (rp != NULL); rp = rp->rl_next) { + if (meta_getdevs(sp, rp->rl_repp->r_namep, + nlpp, ep) != 0) + rval = -1; + } + metafreereplicalist(rlp); + } + + /* + * Get a stripe namelist, + * and then get all the devs within the stripes. + */ + nlp = NULL; + if (meta_get_stripe_names(sp, &nlp, 0, ep) < 0) + rval = -1; + for (np = nlp; (np != NULL); np = np->next) { + if (meta_getdevs(sp, np->namep, nlpp, ep) != 0) + rval = -1; + } + metafreenamelist(nlp); + + /* + * Get a mirror namelist, + * and then get all the devs within the mirrors. + */ + nlp = NULL; + if (meta_get_mirror_names(sp, &nlp, 0, ep) < 0) + rval = -1; + for (np = nlp; (np != NULL); np = np->next) { + if (meta_getdevs(sp, np->namep, nlpp, ep) != 0) + rval = -1; + } + metafreenamelist(nlp); + + /* + * Get a trans namelist, + * and then get all the devs within the trans. + */ + nlp = NULL; + + if (meta_get_trans_names(sp, &nlp, 0, ep) < 0) + rval = -1; + for (np = nlp; (np != NULL); np = np->next) { + if (meta_getdevs(sp, np->namep, nlpp, ep) != 0) + rval = -1; + } + metafreenamelist(nlp); + + /* + * Get a hot spare pool namelist, + * and then get all the devs within the hot spare pools. + */ + hspnlp = NULL; + if (meta_get_hsp_names(sp, &hspnlp, 0, ep) < 0) + rval = -1; + for (hspp = hspnlp; (hspp != NULL); hspp = hspp->next) { + md_hsp_t *hsp; + uint_t i; + + if ((hsp = meta_get_hsp(sp, hspp->hspnamep, ep)) == NULL) + rval = -1; + else for (i = 0; (i < hsp->hotspares.hotspares_len); ++i) { + md_hs_t *hs = &hsp->hotspares.hotspares_val[i]; + + if (meta_getdevs(sp, hs->hsnamep, nlpp, ep) != 0) + rval = -1; + } + } + metafreehspnamelist(hspnlp); + + /* + * Get a raid namelist, + * and then get all the devs within the raids. + */ + nlp = NULL; + if (meta_get_raid_names(sp, &nlp, 0, ep) < 0) + rval = -1; + for (np = nlp; (np != NULL); np = np->next) { + if (meta_getdevs(sp, np->namep, nlpp, ep) != 0) + rval = -1; + } + metafreenamelist(nlp); + + /* + * Get a soft partition namelist, + * and then get all the devs within the softpartitions + */ + nlp = NULL; + if (meta_get_sp_names(sp, &nlp, 0, ep) < 0) + rval = -1; + for (np = nlp; (np != NULL); np = np->next) { + if (meta_getdevs(sp, np->namep, nlpp, ep) != 0) + rval = -1; + } + metafreenamelist(nlp); + + return (rval); +} + +/* + * get vtoc from a device already opened. + * returns + * 0 on success, + * -1 on error. If the error was ENOTSUP, partno will be set to + * VT_ENOTSUP if possible. + */ +int +meta_getvtoc( + int fd, /* fd for named device */ + char *devname, /* name of device */ + struct vtoc *vtocbufp, /* vtoc buffer to fill */ + int *partno, /* return partno here */ + md_error_t *ep +) +{ + int part; + + (void) memset(vtocbufp, 0, sizeof (*vtocbufp)); + if ((part = read_vtoc(fd, vtocbufp)) < 0) { + int err = errno; + + if (ioctl(fd, MHIOCSTATUS, NULL) == 1) + err = EACCES; + else if (part == VT_EINVAL) + err = EINVAL; + else if (part == VT_EIO) + err = EIO; + else if (part == VT_ENOTSUP) { + if (partno) { + *partno = VT_ENOTSUP; + return (-1); + } + } + return (mdsyserror(ep, err, devname)); + } + + /* Slice number for *p0 partition (whole disk on x86) is 16 */ + if (part >= V_NUMPAR) + return (mdsyserror(ep, EINVAL, devname)); + + if (partno) + *partno = part; + return (0); +} +/* + * set mdvtoc for a meta devices + */ +int +meta_setmdvtoc( + int fd, /* fd for named device */ + char *devname, /* name of device */ + mdvtoc_t *mdvtocp, /* mdvtoc buffer to fill */ + md_error_t *ep +) +{ + uint_t i; + + /* + * Sanity-check the mdvtoc + */ + + if (mdvtocp->nparts > V_NUMPAR) { + return (-1); + } + + /* + * since many drivers won't allow opening a device make sure + * all partitions aren't being set to zero. If all are zero then + * we have no way to set them to something else + */ + + for (i = 0; i < mdvtocp->nparts; i++) + if (mdvtocp->parts[i].size > 0) + break; + if (i == mdvtocp->nparts) + return (-1); + + /* + * Write the mdvtoc + */ + if (ioctl(fd, DKIOCSVTOC, (caddr_t)mdvtocp) == -1) { + return (mdsyserror(ep, errno, devname)); + } + + return (0); +} + +/* + * set vtoc + */ +int +meta_setvtoc( + int fd, /* fd for named device */ + char *devname, /* name of device */ + struct vtoc *vtocbufp, /* vtoc buffer to fill */ + md_error_t *ep +) +{ + int part; + int err; + + if ((part = write_vtoc(fd, vtocbufp)) < 0) { + if (part == VT_EINVAL) + err = EINVAL; + else if (part == VT_EIO) + err = EIO; + else + err = errno; + return (mdsyserror(ep, err, devname)); + } + + return (0); +} + +/* + * FUNCTION: meta_get_names() + * INPUT: drivername - char string containing the driver name + * sp - the set name to get soft partitions from + * options - options from the command line + * OUTPUT: nlpp - list of all soft partition names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all specified devices in the metadb + * for all devices in the specified set + */ +int +meta_get_names( + char *drivername, + mdsetname_t *sp, + mdnamelist_t **nlpp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_i_getnum_t gn; /* MD_IOCGET_NUM params */ + mdnamelist_t **tailpp = nlpp; + minor_t *minors = NULL; + minor_t *m_ptr; + int i; + + (void) memset(&gn, '\0', sizeof (gn)); + MD_SETDRIVERNAME(&gn, drivername, sp->setno); + + /* get number of devices */ + if (metaioctl(MD_IOCGET_NUM, &gn, &gn.mde, NULL) != 0) { + if (mdiserror(&gn.mde, MDE_UNIT_NOT_FOUND)) { + mdclrerror(&gn.mde); + } else { + (void) mdstealerror(ep, &gn.mde); + return (-1); + } + } + + if (gn.size > 0) { + /* malloc minor number buffer to be filled by ioctl */ + if ((minors = (minor_t *)malloc( + gn.size * sizeof (minor_t))) == 0) { + return (ENOMEM); + } + gn.minors = (uintptr_t)minors; + if (metaioctl(MD_IOCGET_NUM, &gn, &gn.mde, NULL) != 0) { + (void) mdstealerror(ep, &gn.mde); + free(minors); + return (-1); + } + m_ptr = minors; + for (i = 0; i < gn.size; i++) { + mdname_t *np; + + /* get name */ + np = metamnumname(&sp, *m_ptr, + ((options & PRINT_FAST) ? 1 : 0), ep); + if (np == NULL) + goto out; + + tailpp = meta_namelist_append_wrapper( + tailpp, np); + + /* next device */ + m_ptr++; + } + free(minors); + } + return (gn.size); + +out: + if (minors != NULL) + free(minors); + metafreenamelist(*nlpp); + *nlpp = NULL; + return (-1); +} + +/* + * Wrap lib/libdevid/devid_deviceid_to_nmlist. We want to take the + * results from that function and filter out the c[t]dp style names that + * we typically see on x86 so that we never see them. + */ +int +meta_deviceid_to_nmlist(char *search_path, ddi_devid_t devid, char *minor_name, + devid_nmlist_t **retlist) +{ + int res; + devid_nmlist_t *dp; + devid_nmlist_t *tmp_retlist; + int i = 1; + devid_nmlist_t *rp; + + res = devid_deviceid_to_nmlist(search_path, devid, minor_name, retlist); + if (res != 0) { + return (res); + } + + + /* first count the number of non c[t]dp items in retlist */ + for (dp = *retlist; dp->dev != NODEV; dp++) { + uint_t s; + + /* Check if this is a c[t]dp style name. */ + if (parse_ctd(basename(dp->devname), &s) != 1) { + i++; + } + } + + /* create an array to hold the non c[t]dp items */ + tmp_retlist = Malloc(sizeof (devid_nmlist_t) * i); + /* copy the non c[t]dp items to the array */ + for (dp = *retlist, rp = tmp_retlist; dp->dev != NODEV; dp++) { + uint_t s; + + /* Check if this is a c[t]dp style name. */ + if (parse_ctd(basename(dp->devname), &s) != 1) { + /* nope, so copy and go to the next */ + rp->dev = dp->dev; + rp->devname = Strdup(dp->devname); + rp++; + } + /* if it is c[t]dp, just skip the element */ + } + /* copy the list terminator */ + rp->dev = NODEV; + rp->devname = NULL; + devid_free_nmlist (*retlist); + *retlist = tmp_retlist; + return (res); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_hotspares.c b/usr/src/lib/lvm/libmeta/common/meta_hotspares.c new file mode 100644 index 0000000000..a76f9f3765 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_hotspares.c @@ -0,0 +1,1630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * hotspares utilities + */ + +#include <meta.h> +#include <sys/lvm/md_hotspares.h> +#include <sys/lvm/md_convert.h> + + +/* + * FUNCTION: meta_get_hsp_names() + * INPUT: sp - the set name to get hotspares from + * options - options from the command line + * OUTPUT: hspnlpp - list of all hotspare names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all hotspares in the metadb + * for all devices in the specified set + */ +/*ARGSUSED*/ +int +meta_get_hsp_names( + mdsetname_t *sp, + mdhspnamelist_t **hspnlpp, + int options, + md_error_t *ep +) +{ + md_i_getnum_t gn; /* MD_IOCGET_NUM params */ + minor_t *minors = NULL; + minor_t *m_ptr; + int i; + + /* we must have a set */ + assert(sp != NULL); + + (void) memset(&gn, 0, sizeof (gn)); + MD_SETDRIVERNAME(&gn, MD_HOTSPARES, sp->setno); + + /* get number of devices */ + if (metaioctl(MD_IOCGET_NUM, &gn, &gn.mde, NULL) != 0) { + if (mdiserror(&gn.mde, MDE_UNIT_NOT_FOUND)) { + mdclrerror(&gn.mde); + } else { + (void) mdstealerror(ep, &gn.mde); + return (-1); + } + } + + if (gn.size > 0) { + /* malloc minor number buffer to be filled by ioctl */ + if ((minors = (minor_t *)malloc( + gn.size * sizeof (minor_t))) == 0) { + return (ENOMEM); + } + gn.minors = (uintptr_t)minors; + if (metaioctl(MD_IOCGET_NUM, &gn, &gn.mde, NULL) != 0) { + (void) mdstealerror(ep, &gn.mde); + free(minors); + return (-1); + } + m_ptr = minors; + for (i = 0; i < gn.size; i++) { + mdhspname_t *hspnp; + + + /* get name */ + if ((hspnp = metahsphspname(&sp, *m_ptr, ep)) + == NULL) + goto out; + + /* append to list */ + (void) metahspnamelist_append(hspnlpp, hspnp); + + /* next device */ + m_ptr++; + } + free(minors); + } + return (gn.size); + +out: + if (minors != NULL) + free(minors); + metafreehspnamelist(*hspnlpp); + *hspnlpp = NULL; + return (-1); +} + +/* + * get information of a specific hotspare pool from driver + */ +static get_hsp_t * +get_hspinfo( + mdsetname_t *sp, + mdhspname_t *hspnp, + md_error_t *ep +) +{ + md_i_get_t mig; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* get size of unit structure */ + (void) memset(&mig, 0, sizeof (mig)); + MD_SETDRIVERNAME(&mig, MD_HOTSPARES, sp->setno); + mig.id = hspnp->hsp; + if (metaioctl(MD_IOCGET, &mig, &mig.mde, hspnp->hspname) != 0) { + (void) mdstealerror(ep, &mig.mde); + return (NULL); + } + + /* get actual unit structure */ + assert(mig.size > 0); + mig.mdp = (uintptr_t)Zalloc(mig.size); + if (metaioctl(MD_IOCGET, &mig, &mig.mde, hspnp->hspname) != 0) { + (void) mdstealerror(ep, &mig.mde); + Free((void *)mig.mdp); + return (NULL); + } + return ((get_hsp_t *)mig.mdp); +} + +/* + * free hotspare pool unit + */ +void +meta_free_hsp( + md_hsp_t *hspp +) +{ + if (hspp->hotspares.hotspares_val != NULL) { + assert(hspp->hotspares.hotspares_len > 0); + Free(hspp->hotspares.hotspares_val); + } + Free(hspp); +} + +/* + * get hotspare pool unit (common) + */ +md_hsp_t * +meta_get_hsp_common( + mdsetname_t *sp, + mdhspname_t *hspnp, + int fast, + md_error_t *ep +) +{ + get_hsp_t *ghsp; + md_hsp_t *hspp; + uint_t hsi; + + /* must have set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* short circuit */ + if (hspnp->unitp != NULL) + return (hspnp->unitp); + + /* get unit */ + if ((ghsp = get_hspinfo(sp, hspnp, ep)) == NULL) + return (NULL); + + /* allocate hsp */ + hspp = Zalloc(sizeof (*hspp)); + + /* allocate hotspares */ + hspp->hotspares.hotspares_len = ghsp->ghsp_nhotspares; + + /* if empty hotspare pool, we are done */ + if (hspp->hotspares.hotspares_len != 0) + hspp->hotspares.hotspares_val = + Zalloc(hspp->hotspares.hotspares_len * + sizeof (*hspp->hotspares.hotspares_val)); + + /* get name, refcount */ + hspp->hspnamep = hspnp; + hspp->refcount = ghsp->ghsp_refcount; + + /* get hotspares */ + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + mdkey_t hs_key = ghsp->ghsp_hs_keys[hsi]; + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + get_hs_params_t ghs; + + /* get hotspare name */ + hsp->hsnamep = metakeyname(&sp, hs_key, fast, ep); + if (hsp->hsnamep == NULL) + goto out; + + /* get hotspare state */ + (void) memset(&ghs, 0, sizeof (ghs)); + MD_SETDRIVERNAME(&ghs, MD_HOTSPARES, sp->setno); + ghs.ghs_key = hs_key; + if (metaioctl(MD_IOCGET_HS, &ghs, &ghs.mde, NULL) != 0) { + (void) mdstealerror(ep, &ghs.mde); + goto out; + } + hsp->state = ghs.ghs_state; + hsp->size = ghs.ghs_number_blks; + hsp->timestamp = ghs.ghs_timestamp; + hsp->revision = ghs.ghs_revision; + } + + /* cleanup, return success */ + Free(ghsp); + hspnp->unitp = hspp; + return (hspp); + + /* cleanup, return error */ +out: + Free(ghsp); + meta_free_hsp(hspp); + return (NULL); +} + +/* + * get hotspare pool unit + */ +md_hsp_t * +meta_get_hsp( + mdsetname_t *sp, + mdhspname_t *hspnp, + md_error_t *ep +) +{ + return (meta_get_hsp_common(sp, hspnp, 0, ep)); +} + +/* + * check hotspare pool for dev + */ +static int +in_hsp( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_hsp_t *hspp; + uint_t i; + + /* should be in the same set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* get unit */ + if ((hspp = meta_get_hsp(sp, hspnp, ep)) == NULL) + return (-1); + + /* look in hotspares */ + for (i = 0; (i < hspp->hotspares.hotspares_len); ++i) { + md_hs_t *hs = &hspp->hotspares.hotspares_val[i]; + mdname_t *hsnp = hs->hsnamep; + + /* check overlap */ + if (metaismeta(hsnp)) + continue; + if (meta_check_overlap(hspnp->hspname, np, slblk, nblks, + hsnp, 0, -1, ep) != 0) + return (-1); + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a hotspare pool + */ +int +meta_check_inhsp( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdhspnamelist_t *hspnlp = NULL; + mdhspnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each hotspare pool */ + if (meta_get_hsp_names(sp, &hspnlp, 0, ep) < 0) + return (-1); + for (p = hspnlp; (p != NULL); p = p->next) { + mdhspname_t *hspnp = p->hspnamep; + + /* check hotspare pool */ + if (in_hsp(sp, hspnp, np, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreehspnamelist(hspnlp); + return (rval); +} + +/* + * check hotspare + */ +int +meta_check_hotspare( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + mdchkopts_t options = (MDCHK_ALLOW_HS); + + /* make sure we have a disk */ + if (metachkcomp(np, ep) != 0) + return (-1); + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * print hsp + */ +static int +hsp_print( + md_hsp_t *hspp, + char *fname, + FILE *fp, + md_error_t *ep +) +{ + uint_t hsi; + int rval = -1; + + /* print name */ + if (fprintf(fp, "%s", hspp->hspnamep->hspname) == EOF) + goto out; + + /* print hotspares */ + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + + /* print hotspare */ + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(hsp->hsnamep->rname, "/dev/rdsk") == NULL) && + (strstr(hsp->hsnamep->rname, "/dev/md/rdsk") == NULL) && + (strstr(hsp->hsnamep->rname, "/dev/td/") == NULL)) { + /* not standard path, print full pathname */ + if (fprintf(fp, " %s", hsp->hsnamep->rname) == EOF) + goto out; + } else { + /* standard path, just print ctd or d value */ + if (fprintf(fp, " %s", hsp->hsnamep->cname) == EOF) + goto out; + } + } + + /* terminate last line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * hotspare state name + */ +char * +hs_state_to_name( + md_hs_t *hsp, + md_timeval32_t *tvp +) +{ + hotspare_states_t state = hsp->state; + + /* grab time */ + if (tvp != NULL) + *tvp = hsp->timestamp; + + switch (state) { + case HSS_AVAILABLE: + return (dgettext(TEXT_DOMAIN, "Available")); + case HSS_RESERVED: + return (dgettext(TEXT_DOMAIN, "In use")); + case HSS_BROKEN: + return (dgettext(TEXT_DOMAIN, "Broken")); + case HSS_UNUSED: + default: + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +/* + * report hsp + */ +static int +hsp_report( + md_hsp_t *hspp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep, + mdsetname_t *sp +) +{ + uint_t hsi; + int rval = -1; + char *devid = ""; + mdname_t *didnp = NULL; + uint_t len; + int large_hs_dev_cnt = 0; + + if (options & PRINT_LARGEDEVICES) { + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + if (hsp->revision == MD_64BIT_META_DEV) { + large_hs_dev_cnt += 1; + if (meta_getdevs(sp, hsp->hsnamep, nlpp, ep) + != 0) + goto out; + } + } + + if (large_hs_dev_cnt == 0) { + rval = 0; + goto out; + } + } + /* print header */ + if (hspp->hotspares.hotspares_len == 0) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: is empty\n"), + hspp->hspnamep->hspname) == EOF) { + goto out; + } + } else if (hspp->hotspares.hotspares_len == 1) { + + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + + len = strlen(hspp->hotspares.hotspares_val[0].hsnamep->cname); + /* + * if the length is to short to print out all of the header + * force the matter + */ + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + if (options & PRINT_LARGEDEVICES) { + if (fprintf(fp, + "%s: 1 hot spare (1 big device)\n\t%-*.*s " + "%-12.12s%-8.6s\t\t%s\n", + hspp->hspnamep->hspname, len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Status"), + dgettext(TEXT_DOMAIN, "Length"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + } else { + if (fprintf(fp, + "%s: 1 hot spare\n\t%-*.*s %-12.12s%-8.6s\t\t%s\n", + hspp->hspnamep->hspname, len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Status"), + dgettext(TEXT_DOMAIN, "Length"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + } + } else { + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + len = 0; + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + len = max(len, strlen(hspp-> + hotspares.hotspares_val[hsi].hsnamep->cname)); + } + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + if (options & PRINT_LARGEDEVICES) { + if (fprintf(fp, + "%s: %u hot spares (%d big device(s))\n\t%-*.*s " + "%-12.12s%-8.6s\t\t%s\n", + hspp->hspnamep->hspname, + hspp->hotspares.hotspares_len, + large_hs_dev_cnt, len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Status"), + dgettext(TEXT_DOMAIN, "Length"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + } else { + if (fprintf(fp, "%s: %u hot spares\n\t%-*.*s " + "%-12.12s%-8.6s\t\t%s\n", + hspp->hspnamep->hspname, + hspp->hotspares.hotspares_len, len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Status"), + dgettext(TEXT_DOMAIN, "Length"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + } + } + + /* print hotspares */ + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + char *cname = hsp->hsnamep->cname; + char *hs_state; + md_timeval32_t tv; + char *timep; + ddi_devid_t dtp; + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, hsp->hsnamep->dev, ep)) == NULL) { + return (-1); + } + + if (options & PRINT_LARGEDEVICES) { + if (hsp->revision != MD_64BIT_META_DEV) + continue; + } + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) { + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + } + /* print hotspare */ + hs_state = hs_state_to_name(hsp, &tv); + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + " %-*s %-12s %lld blocks\t%s\n", + len, cname, hs_state, + hsp->size, devid) == EOF) { + goto out; + } + } else { + timep = meta_print_time(&tv); + + if (fprintf(fp, + " %-*s\t %-11s %8lld blocks%s\t%s\n", + len, cname, hs_state, + hsp->size, devid, timep) == EOF) { + goto out; + } + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report hsp + */ +int +meta_hsp_print( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_hsp_t *hspp; + + /* should have same set */ + assert(sp != NULL); + assert((hspnp == NULL) || (sp->setno == HSP_SET(hspnp->hsp))); + + /* print all hsps */ + if (hspnp == NULL) { + mdhspnamelist_t *hspnlp = NULL; + mdhspnamelist_t *p; + int cnt; + int rval = 0; + + if ((cnt = meta_get_hsp_names(sp, &hspnlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recurse */ + for (p = hspnlp; (p != NULL); p = p->next) { + mdhspname_t *hspnp = p->hspnamep; + + if (meta_hsp_print(sp, hspnp, nlpp, fname, fp, + options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + metafreehspnamelist(hspnlp); + return (rval); + } + + /* get unit structure */ + if ((hspp = meta_get_hsp_common(sp, hspnp, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* print appropriate detail */ + if (options & PRINT_SHORT) + return (hsp_print(hspp, fname, fp, ep)); + else + return (hsp_report(hspp, nlpp, fname, fp, options, ep, sp)); +} + +/* + * check for valid hotspare pool + */ +int +metachkhsp( + mdsetname_t *sp, + mdhspname_t *hspnp, + md_error_t *ep +) +{ + if (meta_get_hsp(sp, hspnp, ep) == NULL) + return (-1); + return (0); +} + +/* + * invalidate hotspare pool info + */ +void +meta_invalidate_hsp( + mdhspname_t *hspnp +) +{ + md_hsp_t *hspp = hspnp->unitp; + + /* free it up */ + if (hspp == NULL) + return; + meta_free_hsp(hspp); + + /* clear cache */ + hspnp->unitp = NULL; +} + +/* + * add hotspares and/or hotspare pool + */ +int +meta_hs_add( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdnamelist_t *hsnlp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *p; + set_hs_params_t shs; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* clear cache */ + meta_invalidate_hsp(hspnp); + + /* setup hotspare pool info */ + (void) memset(&shs, 0, sizeof (shs)); + shs.shs_cmd = ADD_HOT_SPARE; + shs.shs_hot_spare_pool = hspnp->hsp; + MD_SETDRIVERNAME(&shs, MD_HOTSPARES, sp->setno); + + /* add empty hotspare pool */ + if (hsnlp == NULL) { + shs.shs_options = HS_OPT_POOL; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, + hspnp->hspname) != 0) + return (mdstealerror(ep, &shs.mde)); + goto success; + } + + /* add hotspares */ + shs.shs_options = HS_OPT_NONE; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + for (p = hsnlp; (p != NULL); p = p->next) { + mdname_t *hsnp = p->namep; + diskaddr_t size, label, start_blk; + + /* should be in same set */ + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* check it out */ + if (meta_check_hotspare(sp, hsnp, ep) != 0) + return (-1); + if ((size = metagetsize(hsnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + else if (size == 0) + return (mdsyserror(ep, ENOSPC, hsnp->cname)); + if ((label = metagetlabel(hsnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((start_blk = metagetstart(sp, hsnp, ep)) + == MD_DISKADDR_ERROR) + return (-1); + + shs.shs_size_option = meta_check_devicesize(size); + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, hsnp, NULL, ep) != 0) + return (-1); + } + + /* add hotspare and/or hotspare pool */ + shs.shs_component_old = hsnp->dev; + shs.shs_start_blk = start_blk; + shs.shs_has_label = ((label > 0) ? 1 : 0); + shs.shs_number_blks = size; + shs.shs_key_old = hsnp->key; + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, NULL) != 0) { + if ((options & MDCMD_DOIT) && + (shs.shs_options != HS_OPT_POOL)) { + (void) del_key_name(sp, hsnp, ep); + } + return (mdstealerror(ep, &shs.mde)); + } + } + + /* print success message */ +success: + if (options & MDCMD_PRINT) { + if ((options & MDCMD_INIT) || (hsnlp == NULL)) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare pool is setup\n"), + hspnp->hspname); + } else if (hsnlp->next == NULL) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare is added\n"), + hspnp->hspname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspares are added\n"), + hspnp->hspname); + } + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * delete hotspares from pool + */ +int +meta_hs_delete( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdnamelist_t *hsnlp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *p; + set_hs_params_t shs; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* clear cache */ + meta_invalidate_hsp(hspnp); + + /* setup hotspare pool info */ + (void) memset(&shs, 0, sizeof (shs)); + shs.shs_hot_spare_pool = hspnp->hsp; + MD_SETDRIVERNAME(&shs, MD_HOTSPARES, sp->setno); + shs.shs_cmd = DELETE_HOT_SPARE; + + /* delete empty hotspare pool */ + if (hsnlp == NULL) { + shs.shs_options = HS_OPT_POOL; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, + hspnp->hspname) != 0) + return (mdstealerror(ep, &shs.mde)); + goto success; + } + + /* delete hotspares */ + shs.shs_options = HS_OPT_NONE; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + for (p = hsnlp; (p != NULL); p = p->next) { + mdname_t *hsnp = p->namep; + + /* should be in same set */ + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* delete hotspare */ + shs.shs_component_old = hsnp->dev; + meta_invalidate_name(hsnp); + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, hsnp->cname) != 0) + return (mdstealerror(ep, &shs.mde)); + } + + /* print success message */ +success: + if (options & MDCMD_PRINT) { + if (hsnlp == NULL) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare pool is cleared\n"), + hspnp->hspname); + } else if (hsnlp->next == NULL) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare is deleted\n"), + hspnp->hspname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspares are deleted\n"), + hspnp->hspname); + } + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * replace hotspare in pool + */ +int +meta_hs_replace( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdname_t *oldnp, + mdname_t *newnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + set_hs_params_t shs; + diskaddr_t size, label, start_blk; + md_dev64_t old_dev, new_dev; + diskaddr_t new_start_blk, new_end_blk; + int rebind; + char *new_devidp = NULL; + int ret; + md_set_desc *sd; + + /* should be in same set */ + assert(sp != NULL); + assert(sp->setno == HSP_SET(hspnp->hsp)); + + /* save new binding incase this is a rebind where oldnp==newnp */ + new_dev = newnp->dev; + new_start_blk = newnp->start_blk; + new_end_blk = newnp->end_blk; + + /* invalidate, then get the hotspare (fill in oldnp from metadb) */ + meta_invalidate_hsp(hspnp); + if (meta_get_hsp(sp, hspnp, ep) == NULL) + return (-1); + + /* the old device binding is now established */ + if ((old_dev = oldnp->dev) == NODEV64) + return (mdsyserror(ep, ENODEV, oldnp->cname)); + + /* + * check for the case where oldnp and newnp indicate the same + * device, but the dev_t of the device has changed between old + * and new. This is called a rebind. On entry the dev_t + * represents the new device binding determined from the + * filesystem (meta_getdev). After calling meta_get_hsp + * oldnp (and maybe newnp if this is a rebind) is updated based + * to the old binding from the metadb (done by metakeyname). + */ + if ((strcmp(oldnp->rname, newnp->rname) == 0) && + (old_dev != new_dev)) { + rebind = 1; + } else { + rebind = 0; + } + if (rebind) { + newnp->dev = new_dev; + newnp->start_blk = new_start_blk; + newnp->end_blk = new_end_blk; + } + + /* + * Save a copy of the devid associated with the new disk, the reason + * is that the meta_check_hotspare() call could cause the devid to + * be changed to that of the devid that is currently stored in the + * replica namespace for the disk in question. This devid could be + * stale if we are replacing the disk. The function that overwrites + * the devid is dr2drivedesc(). + */ + if (newnp->drivenamep->devid != NULL) + new_devidp = Strdup(newnp->drivenamep->devid); + + /* if it's a multi-node diskset clear new_devidp */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + Free(new_devidp); + return (-1); + } + if (MD_MNSET_DESC(sd)) { + Free(new_devidp); + new_devidp = NULL; + } + } + + /* check it out */ + if (meta_check_hotspare(sp, newnp, ep) != 0) { + if ((! rebind) || (! mdisuseerror(ep, MDE_ALREADY))) { + Free(new_devidp); + return (-1); + } + mdclrerror(ep); + } + if ((size = metagetsize(newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if ((label = metagetlabel(newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if ((start_blk = metagetstart(sp, newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, newnp->cname); + Free(new_devidp); + return (-1); + } + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + } + + /* + * Copy back the saved devid. + */ + Free(newnp->drivenamep->devid); + if (new_devidp != NULL) { + newnp->drivenamep->devid = new_devidp; + new_devidp = NULL; + } + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + } + + if (rebind && !metaislocalset(sp)) { + /* + * We are 'rebind'ing a disk that is in a diskset so as well + * as updating the diskset's namespace the local set needs + * to be updated because it also contains a reference to the + * disk in question. + */ + ret = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, newnp->cname, + ep); + + if (ret != METADEVADM_SUCCESS) { + md_error_t xep = mdnullerror; + + /* + * In dryrun mode (DOIT not set) we must not alter + * the mddb + */ + if (options & MDCMD_DOIT) { + (void) del_key_name(sp, newnp, &xep); + mdclrerror(&xep); + return (-1); + } + } + } + + /* replace hotspare */ + (void) memset(&shs, 0, sizeof (shs)); + + shs.shs_size_option = meta_check_devicesize(size); + + shs.shs_cmd = REPLACE_HOT_SPARE; + shs.shs_hot_spare_pool = hspnp->hsp; + MD_SETDRIVERNAME(&shs, MD_HOTSPARES, sp->setno); + shs.shs_component_old = old_dev; + shs.shs_options = HS_OPT_NONE; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + shs.shs_component_new = new_dev; + shs.shs_start_blk = start_blk; + shs.shs_has_label = ((label > 0) ? 1 : 0); + shs.shs_number_blks = size; + shs.shs_key_new = newnp->key; + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, NULL) != 0) { + if (options & MDCMD_DOIT) { + (void) del_key_name(sp, newnp, ep); + } + return (mdstealerror(ep, &shs.mde)); + } + + /* clear cache */ + meta_invalidate_name(oldnp); + meta_invalidate_name(newnp); + meta_invalidate_hsp(hspnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare %s is replaced with %s\n"), + hspnp->hspname, oldnp->cname, newnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * enable hotspares + */ +int +meta_hs_enable( + mdsetname_t *sp, + mdnamelist_t *hsnlp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdhspnamelist_t *hspnlp = NULL; + mdhspnamelist_t *hspnp; + set_hs_params_t shs; + int rval = -1; + + /* should have a set */ + assert(sp != NULL); + + /* setup device info */ + (void) memset(&shs, 0, sizeof (shs)); + MD_SETDRIVERNAME(&shs, MD_HOTSPARES, sp->setno); + shs.shs_cmd = FIX_HOT_SPARE; + shs.shs_options = HS_OPT_NONE; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + + /* get the list of hotspare names */ + if (meta_get_hsp_names(sp, &hspnlp, 0, ep) < 0) + goto out; + + /* enable hotspares for each components */ + for (; (hsnlp != NULL); hsnlp = hsnlp->next) { + mdname_t *hsnp = hsnlp->namep; + md_dev64_t fs_dev; + int rebind = 0; + diskaddr_t size, label, start_blk; + + /* get the file_system dev binding */ + if (meta_getdev(sp, hsnp, ep) != 0) + return (-1); + fs_dev = hsnp->dev; + + /* + * search for the component in each hotspare pool + * and replace it (instead of enable) if the binding + * has changed. + */ + for (hspnp = hspnlp; (hspnp != NULL); hspnp = hspnp->next) { + /* + * in_hsp will call meta_get_hsp which will fill + * in hspnp with metadb version of component + */ + meta_invalidate_hsp(hspnp->hspnamep); + if (in_hsp(sp, hspnp->hspnamep, hsnp, 0, -1, ep) != 0) { + /* + * check for the case where the dev_t has + * changed between the filesystem and the + * metadb. This is called a rebind, and + * is handled by meta_hs_replace. + */ + if (fs_dev != hsnp->dev) { + /* + * establish file system binding + * with invalid start/end + */ + rebind++; + hsnp->dev = fs_dev; + hsnp->start_blk = -1; + hsnp->end_blk = -1; + rval = meta_hs_replace(sp, + hspnp->hspnamep, + hsnp, hsnp, options, ep); + if (rval != 0) + goto out; + } + } + } + if (rebind) + continue; + + /* enable the component in all hotspares that use it */ + if (meta_check_hotspare(sp, hsnp, ep) != 0) + goto out; + + if ((size = metagetsize(hsnp, ep)) == MD_DISKADDR_ERROR) + goto out; + if ((label = metagetlabel(hsnp, ep)) == MD_DISKADDR_ERROR) + goto out; + if ((start_blk = metagetstart(sp, hsnp, ep)) + == MD_DISKADDR_ERROR) + goto out; + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, hsnp->cname); + goto out; + } + + /* enable hotspare */ + shs.shs_component_old = hsnp->dev; + shs.shs_component_new = hsnp->dev; + shs.shs_start_blk = start_blk; + shs.shs_has_label = ((label > 0) ? 1 : 0); + shs.shs_number_blks = size; + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, hsnp->cname) != 0) { + rval = mdstealerror(ep, &shs.mde); + goto out; + } + + /* + * Are we dealing with a non-local set? If so need to update + * the local namespace so that the disk record has the correct + * devid. + */ + if (!metaislocalset(sp)) { + rval = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, + hsnp->cname, ep); + + if (rval != METADEVADM_SUCCESS) { + /* + * Failed to update the local set. Nothing to + * do here apart from report the error. The + * namespace is most likely broken and some + * form of remedial recovery is going to + * be required. + */ + mde_perror(ep, ""); + mdclrerror(ep); + } + } + + /* clear cache */ + meta_invalidate_name(hsnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "hotspare %s is enabled\n"), + hsnp->cname); + (void) fflush(stdout); + } + } + + /* clear whole cache */ + for (hspnp = hspnlp; (hspnp != NULL); hspnp = hspnp->next) { + meta_invalidate_hsp(hspnp->hspnamep); + } + + + /* return success */ + rval = 0; + +out: + if (hspnlp) + metafreehspnamelist(hspnlp); + return (rval); +} + +/* + * check for dups in the hsp itself + */ +static int +check_twice( + md_hsp_t *hspp, + uint_t hsi, + md_error_t *ep +) +{ + mdhspname_t *hspnp = hspp->hspnamep; + mdname_t *thisnp; + uint_t h; + + thisnp = hspp->hotspares.hotspares_val[hsi].hsnamep; + for (h = 0; (h < hsi); ++h) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[h]; + mdname_t *hsnp = hsp->hsnamep; + + if (meta_check_overlap(hspnp->hspname, thisnp, 0, -1, + hsnp, 0, -1, ep) != 0) + return (-1); + } + return (0); +} + +/* + * check hsp + */ +/*ARGSUSED2*/ +int +meta_check_hsp( + mdsetname_t *sp, + md_hsp_t *hspp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdhspname_t *hspnp = hspp->hspnamep; + uint_t hsi; + + /* check hotspares */ + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + mdname_t *hsnp = hsp->hsnamep; + diskaddr_t size; + + /* check hotspare */ + if (meta_check_hotspare(sp, hsnp, ep) != 0) + return (-1); + if ((size = metagetsize(hsnp, ep)) == MD_DISKADDR_ERROR) { + return (-1); + } else if (size == 0) { + return (mdsyserror(ep, ENOSPC, hspnp->hspname)); + } + + /* check this hsp too */ + if (check_twice(hspp, hsi, ep) != 0) + return (-1); + } + + /* return success */ + return (0); +} + +/* + * create hsp + */ +int +meta_create_hsp( + mdsetname_t *sp, + md_hsp_t *hspp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdhspname_t *hspnp = hspp->hspnamep; + mdnamelist_t *hsnlp = NULL; + uint_t hsi; + int rval = -1; + + /* validate hsp */ + if (meta_check_hsp(sp, hspp, options, ep) != 0) + return (-1); + + /* if we're not doing anything, return success */ + if (! (options & MDCMD_DOIT)) + return (0); + + /* create hsp */ + for (hsi = 0; (hsi < hspp->hotspares.hotspares_len); ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + mdname_t *hsnp = hsp->hsnamep; + + (void) metanamelist_append(&hsnlp, hsnp); + } + options |= MDCMD_INIT; + rval = meta_hs_add(sp, hspnp, hsnlp, options, ep); + + /* cleanup, return success */ + metafreenamelist(hsnlp); + return (rval); +} + +/* + * initialize hsp + * NOTE: this functions is metainit(1m)'s command line parser! + */ +int +meta_init_hsp( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + char *uname = argv[0]; + mdhspname_t *hspnp = NULL; + md_hsp_t *hspp = NULL; + uint_t hsi; + int rval = -1; + + + /* get hsp name */ + assert(argc > 0); + if (argc < 1) + goto syntax; + if ((hspnp = metahspname(spp, uname, ep)) == NULL) + goto out; + assert(*spp != NULL); + uname = hspnp->hspname; + + if (!(options & MDCMD_NOLOCK)) { + /* grab set lock */ + if (meta_lock(*spp, TRUE, ep)) + goto out; + + if (meta_check_ownership(*spp, ep) != 0) + goto out; + } + + /* see if it exists already */ + if (meta_get_hsp(*spp, hspnp, ep) != NULL) { + (void) mdhsperror(ep, MDE_HSP_ALREADY_SETUP, hspnp->hsp, uname); + goto out; + } else if (! mdishsperror(ep, MDE_INVAL_HSP)) { + goto out; + } else { + mdclrerror(ep); + } + --argc, ++argv; + + /* parse general options */ + optind = 0; + opterr = 0; + if (getopt(argc, argv, "") != -1) + goto options; + + /* allocate hsp */ + hspp = Zalloc(sizeof (*hspp)); + hspp->hotspares.hotspares_len = argc; + if (argc > 0) { + hspp->hotspares.hotspares_val = + Zalloc(argc * sizeof (*hspp->hotspares.hotspares_val)); + } + + /* setup pool */ + hspp->hspnamep = hspnp; + + /* parse hotspares */ + for (hsi = 0; ((argc > 0) && (hsi < hspp->hotspares.hotspares_len)); + ++hsi) { + md_hs_t *hsp = &hspp->hotspares.hotspares_val[hsi]; + mdname_t *hsnamep; + + /* parse hotspare name */ + if ((hsnamep = metaname(spp, argv[0], ep)) == NULL) + goto out; + hsp->hsnamep = hsnamep; + --argc, ++argv; + } + + /* we should be at the end */ + if (argc != 0) + goto syntax; + + /* create hotspare pool */ + if (meta_create_hsp(*spp, hspp, options, ep) != 0) + goto out; + rval = 0; /* success */ + goto out; + + /* syntax error */ +syntax: + rval = meta_cook_syntax(ep, MDE_SYNTAX, uname, argc, argv); + goto out; + + /* options error */ +options: + rval = meta_cook_syntax(ep, MDE_OPTION, uname, argc, argv); + goto out; + + /* cleanup, return error */ +out: + if (hspp != NULL) + meta_free_hsp(hspp); + return (rval); +} + +/* + * reset hotspare pool + */ +int +meta_hsp_reset( + mdsetname_t *sp, + mdhspname_t *hspnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_hsp_t *hspp; + set_hs_params_t shs; + uint_t i; + int rval = -1; + + /* should have the same set */ + assert(sp != NULL); + assert((hspnp == NULL) || (sp->setno == HSP_SET(hspnp->hsp))); + + /* reset all hotspares */ + if (hspnp == NULL) { + mdhspnamelist_t *hspnlp = NULL; + mdhspnamelist_t *p; + + /* for each hotspare pool */ + rval = 0; + if (meta_get_hsp_names(sp, &hspnlp, 0, ep) < 0) + return (-1); + for (p = hspnlp; (p != NULL); p = p->next) { + /* reset hotspare pool */ + hspnp = p->hspnamep; + + /* + * If this is a multi-node set, we send a series + * of individual metaclear commands. + */ + if (meta_is_mn_set(sp, ep)) { + if (meta_mn_send_metaclear_command(sp, + hspnp->hspname, options, 0, ep) != 0) { + rval = -1; + break; + } + } else { + if (meta_hsp_reset(sp, hspnp, options, + ep) != 0) { + rval = -1; + break; + } + } + } + + /* cleanup, return success */ + metafreehspnamelist(hspnlp); + return (rval); + } + + /* get unit structure */ + if ((hspp = meta_get_hsp(sp, hspnp, ep)) == NULL) + return (-1); + + /* make sure nobody owns us */ + if (hspp->refcount > 0) { + return (mdhsperror(ep, MDE_HSP_IN_USE, hspnp->hsp, + hspnp->hspname)); + } + + /* clear hotspare pool members */ + (void) memset(&shs, 0, sizeof (shs)); + MD_SETDRIVERNAME(&shs, MD_HOTSPARES, sp->setno); + shs.shs_cmd = DELETE_HOT_SPARE; + shs.shs_hot_spare_pool = hspnp->hsp; + for (i = 0; (i < hspp->hotspares.hotspares_len); ++i) { + md_hs_t *hs = &hspp->hotspares.hotspares_val[i]; + mdname_t *hsnamep = hs->hsnamep; + + /* clear cache */ + meta_invalidate_name(hsnamep); + + /* clear hotspare */ + shs.shs_component_old = hsnamep->dev; + shs.shs_options = HS_OPT_FORCE; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, NULL) != 0) { + (void) mdstealerror(ep, &shs.mde); + goto out; + } + } + + /* clear hotspare pool */ + shs.shs_options = HS_OPT_POOL; + /* If DOIT is not set, it's a dryrun */ + if ((options & MDCMD_DOIT) == 0) { + shs.shs_options |= HS_OPT_DRYRUN; + } + if (metaioctl(MD_IOCSET_HS, &shs, &shs.mde, hspnp->hspname) != 0) { + (void) mdstealerror(ep, &shs.mde); + goto out; + } + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Hotspare pool is cleared\n"), + hspnp->hspname); + (void) fflush(stdout); + } + + /* clear subdevices (nothing to do) */ + + /* cleanup, return success */ +out: + meta_invalidate_hsp(hspnp); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_import.c b/usr/src/lib/lvm/libmeta/common/meta_import.c new file mode 100644 index 0000000000..ec8819794c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_import.c @@ -0,0 +1,2179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <assert.h> +#include <ctype.h> +#include <libdevinfo.h> +#include <mdiox.h> +#include <meta.h> +#include "meta_repartition.h" +#include "meta_set_prv.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/lvm/md_mddb.h> +#include <sys/lvm/md_names.h> +#include <sys/lvm/md_crc.h> + +typedef struct did_list { + void *rdid; /* real did if replicated set */ + void *did; /* did stored in lb */ + char *devname; + dev_t dev; + uint_t did_index; + char *minor_name; + struct did_list *next; +} did_list_t; + +typedef struct replicated_disk { + void *old_devid; + void *new_devid; + struct replicated_disk *next; +} replicated_disk_t; + +/* + * The current implementation limits the max device id length to 256 bytes. + * Should the max device id length be increased, this define would have to + * be bumped up accordingly + */ +#define MAX_DEVID_LEN 256 + +/* + * We store a global list of all the replicated disks in the system. In + * order to prevent us from performing a linear search on this list, we + * store the disks in a two dimensional sparse array. The disks are bucketed + * based on the length of their device ids. + */ +static replicated_disk_t *replicated_disk_list[MAX_DEVID_LEN + 1] = {NULL}; + +/* + * The list of replicated disks is built just once and this flag is set + * once it's done + */ +static int replicated_disk_list_built = 0; + +/* + * Map logical blk to physical + * + * This is based on the routine of the same name in the md kernel module (see + * file md_mddb.c), with the following caveats: + * + * - The kernel routine works on in core master blocks, or mddb_mb_ic_t; this + * routine works instead on the mddb_mb_t read directly from the disk + */ +static daddr_t +getphysblk( + mddb_block_t blk, + mddb_mb_t *mbp +) +{ + /* + * Sanity check: is the block within range? If so, we then assume + * that the block range map in the master block is valid and + * consistent with the block count. Unfortunately, there is no + * reliable way to validate this assumption. + */ + if (blk >= mbp->mb_blkcnt || blk >= mbp->mb_blkmap.m_consecutive) + return ((daddr_t)-1); + + return (mbp->mb_blkmap.m_firstblk + blk); +} + + + +/* + * drive_append() + * + * Append to tail of linked list of md_im_drive_info_t. + * + * Will allocate space for new node and copy args into new space. + * + * Returns pointer to new node. + */ +static md_im_drive_info_t * +drive_append( + md_im_drive_info_t **midpp, + mddrivename_t *dnp, + void *devid, + void *rdevid, + int devid_sz, + char *minor_name, + md_timeval32_t timestamp, + md_im_replica_info_t *mirp +) +{ + md_im_drive_info_t *midp; + int o_devid_sz; + + for (; (*midpp != NULL); midpp = &((*midpp)->mid_next)) + ; + + midp = *midpp = Zalloc(sizeof (md_im_drive_info_t)); + + midp->mid_dnp = dnp; + + /* + * If rdevid is not NULL then we know we are dealing with + * replicated diskset case. 'devid_sz' will always be the + * size of a valid devid which can be 'devid' or 'rdevid' + */ + midp->mid_devid = (void *)Malloc(devid_sz); + + if (rdevid) { + (void) memcpy(midp->mid_devid, rdevid, devid_sz); + /* + * Also need to store the 'other' devid + */ + o_devid_sz = devid_sizeof((ddi_devid_t)devid); + midp->mid_o_devid = (void *)Malloc(o_devid_sz); + (void) memcpy(midp->mid_o_devid, devid, o_devid_sz); + midp->mid_o_devid_sz = o_devid_sz; + } else { + /* + * In the case of regular diskset, midp->mid_o_devid + * will be a NULL pointer + */ + (void) memcpy(midp->mid_devid, devid, devid_sz); + } + + midp->mid_devid_sz = devid_sz; + midp->mid_setcreatetimestamp = timestamp; + (void) strlcpy(midp->mid_minor_name, minor_name, MDDB_MINOR_NAME_MAX); + midp->mid_replicas = mirp; + + return (midp); +} + + + +/* + * drive_append_wrapper() + * + * Constant time append wrapper; the append function will always walk the list, + * this will take a tail argument and use the append function on just the tail + * node, doing the appropriate old-tail-next-pointer bookkeeping. + */ +static md_im_drive_info_t ** +drive_append_wrapper( + md_im_drive_info_t **tailpp, + mddrivename_t *dnp, + void *devid, + void *rdevid, + int devid_sz, + char *minor_name, + md_timeval32_t timestamp, + md_im_replica_info_t *mirp +) +{ + (void) drive_append(tailpp, dnp, devid, rdevid, devid_sz, minor_name, + timestamp, mirp); + + if ((*tailpp)->mid_next == NULL) + return (tailpp); + + return (&((*tailpp)->mid_next)); +} + + + +/* + * replica_append() + * + * Append to tail of linked list of md_im_replica_info_t. + * + * Will allocate space for new node and copy args into new space. + * + * Returns pointer to new node. + */ +static md_im_replica_info_t * +replica_append( + md_im_replica_info_t **mirpp, + int flags, + daddr32_t offset, + daddr32_t length, + md_timeval32_t timestamp +) +{ + md_im_replica_info_t *mirp; + + for (; (*mirpp != NULL); mirpp = &((*mirpp)->mir_next)) + ; + + mirp = *mirpp = Zalloc(sizeof (md_im_replica_info_t)); + + mirp->mir_flags = flags; + mirp->mir_offset = offset; + mirp->mir_length = length; + mirp->mir_timestamp = timestamp; + + return (mirp); + +} + + + +/* + * replica_append_wrapper() + * + * Constant time append wrapper; the append function will always walk the list, + * this will take a tail argument and use the append function on just the tail + * node, doing the appropriate old-tail-next-pointer bookkeeping. + */ +static md_im_replica_info_t ** +replica_append_wrapper( + md_im_replica_info_t **tailpp, + int flags, + daddr32_t offset, + daddr32_t length, + md_timeval32_t timestamp +) +{ + (void) replica_append(tailpp, flags, offset, length, timestamp); + + if ((*tailpp)->mir_next == NULL) + return (tailpp); + + return (&(*tailpp)->mir_next); +} + +/* + * map_replica_disk() + * + * Searches the device id list for a specific + * disk based on the locator block device id array index. + * + * Returns a pointer to the did_list node if a match was + * found or NULL otherwise. + */ +static did_list_t * +map_replica_disk( + did_list_t *did_listp, + int did_index +) +{ + did_list_t *tailp = did_listp; + + while (tailp != NULL) { + if (tailp->did_index == did_index) + return (tailp); + tailp = tailp->next; + } + + /* not found, return failure */ + return (NULL); +} + +/* + * replicated_list_lookup() + * + * looks up a replicated disk entry in the global replicated disk list + * based upon the length of that disk's device id. returns the new device id + * for the disk. + * If you store the returned devid you must create a local copy. + */ +static void * +replicated_list_lookup( + uint_t devid_len, + void *old_devid +) +{ + replicated_disk_t *head = NULL; + + assert(devid_len <= MAX_DEVID_LEN); + head = replicated_disk_list[devid_len]; + + if (head == NULL) + return (NULL); + + do { + if (devid_compare((ddi_devid_t)old_devid, + (ddi_devid_t)head->old_devid) == 0) + return (head->new_devid); + head = head->next; + } while (head != NULL); + + return (NULL); +} + +/* + * replicated_list_insert() + * + * inserts a replicated disk entry into the global replicated disk list + */ +static void +replicated_list_insert( + size_t old_devid_len, + void *old_devid, + void *new_devid +) +{ + replicated_disk_t *repl_disk, **first_entry; + void *repl_old_devid = NULL; + + assert(old_devid_len <= MAX_DEVID_LEN); + + repl_disk = Zalloc(sizeof (replicated_disk_t)); + repl_old_devid = Zalloc(old_devid_len); + (void) memcpy(repl_old_devid, (void *)old_devid, old_devid_len); + + repl_disk->old_devid = repl_old_devid; + repl_disk->new_devid = new_devid; + + first_entry = &replicated_disk_list[old_devid_len]; + + if (*first_entry == NULL) { + *first_entry = repl_disk; + return; + } + + repl_disk->next = *first_entry; + replicated_disk_list[old_devid_len] = repl_disk; +} + +/* + * get_replica_disks() + * + * Will step through the locator records in the supplied locator block, and add + * each one with an active replica to a supplied list of md_im_drive_info_t, and + * add the appropriate replicas to the md_im_replica_info_t contained therein. + */ +static void +get_replica_disks( + md_im_set_desc_t *misp, + did_list_t *did_listp, + mddb_mb_t *mb, + mddb_lb_t *lbp, + md_error_t *ep, + int replicated +) +{ + mddrivename_t *dnp; + int indx, on_list; + mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); + int flags; + int devid_sz; + char *minor_name; + did_list_t *replica_disk; + daddr32_t offset; + daddr32_t length; + md_timeval32_t timestamp; + md_im_replica_info_t **mirpp = NULL; + md_im_drive_info_t **midpp = &misp->mis_drives; + md_im_drive_info_t *midp; + void *did; + + for (indx = 0; indx < lbp->lb_loccnt; indx++) { + + on_list = 0; + if (lbp->lb_locators[indx].l_flags & MDDB_F_ACTIVE) { + + /* + * search the device id list for a + * specific ctds based on the locator + * block device id array index. + */ + replica_disk = map_replica_disk(did_listp, indx); + + assert(replica_disk != NULL); + + + /* + * metadrivename() can fail for a slice name + * if there is not an existing mddrivename_t. + * So we use metadiskname() to strip the slice + * number. + */ + dnp = metadrivename(&sp, + metadiskname(replica_disk->devname), ep); + + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (dnp == midp->mid_dnp) { + on_list = 1; + mirpp = &midp->mid_replicas; + break; + } + } + + /* + * Get the correct devid_sz + */ + if (replicated) + did = replica_disk->rdid; + else + did = replica_disk->did; + + devid_sz = devid_sizeof((ddi_devid_t)did); + minor_name = replica_disk->minor_name; + + /* + * New on the list so add it + */ + if (!on_list) { + mddb_mb_t *mbp; + uint_t sliceno; + mdname_t *rsp; + int fd = -1; + + mbp = Malloc(DEV_BSIZE); + + /* determine the replica slice */ + if (meta_replicaslice(dnp, &sliceno, + ep) != 0) { + Free(mbp); + continue; + } + + /* + * if the replica slice size is zero, + * don't bother opening + */ + if (dnp->vtoc.parts[sliceno].size == 0) { + Free(mbp); + continue; + } + + if ((rsp = metaslicename(dnp, sliceno, + ep)) == NULL) { + Free(mbp); + continue; + } + + if ((fd = open(rsp->rname, + O_RDONLY| O_NDELAY)) < 0) { + Free(mbp); + continue; + } + + /* + * a drive may not have a master block + */ + if (read_master_block(ep, fd, mbp, + DEV_BSIZE) <= 0) { + mdclrerror(ep); + Free(mbp); + (void) close(fd); + continue; + } + + (void) close(fd); + midpp = drive_append_wrapper(midpp, dnp, + replica_disk->did, replica_disk->rdid, + devid_sz, minor_name, mbp->mb_setcreatetime, + NULL); + mirpp = &((*midpp)->mid_replicas); + Free(mbp); + } + + /* + * For either of these assertions to fail, it implies + * a NULL return from metadrivename() above. Since + * the args came from a presumed valid locator block, + * that's Bad. + */ + assert(midpp != NULL); + assert(mirpp != NULL); + + /* + * Extract the parameters describing this replica. + * + * The magic "1" in the length calculation accounts + * for the length of the master block, in addition to + * the block count it describes. (The master block + * will always take up one block on the disk, and + * there will always only be one master block per + * replica, even though much of the code is structured + * to handle noncontiguous replicas.) + */ + flags = lbp->lb_locators[indx].l_flags; + offset = lbp->lb_locators[indx].l_blkno; + length = mb->mb_blkcnt + 1; + timestamp = mb->mb_setcreatetime; + + mirpp = replica_append_wrapper(mirpp, flags, + offset, length, timestamp); + + /* + * If we're here it means - + * + * a) we had an active copy of the replica, and + * b) we've added the disk to the list of + * disks as well. + * + * We need to bump up the number of active + * replica count for each such replica so that it + * can be used later for replica quorum check. + */ + misp->mis_active_replicas++; + } + } +} + + + +/* + * get_nonreplica_disks() + * + * Extracts the disks without replicas from the locator name space and adds them + * to the supplied list of md_im_drive_info_t. + */ +static void +get_nonreplica_disks( + md_im_set_desc_t *misp, + mddb_rb_t *did_nm, + mddb_rb_t *did_shrnm, + md_error_t *ep, + int replicated +) +{ + char *search_path = "/dev"; + devid_nmlist_t *nmlist; + md_im_drive_info_t *midp, **midpp = &misp->mis_drives; + mddrivename_t *dnp; + mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); + mddb_rb_t *rbp_did = did_nm; + mddb_rb_t *rbp_did_shr = did_shrnm; + int on_list = 0; + int devid_sz; + struct devid_min_rec *did_rec; + struct devid_shr_rec *did_shr_rec; + struct did_shr_name *did; + struct did_min_name *min; + void *r_did; /* NULL if not a replicated diskset */ + void *valid_did; + + /* + * We got a pointer to an mddb record, which we expect to contain a + * name record; extract the pointer thereto. + */ + /* LINTED */ + did_rec = (struct devid_min_rec *)((caddr_t)(&rbp_did->rb_data)); + /* LINTED */ + did_shr_rec = (struct devid_shr_rec *) + ((caddr_t)(&rbp_did_shr->rb_data)); + + /* + * Skip the nm_rec_hdr and iterate on the array of struct minor_name + * at the end of the devid_min_rec + */ + for (min = &did_rec->minor_name[0]; min->min_devid_key != 0; + /* LINTED */ + min = (struct did_min_name *)((char *)min + DID_NAMSIZ(min))) { + + on_list = 0; + r_did = NULL; + + /* + * For a give DID_NM key, locate the corresponding device + * id from DID_NM_SHR + */ + for (did = &did_shr_rec->device_id[0]; did->did_key != 0; + /* LINTED */ + did = (struct did_shr_name *) + ((char *)did + DID_SHR_NAMSIZ(did))) { + /* + * We got a match, this is the device id we're + * looking for + */ + if (min->min_devid_key == did->did_key) + break; + } + + if (did->did_key == 0) { + /* we didn't find a match */ + assert(did->did_key != 0); + md_exit(NULL, 1); + } + + /* + * If replicated diskset + */ + if (replicated) { + size_t new_devid_len; + char *temp; + /* + * In this case, did->did_devid will + * be invalid so lookup the real one + */ + temp = replicated_list_lookup(did->did_size, + did->did_devid); + new_devid_len = devid_sizeof((ddi_devid_t)temp); + r_did = Zalloc(new_devid_len); + (void) memcpy(r_did, temp, new_devid_len); + valid_did = r_did; + } else { + valid_did = did->did_devid; + } + + /* Get the ctds mapping for that device id */ + if (meta_deviceid_to_nmlist(search_path, + (ddi_devid_t)valid_did, + &min->min_name[0], &nmlist) == 0) { + + assert(nmlist->devname != NULL); + /* Don't bother with metadevices, but track disks */ + if (!is_metaname(nmlist->devname)) { + dnp = metadrivename(&sp, + metadiskname(nmlist->devname), ep); + + assert(dnp != NULL); + /* Is it already on the list? */ + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + if (midp->mid_dnp == dnp) { + on_list = 1; + break; + } + } + + devid_sz = devid_sizeof( + (ddi_devid_t)valid_did); + + if (!on_list) { + mddb_mb_t *mbp; + uint_t sliceno; + mdname_t *rsp; + int fd = -1; + + mbp = Malloc(DEV_BSIZE); + + /* determine the replica slice */ + if (meta_replicaslice(dnp, &sliceno, + ep) != 0) { + Free(mbp); + continue; + } + + /* + * if the replica slice size is zero, + * don't bother opening + */ + if (dnp->vtoc.parts[sliceno].size + == 0) { + Free(mbp); + continue; + } + + if ((rsp = metaslicename(dnp, sliceno, + ep)) == NULL) { + Free(mbp); + continue; + } + + if ((fd = open(rsp->rname, + O_RDONLY| O_NDELAY)) < 0) { + Free(mbp); + continue; + } + + /* + * a drive may not have a master block + */ + if (read_master_block(ep, fd, mbp, + DEV_BSIZE) <= 0) { + mdclrerror(ep); + Free(mbp); + (void) close(fd); + continue; + } + + (void) close(fd); + /* + * If it is replicated diskset, + * r_did will be non-NULL and + * devid_sz will be its size + */ + midpp = drive_append_wrapper(midpp, + dnp, &did->did_devid, r_did, + devid_sz, &min->min_name[0], + mbp->mb_setcreatetime, NULL); + Free(mbp); + } + } + devid_free_nmlist(nmlist); + } + } +} + +/* + * set_append() + * + * Append to tail of linked list of md_im_set_desc_t. + * + * Will allocate space for new node AND populate it by extracting disks with + * and without replicas from the locator blocks and locator namespace. + * + * Returns pointer to new node. + */ +static md_im_set_desc_t * +set_append( + md_im_set_desc_t **mispp, + did_list_t *did_listp, + mddb_mb_t *mb, + mddb_lb_t *lbp, + mddb_rb_t *nm, + mddb_rb_t *did_nm, + mddb_rb_t *did_shrnm, + md_error_t *ep, + int replicated +) +{ + md_im_set_desc_t *misp; + set_t setno = mb->mb_setno; + + /* run to end of list */ + for (; (*mispp != NULL); mispp = &((*mispp)->mis_next)) + ; + + /* allocate new list element */ + misp = *mispp = Zalloc(sizeof (md_im_set_desc_t)); + + if (replicated) + misp->mis_flags = MD_IM_SET_REPLICATED; + + misp->mis_oldsetno = setno; + + /* Get the disks with and without replicas */ + get_replica_disks(misp, did_listp, mb, lbp, ep, replicated); + + if (nm != NULL && did_nm != NULL && did_shrnm != NULL) { + get_nonreplica_disks(misp, did_nm, did_shrnm, ep, replicated); + } + + /* + * An error in this struct could come from either of the above routines; + * in both cases, we want to pass it back on up. + */ + return (misp); +} + + + +/* + * set_append_wrapper() + * + * Constant time append wrapper; the append function will always walk the list, + * this will take a tail argument and use the append function on just the tail + * node, doing the appropriate old-tail-next-pointer bookkeeping. + */ +static md_im_set_desc_t ** +set_append_wrapper( + md_im_set_desc_t **tailpp, + did_list_t *did_listp, + mddb_mb_t *mb, + mddb_lb_t *lbp, + mddb_rb_t *nm, + mddb_rb_t *did_nm, + mddb_rb_t *did_shrnm, + md_error_t *ep, + int replicated +) +{ + (void) set_append(tailpp, did_listp, mb, lbp, nm, did_nm, + did_shrnm, ep, replicated); + + /* it's the first item in the list, return it instead of the next */ + return (((*tailpp)->mis_next == NULL) ? tailpp : &(*tailpp)->mis_next); +} + + + +/* + * add_disk_names() + * + * Iterator to walk the minor node tree of the device snapshot, adding only the + * first non-block instance of each non-cdrom minor node to a list of disks. + */ +static int +add_disk_names(di_node_t node, di_minor_t minor, void *args) +{ + char *search_path = "/dev"; + ddi_devid_t devid = di_devid(node); + devid_nmlist_t *nm; + char *min = di_minor_name(minor); + md_im_names_t *cnames = (md_im_names_t *)args; + static di_node_t save_node = NULL; + + /* + * skip CD devices + * If a device does not have a device id, we can't + * do anything with it so just exclude it from our + * list. + * + * This would also encompass CD devices and floppy + * devices that don't have a device id. + */ + if (devid == NULL) { + return (DI_WALK_CONTINUE); + } + + /* char disk devices (as opposed to block) */ + if (di_minor_spectype(minor) == S_IFCHR) { + + /* only first occurrence (slice 0) of each instance */ + if (save_node == NULL || node != save_node) { + save_node = node; + if (meta_deviceid_to_nmlist(search_path, devid, + min, &nm) == 0) { + int index = cnames->min_count++; + + assert(nm->devname != NULL); + cnames->min_names = + Realloc(cnames->min_names, + cnames->min_count * + sizeof (char *)); + + assert(cnames->min_names != NULL); + cnames->min_names[index] = + metadiskname(nm->devname); + devid_free_nmlist(nm); + } + } + } + return (DI_WALK_CONTINUE); +} + + + +/* + * meta_list_disks() + * + * Snapshots the device tree and extracts disk devices from the snapshot. + */ +int +meta_list_disks(md_error_t *ep, md_im_names_t *cnames) +{ + di_node_t root_node; + + assert(cnames != NULL); + cnames->min_count = 0; + cnames->min_names = NULL; + + if ((root_node = di_init("/", DINFOCPYALL|DINFOFORCE)) + == DI_NODE_NIL) { + return (mdsyserror(ep, errno, NULL)); + } + + (void) di_walk_minor(root_node, DDI_NT_BLOCK, 0, cnames, + add_disk_names); + + di_fini(root_node); + return (0); +} + +/* + * meta_imp_drvused + * + * Checks if given drive is mounted, swapped, part of disk configuration + * or in use by SVM. ep also has error code set up if drive is in use. + * + * Returns 1 if drive is in use. + * Returns 0 if drive is not in use. + */ +int +meta_imp_drvused( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + md_error_t status = mdnullerror; + md_error_t *db_ep = &status; + + /* + * We pass in db_ep to meta_setup_db_locations + * and never ever use the error contained therein + * because all we're interested in is a check to + * see whether any local metadbs are present. + */ + if ((meta_check_drivemounted(sp, dnp, ep) != 0) || + (meta_check_driveswapped(sp, dnp, ep) != 0) || + (((meta_setup_db_locations(db_ep) == 0) && + ((meta_check_drive_inuse(sp, dnp, 1, ep) != 0) || + (meta_check_driveinset(sp, dnp, ep) != 0))))) { + return (1); + } else { + return (0); + } +} + +/* + * meta_prune_cnames() + * + * Removes in-use disks from the list prior to further processing. + * + * Return value depends on err_on_prune flag: if set, and one or more disks + * are pruned, the return list will be the pruned disks. If not set, or if no + * disks are pruned, the return list will be the unpruned disks. + */ +mddrivenamelist_t * +meta_prune_cnames( + md_error_t *ep, + md_im_names_t *cnames, + int err_on_prune +) +{ + int d; + int fcount = 0; + mddrivenamelist_t *dnlp = NULL; + mddrivenamelist_t **dnlpp = &dnlp; + mddrivenamelist_t *fdnlp = NULL; + mddrivenamelist_t **fdnlpp = &fdnlp; + mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep); + + for (d = 0; d < cnames->min_count; ++d) { + mddrivename_t *dnp; + + dnp = metadrivename(&sp, cnames->min_names[d], ep); + if (dnp == NULL) { + /* + * Assuming we're interested in knowing about + * whatever error occurred, but not in stopping. + */ + mde_perror(ep, cnames->min_names[d]); + mdclrerror(ep); + + continue; + } + + /* + * Check if the drive is inuse. + */ + if (meta_imp_drvused(sp, dnp, ep)) { + fdnlpp = meta_drivenamelist_append_wrapper(fdnlpp, dnp); + fcount++; + mdclrerror(ep); + } else { + dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp); + } + } + + if (fcount) { + if (err_on_prune) { + (void) mddserror(ep, MDE_DS_DRIVEINUSE, 0, + NULL, fdnlp->drivenamep->cname, NULL); + metafreedrivenamelist(dnlp); + return (fdnlp); + } + metafreedrivenamelist(fdnlp); + } + + return (dnlp); +} + +/* + * read_master_block() + * + * Returns: + * < 0 for failure + * 0 for no valid master block + * 1 for valid master block + * + * The supplied buffer will be filled in for EITHER 0 or 1. + */ +int +read_master_block( + md_error_t *ep, + int fd, + void *bp, + int bsize +) +{ + mddb_mb_t *mbp = bp; + int rval = 1; + + assert(bp != NULL); + + if (lseek(fd, (off_t)dbtob(16), SEEK_SET) < 0) + return (mdsyserror(ep, errno, NULL)); + + if (read(fd, bp, bsize) != bsize) + return (mdsyserror(ep, errno, NULL)); + + /* + * The master block magic number can either be MDDB_MAGIC_MB in + * the case of a real master block, or, it can be MDDB_MAGIC_DU + * in the case of a dummy master block + */ + if ((mbp->mb_magic != MDDB_MAGIC_MB) && + (mbp->mb_magic != MDDB_MAGIC_DU)) { + rval = 0; + (void) mdmddberror(ep, MDE_DB_MASTER, 0, 0, 0, NULL); + } + + if (mbp->mb_revision != MDDB_REV_MB) { + rval = 0; + } + + return (rval); +} + +/* + * read_locator_block() + * + * Returns: + * < 0 for failure + * 0 for no valid locator block + * 1 for valid locator block + */ +int +read_locator_block( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + void *bp, + int bsize +) +{ + mddb_lb_t *lbp = bp; + + assert(bp != NULL); + + if (lseek(fd, (off_t)dbtob(mbp->mb_blkmap.m_firstblk), SEEK_SET) < 0) + return (mdsyserror(ep, errno, NULL)); + + if (read(fd, bp, bsize) != bsize) + return (mdsyserror(ep, errno, NULL)); + + return ((lbp->lb_magic == MDDB_MAGIC_LB) ? 1 : 0); +} + +int +phys_read( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + daddr_t blk, + void *bp, + int bcount +) +{ + daddr_t pblk; + + if ((pblk = getphysblk(blk, mbp)) < 0) + return (mdmddberror(ep, MDE_DB_BLKRANGE, NODEV32, + MD_LOCAL_SET, blk, NULL)); + + if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0) + return (mdsyserror(ep, errno, NULL)); + + if (read(fd, bp, bcount) != bcount) + return (mdsyserror(ep, errno, NULL)); + + return (bcount); +} + +/* + * read_locator_block_did() + * + * Returns: + * < 0 for failure + * 0 for no valid locator name struct + * 1 for valid locator name struct + */ +int +read_locator_block_did( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + mddb_lb_t *lbp, + void *bp, + int bsize +) +{ + int lb_didfirstblk = lbp->lb_didfirstblk; + mddb_did_blk_t *lbdidp = bp; + int rval; + + assert(bp != NULL); + + if ((rval = phys_read(ep, fd, mbp, lb_didfirstblk, bp, bsize)) < 0) + return (rval); + + return ((lbdidp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0); +} + +/* + * read_locator_names() + * + * Returns: + * < 0 for failure + * 0 for no valid locator name struct + * 1 for valid locator name struct + */ +int +read_locator_names( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + mddb_lb_t *lbp, + void *bp, + int bsize +) +{ + int lnfirstblk = lbp->lb_lnfirstblk; + mddb_ln_t *lnp = bp; + int rval; + + assert(bp != NULL); + + if ((rval = phys_read(ep, fd, mbp, lnfirstblk, bp, bsize)) < 0) + return (rval); + + return ((lnp->ln_magic == MDDB_MAGIC_LN) ? 1 : 0); +} + + +int +read_database_block( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + int dbblk, + void *bp, + int bsize +) +{ + mddb_db_t *dbp = bp; + int rval; + + assert(bp != NULL); + + if ((rval = phys_read(ep, fd, mbp, dbblk, bp, bsize)) < 0) + return (rval); + + return ((dbp->db_magic == MDDB_MAGIC_DB) ? 1 : 0); +} + +int +read_loc_didblks( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + int didblk, + void *bp, + int bsize +) +{ + mddb_did_blk_t *didbp = bp; + int rval; + + assert(bp != NULL); + + if ((rval = phys_read(ep, fd, mbp, didblk, bp, bsize)) < 0) + return (rval); + + return ((didbp->blk_magic == MDDB_MAGIC_DI) ? 1 : 0); +} + + +int +read_loc_didinfo( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + int infoblk, + void *bp, + int bsize +) +{ + int rval = 1; + mddb_did_info_t *infop = bp; + + assert(bp != NULL); + + if ((rval = phys_read(ep, fd, mbp, infoblk, bp, bsize)) < 0) + return (rval); + + return ((infop->info_flags & MDDB_DID_EXISTS) ? 1 : 0); +} + +/* + * meta_nm_rec() + * + * Return the DE corresponding to the requested namespace record type. + * Modifies dbp to have a firstentry if one isn't there. + */ +static mddb_de_t * +meta_nm_rec(mddb_db_t *dbp, mddb_type_t rectype) +{ + mddb_de_t *dep; + int desize; + + if (dbp->db_firstentry != NULL) { + /* LINTED */ + dep = (mddb_de_t *)((caddr_t)(&dbp->db_firstentry) + + sizeof (dbp->db_firstentry)); + dbp->db_firstentry = dep; + while (dep && dep->de_next) { + desize = sizeof (*dep) - sizeof (dep->de_blks) + + sizeof (daddr_t) * dep->de_blkcount; + /* LINTED */ + dep->de_next = (mddb_de_t *) + ((caddr_t)dep + desize); + dep = dep->de_next; + } + } + + for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next) { + if (dep->de_type1 == rectype) + break; + } + return (dep); +} + +/* + * read_nm_rec() + * + * Reads the NM, NM_DID or NM_DID_SHR record in the mddb and stores the + * configuration data in the buffer 'nm' + * + * Returns: + * < 0 for failure + * 0 for no valid NM/DID_NM/DID_NM_SHR record + * 1 for valid NM/DID_NM/DID_NM_SHR record + * + */ +static int +read_nm_rec( + md_error_t *ep, + int fd, + mddb_mb_t *mbp, + mddb_lb_t *lbp, + char **nm, + mddb_type_t rectype, + char *diskname +) +{ + int cnt, dbblk, rval = 0; + char db[DEV_BSIZE]; + mddb_de_t *dep; + /*LINTED*/ + mddb_db_t *dbp = (mddb_db_t *)&db; + char *tmpnm = NULL; + daddr_t pblk; + + for (dbblk = lbp->lb_dbfirstblk; + dbblk != 0; + dbblk = dbp->db_nextblk) { + + if ((rval = read_database_block(ep, fd, mbp, dbblk, dbp, + sizeof (db))) <= 0) + return (rval); + + /* + * Locate NM/DID_NM/DID_NM_SHR record. Normally there is + * only one record per mddb. There is a rare case when we + * can't expand the record. If this is the case then we + * will have multiple NM/DID_NM/DID_NM_SHR records linked + * with r_next_recid. + * + * For now assume the normal case and handle the extended + * namespace in Phase 2. + */ + if ((dep = meta_nm_rec(dbp, rectype)) != NULL) + break; + } + + /* If meta_nm_rec() never succeeded, bail out */ + if (dep == NULL) + return (0); + + /* Read in the appropriate record and return configurations */ + tmpnm = (char *)Zalloc(dbtob(dep->de_blkcount)); + *nm = tmpnm; + + for (cnt = 0; cnt < dep->de_blkcount; cnt++) { + if ((pblk = getphysblk(dep->de_blks[cnt], mbp)) < 0) { + rval = mdmddberror(ep, MDE_DB_BLKRANGE, + NODEV32, MD_LOCAL_SET, + dep->de_blks[cnt], diskname); + return (rval); + } + + if (lseek(fd, (off_t)dbtob(pblk), SEEK_SET) < 0) { + rval = mdsyserror(ep, errno, diskname); + return (rval); + } + + if (read(fd, tmpnm, DEV_BSIZE) != DEV_BSIZE) { + rval = mdsyserror(ep, errno, diskname); + return (rval); + } + + tmpnm += DEV_BSIZE; + } + return (1); +} + +/* + * is_replicated + * + * Determines whether a disk has been replicated or not. It checks to see + * if the device id stored in the master block is the same as the device id + * registered for that disk on the current system. If the two device ids are + * different, then we know that the disk has been replicated. + * + * If need_devid is set and the disk is replicated, fill in the new_devid. + * Also, if need_devid is set, this routine allocates memory for the device + * ids; the caller of this routine is responsible for free'ing up the memory. + * + * Returns: + * 1 if it's a replicated disk + * 0 if it's not a replicated disk + */ +static int +is_replicated( + int fd, + mddb_mb_t *mbp, + int need_devid, + void **new_devid +) +{ + ddi_devid_t current_devid; + int retval = 0; + size_t new_devid_len; + + if (mbp->mb_devid_magic != MDDB_MAGIC_DE) + return (retval); + + if (devid_get(fd, ¤t_devid) != 0) + return (retval); + + if (devid_compare((ddi_devid_t)mbp->mb_devid, current_devid) != 0) + retval = 1; + + if (retval && need_devid) { + new_devid_len = devid_sizeof(current_devid); + *new_devid = Zalloc(new_devid_len); + (void) memcpy(*new_devid, (void *)current_devid, new_devid_len); + } + + devid_free(current_devid); + return (retval); +} + +/* + * free_replicated_disks_list() + * + * this frees up all the memory allocated by build_replicated_disks_list + */ +static void +free_replicated_disks_list() +{ + replicated_disk_t **repl_disk, *temp; + int index; + + for (index = 0; index <= MAX_DEVID_LEN; index++) { + repl_disk = &replicated_disk_list[index]; + + while (*repl_disk != NULL) { + temp = *repl_disk; + *repl_disk = (*repl_disk)->next; + + Free(temp->old_devid); + Free(temp->new_devid); + Free(temp); + } + } +} + +/* + * build_replicated_disks_list() + * + * Builds a list of disks that have been replicated using either a + * remote replication or a point-in-time replication software. The + * list is stored as a two dimensional sparse array. + * + * Returns + * 1 on success + * 0 on failure + */ +static int +build_replicated_disks_list( + md_error_t *ep, + mddrivenamelist_t *dnlp +) +{ + uint_t sliceno; + int fd = -1; + mddrivenamelist_t *dp; + mdname_t *rsp; + mddb_mb_t *mbp; + + mbp = Malloc(DEV_BSIZE); + + for (dp = dnlp; dp != NULL; dp = dp->next) { + mddrivename_t *dnp; + void *new_devid; + + dnp = dp->drivenamep; + /* determine the replica slice */ + if (meta_replicaslice(dnp, &sliceno, ep) != 0) + continue; + + /* + * if the replica slice size is zero, don't bother opening + */ + if (dnp->vtoc.parts[sliceno].size == 0) + continue; + + if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL) + continue; + + if ((fd = open(rsp->rname, O_RDONLY| O_NDELAY)) < 0) + return (mdsyserror(ep, errno, rsp->rname)); + + /* a drive may not have a master block so we just continue */ + if (read_master_block(ep, fd, mbp, DEV_BSIZE) <= 0) { + (void) close(fd); + mdclrerror(ep); + continue; + } + + if (is_replicated(fd, mbp, 1, &new_devid)) { + replicated_list_insert(mbp->mb_devid_len, + mbp->mb_devid, new_devid); + } + (void) close(fd); + } + replicated_disk_list_built = 1; + + Free(mbp); + return (1); +} + +/* + * free_did_list() + * + * Frees the did_list allocated as part of build_did_list + */ +static void +free_did_list( + did_list_t *did_listp +) +{ + did_list_t *temp, *head; + + head = did_listp; + + while (head != NULL) { + temp = head; + head = head->next; + if (temp->rdid) + Free(temp->rdid); + if (temp->did) + Free(temp->did); + if (temp->devname) + Free(temp->devname); + if (temp->minor_name) + Free(temp->minor_name); + Free(temp); + } +} + +/* + * build_did_list() + * + * Build a list of device ids corresponding to disks in the locator block. + * Memory is allocated here for the nodes in the did_list. The callers of + * this routine must also call free_did_list to free up the memory after + * they're done. + * + * Returns: + * < 0 for failure + * 0 for no valid locator block device id array + * 1 for valid locator block device id array + * ENOTSUP partial diskset, not all disks in a diskset on the + * system where import is being executed + */ +static int +build_did_list( + md_error_t *ep, + int fd, + mddb_mb_t *mb, + mddb_did_blk_t *lbdidp, + did_list_t **did_listp, + int replicated +) +{ + char *search_path = "/dev"; + char *minor_name; + int rval, cnt; + devid_nmlist_t *nm; + uint_t did_info_length = 0; + uint_t did_info_firstblk = 0; + did_list_t *new, *head = NULL; + char *bp = NULL, *temp; + mddb_did_info_t *did_info = NULL; + void *did = NULL; + size_t new_devid_len; + + for (cnt = 0; cnt < MDDB_NLB; cnt++) { + did_info = &lbdidp->blk_info[cnt]; + + if (!(did_info->info_flags & MDDB_DID_EXISTS)) + continue; + + new = Zalloc(sizeof (did_list_t)); + new->did = Zalloc(did_info->info_length); + + /* + * If we can re-use the buffer already has been + * read in then just use it. Otherwise free + * the previous one and alloc a new one + */ + if (dbtob(did_info->info_blkcnt) != did_info_length && + did_info->info_firstblk != did_info_firstblk) { + + did_info_length = dbtob(did_info->info_blkcnt); + did_info_firstblk = did_info->info_firstblk; + + if (bp) + Free(bp); + bp = temp = Zalloc(did_info_length); + + if ((rval = phys_read(ep, fd, mb, did_info_firstblk, + (void *)bp, did_info_length)) < 0) + return (rval); + } else { + temp = bp; + } + + temp += did_info->info_offset; + (void) memcpy(new->did, temp, did_info->info_length); + new->did_index = cnt; + minor_name = did_info->info_minor_name; + + /* + * If we are not able to find the ctd mapping corresponding + * to a given device id, it probably means the device id in + * question is not registered with the system. + * + * Highly likely that the only time this happens, we've hit + * a case where not all the disks that are a part of the + * diskset were moved before importing the diskset. + * + * If set is a replicated diskset, then the device id we get + * from 'lb' will be the 'other' did and we need to lookup + * the real one before we call this routine. + */ + if (replicated) { + temp = replicated_list_lookup(did_info->info_length, + new->did); + new_devid_len = devid_sizeof((ddi_devid_t)temp); + new->rdid = Zalloc(new_devid_len); + (void) memcpy(new->rdid, temp, new_devid_len); + did = new->rdid; + } else { + did = new->did; + } + + if (devid_valid((ddi_devid_t)(did)) == 0) { + return (-1); + } + + if ((rval = meta_deviceid_to_nmlist(search_path, + (ddi_devid_t)did, minor_name, &nm)) != 0) { + *did_listp = head; + free_did_list(*did_listp); + *did_listp = NULL; + (void) mddserror(ep, MDE_DS_PARTIALSET, MD_SET_BAD, + mynode(), NULL, NULL); + return (ENOTSUP); + } + + assert(nm->devname != NULL); + new->devname = Strdup(nm->devname); + new->dev = nm->dev; + new->minor_name = Strdup(minor_name); + + devid_free_nmlist(nm); + + new->next = head; + head = new; + } + + /* Free the last bp */ + if (bp) + Free(bp); + *did_listp = head; + return (1); +} + +/* + * meta_get_set_info + * + * Scans a given drive for set specific information. If the given drive + * has a shared metadb, scans the shared metadb for information pertaining + * to the set. + * + * Returns: + * <0 for failure + * 0 success but no replicas were found + * 1 success and a replica was found + * ENOTSUP for partial disksets detected + */ +int +meta_get_set_info( + mddrivenamelist_t *dp, + md_im_set_desc_t **mispp, + int local_mb_ok, + md_error_t *ep +) +{ + uint_t s; + mdname_t *rsp; + int fd; + char mb[DEV_BSIZE]; + /*LINTED*/ + mddb_mb_t *mbp = (mddb_mb_t *)mb; + char lb[dbtob(MDDB_LBCNT)]; + /*LINTED*/ + mddb_lb_t *lbp = (mddb_lb_t *)lb; + mddb_did_blk_t *lbdidp = NULL; + mddb_ln_t *lnp = NULL; + int lnsize, lbdid_size; + int rval = 0; + char db[DEV_BSIZE]; + /*LINTED*/ + mddb_db_t *dbp = (mddb_db_t *)db; + did_list_t *did_listp = NULL; + mddrivenamelist_t *dnlp; + mddrivename_t *dnp; + md_im_names_t cnames = { 0, NULL}; + char *nm = NULL; + char *did_nm = NULL, *did_shrnm = NULL; + struct nm_rec *nmp; + struct devid_shr_rec *did_shrnmp; + struct devid_min_rec *did_nmp; + int extended_namespace = 0; + int replicated = 0; + + dnp = dp->drivenamep; + + /* + * Determine and open the replica slice + */ + if (meta_replicaslice(dnp, &s, ep) != 0) { + return (-1); + } + + /* + * Test for the size of replica slice in question. If + * the size is zero, we know that this is not a disk that was + * part of a set and it should be silently ignored for import. + */ + if (dnp->vtoc.parts[s].size == 0) + return (0); + + if ((rsp = metaslicename(dnp, s, ep)) == NULL) { + return (-1); + } + + if ((fd = open(rsp->rname, O_RDONLY|O_NDELAY)) < 0) + return (mdsyserror(ep, errno, rsp->cname)); + + /* + * After the open() succeeds, we should return via the "out" + * label to clean up after ourselves. (Up 'til now, we can + * just return directly, because there are no resources to + * give back.) + */ + + if ((rval = read_master_block(ep, fd, mbp, sizeof (mb))) <= 0) + goto out; + + replicated = is_replicated(fd, mbp, 0, NULL); + + if (!local_mb_ok && mbp->mb_setno == 0) { + rval = 0; + goto out; + } + + if ((rval = read_locator_block(ep, fd, mbp, lbp, sizeof (lb))) <= 0) + goto out; + + /* + * Once the locator block has been read, we need to + * check if the locator block commit count is zero. + * If it is zero, we know that the replica we're dealing + * with is on a disk that was deleted from the disk set; + * and, it potentially has stale data. We need to quit + * in that case + */ + if (lbp->lb_commitcnt == 0) { + rval = 0; + goto out; + } + + /* + * Make sure that the disk being imported has device id + * namespace present for disksets. If a disk doesn't have + * device id namespace, we skip reading the replica on that disk + */ + if (!(lbp->lb_flags & MDDB_DEVID_STYLE)) { + rval = 0; + goto out; + } + + /* + * Grab the locator block device id array. Allocate memory for the + * array first. + */ + lbdid_size = dbtob(lbp->lb_didblkcnt); + lbdidp = Zalloc(lbdid_size); + + if ((rval = read_locator_block_did(ep, fd, mbp, lbp, lbdidp, + lbdid_size)) <= 0) + goto out; + + /* + * For a disk that has not been replicated, extract the device ids + * stored in the locator block device id array and store them in + * a list. + * + * If the disk has been replicated using replication software such + * as HDS Truecopy/ShadowImage or EMC SRDF/BCV, the device ids in + * the locator block are invalid and we need to build a list of + * replicated disks. + */ + if (replicated && !replicated_disk_list_built) { + /* + * if there's a replicated diskset involved, we need to + * scan the system one more time and build a list of all + * candidate disks that might be part of that replicated set + */ + if (meta_list_disks(ep, &cnames) != 0) { + rval = 0; + goto out; + } + dnlp = meta_prune_cnames(ep, &cnames, 0); + rval = build_replicated_disks_list(ep, dnlp); + if (rval == 0) + goto out; + } + + rval = build_did_list(ep, fd, mbp, lbdidp, &did_listp, replicated); + + if ((rval <= 0) || (rval == ENOTSUP)) + goto out; + + /* + * Until here, we've gotten away with fixed sizes for the + * master block and locator block. The locator names, + * however, are sized (and therefore allocated) dynamically + * according to information in the locator block. + */ + lnsize = dbtob(lbp->lb_lnblkcnt); + lnp = Zalloc(lnsize); + + if ((rval = read_locator_names(ep, fd, mbp, lbp, lnp, lnsize)) <= 0) + goto out; + + /* + * Read in the NM record + * If no NM record was found, it still is a valid configuration + * but it also means that we won't find any corresponding DID_NM + * or DID_SHR_NM. + */ + if ((rval = read_nm_rec(ep, fd, mbp, lbp, &nm, MDDB_NM, rsp->cname)) + < 0) + goto out; + else if (rval == 0) + goto append; + + /* + * At this point, we have read in all of the blocks that form + * the nm_rec. We should at least detect the corner case + * mentioned above, in which r_next_recid links to another + * nm_rec. Extended namespace handling is left for Phase 2. + * + * What this should really be is a loop, each iteration of + * which reads in a nm_rec and calls the set_append_wrapper(). + */ + /*LINTED*/ + nmp = (struct nm_rec *)(nm + sizeof (mddb_rb_t)); + if (nmp->r_rec_hdr.r_next_recid != (mddb_recid_t)0) { + extended_namespace = 1; + rval = 0; + goto out; + } + + if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_nm, + MDDB_DID_NM, rsp->cname)) < 0) + goto out; + else if (rval == 0) + goto append; + + /*LINTED*/ + did_nmp = (struct devid_min_rec *)(did_nm + sizeof (mddb_rb_t)); + if (did_nmp->min_rec_hdr.r_next_recid != (mddb_recid_t)0) { + extended_namespace = 1; + rval = 0; + goto out; + } + + if ((rval = read_nm_rec(ep, fd, mbp, lbp, &did_shrnm, + MDDB_DID_SHR_NM, rsp->cname)) < 0) + goto out; + else if (rval == 0) + goto append; + + /*LINTED*/ + did_shrnmp = (struct devid_shr_rec *)(did_shrnm + sizeof (mddb_rb_t)); + if (did_shrnmp->did_rec_hdr.r_next_recid != (mddb_recid_t)0) { + extended_namespace = 1; + rval = 0; + goto out; + } + +append: + /* Finally, we've got what we need to process this replica. */ + mispp = set_append_wrapper(mispp, did_listp, mbp, lbp, + /*LINTED*/ + (mddb_rb_t *)nm, (mddb_rb_t *)did_nm, (mddb_rb_t *)did_shrnm, + ep, replicated); + + /* Return the fact that we found at least one set */ + rval = 1; + +out: + if (fd >= 0) + (void) close(fd); + if (did_listp != NULL) + free_did_list(did_listp); + if (lnp != NULL) + Free(lnp); + if (nm != NULL) + Free(nm); + if (did_nm != NULL) + Free(did_nm); + if (did_shrnm != NULL) + Free(did_shrnm); + + /* + * If we are at the end of the list, we must free up + * the replicated list too + */ + if (dp->next == NULL) + free_replicated_disks_list(); + + if (extended_namespace) + return (mddserror(ep, MDE_DS_EXTENDEDNM, MD_SET_BAD, + mynode(), NULL, NULL)); + + return (rval); +} + +/* + * Return the minor name associated with a given disk slice + */ +static char * +meta_getminor_name( + char *devname, + md_error_t *ep +) +{ + int fd = -1; + char *minor_name = NULL; + char *ret_minor_name = NULL; + + if (devname == NULL) + return (NULL); + + if ((fd = open(devname, O_RDONLY|O_NDELAY, 0)) < 0) { + (void) mdsyserror(ep, errno, devname); + return (NULL); + } + + if (devid_get_minor_name(fd, &minor_name) == 0) { + ret_minor_name = Strdup(minor_name); + devid_str_free(minor_name); + } + + (void) close(fd); + return (ret_minor_name); +} + +static int +meta_replica_quorum( + md_im_set_desc_t *misp, + md_error_t *ep +) +{ + md_im_drive_info_t *midp; + mddrivename_t *dnp; + md_im_replica_info_t *midr; + mdname_t *np; + struct stat st_buf; + uint_t rep_slice; + int replica_count = 0; + + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + + dnp = midp->mid_dnp; + + if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || + ((np = metaslicename(dnp, rep_slice, ep)) + == NULL)) { + mdclrerror(ep); + continue; + } + + if (stat(np->bname, &st_buf) != 0) + continue; + + /* + * The drive is okay now count its replicas + */ + for (midr = midp->mid_replicas; midr != NULL; + midr = midr->mir_next) { + replica_count++; + } + } + + if (replica_count < (misp->mis_active_replicas + 1)/2) + return (-1); + + return (0); +} + +static set_t +meta_imp_setno( + md_error_t *ep +) +{ + set_t max_sets, setno; + int bool; + + if ((max_sets = get_max_sets(ep)) == 0) { + return (MD_SET_BAD); + } + + /* + * This code needs to be expanded when we run in SunCluster + * environment SunCluster obtains setno internally + */ + for (setno = 1; setno < max_sets; setno++) { + if (clnt_setnumbusy(mynode(), setno, + &bool, ep) == -1) { + setno = MD_SET_BAD; + break; + } + /* + * found one available + */ + if (bool == FALSE) + break; + } + + if (setno == max_sets) { + setno = MD_SET_BAD; + } + + return (setno); +} + +int +meta_imp_set( + md_im_set_desc_t *misp, + char *setname, + int force, + bool_t dry_run, + md_error_t *ep +) +{ + md_timeval32_t tp; + md_im_drive_info_t *midp; + uint_t rep_slice; + mddrivename_t *dnp; + struct mddb_config c; + mdname_t *np; + md_im_replica_info_t *mirp; + char setnum_link[MAXPATHLEN]; + char setname_link[MAXPATHLEN]; + char *minor_name = NULL; + + (void) memset(&c, 0, sizeof (c)); + (void) strlcpy(c.c_setname, setname, sizeof (c.c_setname)); + c.c_sideno = 0; + c.c_flags = MDDB_C_IMPORT; + + /* + * Check to see if the setname that the set is being imported into, + * already exists. + */ + if (getsetbyname(c.c_setname, ep) != NULL) { + return (mddserror(ep, MDE_DS_SETNAMEBUSY, MD_SET_BAD, + mynode(), NULL, c.c_setname)); + } + + /* + * Find the next available set number + */ + if ((c.c_setno = meta_imp_setno(ep)) == MD_SET_BAD) { + return (mddserror(ep, MDE_DS_SETNOTIMP, MD_SET_BAD, + mynode(), NULL, c.c_setname)); + } + + if (meta_gettimeofday(&tp) == -1) { + return (mdsyserror(ep, errno, NULL)); + } + c.c_timestamp = tp; + + /* Check to see if replica quorum requirement is fulfilled */ + if (!force && meta_replica_quorum(misp, ep) == -1) + return (mddserror(ep, MDE_DS_INSUFQUORUM, MD_SET_BAD, + mynode(), NULL, c.c_setname)); + + for (midp = misp->mis_drives; midp != NULL; + midp = midp->mid_next) { + mdcinfo_t *cinfo; + + /* + * We pass down the list of the drives in the + * set down to the kernel irrespective of + * whether the drives have a replica or not. + * + * The kernel detects which of the drives don't + * have a replica and accordingly does the + * right thing. + */ + dnp = midp->mid_dnp; + if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || + ((np = metaslicename(dnp, rep_slice, ep)) + == NULL)) { + mdclrerror(ep); + continue; + } + + (void) strcpy(c.c_locator.l_devname, np->bname); + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_mnum = meta_getminor(np->dev); + c.c_locator.l_devid = (uintptr_t)Malloc(midp->mid_devid_sz); + (void) memcpy((void *)c.c_locator.l_devid, midp->mid_devid, + midp->mid_devid_sz); + c.c_locator.l_devid_sz = midp->mid_devid_sz; + c.c_locator.l_devid_flags = + MDDB_DEVID_VALID | MDDB_DEVID_SPACE | MDDB_DEVID_SZ; + if (midp->mid_o_devid) { + c.c_locator.l_old_devid = + (uint64_t)Malloc(midp->mid_o_devid_sz); + (void) memcpy((void *)c.c_locator.l_old_devid, + midp->mid_o_devid, midp->mid_o_devid_sz); + c.c_locator.l_old_devid_sz = midp->mid_o_devid_sz; + } + minor_name = meta_getminor_name(np->bname, ep); + (void) strncpy(c.c_locator.l_minor_name, minor_name, + sizeof (c.c_locator.l_minor_name)); + + if ((cinfo = metagetcinfo(np, ep)) == NULL) { + mdclrerror(ep); + continue; + } + (void) strncpy(c.c_locator.l_driver, cinfo->dname, + sizeof (c.c_locator.l_driver)); + + mirp = midp->mid_replicas; + + do { + if (mirp) { + c.c_locator.l_flags = 0; + c.c_locator.l_blkno = mirp->mir_offset; + mirp = mirp->mir_next; + } else { + /* + * Default offset for dummy is 16 + */ + c.c_locator.l_blkno = 16; + } + + if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { + Free((void *)c.c_locator.l_devid); + if (c.c_locator.l_old_devid) + Free((void *)c.c_locator.l_old_devid); + return (mdstealerror(ep, &c.c_mde)); + } + } while (mirp != NULL); + } + + /* + * If the dry run option was specified, flag success + * and exit out + */ + if (dry_run == 1) { + md_eprintf("%s\n", dgettext(TEXT_DOMAIN, + "import should be successful")); + Free((void *)c.c_locator.l_devid); + if (c.c_locator.l_old_devid) + Free((void *)c.c_locator.l_old_devid); + return (0); + } + + /* + * Now kernel should have all the information + * regarding the import diskset replica. + * Tell kernel to load them up and import the set + */ + if (metaioctl(MD_IOCIMP_LOAD, &c.c_setno, &c.c_mde, NULL) != 0) { + Free((void *)c.c_locator.l_devid); + if (c.c_locator.l_old_devid) + Free((void *)c.c_locator.l_old_devid); + return (mdstealerror(ep, &c.c_mde)); + } + + (void) meta_smf_enable(META_SMF_DISKSET, NULL); + + /* The set has now been imported, create the appropriate symlink */ + (void) snprintf(setname_link, MAXPATHLEN, "/dev/md/%s", setname); + (void) snprintf(setnum_link, MAXPATHLEN, "shared/%d", c.c_setno); + + /* + * Since we already verified that the setname was OK, make sure to + * cleanup before proceeding. + */ + if (unlink(setname_link) == -1) { + if (errno != ENOENT) + (void) mdsyserror(ep, errno, setname_link); + } + + if (symlink(setnum_link, setname_link) == -1) + (void) mdsyserror(ep, errno, setname_link); + + /* resnarf the set that has just been imported */ + if (clnt_resnarf_set(mynode(), c.c_setno, ep) != 0) + md_eprintf("%s\n", dgettext(TEXT_DOMAIN, "Please stop and " + "restart rpc.metad")); + + Free((void *)c.c_locator.l_devid); + if (c.c_locator.l_old_devid) + Free((void *)c.c_locator.l_old_devid); + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_init.c b/usr/src/lib/lvm/libmeta/common/meta_init.c new file mode 100644 index 0000000000..5775af48bc --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_init.c @@ -0,0 +1,453 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * initialize metadevices + */ + +#include <meta.h> +#include <libdevinfo.h> + + +int +parse_interlace( + char *uname, /* Meta Device name (eg d0) */ + char *str, /* String to Parse */ + diskaddr_t *interlacep, + md_error_t *ep +) +{ + diskaddr_t num; + char c; + int cnt; + + /* parse interlace */ + if ((cnt = sscanf(str, "%llu%c", &num, &c)) < 1) { + return (meta_cook_syntax(ep, MDE_BAD_INTERLACE, + uname, 1, &str)); + } else if (cnt == 1) { + if (num & (DEV_BSIZE - 1)) { + return (meta_cook_syntax(ep, MDE_BAD_INTERLACE, + uname, 1, &str)); + } + num = lbtodb(num); + } else switch (c) { + case 'b': + case 'B': + num *= DEV_BSIZE / DEV_BSIZE; + break; + case 'k': + case 'K': + num *= 1024 / DEV_BSIZE; + break; + case 'm': + case 'M': + num *= 1024 * 1024 / DEV_BSIZE; + break; + default: + return (meta_cook_syntax(ep, MDE_BAD_INTERLACE, + NULL, 1, &str)); + } + + /* return success */ + *interlacep = num; + return (0); +} + +/* + * cook up syntax error + */ +int +meta_cook_syntax( + md_error_t *ep, + md_void_errno_t errcode, + char *uname, + int argc, + char *argv[] +) +{ + int rval; + + /* if we have a token, concat it to uname */ + if ((argc > 0) && (argv[0] != NULL) && (argv[0][0] != '\0')) { + char *p; + + if ((uname != NULL) && (uname[0] != '\0')) { + p = Malloc(strlen(uname) + 2 + + 1 + strlen(argv[0]) + 1 + 1); + (void) strcpy(p, uname); + (void) strcat(p, ": "); + } else { + p = Malloc(1 + strlen(argv[0]) + 1 + 1); + p[0] = '\0'; + } + (void) strcat(p, "\""); + (void) strcat(p, argv[0]); + (void) strcat(p, "\""); + rval = mderror(ep, errcode, p); + Free(p); + } else { + rval = mderror(ep, errcode, uname); + } + + return (rval); +} + +int +meta_check_devicesize( + diskaddr_t total_blocks +) +{ + int rval = MD_CRO_32BIT; + + + if (total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) { + rval = MD_CRO_64BIT; + } + return (rval); +} + + +/* + * setup metadevice geometry + */ +/*ARGSUSED*/ +int +meta_setup_geom( + md_unit_t *md, + mdname_t *np, + mdgeom_t *geomp, + uint_t write_reinstruct, + uint_t read_reinstruct, + uint_t round_cyl, + md_error_t *ep +) +{ + diskaddr_t cylsize = geomp->nhead * geomp->nsect; + diskaddr_t total_blocks; + + if (round_cyl) { + total_blocks = rounddown(md->c.un_actual_tb, cylsize); + } else { + total_blocks = md->c.un_actual_tb; + } + + md->c.un_total_blocks = total_blocks; + md->c.un_nhead = geomp->nhead; + md->c.un_nsect = geomp->nsect; + md->c.un_rpm = geomp->rpm; + md->c.un_wr_reinstruct = write_reinstruct; + md->c.un_rd_reinstruct = read_reinstruct; + return (0); +} + +/* + * adjust metadevice geometry + */ +/*ARGSUSED*/ +int +meta_adjust_geom( + md_unit_t *md, + mdname_t *np, + uint_t write_reinstruct, + uint_t read_reinstruct, + uint_t round_cyl, + md_error_t *ep +) +{ + diskaddr_t cylsize = md->c.un_nhead * md->c.un_nsect; + diskaddr_t total_blocks; + + if (round_cyl) { + total_blocks = rounddown(md->c.un_actual_tb, cylsize); + } else { + total_blocks = md->c.un_actual_tb; + } + + md->c.un_total_blocks = total_blocks; + if (write_reinstruct > md->c.un_wr_reinstruct) + md->c.un_wr_reinstruct = write_reinstruct; + if (read_reinstruct > md->c.un_rd_reinstruct) + md->c.un_rd_reinstruct = read_reinstruct; + return (0); +} + +/* + * Function: meta_init_make_device + * Purpose: + * Create the device node <uname> by constructing the necessary + * md_mkdev_params_t structure. We have to handle relative names + * (e.g. "d80") and fully-qualified names (e.g. "/dev/md/red/dsk/d80"). + * The field that we need is the unit number of the metadevice (80 in + * the above examples). + * Input: spp set structure + * uname unit-name (fully qualified or relative) + * Output: ep error return structure + * Returns: 0 success + * -1 Error. <ep> contains error reason + */ +int +meta_init_make_device( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + di_devlink_handle_t hdl; + md_mkdev_params_t params; + int rval = 0; + char *p, *e = uname; + size_t len = strlen(uname); + + e += len; + (void) memset(¶ms, 0, sizeof (params)); + MD_SETDRIVERNAME(¶ms, "md", (*spp)->setno); + + /* + * Find the start of the unit within <uname>. + */ + p = strrchr(uname, '/'); + if (p == NULL) { + /* Relative name (e.g. d80) */ + p = &uname[1]; + } else { + /* qualified name (e.g. /dev/md/dsk/d80) */ + p += 2; + if (p >= e) { + /* Invalid drive name */ + p = Malloc(len + 3); + (void) snprintf(p, len + 3, "\"%s\"", uname); + rval = mderror(ep, MDE_NOT_DRIVENAME, p); + Free(p); + return (rval); + } + } + e = NULL; + params.mnum = strtoul(p, &e, 10); + if (e == p) { + /* Invalid drive name */ + p = Malloc(len + 3); + (void) snprintf(p, len + 3, "\"%s\"", uname); + rval = mderror(ep, MDE_NOT_DRIVENAME, p); + Free(p); + return (rval); + } + + if (metaioctl(MD_IOCMAKE_DEV, ¶ms, ¶ms.mde, NULL) != 0) { + return (mdstealerror(ep, ¶ms.mde)); + } + /* + * Wait until device appears in namespace. di_devlink_init() returns + * once the /dev links have been created. If NULL is returned the + * link operation failed and we haven't got a device to use. + * NOTE: This will take a _long_ time for large numbers of metadevices. + * Change to use the enhanced di_devlink_init() interface when + * available. + */ + hdl = di_devlink_init("md", DI_MAKE_LINK); + if (hdl != NULL) { + (void) di_devlink_fini(&hdl); + } else { + p = Malloc(len + 3); + (void) snprintf(p, len + 3, "\"%s\"", uname); + rval = mderror(ep, MDE_UNIT_NOT_FOUND, p); + Free(p); + } + return (rval); +} + +/* + * FUNCTION: is_metadb_cmd() + * INPUT: argc - number of command line arguments + * argv - pointer to array of command line arguments + * OUTPUT: none + * RETURNS: TRUE if a metadb is to be created, FALSE otherwise + * PURPOSE: parses enough of the command line to determine if a metadb + * create is being attempted + */ +static boolean_t +is_metadb_cmd( + int argc, + char *argv[] +) +{ + ulong_t num; + int len; + + /* look for match */ + if (argc > 0 && (sscanf(argv[0], "mddb%lu%n", &num, &len) == 1) && + (strlen(argv[0]) == len) && ((long)num >= 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * FUNCTION: is_stripe_cmd() + * INPUT: argc - number of command line arguments + * argv - pointer to array of command line arguments + * OUTPUT: none + * RETURNS: TRUE if a stripe is to be created, FALSE otherwise + * PURPOSE: parses enough of the command line to determine if a stripe + * create is being attempted + */ +static boolean_t +is_stripe_cmd( + int argc, + char *argv[] +) +{ + uint_t nrow; + + if (argc > 1 && (sscanf(argv[1], "%u", &nrow) != 1) || ((int)nrow < 0)) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * FUNCTION: meta_get_init_type() + * INPUT: argc - number of command line arguments + * argv - pointer to array of command line arguments + * OUTPUT: none + * RETURNS: type of metadevice or hot spare pools being initialized + * PURPOSE: parses enough of the command line to determine what type + * of metainit is being attempted + */ +mdinittypes_t +meta_get_init_type( + int argc, + char *argv[] +) +{ + char *arg = argv[1]; + mdinittypes_t init_type; + + if (argc == 1) /* must be a hot spare pool w/o devices */ + return (TAB_HSP); + + init_type = TAB_UNKNOWN; + if (arg != NULL) { + if (strcmp(arg, "-m") == 0) { + init_type = TAB_MIRROR; + } else if (strcmp(arg, "-r") == 0) { + init_type = TAB_RAID; + } else if (strcmp(arg, "-p") == 0) { + init_type = TAB_SP; + } else if (strcmp(arg, "-t") == 0) { + init_type = TAB_TRANS; + } else if (is_metadb_cmd(argc, argv)) { + init_type = TAB_MDDB; + } else if (is_stripe_cmd(argc, argv)) { + init_type = TAB_STRIPE; + } else { /* assume that it is a hsp */ + init_type = TAB_HSP; + } + } + return (init_type); +} + +/* + * initialize named device or hotspare pool + */ +int +meta_init_name( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + mdinittypes_t init_type; + char *p; + int rval; + char *uname = argv[0]; + + assert(argc > 0); + + /* determine type of metadevice or hot spare pool being created */ + init_type = meta_get_init_type(argc, argv); + + /* hotspare pool */ + if (init_type == TAB_HSP) + return (meta_init_hsp(spp, argc, argv, options, ep)); + + /* metadevice */ + if (argc >= 2 && init_type != TAB_UNKNOWN) { + md_error_t t_e = mdnullerror; + char *cname; + + /* + * We need to create the device node if the specified metadevice + * does not already exist in the database. The actual creation + * is undertaken by the md driver and the links propagated by + * devfsadm. + */ + + /* initialize the spp properly */ + if ((cname = meta_name_getname(spp, uname, &t_e)) != NULL) + Free(cname); + if (! mdisok(&t_e)) + return (mdstealerror(ep, &t_e)); + + /* Create device node */ + if (meta_init_make_device(spp, uname, &t_e) != 0) { + return (mdstealerror(ep, &t_e)); + } + + switch (init_type) { + case TAB_MIRROR: + return (meta_init_mirror(spp, argc, argv, options, ep)); + break; + case TAB_RAID: + return (meta_init_raid(spp, argc, argv, options, ep)); + break; + case TAB_SP: + return (meta_init_sp(spp, argc, argv, options, ep)); + break; + case TAB_TRANS: + return (mderror(ep, MDE_EOF_TRANS, NULL)); + break; + case TAB_STRIPE: + return (meta_init_stripe(spp, argc, argv, options, ep)); + break; + } + } + + /* unknown type */ + p = Malloc(1 + strlen(uname) + 1 + 1); + (void) strcpy(p, "\""); + (void) strcat(p, uname); + (void) strcat(p, "\""); + rval = mderror(ep, MDE_SYNTAX, p); + Free(p); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_lib_prv.c b/usr/src/lib/lvm/libmeta/common/meta_lib_prv.c new file mode 100644 index 0000000000..1b63a2a03e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_lib_prv.c @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1992-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Metadevice diskset interfaces + */ + +#include "meta_lib_prv.h" +#include <sys/vfstab.h> + +static FILE *mfp = NULL; + +FILE * +open_mnttab(void) +{ + if (mfp != NULL) { + if (fseeko(mfp, (off_t)0L, SEEK_SET) == -1) { + (void) fclose(mfp); + mfp = NULL; + return (NULL); + } + return (mfp); + } + + if ((mfp = fopen(MNTTAB, "r")) == NULL) + return (NULL); + + return (mfp); +} + +int +close_mnttab(void) +{ + int ret = -1; + + if (mfp == NULL) + return (0); + + ret = fclose(mfp); + + mfp = NULL; + + return (ret); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mdcf.c b/usr/src/lib/lvm/libmeta/common/meta_mdcf.c new file mode 100644 index 0000000000..3af1c3be19 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mdcf.c @@ -0,0 +1,148 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * patch md.cf file + */ + +#include <meta.h> + +/* + * save metadevice configuration in md.cf + */ +int +meta_update_md_cf( + mdsetname_t *sp, + md_error_t *ep +) +{ + char *name = METACONF; + char *tname = METACONFTMP; + FILE *tfp = NULL; + FILE *mfp = NULL; + mdprtopts_t options = PRINT_SHORT | PRINT_FAST; + struct stat sbuf; + char line[1000]; + + /* If this is not the local set, no need to do anything */ + if (!metaislocalset(sp)) + return (0); + + /* open temp file */ + if ((tfp = fopen(tname, "w")) == NULL) + return (mdsyserror(ep, errno, tname)); + if (stat(name, &sbuf) == 0) { + (void) fchmod(fileno(tfp), (sbuf.st_mode & 0777)); + (void) fchown(fileno(tfp), sbuf.st_uid, sbuf.st_gid); + } + + /* dump header */ + if (fputs(dgettext(TEXT_DOMAIN, + "# metadevice configuration file\n" + "# do not hand edit\n"), tfp) == EOF) { + (void) mdsyserror(ep, errno, tname); + goto errout; + } + + /* dump device configuration */ + if (meta_print_all(sp, tname, NULL, tfp, options, NULL, ep) != 0) + goto errout; + + /* close and rename file */ + if (fclose(tfp) != 0) { + (void) mdsyserror(ep, errno, tname); + goto errout; + } + tfp = NULL; + + /* + * Renames don't work in the miniroot since tmpfiles are + * created in /var/tmp. Hence we copy the data out. + */ + + if (rename(tname, name) != 0) { + if (errno == EROFS) { + if ((tfp = fopen(tname, "r")) == NULL) { + goto errout; + } + if ((mfp = fopen(METACONF, "w+")) == NULL) { + goto errout; + } + while (fgets(line, 1000, tfp) != NULL) { + if (fputs(line, mfp) == NULL) { + (void) mdsyserror(ep, errno, METACONF); + goto errout; + } + } + if (fclose(tfp) != 0) { + tfp = NULL; + goto errout; + } + tfp = NULL; + /* delete the tempfile */ + (void) unlink(tname); + if (fflush(mfp) != 0) { + goto errout; + } + if (fsync(fileno(mfp)) != 0) { + goto errout; + } + if (fclose(mfp) != 0) { + mfp = NULL; + goto errout; + } + mfp = NULL; + } else { + (void) mdsyserror(ep, errno, name); + goto errout; + } + } + + /* success */ + return (0); + + /* cleanup, return error */ +errout: + if (tfp != NULL) { + (void) fclose(tfp); + (void) unlink(tname); + } + if (mfp != NULL) { + (void) fclose(mfp); + } + return (-1); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_med.c b/usr/src/lib/lvm/libmeta/common/meta_med.c new file mode 100644 index 0000000000..b11f86a0c1 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_med.c @@ -0,0 +1,851 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Mediator functions + */ + +#include <meta.h> +#include <metamed.h> +#include <dlfcn.h> +#include <sdssc.h> + +/* + * There are too many external factors that affect the timing of the + * operations, so we set the timeout to a very large value, in this + * case 1 day, which should handle HW timeouts, large configurations, + * and other potential delays. + */ +#define CL_LONG_TMO 86400L /* 1 day */ +#define CL_MEDIUM_TMO 3600L /* 1 hour */ +#define CL_SHORT_TMO 600L /* 10 minutes */ +#define CL_DEF_TMO 10L /* 10 seconds */ + +static md_timeval32_t def_rpcb_timeout = { MD_CLNT_CREATE_TOUT, 0 }; + +/* + * RPC handle + */ +typedef struct { + char *hostname; + CLIENT *clntp; +} med_handle_t; + +/* + * Data to be sent from med_clnt_create_timed to med_create_helper via + * meta_client_create_retry. + */ +typedef struct { + rpcprog_t mcd_program; /* RPC program designation */ + rpcvers_t mcd_version; /* RPC version */ + char *mcd_nettype; /* Type of network to use for RPC */ +} med_create_data_t; + +/* + * Perform the work of actually doing the clnt_create for + * meta_client_create_retry. + */ +static CLIENT * +med_create_helper(char *hostname, void *private, struct timeval *time_out) +{ + med_create_data_t *cd = (med_create_data_t *)private; + + return (clnt_create_timed(hostname, cd->mcd_program, cd->mcd_version, + cd->mcd_nettype, time_out)); +} + +static +CLIENT *med_clnt_create_timed( + char *hostname, + const ulong_t prog, + const ulong_t vers, + char *nettype, + const md_timeval32_t *tp +) +{ + med_create_data_t cd; /* Create data. */ + + cd.mcd_program = prog; + cd.mcd_version = vers; + cd.mcd_nettype = nettype; + return (meta_client_create_retry(hostname, med_create_helper, + (void *)&cd, (time_t)tp->tv_sec, NULL)); +} + +/* + * Set the timeout value for this client handle. + */ +static int +cl_sto_medd( + CLIENT *clntp, + char *hostname, + long time_out, + md_error_t *ep +) +{ + md_timeval32_t nto; + + (void) memset(&nto, '\0', sizeof (nto)); + + nto.tv_sec = time_out; + + if (clnt_control(clntp, CLSET_TIMEOUT, (char *)&nto) != TRUE) + return (mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad client set timeout"))); + + return (0); +} + +/* + * close RPC connection + */ +static void +close_medd( + med_handle_t *hp +) +{ + assert(hp != NULL); + if (hp->hostname != NULL) { + Free(hp->hostname); + } + if (hp->clntp != NULL) { + auth_destroy(hp->clntp->cl_auth); + clnt_destroy(hp->clntp); + } + Free(hp); +} + +/* + * open RPC connection to rpc.medd + */ +static med_handle_t * +open_medd( + char *hostname, + long time_out, + md_error_t *ep +) +{ + CLIENT *clntp; + med_handle_t *hp; + + /* default to local host */ + if ((hostname == NULL) || (*hostname == '\0')) + hostname = mynode(); + + /* open RPC connection */ + assert(hostname != NULL); + if ((clntp = med_clnt_create_timed(hostname, MED_PROG, MED_VERS, + "tcp", &def_rpcb_timeout)) == NULL) { + if (rpc_createerr.cf_stat != RPC_PROGNOTREGISTERED) + clnt_pcreateerror(hostname); + (void) mdrpccreateerror(ep, hostname, + "medd med_clnt_create_timed"); + return (NULL); + } else { + auth_destroy(clntp->cl_auth); + clntp->cl_auth = authsys_create_default(); + assert(clntp->cl_auth != NULL); + } + + if (cl_sto_medd(clntp, hostname, time_out, ep) != 0) + return (NULL); + + /* return connection */ + hp = Zalloc(sizeof (*hp)); + hp->hostname = Strdup(hostname); + hp->clntp = clntp; + + return (hp); +} + +/* + * steal and convert med_err_t + */ +int +meddstealerror( + md_error_t *ep, + med_err_t *medep +) +{ + char buf[BUFSIZ]; + char *p = buf; + size_t psize = BUFSIZ; + char *emsg; + int rval = -1; + + /* no error */ + if (medep->med_errno == 0) { + /* assert(medep->name == NULL); */ + rval = 0; + goto out; + } + + /* steal error */ + if ((medep->med_node != NULL) && (medep->med_node[0] != '\0')) { + (void) snprintf(p, psize, "%s: ", medep->med_node); + p = &buf[strlen(buf)]; + psize = buf + BUFSIZ - p; + } + + if ((medep->med_misc != NULL) && (medep->med_misc[0] != '\0')) { + (void) snprintf(p, psize, "%s: ", medep->med_misc); + p = &buf[strlen(buf)]; + psize = buf + BUFSIZ - p; + } + + if (medep->med_errno < 0) { + if ((emsg = med_errnum_to_str(medep->med_errno)) != NULL) + (void) snprintf(p, psize, "%s", emsg); + else + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "unknown mediator errno %d\n"), medep->med_errno); + } else { + if ((emsg = strerror(medep->med_errno)) != NULL) + (void) snprintf(p, psize, "%s", emsg); + else + (void) snprintf(p, psize, dgettext(TEXT_DOMAIN, + "errno %d out of range"), medep->med_errno); + } + (void) mderror(ep, MDE_MED_ERROR, buf); + + /* cleanup, return success */ +out: + if (medep->med_node != NULL) + Free(medep->med_node); + if (medep->med_misc != NULL) + Free(medep->med_misc); + (void) memset(medep, 0, sizeof (*medep)); + return (rval); +} + +static med_handle_t * +open_medd_wrap( + md_h_t *mdhp, + long time_out, + md_error_t *ep +) +{ + med_handle_t *hp = NULL; + int i; + char *hnm; + + assert(mdhp && mdhp->a_cnt > 0); + + /* Loop through the hosts listed */ + i = min(mdhp->a_cnt, MAX_HOST_ADDRS) - 1; + for (; i >= 0; i--) { + hnm = mdhp->a_nm[i]; + + if ((hp = open_medd(hnm, time_out, ep)) == NULL) { + if (mdanyrpcerror(ep) && i != 0) { + mdclrerror(ep); + continue; + } + } + return (hp); + } + + rpc_createerr.cf_stat = RPC_CANTSEND; + rpc_createerr.cf_error.re_status = 0; + (void) mdrpccreateerror(ep, mdhp->a_nm[0], + dgettext(TEXT_DOMAIN, "medd open wrap")); + + return (NULL); +} + +static int +setup_med_transtab(md_error_t *ep) +{ + mddb_med_t_parm_t *tp = NULL; + struct stat statb; + int i; + size_t alloc_size = 0; + int err = 0; + + + if ((tp = Zalloc(sizeof (mddb_med_t_parm_t))) == NULL) + return (mdsyserror(ep, ENOMEM, "setup_med_transtab")); + + if (metaioctl(MD_MED_GET_TLEN, tp, &tp->med_tp_mde, NULL) != 0) { + err = mdstealerror(ep, &tp->med_tp_mde); + goto out; + } + + if (tp->med_tp_setup == 1) + goto out; + + alloc_size = (sizeof (mddb_med_t_parm_t) - sizeof (mddb_med_t_ent_t)) + + (sizeof (mddb_med_t_ent_t) * tp->med_tp_nents); + + if ((tp = Realloc(tp, alloc_size)) == NULL) { + err = mdsyserror(ep, ENOMEM, "setup_med_transtab"); + goto out; + } + + if (metaioctl(MD_MED_GET_T, tp, &tp->med_tp_mde, NULL) != 0) { + err = mdstealerror(ep, &tp->med_tp_mde); + goto out; + } + + for (i = 0; i < tp->med_tp_nents; i++) { + if (meta_stat(tp->med_tp_ents[i].med_te_nm, &statb) == -1) { + md_perror("setup_med_transtab(): stat():"); + tp->med_tp_ents[i].med_te_dev = NODEV64; + } else { + tp->med_tp_ents[i].med_te_dev = + meta_expldev(statb.st_rdev); + } + } + + if (metaioctl(MD_MED_SET_T, tp, &tp->med_tp_mde, NULL) != 0) + err = mdstealerror(ep, &tp->med_tp_mde); + +out: + Free(tp); + return (err); +} + +/* + * Externals + */ + +/* + * NULLPROC - just returns a response + */ +int +clnt_med_null( + char *hostname, + md_error_t *ep +) +{ + med_handle_t *hp; + med_err_t res; + + /* initialize */ + mdclrerror(ep); + + /* do it */ + if ((hp = open_medd(hostname, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_null_1(NULL, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hostname, + dgettext(TEXT_DOMAIN, "medd nullproc")); + + close_medd(hp); + + xdr_free(xdr_med_err_t, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Update the mediator information on the mediator. + * *** This is not normally called from user code, the kernel does this! *** + */ +int +clnt_med_upd_data( + md_h_t *mdhp, + mdsetname_t *sp, + med_data_t *meddp, + md_error_t *ep +) +{ + med_handle_t *hp; + med_upd_data_args_t args; + med_err_t res; + md_set_desc *sd; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + /* + * In the MN diskset, use a generic nodename, multiowner, as + * the node initiating the RPC request. This allows + * any node to access mediator information. + * + * MN diskset reconfig cycle forces consistent + * view of set/node/drive/mediator information across all nodes + * in the MN diskset. This allows the relaxation of + * node name checking in rpc.metamedd for MN disksets. + * + * In the traditional diskset, only a calling node that is + * in the mediator record's diskset nodelist can access + * mediator data. + */ + args.med.med_caller = Strdup(MED_MN_CALLER); + else + args.med.med_caller = Strdup(mynode()); + args.med.med_setname = Strdup(sp->setname); + args.med.med_setno = sp->setno; + args.med_data = *meddp; + + /* do it */ + if ((hp = open_medd_wrap(mdhp, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_upd_data_1(&args, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hp->hostname, + dgettext(TEXT_DOMAIN, "medd update data")); + else + (void) meddstealerror(ep, &res); + + close_medd(hp); + + xdr_free(xdr_med_upd_data_args_t, (char *)&args); + xdr_free(xdr_med_err_t, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Get the mediator data for this client from the mediator + */ +int +clnt_med_get_data( + md_h_t *mdhp, + mdsetname_t *sp, + med_data_t *meddp, + md_error_t *ep +) +{ + med_handle_t *hp; + med_args_t args; + med_get_data_res_t res; + int rval = -1; + md_set_desc *sd; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + /* + * In the MN diskset, use a generic nodename, multiowner, as + * the node initiating the RPC request. This allows + * any node to access mediator information. + * + * MN diskset reconfig cycle forces consistent + * view of set/node/drive/mediator information across all nodes + * in the MN diskset. This allows the relaxation of + * node name checking in rpc.metamedd for MN disksets. + * + * In the traditional diskset, only a calling node that is + * in the mediator record's diskset nodelist can access + * mediator data. + */ + args.med.med_caller = Strdup(MED_MN_CALLER); + else + args.med.med_caller = Strdup(mynode()); + args.med.med_setname = Strdup(sp->setname); + args.med.med_setno = sp->setno; + + /* do it */ + if ((hp = open_medd_wrap(mdhp, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_get_data_1(&args, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hp->hostname, + dgettext(TEXT_DOMAIN, "medd get data")); + else + (void) meddstealerror(ep, &res.med_status); + + close_medd(hp); + + if (mdisok(ep)) { + /* do something with the results */ + (void) memmove(meddp, &res.med_data, sizeof (med_data_t)); + rval = 0; + } + + xdr_free(xdr_med_args_t, (char *)&args); + xdr_free(xdr_med_get_data_res_t, (char *)&res); + + return (rval); +} + +/* + * Update the mediator record on the mediator. + */ +int +clnt_med_upd_rec( + md_h_t *mdhp, + mdsetname_t *sp, + med_rec_t *medrp, + md_error_t *ep +) +{ + med_handle_t *hp; + med_upd_rec_args_t args; + med_err_t res; + md_set_desc *sd; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + /* + * In the MN diskset, use a generic nodename, multiowner, as + * the node initiating the RPC request. This allows + * any node to access mediator information. + * + * MN diskset reconfig cycle forces consistent + * view of set/node/drive/mediator information across all nodes + * in the MN diskset. This allows the relaxation of + * node name checking in rpc.metamedd for MN disksets. + * + * In the traditional diskset, only a calling node that is + * in the mediator record's diskset nodelist can access + * mediator data. + */ + args.med.med_caller = Strdup(MED_MN_CALLER); + else + args.med.med_caller = Strdup(mynode()); + args.med.med_setname = Strdup(sp->setname); + args.med.med_setno = sp->setno; + args.med_flags = 0; + args.med_rec = *medrp; /* structure assignment */ + + /* do it */ + if ((hp = open_medd_wrap(mdhp, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_upd_rec_1(&args, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hp->hostname, + dgettext(TEXT_DOMAIN, "medd update record")); + else + (void) meddstealerror(ep, &res); + + close_medd(hp); + + xdr_free(xdr_med_upd_rec_args_t, (char *)&args); + xdr_free(xdr_med_err_t, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Get the mediator record for this client from the mediator + */ +int +clnt_med_get_rec( + md_h_t *mdhp, + mdsetname_t *sp, + med_rec_t *medrp, + md_error_t *ep +) +{ + med_handle_t *hp; + med_args_t args; + med_get_rec_res_t res; + int rval = -1; + md_set_desc *sd; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + /* + * In the MN diskset, use a generic nodename, multiowner, as + * the node initiating the RPC request. This allows + * any node to access mediator information. + * + * MN diskset reconfig cycle forces consistent + * view of set/node/drive/mediator information across all nodes + * in the MN diskset. This allows the relaxation of + * node name checking in rpc.metamedd for MN disksets. + * + * In the traditional diskset, only a calling node that is + * in the mediator record's diskset nodelist can access + * mediator data. + */ + args.med.med_caller = Strdup(MED_MN_CALLER); + else + args.med.med_caller = Strdup(mynode()); + args.med.med_setname = Strdup(sp->setname); + args.med.med_setno = sp->setno; + + /* do it */ + if ((hp = open_medd_wrap(mdhp, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_get_rec_1(&args, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hp->hostname, + dgettext(TEXT_DOMAIN, "medd get record")); + else + (void) meddstealerror(ep, &res.med_status); + + close_medd(hp); + + if (mdisok(ep)) { + /* do something with the results */ + (void) memmove(medrp, &res.med_rec, sizeof (med_rec_t)); + rval = 0; + } + + xdr_free(xdr_med_args_t, (char *)&args); + xdr_free(xdr_med_get_rec_res_t, (char *)&res); + + return (rval); +} + +/* + * Get the name of the host from the mediator daemon. + */ +int +clnt_med_hostname( + char *hostname, + char **ret_hostname, + md_error_t *ep +) +{ + med_handle_t *hp; + med_hnm_res_t res; + int rval = -1; + + /* initialize */ + mdclrerror(ep); + (void) memset(&res, 0, sizeof (res)); + + /* No args */ + + /* do it */ + if ((hp = open_medd(hostname, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (med_hostname_1(NULL, &res, hp->clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, hp->clntp, hostname, + dgettext(TEXT_DOMAIN, "medd hostname")); + else + (void) meddstealerror(ep, &res.med_status); + + close_medd(hp); + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_hostname != NULL) + *ret_hostname = Strdup(res.med_hnm); + } + + xdr_free(xdr_med_hnm_res_t, (char *)&res); + + return (rval); +} + +int +meta_med_hnm2ip(md_hi_arr_t *mp, md_error_t *ep) +{ + int i, j; + int max_meds; + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + for (i = 0; i < max_meds; i++) { + mp->n_lst[i].a_flg = 0; + /* See if this is the local host */ + if (mp->n_lst[i].a_cnt > 0 && + strcmp(mp->n_lst[i].a_nm[0], mynode()) == NULL) + mp->n_lst[i].a_flg |= NMIP_F_LOCAL; + + for (j = 0; j < mp->n_lst[i].a_cnt; j++) { + struct hostent *hp; + char *hnm = mp->n_lst[i].a_nm[j]; + + /* + * Cluster nodename support + * + * See if the clustering code can give us an IP addr + * for the stored name. If not, find it the old way + * which will use the public interface. + */ + if (sdssc_get_priv_ipaddr(mp->n_lst[i].a_nm[j], + (struct in_addr *)&mp->n_lst[i].a_ip[j]) != + SDSSC_OKAY) { + if ((hp = gethostbyname(hnm)) == NULL) + return (mdsyserror(ep, EADDRNOTAVAIL, + hnm)); + + /* We only do INET addresses */ + if (hp->h_addrtype != AF_INET) + return (mdsyserror(ep, EPFNOSUPPORT, + hnm)); + + /* We take the first address only */ + if (*hp->h_addr_list) { + (void) memmove(&mp->n_lst[i].a_ip[j], + *hp->h_addr_list, + sizeof (struct in_addr)); + } else + return (mdsyserror(ep, EADDRNOTAVAIL, + hnm)); + } + + } + } + return (0); +} + +int +meta_h2hi(md_h_arr_t *mdhp, md_hi_arr_t *mdhip, md_error_t *ep) +{ + int i, j; + int max_meds; + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + mdhip->n_cnt = mdhp->n_cnt; + + for (i = 0; i < max_meds; i++) { + mdhip->n_lst[i].a_flg = 0; + mdhip->n_lst[i].a_cnt = mdhp->n_lst[i].a_cnt; + if (mdhp->n_lst[i].a_cnt == 0) + continue; + for (j = 0; j < mdhp->n_lst[i].a_cnt; j++) + (void) strcpy(mdhip->n_lst[i].a_nm[j], + mdhp->n_lst[i].a_nm[j]); + } + return (0); +} + +int +meta_hi2h(md_hi_arr_t *mdhip, md_h_arr_t *mdhp, md_error_t *ep) +{ + int i, j; + int max_meds; + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + mdhp->n_cnt = mdhip->n_cnt; + for (i = 0; i < max_meds; i++) { + mdhp->n_lst[i].a_cnt = mdhip->n_lst[i].a_cnt; + if (mdhip->n_lst[i].a_cnt == 0) + continue; + for (j = 0; j < mdhip->n_lst[i].a_cnt; j++) + (void) strcpy(mdhp->n_lst[i].a_nm[j], + mdhip->n_lst[i].a_nm[j]); + } + return (0); +} + +int +setup_med_cfg( + mdsetname_t *sp, + mddb_config_t *cp, + int force, + md_error_t *ep +) +{ + md_set_desc *sd; + int i; + int max_meds; + + if (metaislocalset(sp)) + return (0); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (setup_med_transtab(ep)) + return (-1); + + if (meta_h2hi(&sd->sd_med, &cp->c_med, ep)) + return (-1); + + /* Make sure the ip addresses are current */ + if (meta_med_hnm2ip(&cp->c_med, ep)) + return (-1); + + if (force) + return (0); + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + /* Make sure metamedd still running on host - only chk nodename */ + for (i = 0; i < max_meds; i++) { + char *hostname; + char *hnm; + + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + hnm = sd->sd_med.n_lst[i].a_nm[0]; + + if (clnt_med_hostname(hnm, &hostname, ep)) + return (mddserror(ep, MDE_DS_NOMEDONHOST, sp->setno, + hnm, NULL, sp->setname)); + Free(hostname); + } + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_med_err.c b/usr/src/lib/lvm/libmeta/common/meta_med_err.c new file mode 100644 index 0000000000..6b83280f89 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_med_err.c @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1992-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#include <meta.h> +#include <metamed.h> + +char * +med_errnum_to_str(int errnum) +{ + switch (errnum) { + case MDE_MED_NOERROR: + return (dgettext(TEXT_DOMAIN, "No Error")); + case MDE_MED_HOSTNOMED: + return (dgettext(TEXT_DOMAIN, + "mediator host has no mediator data for host")); + case MDE_MED_DBNOTINIT: + return (dgettext(TEXT_DOMAIN, + "mediator database is not initialized")); + case MDE_MED_DBSZBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database size is not valid")); + case MDE_MED_DBKEYADDFAIL: + return (dgettext(TEXT_DOMAIN, + "unable to add key to mediator database")); + case MDE_MED_DBKEYDELFAIL: + return (dgettext(TEXT_DOMAIN, + "unable to delete key from mediator database")); + case MDE_MED_DBHDRSZBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database header record size is not valid")); + case MDE_MED_DBHDRMAGBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database header magic is not valid")); + case MDE_MED_DBHDRREVBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database header revision is not valid")); + case MDE_MED_DBHDRCKSBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database header checksum is not valid")); + case MDE_MED_DBRECSZBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database record record size is not valid")); + case MDE_MED_DBRECMAGBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database record magic is not valid")); + case MDE_MED_DBRECREVBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database record revision is not valid")); + case MDE_MED_DBRECCKSBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database record checksum is not valid")); + case MDE_MED_DBRECOFFBAD: + return (dgettext(TEXT_DOMAIN, + "mediator database record offset in not valid")); + case MDE_MED_DBRECNOENT: + return (dgettext(TEXT_DOMAIN, + "no matching mediator record found")); + case MDE_MED_DBARGSMISMATCH: + return (dgettext(TEXT_DOMAIN, "set number in arguments " + "different from set number in data")); + default: + return (NULL); + } +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mem.c b/usr/src/lib/lvm/libmeta/common/meta_mem.c new file mode 100644 index 0000000000..d685f57c09 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mem.c @@ -0,0 +1,250 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 1993, 2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> + +/* + * free + */ +#ifdef _DEBUG_MALLOC_INC + +void +_Free( + char *file, + int line, + void *p +) +{ + debug_free(file, line, p); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +void +Free( + void *p +) +{ + free(p); +} + +#endif /* ! _DEBUG_MALLOC_INC */ + +/* + * malloc + */ +#ifdef _DEBUG_MALLOC_INC + +void * +_Malloc( + char *file, + int line, + size_t s +) +{ + void *mem; + + mem = debug_malloc(file, line, s); + if (mem == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (mem); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +void * +Malloc( + size_t s +) +{ + void *mem; + + if ((mem = malloc(s)) == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (mem); +} + +#endif /* ! _DEBUG_MALLOC_INC */ + +/* + * zalloc + */ +#ifdef _DEBUG_MALLOC_INC + +void * +_Zalloc( + char *file, + int line, + size_t s +) +{ + return (memset(_Malloc(file, line, s), 0, s)); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +void * +Zalloc( + size_t s +) +{ + return (memset(Malloc(s), 0, s)); +} + +#endif /* ! _DEBUG_MALLOC_INC */ + +/* + * realloc + */ +#ifdef _DEBUG_MALLOC_INC + +void * +_Realloc( + char *file, + int line, + void *p, + size_t s +) +{ + if (p == NULL) + p = debug_malloc(file, line, s); + else + p = debug_realloc(file, line, p, s); + if (p == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (p); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +void * +Realloc( + void *p, + size_t s +) +{ + if ((p = realloc(p, s)) == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (p); +} + +#endif /* ! _DEBUG_MALLOC_INC */ + +/* + * calloc + */ +#ifdef _DEBUG_MALLOC_INC + +void * +_Calloc( + char *file, + int line, + size_t n, + size_t s +) +{ + unsigned long total; + + if (n == 0 || s == 0) { + total = 0; + } else { + total = (unsigned long)n * s; + /* check for overflow */ + if (total / n != s) + return (NULL); + } + return (_Zalloc(file, line, total)); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +void * +Calloc( + size_t n, + size_t s +) +{ + unsigned long total; + + if (n == 0 || s == 0) { + total = 0; + } else { + total = (unsigned long)n * s; + /* check for overflow */ + if (total / n != s) + return (NULL); + } + return (Zalloc(total)); +} + +#endif /* ! _DEBUG_MALLOC_INC */ + +/* + * strdup + */ +#ifdef _DEBUG_MALLOC_INC + +char * +_Strdup( + char *file, + int line, + char *p +) +{ + p = DBstrdup(file, line, p); + if (p == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (p); +} + +#else /* ! _DEBUG_MALLOC_INC */ + +char * +Strdup( + char *p +) +{ + if ((p = strdup(p)) == NULL) { + md_perror(""); + md_exit(NULL, 1); + } + return (p); +} + +#endif /* ! _DEBUG_MALLOC_INC */ diff --git a/usr/src/lib/lvm/libmeta/common/meta_metad.c b/usr/src/lib/lvm/libmeta/common/meta_metad.c new file mode 100644 index 0000000000..7588843f5c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_metad.c @@ -0,0 +1,4082 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#include <meta.h> +#include <metad.h> +#include <devid.h> + +static md_setkey_t *my_cl_sk = NULL; + +#define CL_DEF_TMO 30L + +/* + * Convert an old style mddrivename_t into a new style + * mddrivename_t. Meant to be used *ONLY* by rpc.metad + */ +void +meta_conv_drvname_old2new( + o_mddrivename_t *v1_dp, + mddrivename_t *v2_dp +) +{ + int sliceno; + o_mdname_t *v1_np; + mdname_t *v2_np; + + /* fields that haven't changed */ + v2_dp->cname = v1_dp->cname; + v2_dp->rname = v1_dp->rname; + v2_dp->type = v1_dp->type; + v2_dp->errnum = v1_dp->errnum; + + /* geometry information */ + v2_dp->geom.ncyl = v1_dp->geom.ncyl; + v2_dp->geom.nhead = v1_dp->geom.nhead; + v2_dp->geom.nsect = v1_dp->geom.nsect; + v2_dp->geom.rpm = v1_dp->geom.rpm; + v2_dp->geom.write_reinstruct = v1_dp->geom.write_reinstruct; + v2_dp->geom.read_reinstruct = v1_dp->geom.read_reinstruct; + v2_dp->geom.blk_sz = 0; + + /* controller information */ + v2_dp->cinfo = v1_dp->cinfo; + + /* vtoc information */ + v2_dp->vtoc.nparts = v1_dp->vtoc.nparts; + v2_dp->vtoc.first_lba = 0; + v2_dp->vtoc.last_lba = 0; + v2_dp->vtoc.lbasize = 0; + + for (sliceno = 0; sliceno < (MD_MAX_PARTS - 1); sliceno++) { + v2_dp->vtoc.parts[sliceno].start = + (diskaddr_t)v1_dp->vtoc.parts[sliceno].start; + v2_dp->vtoc.parts[sliceno].size = + (diskaddr_t)v1_dp->vtoc.parts[sliceno].size; + v2_dp->vtoc.parts[sliceno].tag = + v1_dp->vtoc.parts[sliceno].tag; + v2_dp->vtoc.parts[sliceno].flag = + v1_dp->vtoc.parts[sliceno].flag; + v2_dp->vtoc.parts[sliceno].label = + (diskaddr_t)v1_dp->vtoc.parts[sliceno].label; + } + + /* The new style vtoc has 17 partitions */ + v2_dp->vtoc.parts[MD_MAX_PARTS - 1].start = 0; + v2_dp->vtoc.parts[MD_MAX_PARTS - 1].size = 0; + v2_dp->vtoc.parts[MD_MAX_PARTS - 1].tag = 0; + v2_dp->vtoc.parts[MD_MAX_PARTS - 1].flag = 0; + v2_dp->vtoc.parts[MD_MAX_PARTS - 1].label = 0; + + v2_dp->vtoc.typename = v1_dp->vtoc.typename; + + /* partition information */ + v2_dp->parts.parts_len = v1_dp->parts.parts_len; + for (sliceno = 0; sliceno < v1_dp->parts.parts_len; sliceno++) { + v1_np = &v1_dp->parts.parts_val[sliceno]; + v2_np = &v2_dp->parts.parts_val[sliceno]; + + /* + * We speculate that if cname for a particular + * partition does not exist, the other fields + * don't exist either. In such a case, we don't + * need to do anything for that partition. + */ + if (v1_np->cname != NULL) { + v2_np->cname = v1_np->cname; + v2_np->bname = v1_np->bname; + v2_np->rname = v1_np->rname; + v2_np->devicesname = v1_np->devicesname; + v2_np->dev = meta_expldev(v1_np->dev); + v2_np->key = v1_np->key; + v2_np->end_blk = (diskaddr_t)v1_np->end_blk; + v2_np->start_blk = (diskaddr_t)v1_np->start_blk; + } + v2_np->drivenamep = v2_dp; + } + + /* We don't care about the rest of the fields */ + v2_dp->side_names = v1_dp->side_names; + v2_dp->side_names_key = v1_dp->side_names_key; + v2_dp->miscname = v1_dp->miscname; +} + +/* + * Convert a new style mddrivename_t into an old style + * mddrivename_t. Meant to be used *ONLY* by rpc.metad + */ +void +meta_conv_drvname_new2old( + o_mddrivename_t *v1_dp, + mddrivename_t *v2_dp +) +{ + int sliceno; + o_mdname_t *v1_np; + mdname_t *v2_np; + + /* fields that haven't changed */ + v1_dp->cname = v2_dp->cname; + v1_dp->rname = v2_dp->rname; + v1_dp->type = v2_dp->type; + v1_dp->errnum = v2_dp->errnum; + + /* geometry information */ + v1_dp->geom.ncyl = v2_dp->geom.ncyl; + v1_dp->geom.nhead = v2_dp->geom.nhead; + v1_dp->geom.nsect = v2_dp->geom.nsect; + v1_dp->geom.rpm = v2_dp->geom.rpm; + v1_dp->geom.write_reinstruct = v2_dp->geom.write_reinstruct; + v1_dp->geom.read_reinstruct = v2_dp->geom.read_reinstruct; + + /* controller information */ + v1_dp->cinfo = v2_dp->cinfo; + + /* vtoc information */ + v1_dp->vtoc.typename = v2_dp->vtoc.typename; + v1_dp->vtoc.nparts = v2_dp->vtoc.nparts; + + for (sliceno = 0; sliceno < (MD_MAX_PARTS - 1); sliceno++) { + v1_dp->vtoc.parts[sliceno].start = + (daddr_t)v2_dp->vtoc.parts[sliceno].start; + v1_dp->vtoc.parts[sliceno].size = + (daddr_t)v2_dp->vtoc.parts[sliceno].size; + v1_dp->vtoc.parts[sliceno].tag = + v2_dp->vtoc.parts[sliceno].tag; + v1_dp->vtoc.parts[sliceno].flag = + v2_dp->vtoc.parts[sliceno].flag; + v1_dp->vtoc.parts[sliceno].label = + (daddr_t)v2_dp->vtoc.parts[sliceno].label; + } + + /* partition information */ + v1_dp->parts.parts_len = v2_dp->parts.parts_len; + + for (sliceno = 0; sliceno < v2_dp->parts.parts_len; sliceno++) { + v1_np = &v1_dp->parts.parts_val[sliceno]; + v2_np = &v2_dp->parts.parts_val[sliceno]; + + /* + * We speculate that if cname for a particular + * partition does not exist then the rest of + * the fields a partition don't exist either. + * In such a case, we don't need to do anything + * for that partition. + */ + if (v2_np->cname != NULL) { + v1_np->cname = v2_np->cname; + v1_np->bname = v2_np->bname; + v1_np->rname = v2_np->rname; + v1_np->devicesname = v2_np->devicesname; + v1_np->dev = meta_cmpldev(v2_np->dev); + v1_np->key = v2_np->key; + v1_np->end_blk = (daddr_t)v2_np->end_blk; + v1_np->start_blk = (daddr_t)v2_np->start_blk; + } + v1_np->drivenamep = v1_dp; + } + + /* We don't care about the rest of the fields */ + v1_dp->side_names = v2_dp->side_names; + v1_dp->side_names_key = v2_dp->side_names_key; + v1_dp->miscname = v2_dp->miscname; +} + +/* + * Convert an old style md_drive_desc_t into a new style + * md_drive_desc_t. Meant to be used *ONLY* by rpc.metad + */ +void +meta_conv_drvdesc_old2new( + o_md_drive_desc *v1_dd, + md_drive_desc *v2_dd +) +{ + md_drive_desc *dd; + o_md_drive_desc *o_dd; + + dd = v2_dd; + + for (o_dd = v1_dd; o_dd != NULL; o_dd = o_dd->dd_next) { + dd->dd_ctime = o_dd->dd_ctime; + dd->dd_genid = o_dd->dd_genid; + dd->dd_flags = o_dd->dd_flags; + meta_conv_drvname_old2new(o_dd->dd_dnp, dd->dd_dnp); + dd->dd_dbcnt = o_dd->dd_dbcnt; + dd->dd_dbsize = o_dd->dd_dbsize; + dd = dd->dd_next; + } +} + +/* + * Convert an new style md_drive_desc_t into a old style + * md_drive_desc_t. Meant to be used *ONLY* by rpc.metad + */ +void +meta_conv_drvdesc_new2old( + o_md_drive_desc *v1_dd, + md_drive_desc *v2_dd +) +{ + md_drive_desc *dd; + o_md_drive_desc *o_dd; + + o_dd = v1_dd; + + for (dd = v2_dd; dd != NULL; dd = dd->dd_next) { + o_dd->dd_ctime = dd->dd_ctime; + o_dd->dd_genid = dd->dd_genid; + o_dd->dd_flags = dd->dd_flags; + meta_conv_drvname_new2old(o_dd->dd_dnp, dd->dd_dnp); + o_dd->dd_dbcnt = dd->dd_dbcnt; + o_dd->dd_dbsize = dd->dd_dbsize; + o_dd = o_dd->dd_next; + } +} + +/* + * Allocate memory for v1 drive descriptor + * depending upon the number of drives in the + * v2 drive descriptor + */ +void +alloc_olddrvdesc( + o_md_drive_desc **v1_dd, + md_drive_desc *v2_dd +) +{ + md_drive_desc *dd; + o_md_drive_desc *new, *head; + + head = NULL; + + for (dd = v2_dd; dd != NULL; dd = dd->dd_next) { + new = Zalloc(sizeof (o_md_drive_desc)); + new->dd_dnp = Zalloc(sizeof (o_mddrivename_t)); + new->dd_dnp->parts.parts_val = Zalloc(sizeof (o_mdname_t) * + dd->dd_dnp->parts.parts_len); + new->dd_next = head; + head = new; + } + *v1_dd = head; +} + +/* + * Allocate memory for v2 drive descriptor + * depending upon the number of drives in the + * v1 drive descriptor + */ +void +alloc_newdrvdesc( + o_md_drive_desc *v1_dd, + md_drive_desc **v2_dd +) +{ + md_drive_desc *new, *head; + o_md_drive_desc *o_dd; + + head = NULL; + + for (o_dd = v1_dd; o_dd != NULL; o_dd = o_dd->dd_next) { + new = Zalloc(sizeof (md_drive_desc)); + new->dd_dnp = Zalloc(sizeof (mddrivename_t)); + new->dd_dnp->parts.parts_val = Zalloc(sizeof (mdname_t) * + o_dd->dd_dnp->parts.parts_len); + new->dd_next = head; + head = new; + } + *v2_dd = head; +} + +void +free_olddrvdesc( + o_md_drive_desc *v1_dd +) +{ + o_md_drive_desc *o_dd, *head; + + head = v1_dd; + + while (head != NULL) { + o_dd = head; + head = head->dd_next; + free(o_dd->dd_dnp->parts.parts_val); + free(o_dd->dd_dnp); + free(o_dd); + } +} + +void +free_newdrvdesc( + md_drive_desc *v2_dd +) +{ + md_drive_desc *dd, *head; + + head = v2_dd; + + while (head != NULL) { + dd = head; + head = head->dd_next; + free(dd->dd_dnp->parts.parts_val); + free(dd->dd_dnp); + free(dd); + } +} + +/* + * Return the device id for a given device + */ +char * +meta_get_devid( + char *rname +) +{ + ddi_devid_t devid; + int fd; + char *enc_devid, *dup_enc_devid = NULL; + + if ((fd = open(rname, O_RDWR | O_NDELAY, 0)) < 0) + return (NULL); + + if (devid_get(fd, &devid) == -1) { + (void) close(fd); + return (NULL); + } + (void) close(fd); + + enc_devid = devid_str_encode(devid, NULL); + devid_free(devid); + + if (enc_devid != NULL) { + dup_enc_devid = strdup(enc_devid); + devid_str_free(enc_devid); + } + + return (dup_enc_devid); +} + +/* + * Add side names for the diskset drive records + * NOTE: these go into the local set's namespace. + */ +int +clnt_add_drv_sidenms( + char *hostname, + char *this_host, + mdsetname_t *sp, + md_set_desc *sd, + int node_c, + char **node_v, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drv_sidenm_args v1_args; + mdrpc_drv_sidenm_2_args v2_args; + mdrpc_drv_sidenm_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + int i, j; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drv_sidenm_2_args_u.rev1; + v21_args->hostname = this_host; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->sp = sp; + v21_args->sd = sd; + v21_args->node_v.node_v_len = node_c; + v21_args->node_v.node_v_val = node_v; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_add_drv_sidenms_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + + v1_args.sd = Zalloc(sizeof (o_md_set_desc)); + alloc_olddrvdesc(&v1_args.sd->sd_drvs, sd->sd_drvs); + + /* build args */ + v1_args.hostname = this_host; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + v1_args.sp = sp; + /* set descriptor */ + v1_args.sd->sd_ctime = sd->sd_ctime; + v1_args.sd->sd_genid = sd->sd_genid; + v1_args.sd->sd_setno = sd->sd_setno; + v1_args.sd->sd_flags = sd->sd_flags; + for (i = 0; i < MD_MAXSIDES; i++) { + v1_args.sd->sd_isown[i] = sd->sd_isown[i]; + + for (j = 0; j < MD_MAX_NODENAME_PLUS_1; j ++) + v1_args.sd->sd_nodes[i][j] = + sd->sd_nodes[i][j]; + } + v1_args.sd->sd_med = sd->sd_med; + meta_conv_drvdesc_new2old(v1_args.sd->sd_drvs, + sd->sd_drvs); + v1_args.node_v.node_v_len = node_c; + v1_args.node_v.node_v_val = node_v; + + rval = mdrpc_add_drv_sidenms_1(&v1_args, &res, clntp); + + free_olddrvdesc(v1_args.sd->sd_drvs); + free(v1_args.sd); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad add drive sidenames")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_add_drv_sidenms_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad add drive sidenames")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Add drives to disksets. + */ +int +clnt_adddrvs( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + md_timeval32_t timestamp, + ulong_t genid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drives_args v1_args; + mdrpc_drives_2_args v2_args; + mdrpc_drives_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drives_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivedescs = dd; + v21_args->timestamp = timestamp; + v21_args->genid = genid; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_adddrvs_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + + alloc_olddrvdesc(&v1_args.drivedescs, dd); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + meta_conv_drvdesc_new2old(v1_args.drivedescs, dd); + v1_args.timestamp = timestamp; + v1_args.genid = genid; + + rval = mdrpc_adddrvs_1(&v1_args, &res, clntp); + + free_olddrvdesc(v1_args.drivedescs); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add drives")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_adddrvs_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add drives")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Add hosts to disksets. + */ +int +clnt_addhosts( + char *hostname, + mdsetname_t *sp, + int node_c, + char **node_v, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_host_args *args; + mdrpc_host_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_host_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->hosts.hosts_len = node_c; + args->hosts.hosts_val = node_v; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_addhosts_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version and invoke + * the appropriate version of the remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_addhosts_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add hosts")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_addhosts_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add hosts")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Create disksets. + */ +int +clnt_createset( + char *hostname, + mdsetname_t *sp, + md_node_nm_arr_t nodes, + md_timeval32_t timestamp, + ulong_t genid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_createset_args *args; + mdrpc_createset_2_args v2_args; + mdrpc_generic_res res; + int i; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_createset_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->timestamp = timestamp; + args->genid = genid; + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(args->nodes[i], nodes[i]); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_createset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version and invoke + * the appropriate version of the remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_createset_1(args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad create set")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_createset_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad create set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Create MN disksets. + */ +int +clnt_mncreateset( + char *hostname, + mdsetname_t *sp, + md_mnnode_desc *nodelist, + md_timeval32_t timestamp, + ulong_t genid, + md_node_nm_t master_nodenm, + int master_nodeid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_mncreateset_args *args; + mdrpc_mncreateset_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_mncreateset_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->timestamp = timestamp; + args->genid = genid; + (void) strlcpy(args->master_nodenm, master_nodenm, MD_MAX_NODENAME); + args->master_nodeid = master_nodeid; + args->nodelist = nodelist; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_mncreateset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mncreateset_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad mncreate set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Join MN set + */ +int +clnt_joinset( + char *hostname, + mdsetname_t *sp, + int flags, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_flags_args *args; + mdrpc_sp_flags_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_flags_2_args_u.rev1; + args->sp = sp; + args->flags = flags; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_joinset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_joinset_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad join set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Withdraw from MN set + */ +int +clnt_withdrawset( + char *hostname, + mdsetname_t *sp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_withdrawset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_withdrawset_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad withdraw set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Delete side names for the diskset drive records + * NOTE: these are removed from the local set's namespace. + */ +int +clnt_del_drv_sidenms( + char *hostname, + mdsetname_t *sp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_del_drv_sidenms_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + if (metaget_setdesc(sp, ep) == NULL) { + if (! mdisok(ep)) + return (-1); + mdclrerror(ep); + } + + /* + * Check the client handle for the version and invoke + * the appropriate version of the remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_del_drv_sidenms_1(args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad delete drive sidenames")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_del_drv_sidenms_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad delete drive sidenames")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * delete drives from the set + */ +int +clnt_deldrvs( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drives_args v1_args; + mdrpc_drives_2_args v2_args; + mdrpc_drives_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drives_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivedescs = dd; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_deldrvs_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + + alloc_olddrvdesc(&v1_args.drivedescs, dd); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + meta_conv_drvdesc_new2old(v1_args.drivedescs, dd); + + rval = mdrpc_deldrvs_1(&v1_args, &res, clntp); + + free_olddrvdesc(v1_args.drivedescs); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad delete drives")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_deldrvs_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad delete drives")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * delete host(s) from a set. + */ +int +clnt_delhosts( + char *hostname, + mdsetname_t *sp, + int node_c, + char **node_v, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_host_args *args; + mdrpc_host_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_host_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->hosts.hosts_len = node_c; + args->hosts.hosts_val = node_v; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_delhosts_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_delhosts_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad delete hosts")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_delhosts_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad delete hosts")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Delete diskset. + */ +int +clnt_delset( + char *hostname, + mdsetname_t *sp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_delset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_delset_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad delete set")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_delset_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad delete set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * return remote device info + */ +int +clnt_devinfo( + char *hostname, + mdsetname_t *sp, + mddrivename_t *dp, + md_dev64_t *ret_dev, + time_t *ret_timestamp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_devinfo_args v1_args; + mdrpc_devinfo_2_args v2_args; + mdrpc_devinfo_2_args_r1 *v21_args; + mdrpc_devinfo_res v1_res; + mdrpc_devinfo_2_res v2_res; + int rval, version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&v1_res, 0, sizeof (v1_res)); + (void) memset(&v2_res, 0, sizeof (v2_res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_devinfo_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivenamep = dp; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure. + */ + bool = mdrpc_devinfo_2_svc(&v2_args, &v2_res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &v1_res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of + * the remote procedure. + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + v1_args.drivenamep = + Zalloc(sizeof (o_mddrivename_t)); + v1_args.drivenamep->parts.parts_val = + Zalloc((sizeof (o_mdname_t)) * + dp->parts.parts_len); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, + sp->setname); + + /* + * Convert v2 arguments to v1 arguments + * before sending over the wire. + */ + meta_conv_drvname_new2old(v1_args.drivenamep, + v21_args->drivenamep); + + rval = mdrpc_devinfo_1(&v1_args, &v1_res, clntp); + + free(v1_args.drivenamep->parts.parts_val); + free(v1_args.drivenamep); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad device info")); + else + (void) mdstealerror(ep, &v1_res.status); + } else { /* version 2 */ + rval = mdrpc_devinfo_2(&v2_args, &v2_res, clntp); + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad device info")); + else + (void) mdstealerror(ep, &v2_res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_dev != NULL) { + if (version == METAD_VERSION) + *ret_dev = meta_expldev(v1_res.dev); + else + *ret_dev = v2_res.dev; + } + + if (ret_timestamp != NULL) { + if (version == METAD_VERSION) + *ret_timestamp = v1_res.vtime; + else + *ret_timestamp = v2_res.vtime; + } + } + + if (version == METAD_VERSION) + xdr_free(xdr_mdrpc_devinfo_res, (char *)&v1_res); + else + xdr_free(xdr_mdrpc_devinfo_2_res, (char *)&v2_res); + + return (rval); +} + +/* + * return remote device info + */ +int +clnt_devid( + char *hostname, + mdsetname_t *sp, + mddrivename_t *dp, + char **ret_encdevid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_devid_args *args; + mdrpc_devid_2_args v2_args; + mdrpc_devid_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_devid_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->drivenamep = dp; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure. + */ + bool = mdrpc_devid_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_DRIVENOTONHOST, sp->setno, + hostname, dp->cname, sp->setname); + } else { /* version 2 */ + rval = mdrpc_devid_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad devid info")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_encdevid != NULL) + *ret_encdevid = strdup(res.enc_devid); + + } + + xdr_free(xdr_mdrpc_devid_res, (char *)&res); + + return (rval); +} + +/* + * Get the device information of a disk on a remote host. The information + * retrieved is the device's name, the associated driver and the dev_t. + * The lookup is performed by using the devid of the disk as this is + * unique to the disk. The device name on the originating node is passed + * in. If that devname is found when doing the devid to namelist translation + * then that value is used to make the device names as consistent as possible + * across the nodes. + * + * An attempt is made to retrieve this information by calling + * mdrpc_devinfo_by_devid_name_2_svc. Locally this call should always + * succeed. In the case where a call is made through a CLIENT handle, + * it is possible that the function hasn't been implemented on the called + * node. If this is the case fall back to mdrpc_devinfo_by_devidstr_2_svc. + * + * Returns: + * -1 Error + * ENOTSUP Operation not supported i.e. procedure not supported on + * the remote node + * 0 Success + */ +int +clnt_devinfo_by_devid( + char *hostname, + mdsetname_t *sp, + char *devidstr, + md_dev64_t *ret_dev, + char *orig_devname, + char **ret_devname, + char **ret_driver, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_devidstr_args devid_args; + mdrpc_devid_name_args *args; + mdrpc_devid_name_2_args v2_args; + mdrpc_devinfo_2_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_devid_name_2_args_u.rev1; + args->enc_devid = devidstr; + args->orig_devname = orig_devname; + args->sp = sp; + + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * We are calling this locally so call the function + * directly. + */ + bool = mdrpc_devinfo_by_devid_name_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + + /* open connection */ + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) { + return (-1); + } + + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* Version 1 */ + metarpcclose(clntp); + return (ENOTSUP); + } + + rval = mdrpc_devinfo_by_devid_name_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) { + /* try falling back to devidstr_2_svc */ + (void) memset(&devid_args, 0, sizeof (devid_args)); + (void) memset(&res, 0, sizeof (res)); + + devid_args.enc_devid = devidstr; + devid_args.sp = sp; + + rval = mdrpc_devinfo_by_devid_2( + &devid_args, &res, clntp); + + if (rval != RPC_SUCCESS) { + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad devinfo by devid")); + } else { + (void) mdstealerror(ep, &res.status); + } + } else { + (void) mdstealerror(ep, &res.status); + } + metarpcclose(clntp); + } + + if (mdisok(ep)) { + rval = 0; + if (ret_dev != NULL) + *ret_dev = res.dev; + + if (ret_devname != NULL && res.devname != NULL) + *ret_devname = Strdup(res.devname); + + if (ret_driver != NULL && res.drivername != NULL) + *ret_driver = Strdup(res.drivername); + } + + xdr_free(xdr_mdrpc_devinfo_2_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); + +} + + +/* + * return status of whether driver is used, mount + */ +int +clnt_drvused( + char *hostname, + mdsetname_t *sp, + mddrivename_t *dp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drvused_args v1_args; + mdrpc_drvused_2_args v2_args; + mdrpc_drvused_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drvused_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivenamep = dp; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_drvused_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + /* open connection */ + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + v1_args.drivenamep = + Zalloc(sizeof (o_mddrivename_t)); + v1_args.drivenamep->parts.parts_val = + Zalloc((sizeof (o_mdname_t)) * + dp->parts.parts_len); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* Convert v2 args to v1 args */ + meta_conv_drvname_new2old(v1_args.drivenamep, + v21_args->drivenamep); + + rval = mdrpc_drvused_1(&v1_args, &res, clntp); + + free(v1_args.drivenamep->parts.parts_val); + free(v1_args.drivenamep); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad drive used")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_drvused_2(&v2_args, &res, clntp); + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad drive used")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +void +free_sr(md_set_record *sr) +{ + mdrpc_getset_res res; + mdrpc_mngetset_res mnres; + + if (md_in_daemon) + return; + + /* + * dummy up a result struct, to do a deep free of the (mn)sr. + * (A deep free means that the xdr_free code will free the + * linked list of drive records for the sr and will also free + * the linked list of node records for the mnsr.) + */ + if (MD_MNSET_REC(sr)) { + (void) memset(&mnres, 0, sizeof (mnres)); + mnres.mnsr = (struct md_mnset_record *)sr; + xdr_free(xdr_mdrpc_mngetset_res, (char *)&mnres); + } else { + (void) memset(&res, 0, sizeof (res)); + res.sr = sr; + xdr_free(xdr_mdrpc_getset_res, (char *)&res); + } +} + +void +short_circuit_getset( + mdrpc_getset_args *args, + mdrpc_getset_res *res +) +{ + if (args->setname != NULL) + res->sr = metad_getsetbyname(args->setname, &res->status); + else + res->sr = metad_getsetbynum(args->setno, &res->status); +} + +void +short_circuit_mngetset( + mdrpc_getset_args *args, + mdrpc_mngetset_res *res +) +{ + md_set_record *sr; + if (args->setname != NULL) + sr = metad_getsetbyname(args->setname, &res->status); + else + sr = metad_getsetbynum(args->setno, &res->status); + + if (MD_MNSET_REC(sr)) { + res->mnsr = (struct md_mnset_record *)sr; + } else { + res->mnsr = NULL; + } +} + +static int +is_auto_take_set(char *setname, set_t setno) +{ + if (setname != NULL) + return (metad_isautotakebyname(setname)); + else + return (metad_isautotakebynum(setno)); +} + +/* + * return the diskset record, and drive records. + * If record is a MNdiskset record, then only the first md_set_record + * bytes were copied from the daemon. + */ +int +clnt_getset( + char *hostname, + char *setname, + set_t setno, + md_set_record **ret_sr, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_getset_args *args; + mdrpc_getset_2_args v2_args; + mdrpc_getset_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_getset_2_args_u.rev1; + args->setname = setname; + args->setno = setno; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + short_circuit_getset(args, &res); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) { + /* + * This has to work during the boot up before the rpc.metad can + * run. Check to see if we can handle this as a strictly local + * diskset. + */ + if (is_auto_take_set(setname, setno)) { + mdclrerror(ep); + short_circuit_getset(args, &res); + res.sr = setdup(res.sr); + (void) mdstealerror(ep, &res.status); + } else { + return (-1); + } + } else { + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_getset_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad get set")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_getset_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad get set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + } + + if (mdisok(ep)) { + rval = 0; + if (ret_sr != NULL) + *ret_sr = res.sr; + else + if (! md_in_daemon) + xdr_free(xdr_mdrpc_getset_res, (char *)&res); + } + + return (rval); +} + +/* + * return the multi-node diskset record, drive records and node records. + */ +clnt_mngetset( + char *hostname, + char *setname, + set_t setno, + md_mnset_record **ret_mnsr, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_getset_args *args; + mdrpc_getset_2_args v2_args; + mdrpc_mngetset_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_getset_2_args_u.rev1; + args->setname = setname; + args->setno = setno; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + short_circuit_mngetset(args, &res); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + setno, hostname, NULL, setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mngetset_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad mn get set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + /* If no ep error and no version mismatch - rpc call worked ok */ + if (mdisok(ep)) { + rval = 0; + if (ret_mnsr != NULL) + *ret_mnsr = res.mnsr; + else + if (! md_in_daemon) + xdr_free(xdr_mdrpc_mngetset_res, (char *)&res); + } + + return (rval); +} + +/* + * Set master nodeid and nodename in multi-node set record. + */ +clnt_mnsetmaster( + char *hostname, + mdsetname_t *sp, + md_node_nm_t master_nodenm, + int master_nodeid, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_mnsetmaster_args *args; + mdrpc_mnsetmaster_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_mnsetmaster_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + (void) strlcpy(args->master_nodenm, master_nodenm, MD_MAX_NODENAME); + args->master_nodeid = master_nodeid; + + /* do it */ + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mnsetmaster_2(&v2_args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad multi-owner set master")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Get the MH timeout values. + */ +int +clnt_gtimeout( + char *hostname, + mdsetname_t *sp, + mhd_mhiargs_t *ret_mhiargs, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_gtimeout_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_gtimeout_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_gtimeout_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad get timeout")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_gtimeout_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad get timeout")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + + /* do something with the results */ + rval = 0; + + /* copy md_mhiargs_t */ + if (ret_mhiargs != NULL) + *ret_mhiargs = *res.mhiargsp; + } + + xdr_free(xdr_mdrpc_gtimeout_res, (char *)&res); + + return (rval); +} + +/* + * get real hostname from remote host + */ +int +clnt_hostname( + char *hostname, + char **ret_hostname, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_null_args args; + mdrpc_hostname_res res; + int rval = -1; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.cl_sk = NULL; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_hostname_1_svc(&args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + if (mdrpc_hostname_1(&args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad hostname")); + else + (void) mdstealerror(ep, &res.status); + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_hostname != NULL) + *ret_hostname = Strdup(res.hostname); + } + + xdr_free(xdr_mdrpc_hostname_res, (char *)&res); + + return (rval); +} + +/* + * NULLPROC - just returns a response + */ +int +clnt_nullproc( + char *hostname, + md_error_t *ep +) +{ + CLIENT *clntp; + + /* initialize */ + mdclrerror(ep); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_nullproc_1_svc(NULL, ep, NULL); + assert(bool == TRUE); + } else { + if ((clntp = metarpcopen(hostname, CL_DEF_TMO, ep)) == NULL) + return (-1); + + if (mdrpc_nullproc_1(NULL, ep, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad nullproc")); + + metarpcclose(clntp); + } + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * does host own the set? + */ +int +clnt_ownset( + char *hostname, + mdsetname_t *sp, + int *ret_bool, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_bool_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_ownset_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) { + /* + * This has to work in the code path from libpreen which is + * running within fsck before the rpc.metad can run. Check + * to see if we should handle this as an auto-take diskset. + */ + if (is_auto_take_set(sp->setname, sp->setno)) { + /* Can't call mdrpc_ownset_2_svc since not in daemon */ + mdclrerror(ep); + if (s_ownset(sp->setno, ep)) + res.value = TRUE; + else + res.value = FALSE; + } else { + return (-1); + } + + } else { + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_ownset_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad own set")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_ownset_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad own set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_bool != NULL) + *ret_bool = res.value; + } + + xdr_free(xdr_mdrpc_bool_res, (char *)&res); + + return (rval); +} + +/* + * Valid set name. + */ +int +clnt_setnameok( + char *hostname, + mdsetname_t *sp, + int *ret_bool, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_bool_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_setnameok_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_setnameok_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad setname ok")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_setnameok_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad setname ok")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_bool != NULL) + *ret_bool = res.value; + } + + xdr_free(xdr_mdrpc_bool_res, (char *)&res); + + return (rval); +} + +/* + * Is set number in-use? + */ +int +clnt_setnumbusy( + char *hostname, + set_t setno, + int *ret_bool, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_setno_args *args; + mdrpc_setno_2_args v2_args; + mdrpc_bool_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_setno_2_args_u.rev1; + args->setno = setno; + args->cl_sk = NULL; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_setnumbusy_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_setnumbusy_1(args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad setnumber busy")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_setnumbusy_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad setnumber busy")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_bool != NULL) + *ret_bool = res.value; + } + + xdr_free(xdr_mdrpc_bool_res, (char *)&res); + + return (rval); +} + +/* + * Set the timeout values used into the drive records. + */ +int +clnt_stimeout( + char *hostname, + mdsetname_t *sp, + mhd_mhiargs_t *mhiargsp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_stimeout_args *args; + mdrpc_stimeout_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_stimeout_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->mhiargsp = mhiargsp; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_stimeout_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_stimeout_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad set timeout")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_stimeout_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad set timeout")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * update drive records + */ +int +clnt_upd_dr_dbinfo( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_drives_args v1_args; + mdrpc_drives_2_args v2_args; + mdrpc_drives_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_drives_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivedescs = dd; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_upd_dr_dbinfo_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + + alloc_olddrvdesc(&v1_args.drivedescs, dd); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + meta_conv_drvdesc_new2old(v1_args.drivedescs, dd); + + rval = mdrpc_upd_dr_dbinfo_1(&v1_args, &res, clntp); + + free_olddrvdesc(v1_args.drivedescs); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad update drive dbinfo")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_upd_dr_dbinfo_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad update drive dbinfo")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * update dr_flags field of drive record. + */ +int +clnt_upd_dr_flags( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + uint_t new_flags, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_upd_dr_flags_args v1_args; + mdrpc_upd_dr_flags_2_args v2_args; + mdrpc_upd_dr_flags_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v1_args, 0, sizeof (v1_args)); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_upd_dr_flags_2_args_u.rev1; + v21_args->sp = sp; + v21_args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + v21_args->drivedescs = dd; + v21_args->new_flags = new_flags; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_upd_dr_flags_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + + alloc_olddrvdesc(&v1_args.drivedescs, dd); + + /* build args */ + v1_args.sp = sp; + v1_args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + meta_conv_drvdesc_new2old(v1_args.drivedescs, dd); + v1_args.new_flags = new_flags; + + rval = mdrpc_upd_dr_flags_1(&v1_args, &res, clntp); + + free_olddrvdesc(v1_args.drivedescs); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad update drive flags")); + else + (void) mdstealerror(ep, &res.status); + } else { /* version 2 */ + rval = mdrpc_upd_dr_flags_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad update drive flags")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * update set record flags + * This replaces all of the sr_flags with the new_flags. It relies on the + * caller to "do the right thing" to preserve the existing flags that should + * not be reset. + */ +static int +upd_sr_flags_common( + char *hostname, + mdsetname_t *sp, + uint_t new_flags, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_upd_sr_flags_args *args; + mdrpc_upd_sr_flags_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_upd_sr_flags_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + args->new_flags = new_flags; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_upd_sr_flags_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_upd_sr_flags_1(args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad update set flags")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_upd_sr_flags_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad update set flags")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Enable bits in the set record flags field. This just turns on the specified + * bits and leaves the other bits alone. + */ +int +clnt_enable_sr_flags( + char *hostname, + mdsetname_t *sp, + uint_t flags, + md_error_t *ep +) +{ + uint_t new_flags; + md_set_desc *sd; + + mdclrerror(ep); + + /* Get the flags from the current set */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Turn on the specified bits */ + new_flags = (sd->sd_flags | flags); + + /* do it */ + return (upd_sr_flags_common(hostname, sp, new_flags, ep)); +} + +/* + * Disable bits in the set record flags field. This just turns off the + * specified bits and leaves the other bits alone. + */ +int +clnt_disable_sr_flags( + char *hostname, + mdsetname_t *sp, + uint_t flags, + md_error_t *ep +) +{ + uint_t new_flags; + md_set_desc *sd; + + mdclrerror(ep); + + /* Get the flags from the current set */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Turn off the specified bits */ + new_flags = (sd->sd_flags & ~flags); + + /* do it */ + return (upd_sr_flags_common(hostname, sp, new_flags, ep)); +} + +/* + * Assign the flags as the new value(s) for the MD_SR_STATE_FLAGS within the + * set record flags field. This actually can set any bits but only clears + * the bits within the MD_SR_STATE_FLAGS subfield and leaves any other + * bits turned on. It can be used to clear (state) and set bits all in one + * rpc call. + */ +int +clnt_upd_sr_flags( + char *hostname, + mdsetname_t *sp, + uint_t flags, + md_error_t *ep +) +{ + uint_t new_flags; + md_set_desc *sd; + + mdclrerror(ep); + + /* Get the flags from the current set */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* clear the existing state flags */ + sd->sd_flags &= ~MD_SR_STATE_FLAGS; + + /* Or in the new value */ + new_flags = (sd->sd_flags | flags); + + /* do it */ + return (upd_sr_flags_common(hostname, sp, new_flags, ep)); +} + +md_setkey_t * +cl_get_setkey(set_t setno, char *setname) +{ + + if (my_cl_sk == NULL) { + my_cl_sk = Zalloc(sizeof (md_setkey_t)); + my_cl_sk->sk_setno = setno; + my_cl_sk->sk_setname = Strdup(setname); + my_cl_sk->sk_host = Strdup(mynode()); + } else { + my_cl_sk->sk_setno = setno; + if (my_cl_sk->sk_setname != NULL) + Free(my_cl_sk->sk_setname); + my_cl_sk->sk_setname = Strdup(setname); + } + + return (my_cl_sk); +} + +void +cl_set_setkey(md_setkey_t *cl_sk) +{ + if ((cl_sk != NULL) && (my_cl_sk != NULL)) { + assert(my_cl_sk->sk_setno == cl_sk->sk_setno); + assert(strcmp(my_cl_sk->sk_setname, cl_sk->sk_setname) == 0); + assert(strcmp(my_cl_sk->sk_host, cl_sk->sk_host) == 0); + my_cl_sk->sk_key = cl_sk->sk_key; + return; + } + + if (my_cl_sk != NULL) { + if (my_cl_sk->sk_setname != NULL) + Free(my_cl_sk->sk_setname); + if (my_cl_sk->sk_host != NULL) + Free(my_cl_sk->sk_host); + Free(my_cl_sk); + } + + my_cl_sk = NULL; + + /* get here, if set called before get */ + if (cl_sk != NULL) { + my_cl_sk = Zalloc(sizeof (md_setkey_t)); + my_cl_sk->sk_host = Strdup(cl_sk->sk_host); + my_cl_sk->sk_setno = cl_sk->sk_setno; + my_cl_sk->sk_setname = Strdup(cl_sk->sk_setname); + my_cl_sk->sk_key = cl_sk->sk_key; + } +} + +/* + * Unlock the set after operation is complete. + */ +int +clnt_unlock_set( + char *hostname, + md_setkey_t *cl_sk, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_null_args args; + mdrpc_setlock_res res; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.cl_sk = cl_sk; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_unlock_set_1_svc(&args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + if (mdrpc_unlock_set_1(&args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad unlock set")); + else + (void) mdstealerror(ep, &res.status); + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_setlock_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Lock set so that only operators with valid keys are allowed in the daemon. + */ +int +clnt_lock_set( + char *hostname, + mdsetname_t *sp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_null_args args; + mdrpc_setlock_res res; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_lock_set_1_svc(&args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + if (mdrpc_lock_set_1(&args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad lock set")); + else + (void) mdstealerror(ep, &res.status); + + metarpcclose(clntp); + } + + if (mdisok(ep)) + cl_set_setkey(res.cl_sk); + + xdr_free(xdr_mdrpc_setlock_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Add mediator hosts to disksets. + */ +int +clnt_updmeds( + char *hostname, + mdsetname_t *sp, + md_h_arr_t *medp, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_updmeds_args *args; + mdrpc_updmeds_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_updmeds_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->meds = *medp; /* structure assignment */ + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_updmeds_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + if (version == METAD_VERSION) { /* version 1 */ + if (mdrpc_updmeds_1(args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add hosts")); + else + (void) mdstealerror(ep, &res.status); + } else { + if (mdrpc_updmeds_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad add hosts")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * update nr_flags field of node records based + * on given action. + */ +int +clnt_upd_nr_flags( + char *hostname, + mdsetname_t *sp, + md_mnnode_desc *nd, + uint_t flag_action, + uint_t flags, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_upd_nr_flags_args *args; + mdrpc_upd_nr_flags_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_upd_nr_flags_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->nodedescs = nd; + args->flag_action = flag_action; + args->flags = flags; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_upd_nr_flags_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_upd_nr_flags_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad set node flags")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Clear set locks for all MN disksets. + * Used during reconfig cycle to recover from failed nodes. + */ +int +clnt_clr_mnsetlock( + char *hostname, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_null_args args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* do it */ + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + NULL, hostname, NULL, NULL); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_clr_mnsetlock_2(&args, &res, clntp) != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad clr mnsetlock")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Calls to suspend, resume or reinit the rpc.mdcommd daemon. + * This allows a node to remotely suspend, reinit and resume the + * rpc.mdcommd daemon on the given hostname node. Used by libmeta + * to lock out class 1 messages (metainit, etc) on all nodes when running + * metaset and metadb commands on this node. + * + * When suspending the commd, the suspend request will fail until all + * messages have been drained from the rpc.mdcommd. This routine will + * spin sending the suspend request until the rpc.mdcommd is drained + * or until rpc.mdcommd returns a failure other than MDMNE_SET_NOT_DRAINED. + * + * Also used to send the rpc.mdcommd daemon a new nodelist by draining all + * messages from the mdcommd and sending a reinit command to have mdcommd + * get the new nodelist from rpc.metad. Used when nodelist is changed + * during: + * - addition or deletion of host from diskset + * - join or withdrawal of host from diskset + * - addition of first disk to diskset (joins all nodes) + * - removal of last disk from diskset (withdraws all nodes) + */ +int +clnt_mdcommdctl( + char *hostname, + int flag_action, + mdsetname_t *sp, + md_mn_msgclass_t class, + uint_t flags, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_mdcommdctl_args *args; + mdrpc_mdcommdctl_2_args v2_args; + mdrpc_generic_res res; + int version; + int suspend_spin = 0; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_mdcommdctl_2_args_u.rev1; + args->flag_action = flag_action; + args->setno = sp->setno; + args->class = class; + args->flags = flags; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + /* + * Call v2 procedure directly if rpc.metad on this node is + * sending message to itself. + */ + if (flag_action == COMMDCTL_SUSPEND) { + suspend_spin = 1; + while (suspend_spin) { + suspend_spin = 0; + bool = mdrpc_mdcommdctl_2_svc(&v2_args, &res, + NULL); + assert(bool == TRUE); + /* + * If set not yet drained, wait a second + * and try again. + */ + if (mdisdserror(&(res.status), + MDE_DS_COMMDCTL_SUSPEND_NYD)) { + /* Wait a second and try again */ + mdclrerror(&(res.status)); + (void) sleep(1); + suspend_spin = 1; + } + } + } else { + bool = mdrpc_mdcommdctl_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + } + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } + + if (flag_action == COMMDCTL_SUSPEND) { + suspend_spin = 1; + while (suspend_spin) { + suspend_spin = 0; + if (mdrpc_mdcommdctl_2(&v2_args, &res, + clntp) != RPC_SUCCESS) { + (void) mdrpcerror(ep, clntp, + hostname, + dgettext(TEXT_DOMAIN, + "metad commd control")); + } else { + /* + * If set not yet drained, + * wait a second and + * and try again. + */ + if (mdisdserror(&(res.status), + MDE_DS_COMMDCTL_SUSPEND_NYD)) { + mdclrerror(&(res.status)); + (void) sleep(1); + suspend_spin = 1; + } else { + (void) mdstealerror(ep, + &res.status); + } + } + } + } else { + if (mdrpc_mdcommdctl_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad commd control")); + else + (void) mdstealerror(ep, &res.status); + } + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Is owner node stale? + */ +int +clnt_mn_is_stale( + char *hostname, + mdsetname_t *sp, + int *ret_bool, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_setno_args *args; + mdrpc_setno_2_args v2_args; + mdrpc_bool_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_setno_2_args_u.rev1; + args->setno = sp->setno; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + /* + * Call v2 procedure directly if rpc.metad on this node is + * sending message to itself. + */ + bool = mdrpc_mn_is_stale_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mn_is_stale_2(&v2_args, &res, clntp) != + RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad mn is stale")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + if (mdisok(ep)) { + /* do something with the results */ + rval = 0; + + if (ret_bool != NULL) + *ret_bool = res.value; + } + + xdr_free(xdr_mdrpc_bool_res, (char *)&res); + + return (rval); +} + +/* + * Free md_drive_desc linked list of drive descriptors that was alloc'd + * from a call to the RPC routine clnt_getdrivedesc. Drive descriptors + * are from another node. + */ +void +free_rem_dd(md_drive_desc *dd) +{ + mdrpc_getdrivedesc_res res; + + /* + * dummy up a result struct, to do a deep free of the dd. + * (A deep free means that the xdr_free code will free the + * linked list of drive descs.) + */ + (void) memset(&res, 0, sizeof (res)); + res.dd = (struct md_drive_desc *)dd; + xdr_free(xdr_mdrpc_getdrivedesc_res, (char *)&res); +} + +/* + * Get a partially filled in drive desc from remote node. Used in MN + * disksets during the reconfig cycle to get the diskset drive + * information from another host in order to sync up all nodes. + * Used when the drive record information isn't good enough + * since the drive record doesn't give the name of + * the drive, but just a key into that other node's nodespace. + * Returned drive desc has the drive name filled in but no other strings + * in the drivename structure. + * + * Returns a 0 if RPC was successful, 1 otherwise. + */ +int +clnt_getdrivedesc( + char *hostname, + mdsetname_t *sp, + md_drive_desc **ret_dd, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_sp_args *args; + mdrpc_sp_2_args v2_args; + mdrpc_getdrivedesc_res res; + int version; + int rval = -1; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_sp_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_getdrivedesc_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_getdrivedesc_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad get drive desc set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + /* If no ep error and no version mismatch - rpc call worked ok */ + if (mdisok(ep)) { + rval = 0; + if (ret_dd != NULL) + *ret_dd = res.dd; + else + xdr_free(xdr_mdrpc_getdrivedesc_res, (char *)&res); + } + + return (rval); +} + +/* + * update dr_flags field of drive record. + * Also sync up genid of drive descriptors and make set + * record and node records match the genid. + * + * Returns a 0 if RPC was successful, 1 otherwise. + */ +int +clnt_upd_dr_reconfig( + char *hostname, + mdsetname_t *sp, + md_drive_desc *dd, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_upd_dr_flags_2_args v2_args; + mdrpc_upd_dr_flags_2_args_r1 *v21_args; + mdrpc_generic_res res; + int rval; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + v21_args = &v2_args.mdrpc_upd_dr_flags_2_args_u.rev1; + v21_args->sp = sp; + v21_args->drivedescs = dd; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + + /* + * If the server is local, we call the v2 procedure + */ + bool = mdrpc_upd_dr_reconfig_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + rval = mdrpc_upd_dr_reconfig_2(&v2_args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad update drive reconfig")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Reset mirror owner(s) if mirror owner(s) is in the list of + * node's specified in the array of nodeids. + * This is called when a node has been deleted or withdrawn + * from the diskset. + */ +int +clnt_reset_mirror_owner( + char *hostname, + mdsetname_t *sp, + int node_c, + int node_id[], + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_nodeid_args *args; + mdrpc_nodeid_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_nodeid_2_args_u.rev1; + args->sp = sp; + args->cl_sk = cl_get_setkey(sp->setno, sp->setname); + args->nodeid.nodeid_len = node_c; + args->nodeid.nodeid_val = &node_id[0]; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + bool = mdrpc_reset_mirror_owner_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + * and invoke the appropriate version of the + * remote procedure + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + sp->setno, hostname, NULL, sp->setname); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_reset_mirror_owner_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad reset mirror owner")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) + return (-1); + + return (0); +} + +/* + * Call to suspend and resume I/O for given diskset(s). + * This allows a node to remotely suspend and resume I/O on + * a MN diskset. A diskset number of 0 represents all MN disksets. + */ +int +clnt_mn_susp_res_io( + char *hostname, + set_t setno, + int cmd, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_mn_susp_res_io_args *args; + mdrpc_mn_susp_res_io_2_args v2_args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&v2_args, 0, sizeof (v2_args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + v2_args.rev = MD_METAD_ARGS_REV_1; + args = &v2_args.mdrpc_mn_susp_res_io_2_args_u.rev1; + args->susp_res_cmd = cmd; + args->susp_res_setno = setno; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + /* + * Call v2 procedure directly if rpc.metad on this node is + * sending message to itself. + */ + bool = mdrpc_mn_susp_res_io_2_svc(&v2_args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + setno, hostname, NULL, NULL); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mn_susp_res_io_2(&v2_args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad mn_susp_res_io control")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Resnarf the set after the set has been imported + * + * We should never be making this procedure call + * over the wire, it's sole purpose is to snarf + * the imported set on the localhost. + */ +int +clnt_resnarf_set( + char *hostname, + set_t setno, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_setno_2_args args; + mdrpc_generic_res res; + int rval = -1; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.rev = MD_METAD_ARGS_REV_1; + args.mdrpc_setno_2_args_u.rev1.setno = setno; + args.mdrpc_setno_2_args_u.rev1.cl_sk = NULL; + + /* do it */ + if (strcmp(mynode(), hostname) == 0) { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* Check the client handle for the version */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* If the client is version 1, return error */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_CANTRESNARF, MD_SET_BAD, + mynode(), NULL, NULL); + } else { + rval = mdrpc_resnarf_set_2(&args, &res, clntp); + + if (rval != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad resnarf set")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + + } else { + (void) mddserror(ep, MDE_DS_CANTRESNARF, MD_SET_BAD, + mynode(), NULL, NULL); + } + + if (mdisok(ep)) + rval = 0; + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + return (rval); +} + +/* + * Call to start a resync for a given diskset. + * Used when a node has been added to a diskset. + * Should be called after rpc.mdcommd is resumed. + */ +int +clnt_mn_mirror_resync_all( + char *hostname, + set_t setno, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_setno_2_args args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.rev = MD_METAD_ARGS_REV_1; + args.mdrpc_setno_2_args_u.rev1.setno = setno; + args.mdrpc_setno_2_args_u.rev1.cl_sk = NULL; + + /* do it */ + if (md_in_daemon && strcmp(mynode(), hostname) == 0) { + int bool; + /* + * Call v2 procedure directly if rpc.metad on this node is + * sending message to itself. + */ + bool = mdrpc_mn_mirror_resync_all_2_svc(&args, &res, NULL); + assert(bool == TRUE); + (void) mdstealerror(ep, &res.status); + } else { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + setno, hostname, NULL, NULL); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mn_mirror_resync_all_2(&args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad mn_mirror_resync_all")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + if (strcmp(mynode(), hostname) == 0) + return (-1); + mdclrerror(ep); + } + + return (0); +} + +/* + * Call to update the ABR state for all soft partitions. + * Used when a node has been added to a diskset. + * Should be called after rpc.mdcommd is resumed. + */ +int +clnt_mn_sp_update_abr( + char *hostname, + set_t setno, + md_error_t *ep +) +{ + CLIENT *clntp; + mdrpc_setno_2_args args; + mdrpc_generic_res res; + int version; + + /* initialize */ + mdclrerror(ep); + (void) memset(&args, 0, sizeof (args)); + (void) memset(&res, 0, sizeof (res)); + + /* build args */ + args.rev = MD_METAD_ARGS_REV_1; + args.mdrpc_setno_2_args_u.rev1.setno = setno; + args.mdrpc_setno_2_args_u.rev1.cl_sk = NULL; + + /* + * No need to call function if adding local node as ABR cannot + * be set. + */ + if (strcmp(mynode(), hostname) != 0) { + if ((clntp = metarpcopen(hostname, CL_LONG_TMO, ep)) == NULL) + return (-1); + + /* + * Check the client handle for the version + */ + CLNT_CONTROL(clntp, CLGET_VERS, (char *)&version); + + /* + * If the client is version 1, return error + * otherwise, make the remote procedure call. + */ + if (version == METAD_VERSION) { /* version 1 */ + (void) mddserror(ep, MDE_DS_RPCVERSMISMATCH, + setno, hostname, NULL, NULL); + metarpcclose(clntp); + return (-1); + } else { + if (mdrpc_mn_sp_update_abr_2(&args, &res, clntp) + != RPC_SUCCESS) + (void) mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, + "metad mn_sp_update_abr")); + else + (void) mdstealerror(ep, &res.status); + } + + metarpcclose(clntp); + } + + xdr_free(xdr_mdrpc_generic_res, (char *)&res); + + if (! mdisok(ep)) { + if (! mdanyrpcerror(ep)) + return (-1); + mdclrerror(ep); + } + + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_metad_subr.c b/usr/src/lib/lvm/libmeta/common/meta_metad_subr.c new file mode 100644 index 0000000000..df50a7650e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_metad_subr.c @@ -0,0 +1,2055 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * interface between user land and the set records + */ + +#include <meta.h> +#include <metad.h> +#include <sdssc.h> +#include <syslog.h> +#include <sys/cladm.h> +#include "meta_set_prv.h" + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/svm.h> + +static md_set_record *setrecords = NULL; /* head of cache linked list */ +static int setsnarfdone = 0; + +typedef struct key_lst_t { + side_t kl_side; + mdkey_t kl_key; + struct key_lst_t *kl_next; +} key_lst_t; + +typedef struct ur_recid_lst { + mddb_recid_t url_recid; + struct ur_recid_lst *url_nx; +} ur_recid_lst_t; + +static ur_recid_lst_t *url_used = NULL; +static ur_recid_lst_t *url_tode = NULL; + +static void +url_addl(ur_recid_lst_t **urlpp, mddb_recid_t recid) +{ + /* Run to the end of the list */ + for (/* void */; (*urlpp != NULL); urlpp = &(*urlpp)->url_nx) + if ((*urlpp)->url_recid == recid) + return; + + /* Add the new member */ + *urlpp = Zalloc(sizeof (**urlpp)); + if (*urlpp == NULL) + return; + + (*urlpp)->url_recid = recid; +} + +static int +url_findl(ur_recid_lst_t *urlp, mddb_recid_t recid) +{ + while (urlp != NULL) { + if (urlp->url_recid == recid) + return (1); + urlp = urlp->url_nx; + } + return (0); +} + +static void +url_freel(ur_recid_lst_t **urlpp) +{ + ur_recid_lst_t *urlp; + ur_recid_lst_t *turlp; + + for (turlp = *urlpp; turlp != NULL; turlp = urlp) { + urlp = turlp->url_nx; + Free(turlp); + } + *urlpp = (ur_recid_lst_t *)NULL; +} + +static int +ckncvt_set_record(mddb_userreq_t *reqp, md_error_t *ep) +{ + mddb_userreq_t req; + md_set_record *sr; + int recs[3]; + + if (reqp->ur_size == sizeof (*sr)) + return (0); + + if (! md_in_daemon) { + if (reqp->ur_size >= sizeof (*sr)) + return (0); + + reqp->ur_data = (uintptr_t)Realloc((void *)reqp->ur_data, + sizeof (*sr)); + (void) memset(((char *)reqp->ur_data) + reqp->ur_size, '\0', + sizeof (*sr) - reqp->ur_size); + reqp->ur_size = sizeof (*sr); + return (0); + } + + /* + * If here, then the daemon is calling, and so the automatic + * conversion will be performed. + */ + + /* shorthand */ + req = *reqp; /* structure assignment */ + sr = (md_set_record *)req.ur_data; + + if (sr->sr_flags & MD_SR_CVT) + return (0); + + /* Leave multi-node set records alone */ + if (MD_MNSET_REC(sr)) { + return (0); + } + + /* Mark the old record as converted */ + sr->sr_flags |= MD_SR_CVT; + + METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid) + + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) + return (mdstealerror(ep, &req.ur_mde)); + + /* Create space for the new record */ + METAD_SETUP_SR(MD_DB_CREATE, 0); + req.ur_size = sizeof (*sr); + + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) + return (mdstealerror(ep, &req.ur_mde)); + + /* Allocate the new record */ + sr = Zalloc(sizeof (*sr)); + + /* copy all the data from the record being converted */ + (void) memmove(sr, (void *)reqp->ur_data, reqp->ur_size); + sr->sr_flags &= ~MD_SR_CVT; + + /* adjust the selfid to point to the new record */ + sr->sr_selfid = req.ur_recid; + + METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid) + req.ur_size = sizeof (*sr); + req.ur_data = (uintptr_t)sr; + + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + Free(sr); + return (mdstealerror(ep, &req.ur_mde)); + } + + /* Commit the old and the new */ + recs[0] = ((md_set_record *)reqp->ur_data)->sr_selfid; + recs[1] = sr->sr_selfid; + recs[2] = 0; + + METAD_SETUP_UR(MD_DB_COMMIT_MANY, 0, 0); + req.ur_size = sizeof (recs); + req.ur_data = (uintptr_t)recs; + + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + Free(sr); + return (mdstealerror(ep, &req.ur_mde)); + } + + /* Add the the old record to the list of records to delete */ + url_addl(&url_tode, ((md_set_record *)reqp->ur_data)->sr_selfid); + + /* Free the old records space */ + Free((void *)reqp->ur_data); + + /* Adjust the reqp structure to point to the new record and size */ + reqp->ur_recid = sr->sr_selfid; + reqp->ur_size = sizeof (*sr); + reqp->ur_data = (uintptr_t)sr; + + return (0); +} + +mddb_userreq_t * +get_db_rec( + md_ur_get_cmd_t cmd, + set_t setno, + mddb_type_t type, + uint_t type2, + mddb_recid_t *idp, + md_error_t *ep +) +{ + mddb_userreq_t *reqp = Zalloc(sizeof (*reqp)); + + reqp->ur_setno = setno; + reqp->ur_type = type; + reqp->ur_type2 = type2; + + switch (cmd) { + case MD_UR_GET_NEXT: + reqp->ur_cmd = MD_DB_GETNEXTREC; + reqp->ur_recid = *idp; + if (metaioctl(MD_DB_USERREQ, reqp, &reqp->ur_mde, NULL) + != 0) { + (void) mdstealerror(ep, &reqp->ur_mde); + Free(reqp); + return (NULL); + } + *idp = reqp->ur_recid; + break; + case MD_UR_GET_WKEY: + reqp->ur_recid = *idp; + break; + } + + if (*idp <= 0) { + Free(reqp); + return (NULL); + } + + reqp->ur_cmd = MD_DB_GETSIZE; + if (metaioctl(MD_DB_USERREQ, reqp, &reqp->ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &reqp->ur_mde); + Free(reqp); + + *idp = 0; + return (NULL); + } + + reqp->ur_cmd = MD_DB_GETDATA; + reqp->ur_data = (uintptr_t)Zalloc(reqp->ur_size); + if (metaioctl(MD_DB_USERREQ, reqp, &reqp->ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &reqp->ur_mde); + Free((void *)reqp->ur_data); + Free(reqp); + *idp = 0; + return (NULL); + } + + switch (reqp->ur_type) { + case MDDB_USER: + switch (reqp->ur_type2) { + case MDDB_UR_SR: + if (ckncvt_set_record(reqp, ep)) { + Free((void *)reqp->ur_data); + Free(reqp); + return (NULL); + } + break; + } + break; + } + + return (reqp); +} + +void * +get_ur_rec( + set_t setno, + md_ur_get_cmd_t cmd, + uint_t type2, + mddb_recid_t *idp, + md_error_t *ep +) +{ + mddb_userreq_t *reqp = NULL; + void *ret_val; + + assert(idp != NULL); + + reqp = get_db_rec(cmd, setno, MDDB_USER, type2, idp, ep); + if (reqp == NULL) + return (NULL); + + ret_val = (void *)reqp->ur_data; + Free(reqp); + return (ret_val); +} + +/* + * Called by rpc.metad on startup of disksets to cleanup + * the host entries associated with a diskset. This is needed if + * a node failed or the metaset command was killed during the addition + * of a node to a diskset. + * + * This is called for all traditional disksets. + * This is only called for MNdisksets when in there is only one node + * in all of the MN disksets and this node is not running SunCluster. + * (Otherwise, the cleanup of the host entries is handled by a + * reconfig cycle that the SunCluster software calls). + */ +static int +sr_hosts(md_set_record *sr) +{ + int i, + nid, + self_in_set = FALSE; + md_error_t xep = mdnullerror; + md_mnnode_record *nr; + md_mnset_record *mnsr; + + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + /* + * Already guaranteed to be only 1 node in set which + * is mynode (done in sr_validate). + * Now, check if node is in the OK state. If not in + * the OK state, leave self_in_set FALSE so that + * set will be removed. + */ + if (nr->nr_flags & MD_MN_NODE_OK) + self_in_set = TRUE; + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sr->sr_nodes[i][0] == '\0') + continue; + + /* Make sure we are in the set and skip this node */ + if (strcmp(sr->sr_nodes[i], mynode()) == 0) { + self_in_set = TRUE; + break; + } + } + } + + if ((self_in_set == FALSE) && (!(MD_MNSET_REC(sr))) && + (_cladm(CL_CONFIG, CL_NODEID, &nid) == 0)) { + + /* + * See if we've got a node which has been booted in + * non-cluster mode. If true the nodeid will match + * one of the sr_nodes values because the conversion + * from nodeid to hostname failed to occur. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + if (sr->sr_nodes[i][0] == 0) + continue; + if (atoi(sr->sr_nodes[i]) == nid) + self_in_set = TRUE; + } + } + + /* If we aren't in the set, delete the set */ + if (self_in_set == FALSE) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "Removing set %s from database\n"), + sr->sr_setname); + s_delset(sr->sr_setname, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + return (1); + } + return (0); +} + +void +sr_del_drv(md_set_record *sr, mddb_recid_t recid) +{ + mddb_userreq_t req; + md_error_t xep = mdnullerror; + + if (!s_ownset(sr->sr_setno, &xep)) { + if (! mdisok(&xep)) + mdclrerror(&xep); + goto skip; + } + + /* delete the replicas? */ + /* release ownership of the drive? */ + /* NOTE: We may not have a name, so both of the above are ugly! */ + +skip: + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_DR(MD_DB_DELETE, recid) + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) + mdclrerror(&req.ur_mde); + + dr_cache_del(sr, recid); +} + +static void +sr_drvs(md_set_record *sr) +{ + md_drive_record *dr; + int i; + int modified = 0; + int sidesok; + mdnm_params_t nm; + static char device_name[MAXPATHLEN]; + md_error_t xep = mdnullerror; + md_mnnode_record *nr; + md_mnset_record *mnsr; + + for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { + /* If we were mid-add, cleanup */ + if ((dr->dr_flags & MD_DR_ADD)) { + sr_del_drv(sr, dr->dr_selfid); + modified++; + continue; + } + + sidesok = TRUE; + if (MD_MNSET_REC(sr)) { + mnsr = (md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + /* + * MultiNode disksets only have entries for + * their side in the local set. Verify + * that drive has a name associated with + * this node's side. + */ + while (nr) { + /* Find my node */ + if (strcmp(mynode(), nr->nr_nodename) != 0) { + nr = nr->nr_next; + continue; + } + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = MD_LOCAL_SET; + nm.side = nr->nr_nodeid; + nm.key = dr->dr_key; + nm.devname = (uint64_t)device_name; + + if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde, + NULL) != 0) { + if (! mdissyserror(&nm.mde, ENOENT)) { + mdclrerror(&nm.mde); + return; + } + } + + /* + * If entry is found for this node, then + * break out of loop walking through + * node list. For a multi-node diskset, + * there should only be an entry for + * this node. + */ + if (nm.key != MD_KEYWILD && + ! mdissyserror(&nm.mde, ENOENT)) { + break; + } + + /* + * If entry is not found for this node, + * then delete the drive. No need to + * continue through the node loop since + * our node has already been found. + */ + sidesok = FALSE; + mdclrerror(&nm.mde); + + /* If we are missing a sidename, cleanup */ + sr_del_drv(sr, dr->dr_selfid); + modified++; + + break; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sr->sr_nodes[i][0] == '\0') + continue; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = MD_LOCAL_SET; + nm.side = i + SKEW; + nm.key = dr->dr_key; + nm.devname = (uint64_t)device_name; + + if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde, + NULL) != 0) { + if (! mdissyserror(&nm.mde, ENOENT)) { + mdclrerror(&nm.mde); + return; + } + } + + if (nm.key != MD_KEYWILD && + ! mdissyserror(&nm.mde, ENOENT)) + continue; + + sidesok = FALSE; + mdclrerror(&nm.mde); + + /* If we are missing a sidename, cleanup */ + sr_del_drv(sr, dr->dr_selfid); + modified++; + + break; + } + } + + if (sidesok == FALSE) + continue; + + /* + * If we got this far, the drive record is either in the OK + * or DEL state, if it is in the DEL state and the sidenames + * all checked out, then we will make it OK. + */ + if ((dr->dr_flags & MD_DR_OK)) + continue; + + dr->dr_flags = MD_DR_OK; + + modified++; + } + + if (modified) { + commitset(sr, FALSE, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + } +} + +static void +add_key_to_lst(key_lst_t **klpp, side_t side, mdkey_t key) +{ + key_lst_t *klp; + + assert(klpp != NULL); + + for (/* void */; *klpp != NULL; klpp = &(*klpp)->kl_next) + /* void */; + + /* allocate new list element */ + klp = *klpp = Zalloc(sizeof (*klp)); + + klp->kl_side = side; + klp->kl_key = key; +} + +#ifdef DUMPKEYLST +static void +pr_key_lst(char *tag, key_lst_t *klp) +{ + key_lst_t *tklp; + + md_eprintf("Tag=%s\n", tag); + for (tklp = klp; tklp != NULL; tklp = tklp->kl_next) + md_eprintf("side=%d, key=%lu\n", tklp->kl_side, tklp->kl_key); +} +#endif /* DUMPKEYLST */ + +static int +key_in_key_lst(key_lst_t *klp, side_t side, mdkey_t key) +{ + key_lst_t *tklp; + + for (tklp = klp; tklp != NULL; tklp = tklp->kl_next) + if (tklp->kl_side == side && tklp->kl_key == key) + return (1); + + return (0); +} + +static void +destroy_key_lst(key_lst_t **klpp) +{ + key_lst_t *tklp, *klp; + + assert(klpp != NULL); + + tklp = klp = *klpp; + while (klp != NULL) { + tklp = klp; + klp = klp->kl_next; + Free(tklp); + } + *klpp = NULL; +} + +static void +sr_sidenms(void) +{ + md_drive_record *dr; + md_set_record *sr; + key_lst_t *use = NULL; + mdnm_params_t nm; + int i; + md_mnset_record *mnsr; + md_mnnode_record *nr; + side_t myside = 0; + + /* + * We now go through the list of set and drive records collecting + * the key/side pairs that are being used. + */ + for (sr = setrecords; sr != NULL; sr = sr->sr_next) { + /* + * To handle the multi-node diskset case, get the sideno + * associated with this node. This sideno will be the + * same across all multi-node disksets. + */ + if ((myside == 0) && (MD_MNSET_REC(sr))) { + mnsr = (struct md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + while (nr) { + if (strcmp(mynode(), nr->nr_nodename) == 0) { + myside = nr->nr_nodeid; + break; + } + nr = nr->nr_next; + } + /* + * If this node is not in this MNset - + * then skip this set. + */ + if (!nr) { + continue; + } + } + + for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { + if (MD_MNSET_REC(sr)) { + /* + * There are no non-local sidenames in the + * local set for a multi-node diskset. + */ + add_key_to_lst(&use, myside, dr->dr_key); + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sr->sr_nodes[i][0] == '\0') + continue; + + add_key_to_lst(&use, i + SKEW, + dr->dr_key); + } + } + } + } + +#ifdef DUMPKEYLST + pr_key_lst("use", use); +#endif /* DUMPKEYLST */ + + /* + * We take the list above and get all non-local sidenames, checking + * each to see if they are in use, if they are not used, we delete them. + * Do the check for myside to cover multinode disksets. + * Then do the check for MD_MAXSIDES to cover non-multinode disksets. + * If any multi-node disksets were present, myside would be non-zero. + * myside is the same for all multi-node disksets for this node. + */ + if (myside) { + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = MD_LOCAL_SET; + nm.side = myside; + nm.key = MD_KEYWILD; + + /*CONSTCOND*/ + while (1) { + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, + NULL) != 0) { + mdclrerror(&nm.mde); + break; + } + + if (nm.key == MD_KEYWILD) + break; + + if (! key_in_key_lst(use, nm.side, nm.key)) { + if (metaioctl(MD_IOCREM_NM, &nm, &nm.mde, + NULL) != 0) { + mdclrerror(&nm.mde); + continue; + } + } + } + } + /* Now handle the non-multinode disksets */ + for (i = 0; i < MD_MAXSIDES; i++) { + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = MD_LOCAL_SET; + nm.side = i + SKEW; + nm.key = MD_KEYWILD; + + /*CONSTCOND*/ + while (1) { + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, + NULL) != 0) { + mdclrerror(&nm.mde); + break; + } + + if (nm.key == MD_KEYWILD) + break; + + if (! key_in_key_lst(use, nm.side, nm.key)) { + if (metaioctl(MD_IOCREM_NM, &nm, &nm.mde, + NULL) != 0) { + mdclrerror(&nm.mde); + continue; + } + } + } + } + + /* Cleanup */ + destroy_key_lst(&use); +} + +void +sr_validate(void) +{ + md_set_record *sr; + md_error_t xep = mdnullerror; + int mnset_single_node; + md_mnnode_record *nr; + md_mnset_record *mnsr; + + assert(setsnarfdone != 0); + + /* We have validated the records already */ + if (setsnarfdone == 3) + return; + + /* + * Check if we are in a single node non-SC3.x environmemnt + */ + mnset_single_node = meta_mn_singlenode(); + /* + * If a possible single_node situation, verify that all + * MN disksets have only one node (which is mynode()). + */ + if (mnset_single_node) { + for (sr = setrecords; sr != NULL; sr = sr->sr_next) { + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + /* + * If next pointer is non-null (more than + * one node in list) or if the single node + * isn't my node - reset single node flag. + */ + if ((nr->nr_next) || + (strcmp(nr->nr_nodename, mynode()) != 0)) { + mnset_single_node = 0; + break; + } + } + } + } + + for (sr = setrecords; sr != NULL; sr = sr->sr_next) { + /* + * If a MN diskset and not in the single node + * situation, then don't validate the MN set. + * This is done during a reconfig cycle since all + * nodes must take the same action. + */ + if (MD_MNSET_REC(sr) && (mnset_single_node == 0)) + continue; + + /* Since we do "partial" snarf's, we only check new entries */ + if (! (sr->sr_flags & MD_SR_CHECK)) + continue; + + /* If we were mid-add, cleanup */ + if ((sr->sr_flags & MD_SR_ADD)) { + s_delset(sr->sr_setname, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + continue; + } + + /* Make sure we are in the set. */ + if (sr_hosts(sr)) + continue; + + /* Check has been done, clear the flag */ + if ((sr->sr_flags & MD_SR_CHECK)) + sr->sr_flags &= ~MD_SR_CHECK; + + /* + * If we got here, we are in the set, make sure the flags make + * sense. + */ + if (! (sr->sr_flags & MD_SR_OK)) { + sr->sr_flags &= ~MD_SR_STATE_FLAGS; + sr->sr_flags |= MD_SR_OK; + commitset(sr, FALSE, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + } + + /* Make sure all the drives are in a stable state. */ + sr_drvs(sr); + } + + /* Cleanup any stray sidenames */ + sr_sidenms(); + + setsnarfdone = 3; +} + +static md_set_record * +sr_in_cache(mddb_recid_t recid) +{ + md_set_record *tsr; + + for (tsr = setrecords; tsr != NULL; tsr = tsr->sr_next) + if (tsr->sr_selfid == recid) + return (tsr); + return ((md_set_record *)NULL); +} + +int +set_snarf(md_error_t *ep) +{ + md_set_record *sr; + md_mnset_record *mnsr; + md_set_record *tsr; + md_drive_record *dr; + mddb_userreq_t *reqp; + ur_recid_lst_t *urlp; + mddb_recid_t id; + mddb_recid_t *p; + md_error_t xep = mdnullerror; + md_mnnode_record *nr; + mddb_set_node_params_t snp; + int nodecnt; + mndiskset_membershiplist_t *nl, *nl2; + + /* We have done the snarf call */ + if (setsnarfdone != 0) + return (0); + + if (meta_setup_db_locations(ep) != 0) { + if (! mdismddberror(ep, MDE_DB_STALE)) + return (-1); + mdclrerror(ep); + } + + /* + * Get membershiplist from API routine. + * If there's an error, just use a NULL + * nodelist. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + nodecnt = 0; /* no nodes are alive */ + nl = NULL; + mdclrerror(ep); + } + + /* Let sr_cache_add and dr_cache_add know we are doing the snarf */ + setsnarfdone = 1; + + /* Go get the set records */ + id = 0; + while ((sr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_NEXT, MDDB_UR_SR, + &id, ep)) != NULL) { + sr->sr_next = NULL; + sr->sr_drivechain = NULL; + + /* + * Cluster nodename support + * Convert nodeid -> nodename + * Don't do this for MN disksets since we've already stored + * both the nodeid and name. + */ + if (!(MD_MNSET_REC(sr))) + sdssc_cm_sr_nid2nm(sr); + + /* If we were mid-cvt, cleanup */ + if (sr->sr_flags & MD_SR_CVT) { + /* If the daemon is calling, cleanup */ + if (md_in_daemon) + url_addl(&url_tode, sr->sr_selfid); + continue; + } + + if (md_in_daemon) + url_addl(&url_used, sr->sr_selfid); + + /* Skip cached records */ + tsr = sr_in_cache(sr->sr_selfid); + if (tsr != (md_set_record *)NULL) { + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + Free(mnsr); + } else { + Free(sr); + } + if (md_in_daemon) + for (dr = tsr->sr_drivechain; + dr != (md_drive_record *)NULL; + dr = dr->dr_next) + url_addl(&url_used, dr->dr_selfid); + continue; + } + + /* Mark the record as one to be checked */ + sr->sr_flags |= MD_SR_CHECK; + + sr_cache_add(sr); + + /* If MNdiskset, go get the node records */ + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + mnsr->sr_nodechain = NULL; + p = &mnsr->sr_noderec; + while ((nr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY, + MDDB_UR_NR, p, ep)) != NULL) { + nr->nr_next = NULL; + + if (md_in_daemon) + url_addl(&url_used, nr->nr_selfid); + + /* + * Turn off ALIVE node flag based on member + * list. + * If ALIVE flag is not set, reset OWN flag. + * If this node is mynode, set the OWN flag + * to match the ownership of the diskset. + */ + if (md_in_daemon) { + nr->nr_flags &= ~MD_MN_NODE_ALIVE; + nl2 = nl; + while (nl2) { + /* + * If in member list, + * set alive. + */ + if (nl2->msl_node_id == + nr->nr_nodeid) { + nr->nr_flags |= + MD_MN_NODE_ALIVE; + break; + } + nl2 = nl2->next; + } + /* + * If mynode is in member list, then + * check to see if set is snarfed. + * If set snarfed, set own flag; + * otherwise reset it. + * Don't change master even if + * node isn't an owner node, since + * node may be master, but hasn't + * joined the set yet. + */ + if (nr->nr_flags & MD_MN_NODE_ALIVE) { + if (strcmp(nr->nr_nodename, + mynode()) == 0) { + if (s_ownset( + mnsr->sr_setno, ep)) { + nr->nr_flags |= + MD_MN_NODE_OWN; + } else { + nr->nr_flags &= + ~MD_MN_NODE_OWN; + } + } + } else { + if (strcmp(nr->nr_nodename, + mynode()) == 0) { + /* + * If my node isn't in member + * list then reset master. + */ + mnsr = (struct + md_mnset_record *)sr; + mnsr->sr_master_nodeid = + MD_MN_INVALID_NID; + mnsr->sr_master_nodenm[0] = + '\0'; + } + nr->nr_flags &= ~MD_MN_NODE_OWN; + } + } + + /* + * Must grab nr_nextrec now since + * mnnr_cache_add may change it + * (mnnr_cache_add is storing the nodes in + * an ascending nodeid order list in order + * to support reconfig). + */ + if (nr->nr_nextrec != 0) + p = &nr->nr_nextrec; + else + p = NULL; + + mnnr_cache_add((struct md_mnset_record *)sr, + nr); + + if ((md_in_daemon) && + (strcmp(nr->nr_nodename, mynode()) == 0)) { + (void) memset(&snp, 0, sizeof (snp)); + snp.sn_nodeid = nr->nr_nodeid; + snp.sn_setno = mnsr->sr_setno; + if (metaioctl(MD_MN_SET_NODEID, &snp, + &snp.sn_mde, NULL) != 0) { + (void) mdstealerror(ep, + &snp.sn_mde); + } + } + + if (p == NULL) + break; + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + } + } + + if (sr->sr_driverec == 0) + continue; + + /* Go get the drive records */ + p = &sr->sr_driverec; + while ((dr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY, + MDDB_UR_DR, p, ep)) != NULL) { + dr->dr_next = NULL; + + if (md_in_daemon) + url_addl(&url_used, dr->dr_selfid); + + dr_cache_add(sr, dr); + + if (dr->dr_nextrec == 0) + break; + + p = &dr->dr_nextrec; + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + /* + * If dr_nextrec was not valid, or we had some + * problem getting the record, we end up here. + * get_ur_rec() zeroes the recid we passed in, + * if we had a failure getting a record using a key, + * so we simply commit the set record and valid + * drive records, if this fails, we hand an error + * back to the caller. + */ + commitset(sr, FALSE, ep); + if (! mdisok(ep)) + goto out; + } + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + } + + /* + * If the daemon called, go through the USER records and cleanup + * any that are not used by valid sets. + */ + if (md_in_daemon) { + id = 0; + /* Make a list of records to delete */ + while ((reqp = get_db_rec(MD_UR_GET_NEXT, MD_LOCAL_SET, + MDDB_USER, 0, &id, ep)) != NULL) { + if (reqp->ur_type2 != MDDB_UR_SR && + reqp->ur_type2 != MDDB_UR_DR) { + Free((void *)reqp->ur_data); + Free(reqp); + continue; + } + if (! url_findl(url_used, reqp->ur_recid)) + url_addl(&url_tode, reqp->ur_recid); + Free((void *)reqp->ur_data); + Free(reqp); + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + } + + /* Delete all the delete listed records */ + for (urlp = url_tode; urlp != NULL; urlp = urlp->url_nx) { + s_delrec(urlp->url_recid, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + } + } + + url_freel(&url_used); + url_freel(&url_tode); + + if (nodecnt) + meta_free_nodelist(nl); + + /* Mark the snarf complete */ + setsnarfdone = 2; + return (0); + +out: + url_freel(&url_used); + url_freel(&url_tode); + + sr_cache_flush(1); + + if (nodecnt) + meta_free_nodelist(nl); + + /* Snarf failed, reset state */ + setsnarfdone = 0; + + return (-1); +} + +void +sr_cache_add(md_set_record *sr) +{ + md_set_record *tsr; + + assert(setsnarfdone != 0); + + if (setrecords == NULL) { + setrecords = sr; + return; + } + + for (tsr = setrecords; tsr->sr_next != NULL; tsr = tsr->sr_next) + /* void */; + tsr->sr_next = sr; +} + +void +sr_cache_del(mddb_recid_t recid) +{ + md_set_record *sr, *tsr; + md_mnset_record *mnsr; + + assert(setsnarfdone != 0); + + for (sr = tsr = setrecords; sr != NULL; tsr = sr, sr = sr->sr_next) { + if (sr->sr_selfid != recid) + continue; + if (sr == setrecords) + setrecords = sr->sr_next; + else + tsr->sr_next = sr->sr_next; + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + Free(mnsr); + } else { + Free(sr); + } + break; + } + if (setrecords == NULL) + setsnarfdone = 0; +} + +void +dr_cache_add(md_set_record *sr, md_drive_record *dr) +{ + md_drive_record *tdr; + + assert(setsnarfdone != 0); + + assert(sr != NULL); + + if (sr->sr_drivechain == NULL) { + sr->sr_drivechain = dr; + sr->sr_driverec = dr->dr_selfid; + return; + } + + for (tdr = sr->sr_drivechain; tdr->dr_next != NULL; tdr = tdr->dr_next) + /* void */; + + tdr->dr_next = dr; + tdr->dr_nextrec = dr->dr_selfid; +} + +void +dr_cache_del(md_set_record *sr, mddb_recid_t recid) +{ + md_drive_record *dr; + md_drive_record *tdr; + + assert(setsnarfdone != 0); + + assert(sr != NULL); + + for (dr = tdr = sr->sr_drivechain; dr != NULL; + tdr = dr, dr = dr->dr_next) { + if (dr->dr_selfid != recid) + continue; + + if (dr == sr->sr_drivechain) { + sr->sr_drivechain = dr->dr_next; + sr->sr_driverec = dr->dr_nextrec; + } else { + tdr->dr_next = dr->dr_next; + tdr->dr_nextrec = dr->dr_nextrec; + } + Free(dr); + break; + } +} + +/* + * Nodes must be kept in ascending node id order in order to + * support reconfig. + * + * This routine may change nr->nr_next and nr->nr_nextrec. + */ +void +mnnr_cache_add(md_mnset_record *mnsr, md_mnnode_record *nr) +{ + md_mnnode_record *tnr, *tnr_prev; + + assert(mnsr != NULL); + + if (mnsr->sr_nodechain == NULL) { + mnsr->sr_nodechain = nr; + mnsr->sr_noderec = nr->nr_selfid; + return; + } + + /* + * If new_record->nodeid < first_record->nodeid, + * put new_record at beginning of list. + */ + if (nr->nr_nodeid < mnsr->sr_nodechain->nr_nodeid) { + nr->nr_next = mnsr->sr_nodechain; + nr->nr_nextrec = mnsr->sr_noderec; + mnsr->sr_nodechain = nr; + mnsr->sr_noderec = nr->nr_selfid; + return; + } + + /* + * Walk list looking for place to insert record. + */ + + tnr_prev = mnsr->sr_nodechain; + tnr = tnr_prev->nr_next; + while (tnr) { + /* Insert new record between tnr_prev and tnr */ + if (nr->nr_nodeid < tnr->nr_nodeid) { + nr->nr_next = tnr; + nr->nr_nextrec = tnr->nr_selfid; /* tnr's recid */ + tnr_prev->nr_next = nr; + tnr_prev->nr_nextrec = nr->nr_selfid; + return; + } + tnr_prev = tnr; + tnr = tnr->nr_next; + } + + /* + * Add record to end of list. + */ + tnr_prev->nr_next = nr; + tnr_prev->nr_nextrec = nr->nr_selfid; +} + +void +mnnr_cache_del(md_mnset_record *mnsr, mddb_recid_t recid) +{ + md_mnnode_record *nr; + md_mnnode_record *tnr; + + assert(mnsr != NULL); + + tnr = 0; + nr = mnsr->sr_nodechain; + while (nr) { + if (nr->nr_selfid != recid) { + tnr = nr; + nr = nr->nr_next; + continue; + } + + if (nr == mnsr->sr_nodechain) { + mnsr->sr_nodechain = nr->nr_next; + mnsr->sr_noderec = nr->nr_nextrec; + } else { + tnr->nr_next = nr->nr_next; + tnr->nr_nextrec = nr->nr_nextrec; + } + Free(nr); + break; + } +} + +int +metad_isautotakebyname(char *setname) +{ + md_error_t error = mdnullerror; + md_set_record *sr; + + if (md_in_daemon) + assert(setsnarfdone != 0); + else if (set_snarf(&error)) { + mdclrerror(&error); + return (0); + } + + for (sr = setrecords; sr != NULL; sr = sr->sr_next) { + if (strcmp(setname, sr->sr_setname) == 0) { + if (sr->sr_flags & MD_SR_AUTO_TAKE) + return (1); + return (0); + } + } + + return (0); +} + +int +metad_isautotakebynum(set_t setno) +{ + md_error_t error = mdnullerror; + md_set_record *sr; + + if (md_in_daemon) + assert(setsnarfdone != 0); + else if (set_snarf(&error)) { + mdclrerror(&error); + return (0); + } + + for (sr = setrecords; sr != NULL; sr = sr->sr_next) { + if (setno == sr->sr_setno) { + if (sr->sr_flags & MD_SR_AUTO_TAKE) + return (1); + return (0); + } + } + + return (0); +} + +md_set_record * +metad_getsetbyname(char *setname, md_error_t *ep) +{ + md_set_record *sr; + char buf[100]; + + assert(setsnarfdone != 0); + + for (sr = setrecords; sr != NULL; sr = sr->sr_next) + if (strcmp(setname, sr->sr_setname) == 0) + return (sr); + + (void) snprintf(buf, sizeof (buf), "setname \"%s\"", setname); + (void) mderror(ep, MDE_NO_SET, buf); + return (NULL); +} + +md_set_record * +metad_getsetbynum(set_t setno, md_error_t *ep) +{ + md_set_record *sr; + char buf[100]; + + if (md_in_daemon) + assert(setsnarfdone != 0); + else if (set_snarf(ep)) /* BYPASS DAEMON mode */ + return (NULL); + + for (sr = setrecords; sr != NULL; sr = sr->sr_next) + if (setno == sr->sr_setno) + return (sr); + + (void) sprintf(buf, "setno %u", setno); + (void) mderror(ep, MDE_NO_SET, buf); + return (NULL); +} + + +/* + * Commit the set record and all of its associated records + * (drive records, node records for a MNset) to the local mddb. + */ +void +commitset(md_set_record *sr, int inc_genid, md_error_t *ep) +{ + int drc, nrc, rc; + int *recs; + uint_t size; + md_drive_record *dr; + mddb_userreq_t req; + md_mnset_record *mnsr; + md_mnnode_record *nr; + + assert(setsnarfdone != 0); + + /* + * Cluster nodename support + * Convert nodename -> nodeid + * Don't do this for MN disksets since we've already stored + * both the nodeid and name. + */ + if (!(MD_MNSET_REC(sr))) + sdssc_cm_sr_nm2nid(sr); + + /* Send down to kernel the data in mddb USER set record */ + if (inc_genid) + sr->sr_genid++; + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_SR(MD_DB_SETDATA, sr->sr_selfid) + if (MD_MNSET_REC(sr)) { + req.ur_size = sizeof (*mnsr); + } else { + req.ur_size = sizeof (*sr); + } + req.ur_data = (uintptr_t)sr; + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + return; + } + + /* + * Walk through the drive records associated with this set record + * and send down to kernel the data in mddb USER drive record. + */ + drc = 0; + dr = sr->sr_drivechain; + while (dr) { + if (inc_genid) + dr->dr_genid++; + METAD_SETUP_DR(MD_DB_SETDATA, dr->dr_selfid) + req.ur_size = sizeof (*dr); + req.ur_data = (uintptr_t)dr; + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + return; + } + drc++; + dr = dr->dr_next; + } + + + /* + * If this set is a multi-node set - + * walk through the node records associated with this set record + * and send down to kernel the data in mddb USER node record. + */ + nrc = 0; + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + while (nr) { + if (inc_genid) + nr->nr_genid++; + METAD_SETUP_NR(MD_DB_SETDATA, nr->nr_selfid) + req.ur_size = sizeof (*nr); + req.ur_data = (uint64_t)nr; + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) + != 0) { + (void) mdstealerror(ep, &req.ur_mde); + return; + } + nrc++; + nr = nr->nr_next; + } + } + + /* + * Set up list of mddb USER recids containing set and drive records + * and node records if a MNset. + */ + rc = 0; + size = (nrc + drc + 2) * sizeof (int); + recs = Zalloc(size); + /* First recid in list is the set record's id */ + recs[rc] = sr->sr_selfid; + rc++; + dr = sr->sr_drivechain; + while (dr) { + /* Now, fill in the drive record ids */ + recs[rc] = dr->dr_selfid; + dr = dr->dr_next; + rc++; + } + if (MD_MNSET_REC(sr)) { + nr = mnsr->sr_nodechain; + while (nr) { + /* If a MNset, fill in the node record ids */ + recs[rc] = nr->nr_selfid; + nr = nr->nr_next; + rc++; + } + } + /* Set last record to null recid */ + recs[rc] = 0; + + /* Write out the set and drive and node records to the local mddb */ + METAD_SETUP_UR(MD_DB_COMMIT_MANY, 0, 0); + req.ur_size = size; + req.ur_data = (uintptr_t)recs; + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + return; + } + + /* + * Cluster nodename support + * Convert nodeid -> nodename + * Don't do this for MN disksets since we've already stored + * both the nodeid and name. + */ + if (!(MD_MNSET_REC(sr))) + sdssc_cm_sr_nid2nm(sr); + + Free(recs); +} + +/* + * This routine only handles returns a md_set_record structure even + * if the set record describes a MN set. This will allow pre-MN + * SVM RPC code to access a MN set record and to display it. + * + * The MN SVM RPC code detects if the set record returned describes + * a MN set and then will copy it using mnsetdup. + */ +md_set_record * +setdup(md_set_record *sr) +{ + md_set_record *tsr = NULL; + md_drive_record **tdrpp = NULL; + + if (sr && (tsr = Malloc(sizeof (*sr))) != NULL) { + (void) memmove(tsr, sr, sizeof (*sr)); + tsr->sr_next = NULL; + tdrpp = &tsr->sr_drivechain; + while (*tdrpp) { + *tdrpp = drdup(*tdrpp); + tdrpp = &(*tdrpp)->dr_next; + } + } + return (tsr); +} + +/* + * This routine only copies MN set records. If a non-MN set + * record was passed in NULL pointer will be returned. + */ +md_mnset_record * +mnsetdup(md_mnset_record *mnsr) +{ + md_mnset_record *tmnsr = NULL; + md_drive_record **tdrpp = NULL; + md_mnnode_record **tnrpp = NULL; + + if (!MD_MNSET_REC(mnsr)) { + return (NULL); + } + + if (mnsr && (tmnsr = Malloc(sizeof (*mnsr))) != NULL) { + (void) memmove(tmnsr, mnsr, sizeof (*mnsr)); + tmnsr->sr_next = NULL; + tdrpp = &tmnsr->sr_drivechain; + while (*tdrpp) { + *tdrpp = drdup(*tdrpp); + tdrpp = &(*tdrpp)->dr_next; + } + tnrpp = &tmnsr->sr_nodechain; + while (*tnrpp) { + *tnrpp = nrdup(*tnrpp); + tnrpp = &(*tnrpp)->nr_next; + } + } + return (tmnsr); +} + +md_drive_record * +drdup(md_drive_record *dr) +{ + md_drive_record *tdr = NULL; + + if (dr && (tdr = Malloc(sizeof (*dr))) != NULL) + (void) memmove(tdr, dr, sizeof (*dr)); + return (tdr); +} + +md_mnnode_record * +nrdup(md_mnnode_record *nr) +{ + md_mnnode_record *tnr = NULL; + + if (nr && (tnr = Malloc(sizeof (*nr))) != NULL) + (void) memmove(tnr, nr, sizeof (*nr)); + return (tnr); +} + +/* + * Duplicate parts of the drive decriptor list for this node. + * Only duplicate the drive name string in the mddrivename structure, don't + * need to copy any other pointers since only interested in the flags and + * the drive name (i.e. other pointers will be set to NULL). + * Returns NULL if failure due to Malloc failure. + * Returns pointer (non-NULL) to dup'd list if successful. + */ +md_drive_desc * +dd_list_dup(md_drive_desc *dd) +{ + md_drive_desc *orig_dd; + md_drive_desc *copy_dd = NULL, *copy_dd_prev = NULL; + md_drive_desc *copy_dd_head = NULL; + mddrivename_t *copy_dnp; + char *copy_cname; + char *copy_devid; + + if (dd == NULL) + return (NULL); + + orig_dd = dd; + + while (orig_dd) { + copy_dd = Zalloc(sizeof (*copy_dd)); + copy_dnp = Zalloc(sizeof (mddrivename_t)); + copy_cname = Zalloc(sizeof (orig_dd->dd_dnp->cname)); + if (orig_dd->dd_dnp->devid) { + copy_devid = Zalloc(sizeof (orig_dd->dd_dnp->devid)); + } else { + copy_devid = NULL; + } + copy_dd->dd_next = NULL; + if ((copy_dd == NULL) || (copy_dnp == NULL) || + (copy_cname == NULL)) { + while (copy_dd_head) { + copy_dd = copy_dd_head->dd_next; + Free(copy_dd_head); + copy_dd_head = copy_dd; + } + if (copy_dnp) + Free(copy_dnp); + if (copy_dd) + Free(copy_dd); + if (copy_cname) + Free(copy_cname); + if (copy_devid) + Free(copy_devid); + return (NULL); + } + (void) memmove(copy_dd, orig_dd, sizeof (*orig_dd)); + (void) strlcpy(copy_cname, orig_dd->dd_dnp->cname, + sizeof (orig_dd->dd_dnp->cname)); + copy_dd->dd_next = NULL; + copy_dd->dd_dnp = copy_dnp; + copy_dd->dd_dnp->cname = copy_cname; + if (copy_devid) { + (void) strlcpy(copy_devid, orig_dd->dd_dnp->devid, + sizeof (orig_dd->dd_dnp->devid)); + } + + if (copy_dd_prev == NULL) { + copy_dd_head = copy_dd; + copy_dd_prev = copy_dd; + } else { + copy_dd_prev->dd_next = copy_dd; + copy_dd_prev = copy_dd; + } + orig_dd = orig_dd->dd_next; + } + copy_dd->dd_next = NULL; + return (copy_dd_head); +} + +void +sr_cache_flush(int flushnames) +{ + md_set_record *sr, *tsr; + md_mnset_record *mnsr; + md_drive_record *dr, *tdr; + md_mnnode_record *nr, *tnr; + + sr = tsr = setrecords; + while (sr != NULL) { + dr = tdr = sr->sr_drivechain; + while (dr != NULL) { + tdr = dr; + dr = dr->dr_next; + Free(tdr); + } + tsr = sr; + sr = sr->sr_next; + if (MD_MNSET_REC(tsr)) { + mnsr = (struct md_mnset_record *)tsr; + nr = tnr = mnsr->sr_nodechain; + while (nr != NULL) { + tnr = nr; + nr = nr->nr_next; + Free(tnr); + } + Free(mnsr); + } else { + Free(tsr); + } + } + + setrecords = NULL; + + setsnarfdone = 0; + + /* This will cause the other caches to be cleared */ + if (flushnames) + metaflushnames(0); +} + +void +sr_cache_flush_setno(set_t setno) +{ + md_set_record *sr, *tsr; + md_mnset_record *mnsr; + md_drive_record *dr, *tdr; + + assert(setsnarfdone != 0); + + for (sr = tsr = setrecords; sr; tsr = sr, sr = sr->sr_next) { + if (sr->sr_setno != setno) + continue; + + dr = tdr = sr->sr_drivechain; + while (dr != NULL) { + tdr = dr; + dr = dr->dr_next; + Free(tdr); + } + if (sr == setrecords) + setrecords = sr->sr_next; + else + tsr->sr_next = sr->sr_next; + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + Free(mnsr); + } else { + Free(sr); + } + break; + } + + setsnarfdone = 0; + + /* This will cause the other caches to be cleared */ + metaflushnames(0); +} + +int +s_ownset(set_t setno, md_error_t *ep) +{ + mddb_ownset_t ownset_arg; + + ownset_arg.setno = setno; + ownset_arg.owns_set = MD_SETOWNER_NONE; + + if (metaioctl(MD_DB_OWNSET, &ownset_arg, ep, NULL) != 0) + return (0); + + return (ownset_arg.owns_set); +} + +void +s_delset(char *setname, md_error_t *ep) +{ + md_set_record *sr; + md_set_record *tsr; + md_drive_record *dr; + md_drive_record *tdr; + md_mnnode_record *nr, *tnr; + mddb_userreq_t req; + char stringbuf[100]; + int i; + mdsetname_t *sp = NULL; + mddrivename_t *dn = NULL; + mdname_t *np = NULL; + md_dev64_t dev; + side_t myside = MD_SIDEWILD; + md_error_t xep = mdnullerror; + md_mnset_record *mnsr; + int num_sets = 0; + int num_mn_sets = 0; + + (void) memset(&req, 0, sizeof (mddb_userreq_t)); + + if ((sr = getsetbyname(setname, ep)) == NULL) + return; + + sp = metasetnosetname(sr->sr_setno, &xep); + mdclrerror(&xep); + + if (MD_MNSET_REC(sr)) { + /* + * If this node is a set owner, halt the set before + * deleting the set records. Ignore any errors since + * s_ownset and halt_set could fail if panic had occurred + * during the add/delete of a node. + */ + if (s_ownset(sr->sr_setno, &xep)) { + mdclrerror(&xep); + if (halt_set(sp, &xep)) + mdclrerror(&xep); + } + } + + (void) snprintf(stringbuf, sizeof (stringbuf), "/dev/md/%s", setname); + (void) unlink(stringbuf); + (void) unlink(meta_lock_name(sr->sr_setno)); + + if (MD_MNSET_REC(sr)) { + mnsr = (struct md_mnset_record *)sr; + nr = mnsr->sr_nodechain; + while (nr) { + /* Setting myside for later use */ + if (strcmp(mynode(), nr->nr_nodename) == 0) + myside = nr->nr_nodeid; + + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_NR(MD_DB_DELETE, nr->nr_selfid) + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, + NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + free_sr(sr); + return; + } + tnr = nr; + nr = nr->nr_next; + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_HOST, + sr->sr_setno, tnr->nr_nodeid); + + mnnr_cache_del((struct md_mnset_record *)sr, + tnr->nr_selfid); + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sr->sr_nodes[i][0] == '\0') + continue; + + if (strcmp(mynode(), sr->sr_nodes[i]) == 0) + myside = i; + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_HOST, + sr->sr_setno, i); + } + } + + dr = sr->sr_drivechain; + while (dr) { + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_DR(MD_DB_DELETE, dr->dr_selfid) + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + free_sr(sr); + return; + } + tdr = dr; + dr = dr->dr_next; + + dev = NODEV64; + if (myside != MD_SIDEWILD && sp != NULL) { + dn = metadrivename_withdrkey(sp, myside, + tdr->dr_key, MD_BASICNAME_OK, &xep); + if (dn != NULL) { + uint_t rep_slice; + + np = NULL; + if (meta_replicaslice(dn, &rep_slice, + &xep) == 0) { + np = metaslicename(dn, rep_slice, &xep); + } + + if (np != NULL) + dev = np->dev; + else + mdclrerror(&xep); + } else + mdclrerror(&xep); + } else + mdclrerror(&xep); + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, SVM_TAG_DRIVE, + sr->sr_setno, dev); + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_DRIVE, + MD_LOCAL_SET, dev); + + dr_cache_del(sr, tdr->dr_selfid); + + } + + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_SR(MD_DB_DELETE, sr->sr_selfid) + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); + free_sr(sr); + return; + } + + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_SET, sr->sr_setno, + NODEV64); + + for (tsr = setrecords; tsr; tsr = tsr->sr_next) { + if (tsr == sr) + continue; + + num_sets++; + if (MD_MNSET_REC(tsr)) + num_mn_sets++; + } + + if (num_mn_sets == 0) + (void) meta_smf_disable(META_SMF_MN_DISKSET, &xep); + + /* The set we just deleted is the only one left */ + if (num_sets == 0) + (void) meta_smf_disable(META_SMF_DISKSET, &xep); + + sr_cache_del(sr->sr_selfid); + free_sr(sr); + +} + +void +s_delrec(mddb_recid_t recid, md_error_t *ep) +{ + mddb_userreq_t req; + + (void) memset(&req, 0, sizeof (req)); + + METAD_SETUP_SR(MD_DB_DELETE, recid) + + if (metaioctl(MD_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) + (void) mdstealerror(ep, &req.ur_mde); +} + +/* + * resnarf the imported set + */ +int +resnarf_set( + set_t setno, + md_error_t *ep +) +{ + md_set_record *sr; + md_drive_record *dr; + mddb_recid_t id, *p; + + if (meta_setup_db_locations(ep) != 0) { + if (! mdismddberror(ep, MDE_DB_STALE)) + return (-1); + mdclrerror(ep); + } + + setsnarfdone = 1; + + id = 0; + while ((sr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_NEXT, MDDB_UR_SR, &id, + ep)) != NULL) { + + if (sr->sr_setno != setno) + continue; + + /* Don't allow resnarf of a multi-node diskset */ + if (MD_MNSET_REC(sr)) + goto out; + + sr->sr_next = NULL; + sr->sr_drivechain = NULL; + + if (md_in_daemon) + url_addl(&url_used, sr->sr_selfid); + + sr->sr_flags |= MD_SR_CHECK; + + sr_cache_add(sr); + + if (sr->sr_driverec == 0) + break; + + p = &sr->sr_driverec; + while ((dr = get_ur_rec(MD_LOCAL_SET, MD_UR_GET_WKEY, + MDDB_UR_DR, p, ep)) != NULL) { + dr->dr_next = NULL; + + if (md_in_daemon) + url_addl(&url_used, dr->dr_selfid); + + dr_cache_add(sr, dr); + + if (dr->dr_nextrec == 0) + break; + + p = &dr->dr_nextrec; + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + commitset(sr, FALSE, ep); + if (! mdisok(ep)) + goto out; + } + } + if (! mdisok(ep)) { + if (! mdissyserror(ep, ENOENT)) + goto out; + mdclrerror(ep); + } + + setsnarfdone = 2; + + url_freel(&url_used); + url_freel(&url_tode); + return (0); + +out: + url_freel(&url_used); + url_freel(&url_tode); + + sr_cache_flush(1); + + setsnarfdone = 0; + + return (-1); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mh.c b/usr/src/lib/lvm/libmeta/common/meta_mh.c new file mode 100644 index 0000000000..ba0ce10656 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mh.c @@ -0,0 +1,842 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * MH ioctl functions + */ + +#include <meta.h> +#include <metamhd.h> +#include <string.h> + +#include "meta_runtime.h" + +#define DEFAULTDEV "/dev/rdsk" +/* + * default timeout values + */ +mhd_mhiargs_t defmhiargs = { + 1000, /* failfast */ + { 6000, 6000, 30000 } /* take ownership */ +}; + +/* RPC timeouts */ +static md_timeval32_t tk_own_timeout = { 24 * 60 * 60, 0 }; /* 1 day */ +static md_timeval32_t rel_own_timeout = { 24 * 60 * 60, 0 }; /* 1 day */ + +/* + * RPC handle + */ +typedef struct { + char *hostname; + CLIENT *clientp; +} mhd_handle_t; + +/* + * close RPC connection + */ +static void +close_metamhd( + mhd_handle_t *hp +) +{ + assert(hp != NULL); + if (hp->hostname != NULL) { + Free(hp->hostname); + } + if (hp->clientp != NULL) { + auth_destroy(hp->clientp->cl_auth); + clnt_destroy(hp->clientp); + } + Free(hp); +} + +/* + * open RPC connection to rpc.metamhd + */ +static mhd_handle_t * +open_metamhd( + char *hostname, + md_error_t *ep +) +{ + CLIENT *clientp; + mhd_handle_t *hp; + + /* default to local host */ + if ((hostname == NULL) || (*hostname == '\0')) + hostname = mynode(); + + /* open RPC connection */ + assert(hostname != NULL); + if ((clientp = meta_client_create(hostname, METAMHD, METAMHD_VERSION, + "tcp")) == NULL) { + clnt_pcreateerror(hostname); + (void) mdrpccreateerror(ep, hostname, "metamhd clnt_create"); + return (NULL); + } else { + auth_destroy(clientp->cl_auth); + clientp->cl_auth = authsys_create_default(); + assert(clientp->cl_auth != NULL); + } + + /* return connection */ + hp = Zalloc(sizeof (*hp)); + hp->hostname = Strdup(hostname); + hp->clientp = clientp; + return (hp); +} + +/* + * steal and convert mherror_t + */ +int +mhstealerror( + mhd_error_t *mhep, + md_error_t *ep +) +{ + int rval = -1; + + /* no error */ + if (mhep->errnum == 0) { + /* assert(mhep->name == NULL); */ + rval = 0; + goto out; + } + + /* steal error */ + switch (mhep->errnum) { + case MHD_E_MAJORITY: + (void) mderror(ep, MDE_TAKE_OWN, mhep->name); + break; + case MHD_E_RESERVED: + (void) mderror(ep, MDE_RESERVED, mhep->name); + break; + default: + (void) mdsyserror(ep, mhep->errnum, mhep->name); + break; + } + + /* cleanup, return success */ +out: + if (mhep->name != NULL) + Free(mhep->name); + (void) memset(mhep, 0, sizeof (*mhep)); + return (rval); +} + +/* + * should we do MHIOCTLs ? + */ +static int +do_mhioctl() +{ + if (getenv("MD_NOMHIOCTL") != NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "NOT doing MH ioctls\n")); + (void) fflush(stderr); + return (0); + } + return (1); +} + +/* + * take ownership of drives + */ +int +meta_take_own( + char *sname, + mddrivenamelist_t *dnlp, + mhd_mhiargs_t *mhiargsp, + int partial_set, + md_error_t *ep +) +{ + mddrivenamelist_t *p; + uint_t ndev = 0; + mhd_tkown_args_t args; + mhd_error_t mherror; + mhd_set_t *mhsp = &args.set; + uint_t i; + char *e; + mhd_handle_t *hp = NULL; + int rval = -1; + + /* + * RFE 4126509. Check the runtime parameters to see if + * they're set to disable MHIOCTKOWN ioctl() operations + * on the disks. If so, return immediately without + * performing the operations. + */ + + if (do_owner_ioctls() == B_FALSE) { + return (0); + } + + /* count drives, get set */ + for (p = dnlp; (p != NULL); p = p->next) + ++ndev; + if (ndev == 0) + return (0); + + /* initialize */ + (void) memset(&args, 0, sizeof (args)); + (void) memset(&mherror, 0, sizeof (mherror)); + + /* build arguments */ + mhsp->setname = Strdup(sname); + mhsp->drives.drives_len = ndev; + mhsp->drives.drives_val + = Calloc(ndev, sizeof (*mhsp->drives.drives_val)); + for (p = dnlp, i = 0; (i < ndev); p = p->next, ++i) { + mhsp->drives.drives_val[i] = Strdup(p->drivenamep->rname); + } + args.timeouts = *mhiargsp; + args.ff_mode = MHD_FF_DRIVER; + if (((e = getenv("MD_DEBUG")) != NULL) && + ((e = strstr(e, "FAILFAST=")) != NULL) && + ((e = strchr(e, '=')) != NULL)) { + ++e; + if (strcmp(e, "NONE") == 0) + args.ff_mode = MHD_FF_NONE; + else if (strcmp(e, "DRIVER") == 0) + args.ff_mode = MHD_FF_DRIVER; + else if (strcmp(e, "DEBUG") == 0) + args.ff_mode = MHD_FF_DEBUG; + else if (strcmp(e, "HALT") == 0) + args.ff_mode = MHD_FF_HALT; + else if (strcmp(e, "PANIC") == 0) + args.ff_mode = MHD_FF_PANIC; + } + if (partial_set) + args.options |= MHD_PARTIAL_SET; + if (((e = getenv("MD_DEBUG")) != NULL) && + (strstr(e, "NOTHREAD") != NULL)) { + args.options |= MHD_SERIAL; + } + + /* open connection */ + if ((hp = open_metamhd(NULL, ep)) == NULL) + return (-1); + clnt_control(hp->clientp, CLSET_TIMEOUT, (char *)&tk_own_timeout); + + /* take ownership */ + if (mhd_tkown_1(&args, &mherror, hp->clientp) != RPC_SUCCESS) { + (void) mdrpcerror(ep, hp->clientp, hp->hostname, + "metamhd tkown"); + } else if (mhstealerror(&mherror, ep) == 0) { + rval = 0; /* success */ + } + + /* cleanup, return success */ +out: + xdr_free(xdr_mhd_tkown_args_t, (char *)&args); + xdr_free(xdr_mhd_error_t, (char *)&mherror); + if (hp != NULL) + close_metamhd(hp); + return (rval); +} + +/* + * take ownership of drives + */ +int +tk_own_bydd( + mdsetname_t *sp, + md_drive_desc *ddlp, + mhd_mhiargs_t *mhiargsp, + int partial_set, + md_error_t *ep +) +{ + mddrivenamelist_t *dnlp = NULL; + mddrivenamelist_t **tailpp = &dnlp; + md_drive_desc *p; + int rval; + + /* + * Add the drivename struct to the end of the + * drivenamelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + for (p = ddlp; (p != NULL); p = p->dd_next) + tailpp = meta_drivenamelist_append_wrapper(tailpp, p->dd_dnp); + + /* take ownership */ + rval = meta_take_own(sp->setname, dnlp, mhiargsp, partial_set, ep); + + /* cleanup, return success */ + metafreedrivenamelist(dnlp); + return (rval); +} + +/* + * release ownership of drives + */ +int +meta_rel_own( + char *sname, + mddrivenamelist_t *dnlp, + int partial_set, + md_error_t *ep +) +{ + mddrivenamelist_t *p; + uint_t ndev = 0; + mhd_relown_args_t args; + mhd_error_t mherror; + mhd_set_t *mhsp = &args.set; + uint_t i; + char *e; + mhd_handle_t *hp = NULL; + int rval = -1; + + /* + * RFE 4126509. Check the runtime parameters to see if + * they're set to disable MHIOCRELEASE and MHIOCENFAILFAST + * ioctl() operations on the disks. If so, return + * immediately without performing the operations. + */ + + if (do_owner_ioctls() == B_FALSE) { + return (0); + } + + /* + * if not doing ioctls (HK 98/10/28: the following code tests + * an environment variable, and was apparently inserted to + * make testing easier.) + */ + + if (! do_mhioctl()) + return (0); + + /* count drives, get set */ + for (p = dnlp; (p != NULL); p = p->next) + ++ndev; + if (ndev == 0) + return (0); + + /* initialize */ + (void) memset(&args, 0, sizeof (args)); + (void) memset(&mherror, 0, sizeof (mherror)); + + /* build arguments */ + mhsp->setname = Strdup(sname); + mhsp->drives.drives_len = ndev; + mhsp->drives.drives_val + = Calloc(ndev, sizeof (*mhsp->drives.drives_val)); + for (p = dnlp, i = 0; (i < ndev); p = p->next, ++i) { + mhsp->drives.drives_val[i] = Strdup(p->drivenamep->rname); + } + if (partial_set) + args.options |= MHD_PARTIAL_SET; + if (((e = getenv("MD_DEBUG")) != NULL) && + (strstr(e, "NOTHREAD") != NULL)) { + args.options |= MHD_SERIAL; + } + + /* open connection */ + if ((hp = open_metamhd(NULL, ep)) == NULL) + return (-1); + clnt_control(hp->clientp, CLSET_TIMEOUT, (char *)&rel_own_timeout); + + /* take ownership */ + if (mhd_relown_1(&args, &mherror, hp->clientp) != RPC_SUCCESS) { + (void) mdrpcerror(ep, hp->clientp, hp->hostname, + "metamhd relown"); + } else if (mhstealerror(&mherror, ep) == 0) { + rval = 0; /* success */ + } + + /* cleanup, return success */ +out: + xdr_free(xdr_mhd_relown_args_t, (char *)&args); + xdr_free(xdr_mhd_error_t, (char *)&mherror); + if (hp != NULL) + close_metamhd(hp); + return (rval); +} + +/* + * release ownership of drives + */ +int +rel_own_bydd( + mdsetname_t *sp, + md_drive_desc *ddlp, + int partial_set, + md_error_t *ep +) +{ + mddrivenamelist_t *dnlp = NULL; + mddrivenamelist_t **tailpp = &dnlp; + md_drive_desc *p; + int rval; + + /* + * Add the drivename struct to the end of the + * drivenamelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + for (p = ddlp; (p != NULL); p = p->dd_next) + tailpp = meta_drivenamelist_append_wrapper(tailpp, p->dd_dnp); + + /* release ownership */ + rval = meta_rel_own(sp->setname, dnlp, partial_set, ep); + + /* cleanup, return success */ + metafreedrivenamelist(dnlp); + return (rval); +} + +/* + * get status of drives + */ +int +meta_status_own( + char *sname, + md_disk_status_list_t *dslp, + int partial_set, + md_error_t *ep +) +{ + md_disk_status_list_t *p; + uint_t ndev = 0; + mhd_status_args_t args; + mhd_status_res_t results; + mhd_error_t *mhep = &results.status; + mhd_set_t *mhsp = &args.set; + uint_t i; + char *e; + mhd_handle_t *hp = NULL; + int rval = -1; + + /* if not doing ioctls */ + if (! do_mhioctl()) + return (0); + + /* count drives, get set */ + for (p = dslp; (p != NULL); p = p->next) + ++ndev; + if (ndev == 0) + return (0); + + /* initialize */ + (void) memset(&args, 0, sizeof (args)); + (void) memset(&results, 0, sizeof (results)); + + /* build arguments */ + mhsp->setname = Strdup(sname); + mhsp->drives.drives_len = ndev; + mhsp->drives.drives_val + = Calloc(ndev, sizeof (*mhsp->drives.drives_val)); + for (p = dslp, i = 0; (i < ndev); p = p->next, ++i) { + mhsp->drives.drives_val[i] = Strdup(p->drivenamep->rname); + } + if (partial_set) + args.options |= MHD_PARTIAL_SET; + if (((e = getenv("MD_DEBUG")) != NULL) && + (strstr(e, "NOTHREAD") != NULL)) { + args.options |= MHD_SERIAL; + } + + /* open connection */ + if ((hp = open_metamhd(NULL, ep)) == NULL) + return (-1); + clnt_control(hp->clientp, CLSET_TIMEOUT, (char *)&tk_own_timeout); + + /* get status */ + if (mhd_status_1(&args, &results, hp->clientp) != RPC_SUCCESS) { + (void) mdrpcerror(ep, hp->clientp, hp->hostname, + dgettext(TEXT_DOMAIN, "metamhd status")); + goto out; + } else if (mhstealerror(mhep, ep) != 0) { + goto out; + } + + /* do something with it */ + assert(results.results.results_len == ndev); + for (p = dslp, i = 0; (i < ndev); p = p->next, ++i) { + mhd_drive_status_t *resp = &results.results.results_val[i]; + mddrivename_t *dp = p->drivenamep; + mhd_error_t mherror; + + /* make sure we have the right drive */ + assert(strcmp(dp->rname, resp->drive) == 0); + + /* copy status */ + if (resp->errnum != 0) { + (void) memset(&mherror, 0, sizeof (mherror)); + mherror.errnum = resp->errnum; + mherror.name = Strdup(resp->drive); + (void) mhstealerror(&mherror, &p->status); + } + } + rval = 0; /* success */ + + /* cleanup, return success */ +out: + xdr_free(xdr_mhd_status_args_t, (char *)&args); + xdr_free(xdr_mhd_status_res_t, (char *)&results); + if (hp != NULL) + close_metamhd(hp); + return (rval); +} + +/* + * build disk status list from drivename list + */ +md_disk_status_list_t * +meta_drive_to_disk_status_list( + mddrivenamelist_t *dnlp +) +{ + md_disk_status_list_t *head = NULL; + md_disk_status_list_t **tailp = &head; + mddrivenamelist_t *p; + + /* copy list */ + for (p = dnlp; (p != NULL); p = p->next) { + md_disk_status_list_t *dsp; + + dsp = *tailp = Zalloc(sizeof (*dsp)); + tailp = &dsp->next; + dsp->drivenamep = p->drivenamep; + } + + /* return list */ + return (head); +} + +/* + * free disk status list + */ +void +meta_free_disk_status_list( + md_disk_status_list_t *dslp +) +{ + md_disk_status_list_t *next = NULL; + + for (/* void */; (dslp != NULL); dslp = next) { + next = dslp->next; + mdclrerror(&dslp->status); + Free(dslp); + } +} + +/* + * free drive info list + */ +void +meta_free_drive_info_list( + mhd_drive_info_list_t *listp +) +{ + xdr_free(xdr_mhd_drive_info_list_t, (char *)listp); + (void) memset(listp, 0, sizeof (*listp)); +} + +/* + * sort drive info list + */ +static int +compare_drives( + const void *p1, + const void *p2 +) +{ + const mhd_drive_info_t *di1 = p1; + const mhd_drive_info_t *di2 = p2; + const char *n1 = di1->dif_name; + const char *n2 = di2->dif_name; + uint_t c1 = 0, t1 = 0, d1 = 0, s1 = 0; + uint_t c2 = 0, t2 = 0, d2 = 0, s2 = 0; + uint_t l, cl; + + if (n1 == NULL) + n1 = ""; + if (n2 == NULL) + n2 = ""; + + /* attempt to sort correctly for c0t1d0s0 .vs. c0t18d0s0 */ + if ((n1 = strrchr(n1, '/')) == NULL) + goto u; + n1 += (n1[1] != 'c') ? 2 : 1; + cl = strlen(n1); + if ((sscanf(n1, "c%ut%ud%us%u%n", &c1, &t1, &d1, &s1, &l) != 4 && + sscanf(n1, "c%ud%us%u%n", &c1, &d1, &s1, &l) != 3 && + sscanf(n1, "c%ut%ud%u%n", &c1, &t1, &d1, &l) != 3 && + sscanf(n1, "c%ud%u%n", &c1, &d1, &l) != 2) || (l != cl)) + goto u; + + if ((n2 = strrchr(n2, '/')) == NULL) + goto u; + n2 += (n2[1] != 'c') ? 2 : 1; + cl = strlen(n2); + if ((sscanf(n2, "c%ut%ud%us%u%n", &c2, &t2, &d2, &s2, &l) != 4 && + sscanf(n2, "c%ud%us%u%n", &c2, &d2, &s2, &l) != 3 && + sscanf(n2, "c%ut%ud%u%n", &c2, &t2, &d2, &l) != 3 && + sscanf(n2, "c%ud%u%n", &c2, &d2, &l) != 2) || (l != cl)) + goto u; + if (c1 != c2) + return ((c1 > c2) ? 1 : -1); + if (t1 != t2) + return ((t1 > t2) ? 1 : -1); + if (d1 != d2) + return ((d1 > d2) ? 1 : -1); + if (s1 != s2) + return ((s1 > s2) ? 1 : -1); + return (0); + +u: return (strcmp(di1->dif_name, di2->dif_name)); +} + +static void +sort_drives( + mhd_drive_info_list_t *listp +) +{ + qsort(listp->mhd_drive_info_list_t_val, + listp->mhd_drive_info_list_t_len, + sizeof (*listp->mhd_drive_info_list_t_val), + compare_drives); +} + +/* + * return list of all drives + */ +int +meta_list_drives( + char *hostname, + char *path, + mhd_did_flags_t flags, + mhd_drive_info_list_t *listp, + md_error_t *ep +) +{ + mhd_list_args_t args; + mhd_list_res_t results; + mhd_error_t *mhep = &results.status; + mhd_handle_t *hp = NULL; + int rval = -1; + + /* if not doing ioctls */ + if (! do_mhioctl()) + return (0); + + /* initialize */ + (void) memset(&args, 0, sizeof (args)); + (void) memset(&results, 0, sizeof (results)); + + /* build arguments */ + if (path == NULL) + path = getenv("MD_DRIVE_ROOT"); + if ((path != NULL) && (*path != '\0')) + args.path = Strdup(path); + args.flags = flags; + + /* open connection */ + if ((hp = open_metamhd(hostname, ep)) == NULL) + return (-1); + clnt_control(hp->clientp, CLSET_TIMEOUT, (char *)&tk_own_timeout); + + /* get list */ + if (mhd_list_1(&args, &results, hp->clientp) != RPC_SUCCESS) { + (void) mdrpcerror(ep, hp->clientp, hp->hostname, + dgettext(TEXT_DOMAIN, "metamhd list")); + goto out; + } else if (mhstealerror(mhep, ep) != 0) { + goto out; + } + + /* sort list */ + sort_drives(&results.results); + + /* steal list */ + *listp = results.results; + results.results.mhd_drive_info_list_t_len = 0; + results.results.mhd_drive_info_list_t_val = NULL; + rval = listp->mhd_drive_info_list_t_len; /* success */ + + /* cleanup, return success */ +out: + xdr_free(xdr_mhd_list_args_t, (char *)&args); + xdr_free(xdr_mhd_list_res_t, (char *)&results); + if (hp != NULL) + close_metamhd(hp); + return (rval); +} + +static void +load_paths_to_metamhd() +{ + FILE *cfp; /* config file pointer */ + char buf[BUFSIZ], + *p, + *x; + mhd_drive_info_list_t list; + md_error_t ep; + mhd_did_flags_t flags = MHD_DID_SERIAL; + + if ((cfp = fopen(METADEVPATH, "r")) != NULL) { + /* + * Read each line from the file. Lines will be either + * comments or path names to pass to rpc.metamhd. If + * path names check to see if their a colon seperate + * list of names which must be processed one at a time. + */ + + while (fgets(buf, BUFSIZ, cfp) != NULL) { + if (buf[0] == '#') { + /* + * Ignore comment lines + */ + continue; + + } else if (strchr(buf, ':') != NULL) { + p = buf; + while ((x = strchr(p, ':')) != NULL) { + *x = '\0'; + (void) memset(&ep, '\0', sizeof (ep)); + (void) meta_list_drives(NULL, p, 0, + &list, &ep); + meta_free_drive_info_list(&list); + p = x + 1; + } + /* + * We won't pick up the last path name + * because the line ends with a newline + * not a ':'. So p will still point to + * a valid path in this case. Copy the + * data that p points to to the beginning + * of the buf and let the default case + * handle this buffer. + * NOTE: + * If the file does end with a ":\n", p at + * will point to the newline. The default + * cause would then set the newline to a + * NULL which is okay because meta_list_drives + * interprets a null string as /dev/rdsk. + */ + (void) memcpy(buf, p, strlen(p)); + } + /* + * Remove any newlines in the buffer. + */ + if ((p = strchr(buf, '\n')) != NULL) + *p = '\0'; + (void) memset(&ep, '\0', sizeof (ep)); + (void) memset(&list, '\0', sizeof (list)); + (void) meta_list_drives(NULL, buf, flags, &list, &ep); + meta_free_drive_info_list(&list); + } + (void) fclose(cfp); + } +} + +/* + * build list of all drives in set + */ +/*ARGSUSED*/ +int +meta_get_drive_names( + mdsetname_t *sp, + mddrivenamelist_t **dnlpp, + int options, + md_error_t *ep +) +{ + mhd_did_flags_t flags = MHD_DID_SERIAL; + mhd_drive_info_list_t list; + mhd_drive_info_t *mp; + uint_t i; + unsigned cnt = 0; + int rval = -1; + mddrivenamelist_t **tailpp = dnlpp; + + /* must have a set */ + assert(sp != NULL); + + load_paths_to_metamhd(); + (void) memset(&list, 0, sizeof (list)); + if ((meta_list_drives(NULL, NULL, flags, &list, ep)) < 0) + return (-1); + + /* find drives in set */ + for (i = 0; (i < list.mhd_drive_info_list_t_len); ++i) { + mddrivename_t *dnp; + mdname_t *np; + + mp = &list.mhd_drive_info_list_t_val[i]; + + if (mp->dif_id.did_flags & MHD_DID_DUPLICATE) + continue; + + /* quietly skip drives which don't conform */ + if ((dnp = metadrivename(&sp, mp->dif_name, ep)) == NULL) { + mdclrerror(ep); + continue; + } + + /* check in set */ + if ((np = metaslicename(dnp, MD_SLICE0, ep)) == NULL) + goto out; + if (meta_check_inset(sp, np, ep) != 0) { + mdclrerror(ep); + continue; + } + + /* + * Add the drivename struct to the end of the + * drivenamelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + tailpp = meta_drivenamelist_append_wrapper(tailpp, dnp); + ++cnt; + } + rval = cnt; + + /* cleanup, return error */ +out: + meta_free_drive_info_list(&list); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mirror.c b/usr/src/lib/lvm/libmeta/common/meta_mirror.c new file mode 100644 index 0000000000..8be4ada7ae --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mirror.c @@ -0,0 +1,2762 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * mirror operations + */ + +#include <meta.h> +#include <sys/lvm/md_mirror.h> +#include <sys/lvm/md_convert.h> + +#include <ctype.h> +#include <stddef.h> + +/* + * FUNCTION: meta_get_mirror_names() + * INPUT: sp - the set name to get mirrors from + * options - options from the command line + * OUTPUT: nlpp - list of all mirror names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all mirrors in the metadb + * for all devices in the specified set + */ +int +meta_get_mirror_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + return (meta_get_names(MD_MIRROR, sp, nlpp, options, ep)); +} + +/* + * free mirror unit + */ +void +meta_free_mirror( + md_mirror_t *mirrorp +) +{ + Free(mirrorp); +} + +/* + * get mirror unit + */ +static md_mirror_t * +meta_get_mirror_common( + mdsetname_t *sp, + mdname_t *mirnp, + int fast, + md_error_t *ep +) +{ + mddrivename_t *dnp = mirnp->drivenamep; + char *miscname; + mm_unit_t *mm; + md_mirror_t *mirrorp; + uint_t smi, nsm; + md_resync_ioctl_t ri; + + /* must have set */ + assert(sp != NULL); + + /* short circuit */ + if (dnp->unitp != NULL) { + assert(dnp->unitp->type == MD_METAMIRROR); + return ((md_mirror_t *)dnp->unitp); + } + + /* get miscname and unit */ + if ((miscname = metagetmiscname(mirnp, ep)) == NULL) + return (NULL); + if (strcmp(miscname, MD_MIRROR) != 0) { + (void) mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev), + mirnp->cname); + return (NULL); + } + if ((mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, ep)) == NULL) + return (NULL); + assert(mm->c.un_type == MD_METAMIRROR); + + /* allocate mirror */ + mirrorp = Zalloc(sizeof (*mirrorp)); + + /* get common info */ + mirrorp->common.namep = mirnp; + mirrorp->common.type = mm->c.un_type; + mirrorp->common.state = mm->c.un_status; + mirrorp->common.capabilities = mm->c.un_capabilities; + mirrorp->common.parent = mm->c.un_parent; + mirrorp->common.size = mm->c.un_total_blocks; + mirrorp->common.user_flags = mm->c.un_user_flags; + mirrorp->common.revision = mm->c.un_revision; + + /* get options */ + mirrorp->read_option = mm->un_read_option; + mirrorp->write_option = mm->un_write_option; + mirrorp->pass_num = mm->un_pass_num; + + /* get submirrors */ + for (smi = 0, nsm = 0; (smi < NMIRROR); ++smi) { + mm_submirror_t *mmsp = &mm->un_sm[smi]; + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + + /* get submirror state */ + mdsp->state = mmsp->sm_state; + if (mdsp->state == SMS_UNUSED) + continue; + ++nsm; + + /* get submirror time of last state change */ + mdsp->timestamp = mmsp->sm_timestamp; + + /* get submirror flags */ + mdsp->flags = mmsp->sm_flags; + + /* get submirror name */ + mdsp->submirnamep = metakeyname(&sp, mmsp->sm_key, fast, ep); + if (mdsp->submirnamep == NULL) + goto out; + } + assert(nsm == mm->un_nsm); + + /* get resync info */ + (void) memset(&ri, 0, sizeof (ri)); + ri.ri_mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); + if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, mirnp->cname) != 0) { + (void) mdstealerror(ep, &ri.mde); + goto out; + } + mirrorp->percent_done = ri.ri_percent_done; + mirrorp->percent_dirty = ri.ri_percent_dirty; + + /* cleanup, return success */ + Free(mm); + dnp->unitp = (md_common_t *)mirrorp; + return (mirrorp); + + /* cleanup, return error */ +out: + Free(mm); + meta_free_mirror(mirrorp); + return (NULL); +} + +/* + * get mirror unit + */ +md_mirror_t * +meta_get_mirror( + mdsetname_t *sp, + mdname_t *mirnp, + md_error_t *ep +) +{ + return (meta_get_mirror_common(sp, mirnp, 0, ep)); +} + +/* + * check mirror for dev + */ +static int +in_mirror( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + + /* should be in the same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* get unit */ + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + /* look in submirrors */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + /* skip unused submirrors */ + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + /* check overlap */ + if (metaismeta(submirnp)) + continue; + if (meta_check_overlap(mirnp->cname, np, slblk, nblks, + submirnp, 0, -1, ep) != 0) + return (-1); + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a mirror + */ +int +meta_check_inmirror( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdnamelist_t *mirrornlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each mirror */ + if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0) + return (-1); + for (p = mirrornlp; (p != NULL); p = p->next) { + mdname_t *mirnp = p->namep; + + /* check mirror */ + if (in_mirror(sp, mirnp, np, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(mirrornlp); + return (rval); +} + +/* + * Check to see if the primary mirror is built on top of a + * root slice which is mounted. This check is primarily to + * account for this case - + * + * # metainit -f d1 1 1 <root slice> + * # metainit d0 -m d1 + * # metainit d2 1 1 ctds + * # metattach d0 d2 + * + * The metattach here needs to fail if the root slice is + * being mirrored; otherwise there is a potential for + * data corruption. + */ +static int +meta_check_primary_mirror( + mdsetname_t *sp, + mdname_t *mirnp, + md_error_t *ep +) +{ + int smi; + char *curroot; + mdname_t *rootnp; + md_mirror_t *mirrorp; + md_stripe_t *stripep; + md_row_t *rp; + md_comp_t *cp; + + if ((curroot = meta_get_current_root(ep)) == NULL) + return (-1); + /* + * Get device name of current root metadevice. If root + * is net mounted as happens if we're part of the + * install process, rootnp will be set to NULL and we + * return success. + */ + if ((rootnp = metaname(&sp, curroot, ep)) == NULL) + return (0); + /* + * If the currently mounted root slice is not a + * ctds, we don't bother checking + */ + if ((!metaismeta(rootnp)) && metaismeta(mirnp)) { + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + for (smi = 0; (smi < NMIRROR); ++smi) { + /* Check all submirrors */ + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnamep = mdsp->submirnamep; + + /* skip unused submirrors */ + if (submirnamep == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + /* check if submirror is a stripe or not */ + if (strcmp(metagetmiscname(submirnamep, ep), MD_STRIPE) + != 0) + return (-1); + if ((stripep = meta_get_stripe(sp, submirnamep, ep)) + == NULL) + return (-1); + + /* + * Examine the first component of the first row and + * check to see if it has a mounted root slice + */ + rp = &stripep->rows.rows_val[0]; + cp = &rp->comps.comps_val[0]; + /* + * we just care about the component built on + * top of a raw device + */ + if (!metaismeta(cp->compnamep)) { + /* + * If root device is the 1st component of + * the stripe, then fail. + */ + if (strcmp(rootnp->cname, cp->compnamep->cname) + == 0) { + (void) mduseerror(ep, MDE_IS_MOUNTED, + rootnp->dev, "/", rootnp->cname); + return (-1); + } + } + } + } + /* return success */ + return (0); +} + +/* + * check submirror + */ +int +meta_check_submirror( + mdsetname_t *sp, + mdname_t *np, + mdname_t *mirnp, + int force, + md_error_t *ep +) +{ + mdchkopts_t options = 0; + md_common_t *mdp; + + /* make sure we have a metadevice disk */ + if (metachkmeta(np, ep) != 0) + return (-1); + + /* + * Check to see if the primary mirror consists of a root + * mounted device + */ + if (mirnp && (!force) && ((meta_check_primary_mirror(sp, mirnp, ep) + != 0))) + return (-1); + + /* check to ensure that it is not already in use */ + if ((! force) && + (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0)) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (! metaismeta(np)) { /* Non-metadevices */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + } else { /* Metadevices only! */ + /* make sure it can be parented */ + if ((mdp = meta_get_unit(sp, np, ep)) == NULL) + return (-1); + + if ((! (mdp->capabilities & MD_CAN_PARENT)) || + (! (mdp->capabilities & MD_CAN_SUB_MIRROR)) || + (mdp->parent != MD_NO_PARENT)) { + return (mdmderror(ep, MDE_INVAL_UNIT, + meta_getminor(np->dev), np->cname)); + } + } + + /* return success */ + return (0); +} + +/* + * convert read options + */ +char * +rd_opt_to_name( + mm_rd_opt_t opt +) +{ + switch (opt) { + case RD_LOAD_BAL: + return ("roundrobin"); + case RD_GEOMETRY: + return ("geometric"); + case RD_FIRST: + return ("first"); + default: + assert(0); + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +static char * +rd_opt_to_opt( + mm_rd_opt_t opt +) +{ + switch (opt) { + case RD_LOAD_BAL: + return (NULL); /* default */ + case RD_GEOMETRY: + return ("-g"); + case RD_FIRST: + return ("-r"); + default: + assert(0); + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +int +name_to_rd_opt( + char *uname, + char *name, + mm_rd_opt_t *optp, + md_error_t *ep +) +{ + if (strcasecmp(name, "roundrobin") == 0) { + *optp = RD_LOAD_BAL; + return (0); + } + if (strcasecmp(name, "geometric") == 0) { + *optp = RD_GEOMETRY; + return (0); + } + if (strcasecmp(name, "first") == 0) { + *optp = RD_FIRST; + return (0); + } + return (meta_cook_syntax(ep, MDE_BAD_RD_OPT, uname, 1, &name)); +} + +/* + * convert write options + */ +char * +wr_opt_to_name( + mm_wr_opt_t opt +) +{ + switch (opt) { + case WR_PARALLEL: + return ("parallel"); + case WR_SERIAL: + return ("serial"); + default: + assert(0); + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +static char * +wr_opt_to_opt( + mm_wr_opt_t opt +) +{ + switch (opt) { + case WR_PARALLEL: + return (NULL); /* default */ + case WR_SERIAL: + return ("-S"); + default: + assert(0); + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +int +name_to_wr_opt( + char *uname, + char *name, + mm_wr_opt_t *optp, + md_error_t *ep +) +{ + if (strcasecmp(name, "parallel") == 0) { + *optp = WR_PARALLEL; + return (0); + } + if (strcasecmp(name, "serial") == 0) { + *optp = WR_SERIAL; + return (0); + } + return (meta_cook_syntax(ep, MDE_BAD_WR_OPT, uname, 1, &name)); +} + +/* + * convert pass numbers + */ +int +name_to_pass_num( + char *uname, + char *name, + mm_pass_num_t *passp, + md_error_t *ep +) +{ + if ((sscanf(name, "%hd", passp) != 1) || + (*passp < 0) || (*passp > MD_PASS_MAX)) { + return (meta_cook_syntax(ep, MDE_BAD_PASS_NUM, + uname, 1, &name)); + } + return (0); +} + +/* + * convert resync option + */ + +static char * +resync_opt_to_name( + uint_t tstate +) +{ + if (tstate & MD_ABR_CAP) + return (dgettext(TEXT_DOMAIN, "application based")); + else + return (dgettext(TEXT_DOMAIN, "optimized resync")); +} + +/* + * print mirror + */ +static int +mirror_print( + md_mirror_t *mirrorp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t smi; + char *p; + int rval = -1; + + + if (options & PRINT_LARGEDEVICES) { + if (mirrorp->common.revision != MD_64BIT_META_DEV) { + rval = 0; + goto out; + } + } + + /* print name and -m */ + if (fprintf(fp, "%s -m", mirrorp->common.namep->cname) == EOF) + goto out; + + /* print submirrors */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnamep = mdsp->submirnamep; + + /* skip unused submirrors */ + if (submirnamep == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + /* print submirror */ + if (fprintf(fp, " %s", submirnamep->cname) == EOF) + goto out; + } + + /* print options */ + if ((p = rd_opt_to_opt(mirrorp->read_option)) != NULL) { + if (fprintf(fp, " %s", p) == EOF) + goto out; + } + if ((p = wr_opt_to_opt(mirrorp->write_option)) != NULL) { + if (fprintf(fp, " %s", p) == EOF) + goto out; + } + if (fprintf(fp, " %u\n", mirrorp->pass_num) == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * convert submirror state to name + */ +char * +sm_state_to_name( + md_submirror_t *mdsp, + md_status_t mirror_status, + md_timeval32_t *tvp, + uint_t tstate +) +{ + static char state_to_str[100]; + sm_state_t state = mdsp->state; + uint_t is_target = mdsp->flags & MD_SM_RESYNC_TARGET; + + /* grab time */ + if (tvp != NULL) + *tvp = mdsp->timestamp; + + /* + * Only return Unavailable if there is no flagged error on the + * submirror. If the mirror has received any writes since the submirror + * went into Unavailable state a resync is required. To alert the + * administrator to this we return a 'Needs maintenance' message. + */ + if ((tstate != 0) && (state & SMS_RUNNING)) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* all is well */ + if (state & SMS_RUNNING) { + if (!(mirror_status & MD_UN_OPT_NOT_DONE) || + ((mirror_status & MD_UN_OPT_NOT_DONE) && !is_target)) { + return (dgettext(TEXT_DOMAIN, "Okay")); + } + } + + /* resyncing, needs repair */ + if ((state & (SMS_COMP_RESYNC | SMS_ATTACHED_RESYNC | + SMS_OFFLINE_RESYNC)) || + (mirror_status & MD_UN_OPT_NOT_DONE)) { + if (mirror_status & MD_UN_RESYNC_ACTIVE) { + return (dgettext(TEXT_DOMAIN, "Resyncing")); + } + if (mirror_status & MD_UN_RESYNC_CANCEL) { + return (dgettext(TEXT_DOMAIN, "Resync cancelled")); + } + return (dgettext(TEXT_DOMAIN, "Needs maintenance")); + } + + /* needs repair */ + if (state & (SMS_COMP_ERRED | SMS_ATTACHED | SMS_OFFLINE)) { + if (mirror_status & MD_UN_RESYNC_CANCEL) { + return (dgettext(TEXT_DOMAIN, "Resync cancelled")); + } + return (dgettext(TEXT_DOMAIN, "Needs maintenance")); + } + + /* unknown */ + assert(0); + (void) sprintf(state_to_str, "0x%x", state); + return (state_to_str); +} + +/* + * convert submirror state to repair action + */ +int +sm_state_to_action( + mdsetname_t *sp, + md_submirror_t *mdsp, + md_status_t mirror_status, + md_mirror_t *mirrorp, + char **actionp, + md_error_t *ep +) +{ + static char buf[1024]; + mdname_t *submirnamep = mdsp->submirnamep; + sm_state_t state = mdsp->state; + char *miscname; + + /* all is well */ + *actionp = NULL; + if (mirror_status & MD_UN_RESYNC_ACTIVE) + return (0); + if ((state == SMS_RUNNING) && !(mirror_status & MD_UN_OPT_NOT_DONE)) + return (0); + + /* complete cancelled resync */ + if (mirror_status & MD_UN_RESYNC_CANCEL) { + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, "metasync %s"), + mirrorp->common.namep->cname); + *actionp = buf; + return (0); + } + + /* replace stripe component */ + if ((metaismeta(submirnamep)) && (state & SMS_COMP_ERRED)) { + if ((miscname = metagetmiscname(submirnamep, ep)) == NULL) + return (-1); + if (strcmp(miscname, MD_STRIPE) == 0) { + mdname_t *compnamep; + comp_state_t compstate; + + if (meta_find_erred_comp(sp, submirnamep, + &compnamep, &compstate, ep) != 0) { + return (-1); + } + if (compstate != CS_LAST_ERRED) + (void) snprintf(buf, sizeof (buf), + "metareplace %s %s <%s>", + mirrorp->common.namep->cname, + compnamep->cname, + dgettext(TEXT_DOMAIN, "new device")); + else + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, + "after replacing \"Maintenance\" " + "components:\n" + "\t\tmetareplace %s %s <new device>"), + mirrorp->common.namep->cname, + compnamep->cname); + *actionp = buf; + return (0); + } + } + + /* resync mirror */ + if ((state & (SMS_ATTACHED_RESYNC | SMS_OFFLINE_RESYNC | + SMS_COMP_RESYNC | SMS_ATTACHED)) || + (mirror_status & MD_UN_OPT_NOT_DONE)) { + (void) snprintf(buf, sizeof (buf), "metasync %s", + mirrorp->common.namep->cname); + *actionp = buf; + return (0); + } + + /* online submirror */ + if (state & SMS_OFFLINE) { + (void) snprintf(buf, sizeof (buf), "metaonline %s %s", + mirrorp->common.namep->cname, submirnamep->cname); + *actionp = buf; + return (0); + } + + /* unknown action */ + *actionp = dgettext(TEXT_DOMAIN, "???"); + return (0); +} + +/* + * print mirror options + */ +int +meta_print_mirror_options( + mm_rd_opt_t read_option, + mm_wr_opt_t write_option, + mm_pass_num_t pass_num, + uint_t tstate, + char *fname, + mdsetname_t *sp, + FILE *fp, + md_error_t *ep +) +{ + char *p; + int rval = -1; + + /* print options */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Pass: %u\n"), + pass_num) == EOF) { + goto out; + } + if ((p = rd_opt_to_opt(read_option)) == NULL) + p = dgettext(TEXT_DOMAIN, "default"); + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Read option: %s (%s)\n"), + rd_opt_to_name(read_option), p) == EOF) { + goto out; + } + if ((p = wr_opt_to_opt(write_option)) == NULL) + p = dgettext(TEXT_DOMAIN, "default"); + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Write option: %s (%s)\n"), + wr_opt_to_name(write_option), p) == EOF) { + goto out; + } + /* Display resync option for mirror, if MultiNode set */ + if (meta_is_mn_set(sp, ep)) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync option: %s\n"), + resync_opt_to_name(tstate)) == EOF) { + goto out; + } + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +static char * +get_node_name(uint_t nid, md_error_t *ep) +{ + mndiskset_membershiplist_t *nl, *p; + int n; + char *node_nm; + + /* get the known membership list */ + if (meta_read_nodelist(&n, &nl, ep)) { + return (NULL); + } + + /* find the matching node and return the name */ + for (p = nl; (p != NULL); p = p->next) { + if (nid == p->msl_node_id) { + /* match found */ + node_nm = Strdup(p->msl_node_name); + goto out; + } + } + + /* match not found */ + node_nm = Strdup(dgettext(TEXT_DOMAIN, "None")); + +out: + meta_free_nodelist(nl); + return (node_nm); +} + +/* + * report mirror + */ +static int +mirror_report( + mdsetname_t *sp, + md_mirror_t *mirrorp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_status_t status = mirrorp->common.state; + uint_t smi; + char *p; + int rval = -1; + uint_t tstate = 0; + + /* + * check for the -B option. If -B and the metadevice is + * a 64 bit device, get the dev for relocation information + * printout. If not a 64 bit device, just don't print this + * information out but you need to go down to the subdevice + * level and print there if appropriate. + */ + if (options & PRINT_LARGEDEVICES) { + if (mirrorp->common.revision != MD_64BIT_META_DEV) { + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = + &mirrorp->submirrors[smi]; + mdname_t *submirnamep = + mdsp->submirnamep; + if (submirnamep == NULL) { + continue; + } + if ((metaismeta(submirnamep)) && + (meta_print_name(sp, submirnamep, nlpp, + fname, fp, options | PRINT_SUBDEVS, NULL, + ep) != 0)) { + return (-1); + } + } + rval = 0; + goto out; + } else { + if (meta_getdevs(sp, mirrorp->common.namep, + nlpp, ep) != 0) + goto out; + } + } + + /* print header */ + if (options & PRINT_HEADER) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Mirror\n"), + mirrorp->common.namep->cname) == EOF) { + goto out; + } + } + + /* print submirrors, adjust status */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnamep = mdsp->submirnamep; + char *sm_state; + md_timeval32_t tv; + char *timep; + + /* skip unused submirrors */ + if (submirnamep == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + if (mdsp->state & SMS_OFFLINE) + status &= ~MD_UN_OPT_NOT_DONE; + + /* print submirror */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Submirror %u: %s\n"), + smi, submirnamep->cname) == EOF) { + goto out; + } + + /* print state */ + if (metaismeta(mdsp->submirnamep)) { + if (meta_get_tstate(mdsp->submirnamep->dev, &tstate, + ep) != 0) + return (-1); + } + sm_state = sm_state_to_name(mdsp, status, &tv, + tstate & MD_DEV_ERRORED); + if (options & PRINT_TIMES) { + timep = meta_print_time(&tv); + } else { + timep = ""; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " State: %-12s %s\n"), + sm_state, timep) == EOF) { + goto out; + } + } + + /* print resync status */ + if (status & MD_UN_RESYNC_CANCEL) { + /* Resync was cancelled but is restartable */ + if (mirrorp->common.revision == MD_64BIT_META_DEV) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync cancelled: %2d.%1d %% done\n"), + mirrorp->percent_done/10, + mirrorp->percent_done%10) == EOF) { + goto out; + } + } else { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync cancelled: %d %% done\n"), + mirrorp->percent_done) == EOF) { + goto out; + } + } + } else if (status & MD_UN_RESYNC_ACTIVE) { + if (mirrorp->common.revision == MD_64BIT_META_DEV) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync in progress: %2d.%1d %% done\n"), + mirrorp->percent_done/10, + mirrorp->percent_done%10) == EOF) { + goto out; + } + } else { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync in progress: %d %% done\n"), + mirrorp->percent_done) == EOF) { + goto out; + } + } + } + + /* print options */ + if (meta_get_tstate(mirrorp->common.namep->dev, &tstate, ep) != 0) + return (-1); + + if (meta_print_mirror_options(mirrorp->read_option, + mirrorp->write_option, mirrorp->pass_num, + tstate, fname, sp, fp, ep) != 0) + return (-1); + + /* print mirror owner for multi-node metadevice */ + if (meta_is_mn_set(sp, ep)) { + md_set_mmown_params_t ownpar; + mdname_t *mirnp = mirrorp->common.namep; + char *node_name; + + (void) memset(&ownpar, 0, sizeof (ownpar)); + ownpar.d.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&ownpar, MD_MIRROR, sp->setno); + + if (metaioctl(MD_MN_GET_MM_OWNER, &ownpar, ep, + "MD_MN_GET_MM_OWNER") != 0) { + return (-1); + } + + node_name = get_node_name(ownpar.d.owner, ep); + if (node_name == NULL) + return (-1); + else if (fprintf(fp, dgettext(TEXT_DOMAIN, " Owner: %s\n"), + node_name) == EOF) { + Free(node_name); + goto out; + } + Free(node_name); + + } + + /* print size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %lld blocks (%s)\n"), + mirrorp->common.size, + meta_number_to_string(mirrorp->common.size, DEV_BSIZE)) + == EOF) { + goto out; + } + + /* MD_DEBUG stuff */ + if (options & PRINT_DEBUG) { + mdname_t *mirnp = mirrorp->common.namep; + mm_unit_t *mm; + mddb_optloc_t optloc; + uint_t i; + + /* get real mirror unit */ + if ((mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, ep)) + == NULL) { + return (-1); + } + assert(mm->c.un_type == MD_METAMIRROR); + + /* print dirty regions */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, +" Regions which are dirty: %d%% (blksize %d num %d)\n"), + mirrorp->percent_dirty, mm->un_rrd_blksize, + mm->un_rrd_num) == EOF) { + Free(mm); + goto out; + } + + /* print optimized resync record locations */ + (void) memset(&optloc, 0, sizeof (optloc)); + optloc.recid = mm->un_rr_dirty_recid; + if (metaioctl(MD_DB_GETOPTLOC, &optloc, ep, + "MD_DB_GETOPTLOC") != 0) { + Free(mm); + return (-1); + } + for (i = 0; (i < ((sizeof optloc.li) / sizeof (optloc.li[0]))); + ++i) { + mddb_config_t dbconf; + char *devname; + + (void) memset(&dbconf, 0, sizeof (dbconf)); + dbconf.c_id = optloc.li[i]; + dbconf.c_setno = sp->setno; + dbconf.c_subcmd = MDDB_CONFIG_ABS; + /* Don't need device id information from this ioctl */ + dbconf.c_locator.l_devid = (uint64_t)0; + dbconf.c_locator.l_devid_flags = 0; + if (metaioctl(MD_DB_ENDDEV, &dbconf, &dbconf.c_mde, + "MD_DB_ENDDEV") != 0) { + Free(mm); + return (mdstealerror(ep, &dbconf.c_mde)); + } + if ((devname = splicename(&dbconf.c_devname)) + == NULL) { + devname = Strdup(dgettext(TEXT_DOMAIN, + "unknown")); + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync record[%u]: %d (%s %d %d)\n"), i, + optloc.li[i], devname, dbconf.c_locator.l_blkno, + (dbconf.c_dbend - dbconf.c_locator.l_blkno + 1)) + == EOF) { + Free(mm); + Free(devname); + goto out; + } + Free(devname); + } + Free(mm); + } + + /* print submirror details */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnamep = mdsp->submirnamep; + char *sm_state; + md_timeval32_t tv; + char *timep; + + /* skip unused submirrors */ + if (submirnamep == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* print submirror */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, + "%s: Submirror of %s\n"), + submirnamep->cname, + mirrorp->common.namep->cname) == EOF) { + goto out; + } + + /* print state */ + if (metaismeta(mdsp->submirnamep)) { + if (meta_get_tstate(mdsp->submirnamep->dev, &tstate, ep) + != 0) + return (-1); + } + sm_state = sm_state_to_name(mdsp, status, &tv, NULL); + if (options & PRINT_TIMES) { + timep = meta_print_time(&tv); + } else { + timep = ""; + } + + if ((tstate & MD_DEV_ERRORED) == 0) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " State: %-12s %s\n"), + sm_state, timep) == EOF) { + goto out; + } + + /* print what to do */ + if (sm_state_to_action(sp, mdsp, status, + mirrorp, &p, ep) != 0) + return (-1); + if ((p != NULL) && + (fprintf(fp, dgettext(TEXT_DOMAIN, + " Invoke: %s\n"), p) == EOF)) { + goto out; + } + } + + /* print underlying metadevice */ + if ((metaismeta(submirnamep)) && + (meta_print_name(sp, submirnamep, nlpp, fname, fp, + ((options & ~PRINT_HEADER) | PRINT_SUBDEVS), + NULL, ep) != 0)) { + return (-1); + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report mirror + */ +int +meta_mirror_print( + mdsetname_t *sp, + mdname_t *mirnp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + + /* should have same set */ + assert(sp != NULL); + assert((mirnp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)))); + + /* print all mirrors */ + if (mirnp == NULL) { + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int cnt; + int rval = 0; + + /* get list */ + if ((cnt = meta_get_mirror_names(sp, &nlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recurse */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (meta_mirror_print(sp, np, nlpp, fname, fp, + options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + metafreenamelist(nlp); + return (rval); + } + + /* get unit structure */ + if ((mirrorp = meta_get_mirror_common(sp, mirnp, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* check for parented */ + if ((! (options & PRINT_SUBDEVS)) && + (MD_HAS_PARENT(mirrorp->common.parent))) { + return (0); + } + + /* print appropriate detail */ + if (options & PRINT_SHORT) { + /* print mirror */ + if (mirror_print(mirrorp, fname, fp, options, ep) != 0) + return (-1); + + /* print underlying metadevices */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnamep = mdsp->submirnamep; + + /* skip unused submirrors */ + if (submirnamep == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + /* print submirror */ + if (metaismeta(submirnamep)) { + if (meta_print_name(sp, submirnamep, nlpp, + fname, fp, (options | PRINT_SUBDEVS), NULL, + ep) != 0) { + return (-1); + } + } + } + + /* return success */ + return (0); + } else { + return (mirror_report(sp, mirrorp, nlpp, fname, fp, + options, ep)); + } +} + +/* + * online submirror + */ +int +meta_mirror_online( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *submirnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_i_off_on_t mio; + md_mirror_t *mirrorp; + md_set_desc *sd; + uint_t tstate; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + /* Only valid for mirror without ABR set */ + if (meta_get_tstate(mirrorp->common.namep->dev, &tstate, ep) != 0) + return (-1); + if (tstate & MD_ABR_CAP) { + (void) mderror(ep, MDE_ABR_SET, NULL); + return (-1); + } + + /* + * In a MN set, the master always executes the online command first. + * Before the master executes the IOC_ONLINE ioctl, + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_ONLINE ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the online command, each slave will + * call the IOC_ONLINE ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if ((MD_MNSET_DESC(sd)) && sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + /* online submirror */ + (void) memset(&mio, 0, sizeof (mio)); + mio.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&mio, MD_MIRROR, sp->setno); + mio.submirror = submirnp->dev; + if (metaioctl(MD_IOCONLINE, &mio, &mio.mde, NULL) != 0) + return (mdstealerror(ep, &mio.mde)); + + /* clear cache */ + meta_invalidate_name(mirnp); + meta_invalidate_name(submirnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: submirror %s is onlined\n"), + mirnp->cname, submirnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * offline submirror + */ +int +meta_mirror_offline( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *submirnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + int force = ((options & MDCMD_FORCE) ? 1 : 0); + md_i_off_on_t mio; + md_mirror_t *mirrorp; + md_set_desc *sd; + uint_t tstate; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + /* Only valid for mirror without ABR set */ + if (meta_get_tstate(mirrorp->common.namep->dev, &tstate, ep) != 0) + return (-1); + if (tstate & MD_ABR_CAP) { + (void) mderror(ep, MDE_ABR_SET, NULL); + return (-1); + } + + /* + * In a MN set, the master always executes the offline command first. + * Before the master executes the IOC_OFFLINE ioctl, + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_OFFLINE ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the offline command, each slave will + * call the IOC_OFFLINE ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if ((MD_MNSET_DESC(sd)) && sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + /* offline submirror */ + (void) memset(&mio, 0, sizeof (mio)); + mio.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&mio, MD_MIRROR, sp->setno); + mio.submirror = submirnp->dev; + mio.force_offline = force; + if (metaioctl(MD_IOCOFFLINE, &mio, &mio.mde, NULL) != 0) + return (mdstealerror(ep, &mio.mde)); + + /* clear cache */ + meta_invalidate_name(mirnp); + meta_invalidate_name(submirnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: submirror %s is offlined\n"), + mirnp->cname, submirnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * attach submirror to mirror + * we actually never have to worry about crossing a thresh hold here. + * 2 cases 1) attach and the only way the mirror can be 64 bit is if + * one of the submirrors already is. 2) grow and the only way the mirror + * is 64 bit is if one of the submirror's already is. + */ +int +meta_mirror_attach( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *submirnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_att_struct_t att; + md_set_desc *sd; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* just grow */ + if (submirnp == NULL) { + return (meta_concat_generic(sp, mirnp, NULL, ep)); + } + + /* check submirror */ + if (meta_check_submirror(sp, submirnp, mirnp, 0, ep) != 0) + return (-1); + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, submirnp, NULL, ep) != 0) + return (-1); + } + + /* + * In a MN set, the master always executes the attach command first. + * Before the master executes the IOC_ATTACH ioctl, in non-DRYRUN mode + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_ATTACH ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the attach command, each slave will + * call the IOC_ATTACH ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if ((MD_MNSET_DESC(sd)) && (options & MDCMD_DOIT) && + sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + /* attach submirror */ + (void) memset(&att, 0, sizeof (att)); + att.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&att, MD_MIRROR, sp->setno); + att.submirror = submirnp->dev; + att.key = submirnp->key; + /* if the comamnd was issued with -n option, use dryrun mode */ + if ((options & MDCMD_DOIT) == 0) { + att.options = MDIOCTL_DRYRUN; + } + if (metaioctl(MD_IOCATTACH, &att, &att.mde, NULL) != 0) { + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + (void) del_key_name(sp, submirnp, ep); + } + return (mdstealerror(ep, &att.mde)); + } + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* clear cache */ + meta_invalidate_name(mirnp); + meta_invalidate_name(submirnp); + } + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: submirror %s %s\n"), mirnp->cname, submirnp->cname, + (options & MDCMD_DOIT) ? "is attached" : "would attach"); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * detach submirror + */ +int +meta_mirror_detach( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *submirnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + int force = ((options & MDCMD_FORCE) ? 1 : 0); + md_detach_params_t detach; + md_set_desc *sd; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* + * In a MN set, the master always executes the detach command first. + * Before the master executes the IOC_DETACH ioctl, + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_DETACH ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the detach command, each slave will + * call the IOC_DETACH ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if ((MD_MNSET_DESC(sd)) && sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + /* detach submirror */ + (void) memset(&detach, 0, sizeof (detach)); + detach.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(&detach, MD_MIRROR, sp->setno); + detach.submirror = submirnp->dev; + detach.force_detach = force; + if (metaioctl(MD_IOCDETACH, &detach, &detach.mde, NULL) != 0) + return (mdstealerror(ep, &detach.mde)); + + /* clear cache */ + meta_invalidate_name(mirnp); + meta_invalidate_name(submirnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: submirror %s is detached\n"), + mirnp->cname, submirnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * get mirror parameters + */ +int +meta_mirror_get_params( + mdsetname_t *sp, + mdname_t *mirnp, + mm_params_t *paramsp, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* get unit */ + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + /* return parameters */ + (void) memset(paramsp, 0, sizeof (*paramsp)); + paramsp->read_option = mirrorp->read_option; + paramsp->write_option = mirrorp->write_option; + paramsp->pass_num = mirrorp->pass_num; + return (0); +} + +/* + * set mirror parameters + */ +int +meta_mirror_set_params( + mdsetname_t *sp, + mdname_t *mirnp, + mm_params_t *paramsp, + md_error_t *ep +) +{ + md_mirror_params_t mmp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* set parameters */ + (void) memset(&mmp, 0, sizeof (mmp)); + MD_SETDRIVERNAME(&mmp, MD_MIRROR, sp->setno); + mmp.mnum = meta_getminor(mirnp->dev); + mmp.params = *paramsp; + if (metaioctl(MD_IOCCHANGE, &mmp, &mmp.mde, mirnp->cname) != 0) + return (mdstealerror(ep, &mmp.mde)); + + /* clear cache */ + meta_invalidate_name(mirnp); + + /* return success */ + return (0); +} + +/* + * invalidate submirror names + */ +static int +invalidate_submirrors( + mdsetname_t *sp, + mdname_t *mirnp, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + meta_invalidate_name(submirnp); + } + return (0); +} + +/* + * replace mirror component + */ +int +meta_mirror_replace( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *oldnp, + mdname_t *newnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + replace_params_t params; + diskaddr_t size, label, start_blk; + md_dev64_t old_dev, new_dev; + diskaddr_t new_start_blk, new_end_blk; + int rebind; + md_set_desc *sd; + char *new_devidp = NULL; + int ret; + md_error_t xep = mdnullerror; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* save new binding incase this is a rebind where oldnp==newnp */ + new_dev = newnp->dev; + new_start_blk = newnp->start_blk; + new_end_blk = newnp->end_blk; + + /* invalidate, then get the mirror (fill in oldnp from metadb) */ + meta_invalidate_name(mirnp); + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + if (! metaismeta(submirnp)) + continue; + + meta_invalidate_name(submirnp); + if (meta_get_unit(sp, submirnp, ep) == NULL) + return (-1); + } + + /* the old device binding is now established */ + if ((old_dev = oldnp->dev) == NODEV64) + return (mdsyserror(ep, ENODEV, oldnp->cname)); + + /* + * check for the case where oldnp and newnp indicate the same + * device, but the dev_t of the device has changed between old + * and new. This is called a rebind. On entry the dev_t + * represents the new device binding determined from the + * filesystem (meta_getdev). After calling meta_get_unit + * oldnp (and maybe newnp if this is a rebind) is updated based + * to the old binding from the metadb (done by metakeyname). + */ + if ((strcmp(oldnp->rname, newnp->rname) == 0) && + (old_dev != new_dev)) { + rebind = 1; + } else { + rebind = 0; + } + if (rebind) { + newnp->dev = new_dev; + newnp->start_blk = new_start_blk; + newnp->end_blk = new_end_blk; + } + + /* + * Save a copy of the devid associated with the new disk, the reason + * is that if we are rebinding then the call to meta_check_component() + * will cause the devid of the disk to be overwritten with what is in + * the replica namespace. The function that actually overwrites the + * devid is dr2drivedesc(). + */ + if (newnp->drivenamep->devid != NULL) + new_devidp = Strdup(newnp->drivenamep->devid); + + /* if it's a multi-node diskset clear new_devidp */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if (MD_MNSET_DESC(sd)) + new_devidp = NULL; + } + + /* check it out (dup on rebind is ok) */ + if (meta_check_component(sp, newnp, 0, ep) != 0) { + if ((! rebind) || (! mdisuseerror(ep, MDE_ALREADY))) { + Free(new_devidp); + return (-1); + } + mdclrerror(ep); + } + if ((size = metagetsize(newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if ((label = metagetlabel(newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if ((start_blk = metagetstart(sp, newnp, ep)) == MD_DISKADDR_ERROR) { + Free(new_devidp); + return (-1); + } + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, newnp->cname); + Free(new_devidp); + return (-1); + } + + /* + * Copy back the saved devid. + */ + Free(newnp->drivenamep->devid); + if (new_devidp != NULL) { + newnp->drivenamep->devid = Strdup(new_devidp); + Free(new_devidp); + } + + /* store name in namespace, allocate new key */ + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + + /* + * In a MN set, the master always executes the replace command first. + * Before the master executes the IOC_REPLACE ioctl, in non-DRYRUN mode + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_REPLACE ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the replace command, each slave will + * call the IOC_REPLACE ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((MD_MNSET_DESC(sd)) && (options & MDCMD_DOIT) && + sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + if (rebind && !metaislocalset(sp)) { + /* + * We are 'rebind'ing a disk that is in a diskset so as well + * as updating the diskset's namespace the local set needs + * to be updated because it also contains a reference to + * the disk in question. + */ + ret = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, + newnp->cname, ep); + + if (ret != METADEVADM_SUCCESS) { + (void) del_key_name(sp, newnp, &xep); + return (-1); + } + } + + /* replace component */ + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(¶ms, MD_MIRROR, sp->setno); + params.cmd = REPLACE_COMP; + params.old_dev = old_dev; + params.new_dev = new_dev; + params.start_blk = start_blk; + params.has_label = ((label > 0) ? 1 : 0); + params.number_blks = size; + params.new_key = newnp->key; + /* Is this just a dryrun ? */ + if ((options & MDCMD_DOIT) == 0) { + params.options |= MDIOCTL_DRYRUN; + } + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) { + (void) del_key_name(sp, newnp, ep); + return (mdstealerror(ep, ¶ms.mde)); + } + + /* clear cache */ + meta_invalidate_name(oldnp); + meta_invalidate_name(newnp); + if (invalidate_submirrors(sp, mirnp, ep) != 0) { + meta_invalidate_name(mirnp); + return (-1); + } + meta_invalidate_name(mirnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is replaced with %s\n"), + mirnp->cname, oldnp->cname, newnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * enable mirror component + */ +int +meta_mirror_enable( + mdsetname_t *sp, + mdname_t *mirnp, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + replace_params_t params; + diskaddr_t size, label, start_blk; + md_dev64_t fs_dev; + md_set_desc *sd; + int ret; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* get the file_system dev binding */ + if (meta_getdev(sp, compnp, ep) != 0) + return (-1); + fs_dev = compnp->dev; + + /* get the mirror unit (fill in compnp->dev with metadb version) */ + meta_invalidate_name(mirnp); + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + if (! metaismeta(submirnp)) + continue; + + meta_invalidate_name(submirnp); + if (meta_get_unit(sp, submirnp, ep) == NULL) + return (-1); + } + + /* the metadb device binding is now established */ + if (compnp->dev == NODEV64) + return (mdsyserror(ep, ENODEV, compnp->cname)); + + /* + * check for the case where the dev_t has changed between the + * filesystem and the metadb. This is called a rebind, and + * is handled by meta_mirror_replace. + */ + if (fs_dev != compnp->dev) { + /* establish file system binding with invalid start/end */ + compnp->dev = fs_dev; + compnp->start_blk = -1; + compnp->end_blk = -1; + return (meta_mirror_replace(sp, mirnp, + compnp, compnp, options, ep)); + } + + /* setup mirror info */ + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(mirnp->dev); + MD_SETDRIVERNAME(¶ms, MD_MIRROR, sp->setno); + params.cmd = ENABLE_COMP; + + /* check it out */ + if (meta_check_component(sp, compnp, 0, ep) != 0) { + if (! mdisuseerror(ep, MDE_ALREADY)) + return (-1); + mdclrerror(ep); + } + + if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((label = metagetlabel(compnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((start_blk = metagetstart(sp, compnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, compnp->cname); + return (-1); + } + + /* + * In a MN set, the master always executes the replace command first. + * Before the master executes the IOC_REPLACE ioctl, in non-DRYRUN mode + * the master sends a message to all nodes to suspend writes to + * this mirror. Then the master executes the IOC_REPLACE ioctl + * which resumes writes to this mirror from the master node. + * As each slave executes the replace command, each slave will + * call the IOC_REPLACE ioctl which will resume writes to this mirror + * from that slave node. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if ((MD_MNSET_DESC(sd)) && (options & MDCMD_DOIT) && + sd->sd_mn_am_i_master) + if (meta_mn_send_suspend_writes( + meta_getminor(mirnp->dev), ep) != 0) + return (-1); + } + + /* enable component */ + params.old_dev = compnp->dev; + params.new_dev = compnp->dev; + params.start_blk = start_blk; + params.has_label = ((label > 0) ? 1 : 0); + params.number_blks = size; + + /* Is this just a dryrun ? */ + if ((options & MDCMD_DOIT) == 0) { + params.options |= MDIOCTL_DRYRUN; + } + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) + return (mdstealerror(ep, ¶ms.mde)); + + /* + * Are we dealing with a non-local set? If so need to update the + * local namespace so that the disk record has the correct devid. + */ + if (!metaislocalset(sp)) { + ret = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, compnp->cname, + ep); + + if (ret != METADEVADM_SUCCESS) { + /* + * Failed to update the local set. Nothing to do here + * apart from report the error. The namespace is + * most likely broken and some form of remedial + * recovery is going to be required. + */ + mde_perror(ep, ""); + mdclrerror(ep); + } + } + + /* clear cache */ + meta_invalidate_name(compnp); + if (invalidate_submirrors(sp, mirnp, ep) != 0) { + meta_invalidate_name(mirnp); + return (-1); + } + meta_invalidate_name(mirnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is enabled\n"), + mirnp->cname, compnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * check for dups in the mirror itself + */ +static int +check_twice( + md_mirror_t *mirrorp, + uint_t smi, + md_error_t *ep +) +{ + mdname_t *mirnp = mirrorp->common.namep; + mdname_t *thisnp; + uint_t s; + + thisnp = mirrorp->submirrors[smi].submirnamep; + for (s = 0; (s < smi); ++s) { + md_submirror_t *mdsp = &mirrorp->submirrors[s]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) + continue; + + if (meta_check_overlap(mirnp->cname, thisnp, 0, -1, + submirnp, 0, -1, ep) != 0) { + return (-1); + } + } + return (0); +} + +/* + * check mirror + */ +int +meta_check_mirror( + mdsetname_t *sp, + md_mirror_t *mirrorp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *mirnp = mirrorp->common.namep; + int force = ((options & MDCMD_FORCE) ? 1 : 0); + int doit = ((options & MDCMD_DOIT) ? 1 : 0); + uint_t nsm = 0; + uint_t smi; + + /* check submirrors */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) + continue; + ++nsm; + } + if (nsm < 1) { + return (mdmderror(ep, MDE_BAD_MIRROR, + meta_getminor(mirnp->dev), mirnp->cname)); + } + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + diskaddr_t size; + + /* skip unused submirrors */ + if (submirnp == NULL) { + if (mdsp->state != SMS_UNUSED) { + return (mdmderror(ep, MDE_BAD_MIRROR, + meta_getminor(mirnp->dev), mirnp->cname)); + } + continue; + } + + /* check submirror */ + if (doit) { + if (meta_check_submirror(sp, submirnp, NULL, force, + ep) != 0) + return (-1); + if ((size = metagetsize(submirnp, ep)) == + MD_DISKADDR_ERROR) { + return (-1); + } else if (size == 0) { + return (mdsyserror(ep, ENOSPC, + submirnp->cname)); + } + } + + /* check this mirror too */ + if (check_twice(mirrorp, smi, ep) != 0) + return (-1); + } + + /* check read option */ + switch (mirrorp->read_option) { + case RD_LOAD_BAL: + case RD_GEOMETRY: + case RD_FIRST: + break; + default: + return (mderror(ep, MDE_BAD_RD_OPT, mirnp->cname)); + } + + /* check write option */ + switch (mirrorp->write_option) { + case WR_PARALLEL: + case WR_SERIAL: + break; + default: + return (mderror(ep, MDE_BAD_WR_OPT, mirnp->cname)); + } + + /* check pass number */ + if ((mirrorp->pass_num < 0) || (mirrorp->pass_num > MD_PASS_MAX)) + return (mderror(ep, MDE_BAD_PASS_NUM, mirnp->cname)); + + /* return success */ + return (0); +} + +/* + * setup mirror geometry + */ +static int +mirror_geom( + md_mirror_t *mirrorp, + mm_unit_t *mm, + md_error_t *ep +) +{ + uint_t write_reinstruct = 0; + uint_t read_reinstruct = 0; + uint_t round_cyl = 1; + mdname_t *smnp = NULL; + uint_t smi; + mdgeom_t *geomp; + + /* get worst reinstructs */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + if (submirnp == NULL) + continue; + + if ((geomp = metagetgeom(submirnp, ep)) == NULL) + return (-1); + if (geomp->write_reinstruct > write_reinstruct) + write_reinstruct = geomp->write_reinstruct; + if (geomp->read_reinstruct > read_reinstruct) + read_reinstruct = geomp->read_reinstruct; + + if (smnp == NULL) + smnp = submirnp; + } + + /* setup geometry from first submirror */ + assert(smnp != NULL); + if ((geomp = metagetgeom(smnp, ep)) == NULL) + return (-1); + if (meta_setup_geom((md_unit_t *)mm, mirrorp->common.namep, geomp, + write_reinstruct, read_reinstruct, round_cyl, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * create mirror + */ +int +meta_create_mirror( + mdsetname_t *sp, + md_mirror_t *mirrorp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *mirnp = mirrorp->common.namep; + mm_unit_t *mm; + diskaddr_t submir_size = MD_DISKADDR_ERROR; + ushort_t nsm = 0; + uint_t smi; + mdnamelist_t *keynlp = NULL; + md_set_params_t set_params; + int rval = -1; + md_timeval32_t creation_time; + int create_flag = MD_CRO_32BIT; + + /* validate mirror */ + if (meta_check_mirror(sp, mirrorp, options, ep) != 0) + return (-1); + + + /* allocate mirror unit */ + mm = Zalloc(sizeof (*mm)); + + if (meta_gettimeofday(&creation_time) == -1) + return (mdsyserror(ep, errno, NULL)); + + /* do submirrors */ + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + mm_submirror_t *mmsp = &mm->un_sm[smi]; + diskaddr_t size; + + /* skip unused submirrors */ + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + ++nsm; + + /* get size */ + if ((size = metagetsize(submirnp, ep)) == MD_DISKADDR_ERROR) + goto out; + assert(size > 0); + + /* adjust for smallest submirror */ + if (submir_size == MD_DISKADDR_ERROR) { + submir_size = size; + } else if (size < submir_size) { + submir_size = size; + } + + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, submirnp, &keynlp, ep) != 0) + goto out; + } + + /* setup submirror */ + mmsp->sm_key = submirnp->key; + mmsp->sm_dev = submirnp->dev; + mmsp->sm_state = SMS_RUNNING; + mmsp->sm_timestamp = creation_time; + } + + /* setup unit */ + mm->c.un_type = MD_METAMIRROR; + MD_SID(mm) = meta_getminor(mirnp->dev); + mm->c.un_actual_tb = submir_size; + mm->c.un_size = offsetof(mm_unit_t, un_smic); + mm->un_nsm = nsm; + mm->un_read_option = mirrorp->read_option; + mm->un_write_option = mirrorp->write_option; + mm->un_pass_num = mirrorp->pass_num; + if (mirror_geom(mirrorp, mm, ep) != 0) + goto out; + + /* fill in the size of the mirror */ + if (options & MDCMD_UPDATE) { + mirrorp->common.size = mm->c.un_total_blocks; + } + + /* if we're not doing anything, return success */ + if (! (options & MDCMD_DOIT)) { + rval = 0; /* success */ + goto out; + } + + /* create mirror */ + (void) memset(&set_params, 0, sizeof (set_params)); + /* did the user tell us to generate a large device? */ + create_flag = meta_check_devicesize(mm->c.un_total_blocks); + if (create_flag == MD_CRO_64BIT) { + mm->c.un_revision = MD_64BIT_META_DEV; + set_params.options = MD_CRO_64BIT; + } else { + mm->c.un_revision = MD_32BIT_META_DEV; + set_params.options = MD_CRO_32BIT; + } + set_params.mnum = MD_SID(mm); + set_params.size = mm->c.un_size; + set_params.mdp = (uintptr_t)mm; + MD_SETDRIVERNAME(&set_params, MD_MIRROR, MD_MIN2SET(set_params.mnum)); + if (metaioctl(MD_IOCSET, &set_params, &set_params.mde, + mirnp->cname) != 0) { + (void) mdstealerror(ep, &set_params.mde); + goto out; + } + rval = 0; /* success */ + + /* cleanup, return success */ +out: + Free(mm); + if (rval != 0) { + (void) del_key_names(sp, keynlp, NULL); + } + metafreenamelist(keynlp); + if ((rval == 0) && (options & MDCMD_DOIT)) { + if (invalidate_submirrors(sp, mirnp, ep) != 0) + rval = -1; + meta_invalidate_name(mirnp); + } + return (rval); +} + +/* + * initialize mirror + * NOTE: this functions is metainit(1m)'s command line parser! + */ +int +meta_init_mirror( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + char *uname = argv[0]; + mdname_t *mirnp = NULL; + int old_optind; + int c; + md_mirror_t *mirrorp = NULL; + uint_t smi; + int rval = -1; + + /* get mirror name */ + assert(argc > 0); + if (argc < 1) + goto syntax; + if ((mirnp = metaname(spp, uname, ep)) == NULL) + goto out; + assert(*spp != NULL); + uname = mirnp->cname; + if (metachkmeta(mirnp, ep) != 0) + goto out; + + if (!(options & MDCMD_NOLOCK)) { + /* grab set lock */ + if (meta_lock(*spp, TRUE, ep) != 0) + goto out; + + if (meta_check_ownership(*spp, ep) != 0) + goto out; + } + + /* see if it exists already */ + if (metagetmiscname(mirnp, ep) != NULL) { + (void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP, + meta_getminor(mirnp->dev), uname); + goto out; + } else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) { + goto out; + } else { + mdclrerror(ep); + } + --argc, ++argv; + + /* grab -m */ + if ((argc < 1) || (strcmp(argv[0], "-m") != 0)) + goto syntax; + --argc, ++argv; + + if (argc == 0) + goto syntax; + + /* parse general options */ + optind = 0; + opterr = 0; + if (getopt(argc, argv, "") != -1) + goto options; + + /* allocate mirror */ + mirrorp = Zalloc(sizeof (*mirrorp)); + + /* setup common */ + mirrorp->common.namep = mirnp; + mirrorp->common.type = MD_METAMIRROR; + + /* parse submirrors */ + for (smi = 0; ((argc > 0) && (argv[0][0] != '-') && + (! isdigit(argv[0][0]))); ++smi) { + md_submirror_t *mdsm = &mirrorp->submirrors[smi]; + mdname_t *submirnamep; + + /* check for room */ + if (smi >= NMIRROR) { + (void) mdmderror(ep, MDE_MIRROR_FULL, + meta_getminor(mirnp->dev), uname); + goto out; + } + + /* parse submirror name */ + if ((submirnamep = metaname(spp, argv[0], ep)) == NULL) + goto out; + mdsm->submirnamep = submirnamep; + --argc, ++argv; + } + if (smi == 0) { + (void) mdmderror(ep, MDE_NSUBMIRS, meta_getminor(mirnp->dev), + uname); + goto out; + } + + /* dangerous n-way mirror creation */ + if ((smi > 1) && (options & MDCMD_PRINT)) { + md_eprintf(dgettext(TEXT_DOMAIN, +"%s: WARNING: This form of metainit is not recommended.\n" +"The submirrors may not have the same data.\n" +"Please see ERRORS in metainit(1M) for additional information.\n"), + uname); + } + + /* parse mirror options */ + mirrorp->read_option = RD_LOAD_BAL; + mirrorp->write_option = WR_PARALLEL; + mirrorp->pass_num = MD_PASS_DEFAULT; + old_optind = optind = 0; + opterr = 0; + while ((c = getopt(argc, argv, "grS")) != -1) { + switch (c) { + case 'g': + if (mirrorp->read_option != RD_LOAD_BAL) { + (void) mderror(ep, MDE_BAD_RD_OPT, uname); + goto out; + } + mirrorp->read_option = RD_GEOMETRY; + break; + + case 'r': + if (mirrorp->read_option != RD_LOAD_BAL) { + (void) mderror(ep, MDE_BAD_RD_OPT, uname); + goto out; + } + mirrorp->read_option = RD_FIRST; + break; + + case 'S': + if (mirrorp->write_option != WR_PARALLEL) { + (void) mderror(ep, MDE_BAD_WR_OPT, uname); + goto out; + } + mirrorp->write_option = WR_SERIAL; + break; + + default: + argc -= old_optind; + argv += old_optind; + goto options; + } + old_optind = optind; + } + argc -= optind; + argv += optind; + + /* parse pass number */ + if ((argc > 0) && (isdigit(argv[0][0]))) { + if (name_to_pass_num(uname, argv[0], + &mirrorp->pass_num, ep) != 0) { + goto out; + } + --argc, ++argv; + } + + /* we should be at the end */ + if (argc != 0) + goto syntax; + + /* create mirror */ + if (meta_create_mirror(*spp, mirrorp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Mirror is setup\n"), + uname); + (void) fflush(stdout); + } + goto out; + + /* syntax error */ +syntax: + rval = meta_cook_syntax(ep, MDE_SYNTAX, uname, argc, argv); + goto out; + + /* options error */ +options: + rval = meta_cook_syntax(ep, MDE_OPTION, uname, argc, argv); + goto out; + + /* cleanup, return error */ +out: + if (mirrorp != NULL) + meta_free_mirror(mirrorp); + return (rval); +} + +/* + * reset mirrors + */ +int +meta_mirror_reset( + mdsetname_t *sp, + mdname_t *mirnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_mirror_t *mirrorp; + uint_t smi; + int rval = -1; + + /* should have same set */ + assert(sp != NULL); + assert((mirnp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev)))); + + /* reset all mirrors */ + if (mirnp == NULL) { + mdnamelist_t *mirrornlp = NULL; + mdnamelist_t *p; + + /* for each mirror */ + rval = 0; + if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0) + return (-1); + for (p = mirrornlp; (p != NULL); p = p->next) { + /* reset mirror */ + mirnp = p->namep; + /* + * If this is a multi-node set, we send a series + * of individual metaclear commands. + */ + if (meta_is_mn_set(sp, ep)) { + if (meta_mn_send_metaclear_command(sp, + mirnp->cname, options, 0, ep) != 0) { + rval = -1; + break; + } + } else { + if (meta_mirror_reset(sp, mirnp, options, + ep) != 0) { + rval = -1; + break; + } + } + } + + /* cleanup return success */ + metafreenamelist(mirrornlp); + return (rval); + } + + /* check name */ + if (metachkmeta(mirnp, ep) != 0) + return (-1); + + /* get unit structure */ + if ((mirrorp = meta_get_mirror(sp, mirnp, ep)) == NULL) + return (-1); + + /* make sure nobody owns us */ + if (MD_HAS_PARENT(mirrorp->common.parent)) { + return (mdmderror(ep, MDE_IN_USE, meta_getminor(mirnp->dev), + mirnp->cname)); + } + + /* clear subdevices cache */ + if (invalidate_submirrors(sp, mirnp, ep) != 0) + return (-1); + + /* clear metadevice */ + if (meta_reset(sp, mirnp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Mirror is cleared\n"), mirnp->cname); + (void) fflush(stdout); + } + + /* clear subdevices */ + if (! (options & MDCMD_RECURSE)) + goto out; + for (smi = 0; (smi < NMIRROR); ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + mdname_t *submirnp = mdsp->submirnamep; + + /* skip unused submirrors */ + if (submirnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + + /* make sure we have a metadevice */ + if (! metaismeta(submirnp)) + continue; + + /* clear submirror */ + if (meta_reset_by_name(sp, submirnp, options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ +out: + meta_invalidate_name(mirnp); + return (rval); +} + +/* + * reports TRUE if any mirror component is in error + */ +int +meta_mirror_anycomp_is_err(mdsetname_t *sp, mdnamelist_t *mirror_names) +{ + mdnamelist_t *nlp; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int any_errs = FALSE; + + for (nlp = mirror_names; nlp; nlp = nlp->next) { + md_mirror_t *mirrorp; + int smi; + + if ((mirrorp = meta_get_mirror(sp, nlp->namep, ep)) == NULL) { + any_errs |= TRUE; + goto out; + } + + for (smi = 0; smi < NMIRROR; ++smi) { + md_submirror_t *mdsp = &mirrorp->submirrors[smi]; + + if (mdsp->state & + (SMS_COMP_ERRED|SMS_ATTACHED|SMS_OFFLINE)) { + any_errs |= TRUE; + goto out; + } + } + } +out: + if (!mdisok(ep)) + mdclrerror(ep); + + return (any_errs); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mirror_resync.c b/usr/src/lib/lvm/libmeta/common/meta_mirror_resync.c new file mode 100644 index 0000000000..f833ce5c3e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mirror_resync.c @@ -0,0 +1,658 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * mirror operations + */ + +#include <meta.h> +#include <sys/lvm/md_mirror.h> +#include <thread.h> + +extern int md_in_daemon; +extern md_mn_client_list_t *mdmn_clients; + +/* + * chain of mirrors + */ +typedef struct mm_unit_list { + struct mm_unit_list *next; /* next in chain */ + mdname_t *namep; /* mirror name */ + mm_pass_num_t pass; /* pass number */ + uint_t done; /* resync done */ +} mm_unit_list_t; + +/* + * resync mirror + * meta_lock for this set should be held on entry. + */ +int +meta_mirror_resync( + mdsetname_t *sp, + mdname_t *mirnp, + daddr_t size, + md_error_t *ep, + md_resync_cmd_t cmd /* Start/Block/Unblock/Kill */ +) +{ + char *miscname; + md_resync_ioctl_t ri; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(mirnp->dev))); + + /* make sure we have a mirror */ + if ((miscname = metagetmiscname(mirnp, ep)) == NULL) + return (-1); + if (strcmp(miscname, MD_MIRROR) != 0) { + return (mdmderror(ep, MDE_NOT_MM, meta_getminor(mirnp->dev), + mirnp->cname)); + } + + /* start resync */ + (void) memset(&ri, 0, sizeof (ri)); + MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); + ri.ri_mnum = meta_getminor(mirnp->dev); + ri.ri_copysize = size; + switch (cmd) { + case MD_RESYNC_FORCE_MNSTART: + ri.ri_flags |= MD_RI_RESYNC_FORCE_MNSTART; + break; + case MD_RESYNC_START: + ri.ri_flags = 0; + break; + case MD_RESYNC_BLOCK: + ri.ri_flags = MD_RI_BLOCK; + break; + case MD_RESYNC_UNBLOCK: + ri.ri_flags = MD_RI_UNBLOCK; + break; + case MD_RESYNC_KILL: + ri.ri_flags = MD_RI_KILL; + break; + case MD_RESYNC_KILL_NO_WAIT: + ri.ri_flags = MD_RI_KILL | MD_RI_NO_WAIT; + break; + default: + /* TODO: Add new error MDE_BAD_RESYNC_FLAGS */ + return (mderror(ep, MDE_BAD_RESYNC_OPT, mirnp->cname)); + } + + if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, mirnp->cname) != 0) + return (mdstealerror(ep, &ri.mde)); + + /* return success */ + return (0); +} + +/* + * free units + */ +static void +free_units( + mm_unit_list_t *mirrors[MD_PASS_MAX + 1] +) +{ + uint_t i; + + for (i = 0; (i < (MD_PASS_MAX + 1)); ++i) { + mm_unit_list_t *p, *n; + + for (p = mirrors[i], n = NULL; (p != NULL); p = n) { + n = p->next; + Free(p); + } + mirrors[i] = NULL; + } +} + +/* + * setup_units: build lists of units for each pass + */ +static int +setup_units( + mdsetname_t *sp, + mm_unit_list_t *mirrors[MD_PASS_MAX + 1], + md_error_t *ep +) +{ + mdnamelist_t *mirrornlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each mirror */ + if (meta_get_mirror_names(sp, &mirrornlp, 0, ep) < 0) + return (-1); + for (p = mirrornlp; (p != NULL); p = p->next) { + md_mirror_t *mirrorp; + mm_unit_list_t *lp; + + /* get unit structure */ + if ((mirrorp = meta_get_mirror(sp, p->namep, ep)) == NULL) { + rval = -1; /* record, but ignore errors */ + continue; + } + + /* save info */ + lp = Zalloc(sizeof (*lp)); + lp->namep = p->namep; + lp->pass = mirrorp->pass_num; + if ((lp->pass < 0) || (lp->pass > MD_PASS_MAX)) + lp->pass = MD_PASS_MAX; + + /* put on list */ + lp->next = mirrors[lp->pass]; + mirrors[lp->pass] = lp; + } + + /* cleanup, return error */ + metafreenamelist(mirrornlp); + return (rval); +} + +/* + * resync all mirrors (in background) + */ +int +meta_mirror_resync_all( + mdsetname_t *sp, + daddr_t size, + md_error_t *ep +) +{ + mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; + mm_pass_num_t pass, max_pass; + int rval = 0, fval; + + /* should have a set */ + assert(sp != NULL); + + /* get mirrors */ + (void) memset(mirrors, 0, sizeof (mirrors)); + if (setup_units(sp, mirrors, ep) != 0) + rval = -1; + + /* fork a process */ + if ((fval = md_daemonize(sp, ep)) != 0) { + /* + * md_daemonize will fork off a process. The is the + * parent or error. + */ + if (fval > 0) { + free_units(mirrors); + return (0); + } + mdclrerror(ep); + } + /* + * Closing stdin/out/err here. + * In case this was called thru rsh, the calling process on the other + * side will know, it doesn't have to wait until all the resyncs have + * finished. + * Also initialise the rpc client pool so that this process will use + * a unique pool of clients. If we don't do this, all of the forked + * clients will end up using the same pool of clients which can result + * in hung clients. + */ + if (meta_is_mn_set(sp, ep)) { + (void) close(0); + (void) close(1); + (void) close(2); + mdmn_clients = NULL; + } + assert((fval == 0) || (fval == -1)); + + /* + * Determine which pass level is the highest that contains mirrors to + * resync. We only need to wait for completion of earlier levels below + * this high watermark. If all mirrors are at the same pass level + * there is no requirement to wait for completion. + */ + + max_pass = 1; + for (pass = MD_PASS_MAX; pass > 1; --pass) { + if (mirrors[pass] != NULL) { + max_pass = pass; + break; + } + } + + /* + * max_pass now contains the highest pass-level with resyncable mirrors + */ + + /* do passes */ + for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { + int dispatched = 0; + unsigned howlong = 1; + mm_unit_list_t *lp; + + /* skip empty passes */ + if (mirrors[pass] == NULL) + continue; + + /* dispatch all resyncs in pass */ + for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { + if (meta_is_mn_set(sp, ep)) { + if (meta_mn_send_setsync(sp, lp->namep, + size, ep) != 0) { + rval = -1; + lp->done = 1; + } else { + ++dispatched; + } + } else { + if (meta_mirror_resync(sp, lp->namep, size, ep, + MD_RESYNC_START) != 0) { + rval = -1; + lp->done = 1; + } else { + ++dispatched; + } + } + } + + /* + * Wait for them to finish iff we are at a level lower than + * max_pass. This orders the resyncs into distinct levels. + * I.e. level 2 resyncs won't start until all level 1 ones + * have completed. + */ + if (pass == max_pass) + continue; + + howlong = 1; + while (dispatched > 0) { + + /* wait a while */ + (void) sleep(howlong); + + /* see if any finished */ + for (lp = mirrors[pass]; lp != NULL; lp = lp->next) { + md_resync_ioctl_t ri; + + if (lp->done) + continue; + + (void) memset(&ri, '\0', sizeof (ri)); + ri.ri_mnum = meta_getminor(lp->namep->dev); + MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); + if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, + lp->namep->cname) != 0) { + (void) mdstealerror(ep, &ri.mde); + rval = -1; + lp->done = 1; + --dispatched; + } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) { + lp->done = 1; + --dispatched; + } + } + + /* wait a little longer next time */ + if (howlong < 10) + ++howlong; + } + } + + /* cleanup, return success */ + free_units(mirrors); + if (fval == 0) /* we are the child process so exit */ + exit(0); + return (rval); +} + +/* + * meta_mn_mirror_resync_all: + * ------------------------- + * Resync all mirrors associated with given set (arg). Called when master + * node is adding a node to a diskset. Only want to initiate the resync on + * the current node. + */ +void * +meta_mn_mirror_resync_all(void *arg) +{ + set_t setno = *((set_t *)arg); + mdsetname_t *sp; + mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; + mm_pass_num_t pass, max_pass; + md_error_t mde = mdnullerror; + int fval; + + + /* should have a set */ + assert(setno != NULL); + + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + mde_perror(&mde, ""); + return (NULL); + } + + if (!(meta_is_mn_set(sp, &mde))) { + mde_perror(&mde, ""); + return (NULL); + } + + /* fork a process */ + if ((fval = md_daemonize(sp, &mde)) != 0) { + /* + * md_daemonize will fork off a process. The is the + * parent or error. + */ + if (fval > 0) { + return (NULL); + } + mde_perror(&mde, ""); + return (NULL); + } + /* + * Child process should never return back to rpc.metad, but + * should exit. + * Flush all internally cached data inherited from parent process + * since cached data will be cleared when parent process RPC request + * has completed (which is possibly before this child process + * can complete). + * Child process can retrieve and cache its own copy of data from + * rpc.metad that won't be changed by the parent process. + * + * Reset md_in_daemon since this child will be a client of rpc.metad + * not part of the rpc.metad daemon itself. + * md_in_daemon is used by rpc.metad so that libmeta can tell if + * this thread is rpc.metad or any other thread. (If this thread + * was rpc.metad it could use some short circuit code to get data + * directly from rpc.metad instead of doing an RPC call to rpc.metad). + */ + md_in_daemon = 0; + metaflushsetname(sp); + sr_cache_flush_setno(setno); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + mde_perror(&mde, ""); + md_exit(sp, 1); + } + + if (meta_lock(sp, TRUE, &mde) != 0) { + mde_perror(&mde, ""); + md_exit(sp, 1); + } + + /* + * Closing stdin/out/err here. + */ + (void) close(0); + (void) close(1); + (void) close(2); + assert(fval == 0); + + /* get mirrors */ + (void) memset(mirrors, 0, sizeof (mirrors)); + if (setup_units(sp, mirrors, &mde) != 0) { + (void) meta_unlock(sp, &mde); + md_exit(sp, 1); + } + + /* + * Determine which pass level is the highest that contains mirrors to + * resync. We only need to wait for completion of earlier levels below + * this high watermark. If all mirrors are at the same pass level + * there is no requirement to wait for completion. + */ + max_pass = 1; + for (pass = MD_PASS_MAX; pass > 1; --pass) { + if (mirrors[pass] != NULL) { + max_pass = pass; + break; + } + } + + /* + * max_pass now contains the highest pass-level with resyncable mirrors + */ + /* do passes */ + for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { + int dispatched = 0; + unsigned howlong = 1; + mm_unit_list_t *lp; + + /* skip empty passes */ + if (mirrors[pass] == NULL) + continue; + + /* dispatch all resyncs in pass */ + for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { + if (meta_mirror_resync(sp, lp->namep, 0, &mde, + MD_RESYNC_FORCE_MNSTART) != 0) { + mdclrerror(&mde); + lp->done = 1; + } else { + ++dispatched; + } + } + + /* + * Wait for them to finish iff we are at a level lower than + * max_pass. This orders the resyncs into distinct levels. + * I.e. level 2 resyncs won't start until all level 1 ones + * have completed. + */ + if (pass == max_pass) + continue; + + howlong = 1; + while (dispatched > 0) { + + /* wait a while */ + (void) sleep(howlong); + + /* see if any finished */ + for (lp = mirrors[pass]; lp != NULL; lp = lp->next) { + md_resync_ioctl_t ri; + + if (lp->done) + continue; + + (void) memset(&ri, '\0', sizeof (ri)); + ri.ri_mnum = meta_getminor(lp->namep->dev); + MD_SETDRIVERNAME(&ri, MD_MIRROR, sp->setno); + if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, + lp->namep->cname) != 0) { + mdclrerror(&mde); + lp->done = 1; + --dispatched; + } else if (! (ri.ri_flags & MD_RI_INPROGRESS)) { + lp->done = 1; + --dispatched; + } + } + + /* wait a little longer next time */ + if (howlong < 10) + ++howlong; + } + } + + /* cleanup, return success */ + free_units(mirrors); + (void) meta_unlock(sp, &mde); + md_exit(sp, 0); + /*NOTREACHED*/ +} + +/* + * meta_mirror_resync_process: + * -------------------------- + * Modify any resync that is in progress on this node for the given set. + * + * Input Parameters: + * sp setname to scan for mirrors + * cmd action to take: + * MD_RESYNC_KILL - kill all resync threads + * MD_RESYNC_BLOCK - block all resync threads + * MD_RESYNC_UNBLOCK - resume all resync threads + * Output Parameters + * ep error return structure + * + * meta_lock for this set should be held on entry. + */ +static void +meta_mirror_resync_process(mdsetname_t *sp, md_error_t *ep, md_resync_cmd_t cmd) +{ + mm_unit_list_t *mirrors[MD_PASS_MAX + 1]; + mm_pass_num_t pass; + + /* Grab all the mirrors from the set (if any) */ + (void) memset(mirrors, 0, sizeof (mirrors)); + if (setup_units(sp, mirrors, ep) != 0) + return; + + /* do passes */ + for (pass = 1; (pass <= MD_PASS_MAX); ++pass) { + mm_unit_list_t *lp; + + /* skip empty passes */ + if (mirrors[pass] == NULL) + continue; + + /* Process all resyncs in pass */ + for (lp = mirrors[pass]; (lp != NULL); lp = lp->next) { + (void) meta_mirror_resync(sp, lp->namep, 0, ep, + cmd); + } + } + + /* Clear up mirror units */ + free_units(mirrors); +} + +/* + * meta_mirror_resync_process_all: + * ------------------------------ + * Issue the given resync command to all mirrors contained in all multi-node + * sets. + * + * Input Parameters: + * cmd - MD_RESYNC_KILL, MD_RESYNC_BLOCK, MD_RESYNC_UNBLOCK + */ +static void +meta_mirror_resync_process_all(md_resync_cmd_t cmd) +{ + set_t setno, max_sets; + md_error_t mde = mdnullerror; + mdsetname_t *this_sp; + md_set_desc *sd; + + /* + * Traverse all sets looking for multi-node capable ones. + */ + max_sets = get_max_sets(&mde); + for (setno = 1; setno < max_sets; setno++) { + mde = mdnullerror; + if (this_sp = metasetnosetname(setno, &mde)) { + if ((sd = metaget_setdesc(this_sp, &mde)) == NULL) + continue; + if (!MD_MNSET_DESC(sd)) + continue; + + if (meta_lock(this_sp, TRUE, &mde)) { + continue; + } + meta_mirror_resync_process(this_sp, &mde, cmd); + (void) meta_unlock(this_sp, &mde); + } + } +} + +/* + * meta_mirror_resync_kill_all: + * --------------------------- + * Abort any resync that is in progress on this node. Scan all sets for all + * mirrors. + * Note: this routine is provided for future use. For example to kill all + * resyncs on a node this could be used as long as the + * mddoors / rpc.mdcommd tuple is running on all members of the cluster. + */ +void +meta_mirror_resync_kill_all(void) +{ + meta_mirror_resync_process_all(MD_RESYNC_KILL); +} + +/* + * meta_mirror_resync_block_all: + * ---------------------------- + * Block all resyncs that are in progress. This causes the resync state to + * freeze on this machine, and can be resumed by calling + * meta_mirror_resync_unblock_all. + */ +void +meta_mirror_resync_block_all(void) +{ + meta_mirror_resync_process_all(MD_RESYNC_BLOCK); +} + +/* + * meta_mirror_resync_unblock_all: + * ------------------------------ + * Unblock all previously blocked resync threads on this node. + */ +void +meta_mirror_resync_unblock_all(void) +{ + meta_mirror_resync_process_all(MD_RESYNC_UNBLOCK); +} + +/* + * meta_mirror_resync_unblock: + * -------------------------- + * Unblock any previously blocked resync threads for the given set. + * meta_lock for this set should be held on entry. + */ +void +meta_mirror_resync_unblock(mdsetname_t *sp) +{ + md_error_t mde = mdnullerror; + + meta_mirror_resync_process(sp, &mde, MD_RESYNC_UNBLOCK); +} + +/* + * meta_mirror_resync_kill: + * ----------------------- + * Kill any resync threads running on mirrors in the given set. + * Called when releasing a set (meta_set_prv.c`halt_set) + */ +void +meta_mirror_resync_kill(mdsetname_t *sp) +{ + md_error_t mde = mdnullerror; + + meta_mirror_resync_process(sp, &mde, MD_RESYNC_KILL); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c new file mode 100644 index 0000000000..bd9b5cc508 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c @@ -0,0 +1,636 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdlib.h> +#include <unistd.h> + +#include <wait.h> +#include <sys/time.h> +#include <meta.h> +#include <metad.h> +#include <mdmn_changelog.h> +#include <syslog.h> +#include <umem.h> + +/* + * Number of log entries per set. + * + * We want at least 4 spares available at all times + * in case new classes are added during a live upgrade. + * + * Allocate the entries in chunks of 16 + */ +#define MDMN_LOGRECS_QUANTA 16 +#define MDMN_LOGRECS_MINSPARES 4 +#define MDMN_LOGHDR_SIZE sizeof (mdmn_changelog_record_t) +#define MDMN_LOGRECSIZE (MDMN_LOGHDR_SIZE + MD_MN_MSG_MAXDATALEN) +#define MDMN_LOGRECSIZE_OD sizeof (mdmn_changelog_record_od_t) +#define MDMN_LOGRECS_TRIMUP ((MD_MN_NCLASSES % MDMN_LOGRECS_QUANTA) > \ + (MDMN_LOGRECS_QUANTA - MDMN_LOGRECS_MINSPARES)) + +static int mdmn_commitlog(md_set_desc *, md_error_t *); +static int mdmn_log_it(set_t, md_error_t *, mdmn_changelog_record_t *lr); + + +/* Global variables */ + +mdmn_changelog_record_t *mdmn_changelog[MD_MAXSETS]; +int mdmn_changelog_snarfed[MD_MAXSETS]; + +/* Total number of log records */ +int mdmn_logrecs = (MDMN_LOGRECS_QUANTA + + ((MD_MN_NCLASSES/MDMN_LOGRECS_QUANTA) * MDMN_LOGRECS_QUANTA)); + +#ifdef DEBUG +void +dump_rec(char *fn_name, mdmn_changelog_record_t *lr) +{ + syslog(LOG_DEBUG, "%s incore: selfid 0x%x class %d flags %d " + "msglen %d\n", fn_name, lr->lr_selfid, lr->lr_class, + lr->lr_flags, lr->lr_msglen); +} +void +dump_rec_od(char *fn_name, mdmn_changelog_record_od_t *lr) +{ + syslog(LOG_DEBUG, "%s ondisk: selfid 0x%x class %d flags %d " + "msglen %d\n", fn_name, lr->lr_selfid, lr->lr_class, + lr->lr_flags, lr->lr_msglen); +} + +void +dump_array(char *fn_name, set_t setno) +{ + int i; + char tchar[80]; + + mdmn_changelog_record_t *tlr; + + for (i = 0; i < mdmn_logrecs; i++) { + tlr = &mdmn_changelog[setno][i]; + (void) snprintf(tchar, sizeof (tchar), "%s class %d ", + fn_name, i); + dump_rec(tchar, tlr); + } +} +#endif + +/* + * copy_changelog: copies changelog ondisk<->incore records. + * The argument "direction" controls the direction to copy the + * the records. Incore and ondisk changlog structures must be + * allocated when calling this routine. + * + * The purpose of changelog is to store a message that is in progress. + * Therefore the changlog structure embeds the message structure. + * Incore and ondisk changelog structures are created to handle the + * incore and ondisk message formats. The incore message has a pointer + * to the payload. The ondisk message format has payload embedded as + * part of the message. + * + * Caveat Emptor: Incore and ondisk structures have the payload buffers + * correctly allocated. + */ + +static void +copy_changelog(mdmn_changelog_record_t *incp, + mdmn_changelog_record_od_t *odp, int direction) +{ + assert(incp != NULL && odp != NULL); + assert((direction == MD_MN_COPY_TO_ONDISK) || + (direction == MD_MN_COPY_TO_INCORE)); + + if (direction == MD_MN_COPY_TO_ONDISK) { + odp->lr_revision = incp->lr_revision; + odp->lr_flags = incp->lr_flags; + odp->lr_selfid = incp->lr_selfid; + odp->lr_class = incp->lr_class; + odp->lr_msglen = incp->lr_msglen; + if (incp->lr_msglen) + copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction); + } else { + incp->lr_revision = odp->lr_revision; + incp->lr_flags = odp->lr_flags; + incp->lr_selfid = odp->lr_selfid; + incp->lr_class = odp->lr_class; + incp->lr_msglen = odp->lr_msglen; + if (odp->lr_msglen) + copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction); + } +} + +/* + * mdmn_allocate_changelog + * + * Changelog records are allocated on a per multi-node basis. + * This routine is called during MN set creation. + * It pre-allocates the changelog, as user records + * one per message class plus some spares. + * Once the records are allocated they are never freed until + * the mddb is deleted. The preallocation ensures that all nodes + * will have a consistent view of the mddb. + * + * Each record is large enough to hold a maximum sized message + * Return Values: + * 0 - success + * -1 - fail + */ +int +mdmn_allocate_changelog(mdsetname_t *sp, md_error_t *ep) +{ + mddb_userreq_t req; + md_set_desc *sd; + mdmn_changelog_record_t *tlr; + int i; + set_t setno; + + /* Get a pointer to the incore md_set_desc for this MN set */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + setno = sd->sd_setno; + /* + * Round up the number of changelog records + * to the next value of MDMN_LOGRECS_QUANTA + * + * In all cases, make sure we have at least + * four more entries than the number of classes + * in order to provide space for live upgrades that + * might add classes. + */ + + mdmn_logrecs += (MDMN_LOGRECS_TRIMUP) ? MDMN_LOGRECS_QUANTA : 0; + + mdmn_changelog[setno] = Zalloc(MDMN_LOGHDR_SIZE * mdmn_logrecs); + + for (i = 0; i < mdmn_logrecs; i++) { + (void) memset(&req, 0, sizeof (req)); + METAD_SETUP_LR(MD_DB_CREATE, setno, 0); + /* grab a record big enough for max message size */ + req.ur_size = MDMN_LOGRECSIZE_OD; + + if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); +#ifdef DEBUG + syslog(LOG_DEBUG, "allocate_log: %s\n", + mde_sperror(ep, "")); +#endif + Free(mdmn_changelog[setno]); + return (-1); + } + + tlr = &mdmn_changelog[setno][i]; + tlr->lr_selfid = req.ur_recid; + tlr->lr_revision = MD_MN_CHANGELOG_RECORD_REVISION; + tlr->lr_class = i; + } + + /* commit class, and selfid */ + (void) mdmn_commitlog(sd, ep); + Free(mdmn_changelog[setno]); + return (0); +} + +/* + * mdmn_reset_changelog + * + * Called during reconfig step 2. + * The only time the changelog is reset is when all nodes in a cluster + * are starting up. In this case changelog must be ignored, therefore + * it is reset. + * + * The function frees the incore data structures and zeros out the + * records. The ondisk records are never freed. + * + * Return Values: + * 0 - success + * -1 - fail + */ +int +mdmn_reset_changelog(mdsetname_t *sp, md_error_t *ep, int flag) +{ + md_set_desc *sd; + mdmn_changelog_record_t *lr; + set_t setno; + int lrc; + + /* Get a pointer to the incore md_set_desc this MN set */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + setno = sd->sd_setno; + + if (mdmn_snarf_changelog(setno, ep) == 0) { + return (0); + } + + if (flag & MDMN_CLF_RESETLOG) { + for (lrc = 0; lrc < mdmn_logrecs; lrc++) { + lr = &mdmn_changelog[setno][lrc]; + Free(lr->lr_msg.msg_event_data); + (void) memset(&lr->lr_msg, 0, sizeof (md_mn_msg_t)); + lr->lr_msglen = 0; + lr->lr_flags = 0; + } + (void) mdmn_commitlog(sd, ep); +#ifdef DEBUG + syslog(LOG_DEBUG, "reset_changelog: Log reset\n"); +#endif + } + /* now zap the array */ + if (flag & MDMN_CLF_RESETCACHE) { +#ifdef DEBUG + syslog(LOG_DEBUG, "reset_changelog: cache reset\n"); +#endif + Free(&mdmn_changelog[setno]); + mdmn_changelog[setno] = NULL; + mdmn_changelog_snarfed[setno] = 0; + } + return (0); +} + +/* + * Log a given message in the changelog. + * This function is only executed by the master node + * Return Values: + * MDMNE_NULL: + * success, the log slot is free + * + * MDMNE_ACK: + * success, + * the log slot is occupied with the same msg from a previous try. + * + * MDMNE_CLASS_BUSY: + * This means the appropriate slot is occupied with a different + * message. In that case the stored message needs being replayed, + * while the current message will be rejected with MDMNE_CLASS_BUSY + * to the initiator. + * + * MDMNE_LOG_FAIL: + * Bad things happend, cannot continue. + */ +int +mdmn_log_msg(md_mn_msg_t *msg) +{ + set_t setno; + md_mn_msgclass_t class; + mdmn_changelog_record_t *lr; + md_error_t err = mdnullerror; + md_error_t *ep = &err; + int retval = 0; + + setno = msg->msg_setno; + class = mdmn_get_message_class(msg->msg_type); + + /* if not snarfed, snarf it */ + if (mdmn_snarf_changelog(setno, ep) <= 0) { + syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, + "log_msg: No records snarfed\n")); + return (-1); + } + + + /* log entry for the class */ + lr = &mdmn_changelog[setno][class]; + + /* Check if the class is occupied */ + if (lr->lr_flags & MD_MN_LR_INUSE) { + if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) { + syslog(LOG_DAEMON | LOG_DEBUG, dgettext(TEXT_DOMAIN, + "log_msg: id mismatch:\n" + " stored : ID = (%d, 0x%llx-%d)" + " setno %d class %d type %d\n" + " msg to log: ID = (%d, 0x%llx-%d)" + " setno %d class %d type %d.\n"), + MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno, + lr->lr_class, lr->lr_msgtype, + MSGID_ELEMS(msg->msg_msgid), msg->msg_setno, class, + msg->msg_type); + return (MDMNE_CLASS_BUSY); + } else { + syslog(LOG_DAEMON | LOG_DEBUG, dgettext(TEXT_DOMAIN, + "log_msg: msgid already logged:\n ID = " + " (%d, 0x%llx-%d) setno %d class %d type %d\n"), + MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno, + lr->lr_class, lr->lr_msgtype); + return (MDMNE_ACK); + } + } + + lr->lr_flags |= MD_MN_LR_INUSE; + lr->lr_msglen = MD_MN_MSG_LEN(msg); + assert(lr->lr_msg.msg_event_data == NULL); + if (msg->msg_event_size) + lr->lr_msg.msg_event_data = Zalloc(msg->msg_event_size); + (void) copy_msg(msg, &(lr->lr_msg)); + retval = mdmn_log_it(setno, ep, lr); + if (retval != 0) { + syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, + "mdmn_log_msg - failure committing logged msg to disk\n")); + return (MDMNE_LOG_FAIL); + } + + return (MDMNE_NULL); /* this is good */ +} + +/* + * mdmn_unlog_msg(md_mn_msg_t *) + * + * Clear the log entry holding the indicated message. + * Only the set master can do this. + * + * Return Values: + * 0 - success + * -1 - fail + */ +int +mdmn_unlog_msg(md_mn_msg_t *msg) +{ + set_t setno; + md_mn_msgclass_t class; + md_error_t err = mdnullerror; + md_error_t *ep = &err; + int retval = 0; + mdmn_changelog_record_t *lr = NULL; + + setno = msg->msg_setno; + class = mdmn_get_message_class(msg->msg_type); + + /* Find the log entry holding the indicated message */ + if (mdmn_snarf_changelog(setno, ep) == 0) + return (-1); + + lr = &mdmn_changelog[setno][class]; + + /* assert the message is still logged */ + assert(lr != NULL); + if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "unlog_msg: msgid mismatch\n" + "\t\tstored: ID = (%d, 0x%llx-%d) setno %d class %d type %d\n" + "\t\tattempting to unlog:\n" + "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"), + MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno, + lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid), + msg->msg_setno, class, msg->msg_type); + return (-1); + } + lr->lr_msglen = 0; + lr->lr_flags &= ~(MD_MN_LR_INUSE); + if (lr->lr_msg.msg_event_data) { + Free(lr->lr_msg.msg_event_data); + lr->lr_msg.msg_event_data = NULL; + } + /* commit the updated log record to disk */ + retval = mdmn_log_it(setno, ep, lr); +#ifdef DEBUG + dump_rec("mdmn_unlog_msg: ", lr); +#endif + return (retval); +} + + +/* + * mdmn_get_changelogrec(set_t , md_mn_msgclass_t) + * Returns a pointer to incore changelog record. + * + * Return Values: + * non-NULL - success + * NULL - fail + */ +mdmn_changelog_record_t * +mdmn_get_changelogrec(set_t setno, md_mn_msgclass_t class) +{ + md_error_t err = mdnullerror; + + if (mdmn_snarf_changelog(setno, &err) == 0) + return (NULL); + assert(mdmn_changelog[setno] != NULL); + + return (&mdmn_changelog[setno][class]); +} + +/* + * mdmn_commitlog(md_set_desc *, md_error_t *) + * + * Commit the set record and all of the changelog entry records to disk. + * Don't bother with other stuff hanging off the set record + * (e.g. drive records) since none of that is changing. + * Called only at changelog pre-allocation time or when flushing a log. + * + * Return Values: + * 0 - success + * errno - fail + */ + +static int +mdmn_commitlog(md_set_desc *sd, md_error_t *ep) +{ + int lrc; + int *recs; + uint_t size; + mdmn_changelog_record_t *lr; + mdmn_changelog_record_od_t clodrec; /* changelog ondisk record */ + mddb_userreq_t req; + int retval = 0; + set_t setno; + + /* Check for master and bounce non-master requests */ + if (!(MD_MNSET_DESC(sd)) || !sd->sd_mn_am_i_master) { + if (!(MD_MNSET_DESC(sd))) { + syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, + "mdmn_commitlog - Not MN Set\n")); + } else { + syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN, + "mdmn_commit_log - Not Master\n")); + } + return (-1); + } + (void) memset(&req, 0, sizeof (req)); + /* create the records to commit the info to the mddb */ + + size = (mdmn_logrecs + 1) * sizeof (int); + recs = Zalloc(size); + /* Initialize the log entry records for update */ + setno = sd->sd_setno; + + for (lrc = 0; lrc < mdmn_logrecs; lrc++) { + lr = &mdmn_changelog[setno][lrc]; + recs[lrc] = lr->lr_selfid; + copy_changelog(lr, &clodrec, MD_MN_COPY_TO_ONDISK); + METAD_SETUP_LR(MD_DB_SETDATA, setno, lr->lr_selfid); + req.ur_size = MDMN_LOGRECSIZE_OD; + req.ur_data = (uint64_t)&clodrec; + if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, + NULL)) != 0) { + (void) mdstealerror(ep, &req.ur_mde); +#ifdef DEBUG + syslog(LOG_DAEMON|LOG_DEBUG, + "mdmn_commitlog - metaioctl SETDATA failure\n%s", + mde_sperror(ep, "")); +#endif + break; + } + } + + if (retval == 0) { + /* set last rec to be 0 to indicate completion */ + recs[lrc] = 0; + /* Commit to mddb on disk */ + METAD_SETUP_LR(MD_DB_COMMIT_MANY, setno, + mdmn_changelog[setno][0].lr_selfid); + req.ur_size = size; + req.ur_data = (uint64_t)recs; + if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, + &req.ur_mde, NULL)) != 0) { + (void) mdstealerror(ep, &req.ur_mde); +#ifdef DEBUG + syslog(LOG_DAEMON|LOG_DEBUG, + "mdmn_commitlog - metaioctl COMMIT_MANY" + "Failure\n%s", mde_sperror(ep, "")); +#endif + } + } + + Free(recs); + return (retval); +} + +/* + * mdmn_log_it(set_t, md_error_t *, mdmn_changelog_record_t *) + * + * Commit the changed log record to disk. + * + * Return Values: + * 0 - success + * -1 - fail + */ +static int +mdmn_log_it(set_t set, md_error_t *ep, mdmn_changelog_record_t *lr) +{ + int *recs; + uint_t size; + mddb_userreq_t req; + mdmn_changelog_record_od_t clodrec; + + (void) memset(&req, 0, sizeof (req)); + + /* Initialize the log entry record for update */ + + copy_changelog(lr, &clodrec, MD_MN_COPY_TO_ONDISK); + METAD_SETUP_LR(MD_DB_SETDATA, set, lr->lr_selfid); + req.ur_size = MDMN_LOGRECSIZE_OD; + req.ur_data = (uint64_t)&clodrec; + if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); +#ifdef DEBUG + syslog(LOG_DEBUG, "mdmn_log_it: DB_SETDATA failed\n" + "set %d selfid %d, size %d\n%s", set, lr->lr_selfid, + req.ur_size, mde_sperror(ep, "")); +#endif + return (-1); + } + /* Set up the recid to be updated */ + size = 2 * sizeof (int); /* the changed record, plus null terminator */ + recs = Zalloc(size); + recs[0] = lr->lr_selfid; + recs[1] = 0; + /* Commit to mddb on disk */ + METAD_SETUP_LR(MD_DB_COMMIT_ONE, set, lr->lr_selfid); + req.ur_size = size; + req.ur_data = (uint64_t)recs; + if (metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde, NULL) != 0) { + (void) mdstealerror(ep, &req.ur_mde); +#ifdef DEBUG + syslog(LOG_DEBUG, "mdmn_log_it: DB_COMMIT_ONE failed\n" + "set %d selfid %d, size %d\n%s", set, lr->lr_selfid, + req.ur_size, mde_sperror(ep, "")); +#endif + Free(recs); + return (-1); + } + Free(recs); + return (0); +} + +/* + * mdmn_snarf_changelog(set_t, md_error_t *) + * + * snarf in the changelog entries and allocate incore structures + * if required. + * mdmn_changelog_snarfed array if set to MDMN_CLF_SNARFED, then + * then the records are already snarfed. + * + * Called from set_snarf(), mdmn_log_msg(), and mdmn_unlog_msg() + * Return Values: + * non-zero - success + * 0 - fail + */ +int +mdmn_snarf_changelog(set_t set, md_error_t *ep) +{ + mdmn_changelog_record_t *tlr; + mdmn_changelog_record_od_t *lr; + mddb_recid_t id; + md_mn_msgclass_t class; + + + if (set == MD_LOCAL_SET) + return (0); + + id = 0; + + if (mdmn_changelog_snarfed[set] & MDMN_CLF_SNARFED) { + assert(mdmn_changelog[set] != NULL); + return (mdmn_logrecs); + } + + lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, MD_UR_GET_NEXT, + MDDB_UR_LR, &id, ep); + if (lr == NULL) + return (0); + + /* only allocate if Log records exist */ + + if (mdmn_changelog[set] == NULL) { + /* Allocate incore state for the log */ + mdmn_changelog[set] = Zalloc(MDMN_LOGHDR_SIZE * + mdmn_logrecs); + } + + do { + class = lr->lr_class; + tlr = &mdmn_changelog[set][class]; + copy_changelog(tlr, lr, MD_MN_COPY_TO_INCORE); + Free(lr); + lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, + MD_UR_GET_NEXT, MDDB_UR_LR, &id, ep); + } while (lr != NULL); + + /* Since log records counts are fixed return that value */ + mdmn_changelog_snarfed[set] |= MDMN_CLF_SNARFED; + return (mdmn_logrecs); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c new file mode 100644 index 0000000000..02ad7bf1e6 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c @@ -0,0 +1,984 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdlib.h> +#include <unistd.h> +#include <wait.h> +#include <sys/time.h> +#include <strings.h> +#include <meta.h> +#include <syslog.h> + +extern md_mn_msg_tbl_entry_t msg_table[]; + +/* + * When contacting the local rpc.mdcommd we always want to do that using + * the IPv4 version of localhost. + */ +#define LOCALHOST_IPv4 "127.0.0.1" + +md_mn_msgclass_t +mdmn_get_message_class(md_mn_msgtype_t msgtype) +{ + return (msg_table[msgtype].mte_class); +} + +void (* +mdmn_get_handler(md_mn_msgtype_t msgtype)) + (md_mn_msg_t *msg, uint_t flags, md_mn_result_t *res) +{ + return (msg_table[msgtype].mte_handler); +} + +int (* +mdmn_get_submessage_generator(md_mn_msgtype_t msgtype)) + (md_mn_msg_t *msg, md_mn_msg_t **msglist) +{ + return (msg_table[msgtype].mte_smgen); +} + +time_t +mdmn_get_timeout(md_mn_msgtype_t msgtype) +{ + return (msg_table[msgtype].mte_timeout); +} + + +void +ldump_msg(char *prefix, md_mn_msg_t *msg) +{ + (void) fprintf(stderr, "%s &msg = 0x%x\n", prefix, (uint_t)msg); + (void) fprintf(stderr, "%s ID = (%d, 0x%llx-%d)\n", prefix, + MSGID_ELEMS(msg->msg_msgid)); + (void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender); + (void) fprintf(stderr, "%s flags = 0x%x\n", prefix, msg->msg_flags); + (void) fprintf(stderr, "%s setno = %d\n", prefix, msg->msg_setno); + (void) fprintf(stderr, "%s type = %d\n", prefix, msg->msg_type); + (void) fprintf(stderr, "%s size = %d\n", prefix, msg->msg_event_size); +} + + +/* Default timeout can be changed using clnt_control() */ +static struct timeval TIMEOUT = { 25, 0 }; + +md_mn_result_t * +mdmn_send_1(argp, clnt) + md_mn_msg_t *argp; + CLIENT *clnt; +{ + md_mn_result_t *clnt_res = Zalloc(sizeof (md_mn_result_t)); + + if (clnt_call(clnt, mdmn_send, + (xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp, + (xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_work_1(argp, clnt) + md_mn_msg_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_work, + (xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + Free(clnt_res); + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_wakeup_initiator_1(argp, clnt) + md_mn_result_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_wakeup_initiator, + (xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + Free(clnt_res); + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_wakeup_master_1(argp, clnt) + md_mn_result_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_wakeup_master, + (xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + Free(clnt_res); + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_lock_1(argp, clnt) + md_mn_set_and_class_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_lock, + (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_unlock_1(argp, clnt) + md_mn_set_and_class_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_unlock, + (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_suspend_1(argp, clnt) + md_mn_set_and_class_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_suspend, + (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_resume_1(argp, clnt) + md_mn_set_and_class_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_resume, + (xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_reinit_set_1(argp, clnt) + set_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_reinit_set, + (xdrproc_t)xdr_set_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + +int * +mdmn_comm_msglock_1(argp, clnt) + md_mn_type_and_lock_t *argp; + CLIENT *clnt; +{ + int *clnt_res = Zalloc(sizeof (int)); + + if (clnt_call(clnt, mdmn_comm_msglock, + (xdrproc_t)xdr_md_mn_type_and_lock_t, (caddr_t)argp, + (xdrproc_t)xdr_int, (caddr_t)clnt_res, + TIMEOUT) != RPC_SUCCESS) { + return (NULL); + } + return (clnt_res); +} + + +#define USECS_PER_TICK 10000 + + +/* + * Let the kernel create a clusterwide unique message ID + * + * returns 0 on success + * 1 on failure + */ + +int +mdmn_create_msgid(md_mn_msgid_t *msgid) +{ + md_error_t mde = mdnullerror; + + if (msgid == NULL) { + return (1); /* failure */ + } + + if (metaioctl(MD_IOCGUNIQMSGID, msgid, &mde, NULL) != 0) { + msgid->mid_nid = ~0u; + msgid->mid_time = 0LL; + return (1); /* failure */ + } + + /* + * mid_smid and mid_oclass are only used for submessages. + * mdmn_create_msgid is never called for submessages, as they inherit + * the message ID from their parent. + * Thus we can safely null out the following fields. + */ + msgid->mid_smid = 0; + msgid->mid_oclass = 0; + + /* if the node_id is not set yet, somethings seems to be wrong */ + if (msgid->mid_nid == ~0u) { + return (1); /* failure */ + } + + return (0); /* success */ +} + +md_mn_result_t * +copy_result(md_mn_result_t *res) +{ + md_mn_result_t *nres; + nres = Zalloc(sizeof (md_mn_result_t)); + /* It's MSGID_COPY(from, to); */ + MSGID_COPY(&(res->mmr_msgid), &(nres->mmr_msgid)); + nres->mmr_msgtype = res->mmr_msgtype; + nres->mmr_setno = res->mmr_setno; + nres->mmr_flags = res->mmr_flags; + nres->mmr_sender = res->mmr_sender; + nres->mmr_failing_node = res->mmr_failing_node; + nres->mmr_comm_state = res->mmr_comm_state; + nres->mmr_exitval = res->mmr_exitval; + nres->mmr_out_size = res->mmr_out_size; + nres->mmr_err_size = res->mmr_err_size; + if (res->mmr_out_size > 0) { + nres->mmr_out = Zalloc(res->mmr_out_size); + bcopy(res->mmr_out, nres->mmr_out, res->mmr_out_size); + } + if (res->mmr_err_size > 0) { + nres->mmr_err = Zalloc(res->mmr_err_size); + bcopy(res->mmr_err, nres->mmr_err, res->mmr_err_size); + } + if (res->mmr_ep.host != '\0') { + nres->mmr_ep.host = strdup(res->mmr_ep.host); + } + if (res->mmr_ep.extra != '\0') { + nres->mmr_ep.extra = strdup(res->mmr_ep.extra); + } + if (res->mmr_ep.name != '\0') { + nres->mmr_ep.name = strdup(res->mmr_ep.name); + } + return (nres); +} + +void +free_result(md_mn_result_t *res) +{ + if (res->mmr_out_size > 0) { + Free(res->mmr_out); + } + if (res->mmr_err_size > 0) { + Free(res->mmr_err); + } + if (res->mmr_ep.host != '\0') { + Free(res->mmr_ep.host); + } + if (res->mmr_ep.extra != '\0') { + Free(res->mmr_ep.extra); + } + if (res->mmr_ep.name != '\0') { + Free(res->mmr_ep.name); + } + Free(res); +} + + +/* allocate a new message and copy a given message into it */ +md_mn_msg_t * +copy_msg(md_mn_msg_t *msg, md_mn_msg_t *dest) +{ + md_mn_msg_t *nmsg; + + nmsg = dest; + + if (nmsg == NULL) { + nmsg = Zalloc(sizeof (md_mn_msg_t)); + } + if (nmsg->msg_event_data == NULL) { + nmsg->msg_event_data = Zalloc(msg->msg_event_size); + } + /* It's MSGID_COPY(from, to); */ + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + nmsg->msg_sender = msg->msg_sender; + nmsg->msg_flags = msg->msg_flags; + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = msg->msg_type; + nmsg->msg_event_size = msg->msg_event_size; + if (msg->msg_event_size > 0) { + bcopy(msg->msg_event_data, nmsg->msg_event_data, + msg->msg_event_size); + } + return (nmsg); +} + +void +copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction) +{ + assert((direction == MD_MN_COPY_TO_ONDISK) || + (direction == MD_MN_COPY_TO_INCORE)); + + if (direction == MD_MN_COPY_TO_ONDISK) { + MSGID_COPY(&(msg->msg_msgid), &(msgod->msg_msgid)); + msgod->msg_sender = msg->msg_sender; + msgod->msg_flags = msg->msg_flags; + msgod->msg_setno = msg->msg_setno; + msgod->msg_type = msg->msg_type; + msgod->msg_od_event_size = msg->msg_event_size; + /* paranoid checks */ + if (msg->msg_event_size != 0 && msg->msg_event_data != NULL) + bcopy(msg->msg_event_data, + &msgod->msg_od_event_data[0], msg->msg_event_size); + } else { + MSGID_COPY(&(msgod->msg_msgid), &(msg->msg_msgid)); + msg->msg_sender = msgod->msg_sender; + msg->msg_flags = msgod->msg_flags; + msg->msg_setno = msgod->msg_setno; + msg->msg_type = msgod->msg_type; + msg->msg_event_size = msgod->msg_od_event_size; + if (msg->msg_event_data == NULL) + msg->msg_event_data = Zalloc(msg->msg_event_size); + + bcopy(&msgod->msg_od_event_data[0], + msg->msg_event_data, msgod->msg_od_event_size); + } +} + +/* Free a message */ +void +free_msg(md_mn_msg_t *msg) +{ + if (msg->msg_event_size > 0) { + Free(msg->msg_event_data); + } + Free(msg); +} + + +/* The following declarations are only for the next two routines */ + +md_mn_client_list_t *mdmn_clients; + +mutex_t mcl_mutex; +#define MNGLC_INIT_ONLY 0x0001 +#define MNGLC_FOR_REAL 0x0002 +/* + * mdmn_get_local_clnt(flag) + * If there is a client in the free pool, get one, + * If no client is available, create one. + * Every multithreaded application that uses mdmn_send_message must call it + * single threaded first with special flags so we do the initialization + * stuff in a safe environment. + * + * Input: MNGLC_INIT_ONLY: just initializes the mutex + * MNGLC_FOR_REAL : do real work + * Output: + * An rpc client for sending rpc requests to the local commd + * NULL in case of an error + * + */ +static CLIENT * +mdmn_get_local_clnt(uint_t flag) +{ + CLIENT *local_daemon; + static int inited = 0; + md_mn_client_list_t *tmp; + + if (inited == 0) { + (void) mutex_init(&mcl_mutex, USYNC_THREAD, NULL); + inited = 1; + } + + if (flag == MNGLC_INIT_ONLY) + return ((CLIENT *)NULL); + + (void) mutex_lock(&mcl_mutex); + if (mdmn_clients == (md_mn_client_list_t *)NULL) { + /* if there is no entry, create a client and return a it */ + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, + ONE, "tcp"); + } else { + /* + * If there is an entry from a previous put operation, + * remove it from the head of the list and free the list stuff + * around it. Then return the client + */ + local_daemon = mdmn_clients->mcl_clnt; + tmp = mdmn_clients; + mdmn_clients = mdmn_clients->mcl_next; + Free(tmp); + } + (void) mutex_unlock(&mcl_mutex); + + + if (local_daemon == (CLIENT *)NULL) { + clnt_pcreateerror("local_daemon"); + } + + return (local_daemon); +} + +/* + * mdmn_put_local_clnt() + * returns a no longer used client to the pool + * + * Input: an RPC client + * Output: void + */ +static void +mdmn_put_local_clnt(CLIENT *local_daemon) +{ + md_mn_client_list_t *tmp; + + (void) mutex_lock(&mcl_mutex); + + tmp = mdmn_clients; + mdmn_clients = (md_mn_client_list_t *) + malloc(sizeof (md_mn_client_list_t)); + mdmn_clients->mcl_clnt = local_daemon; + mdmn_clients->mcl_next = tmp; + + (void) mutex_unlock(&mcl_mutex); +} + +/* + * This is the regular interface for sending a message. + * This function only passes through all arguments to + * mdmn_send_message_with_msgid() and adds a NULL for the message ID. + * + * Normally, you don't have already a message ID for the message you want + * to send. Only in case of replaying a previously logged message, + * a msgid is already attached to it. + * In that case mdmn_send_message_with_msgid() has to be called directly. + * + * Return values / CAVEAT EMPTOR: see mdmn_send_message_with_msgid() + */ + +int +mdmn_send_message( + set_t setno, + md_mn_msgtype_t type, + uint_t flags, + char *data, + int size, + md_mn_result_t **result, + md_error_t *ep) +{ + return (mdmn_send_message_with_msgid( + setno, type, flags, data, size, result, MD_NULL_MSGID, ep)); +} +/* + * mdmn_send_message_with_msgid() + * Create a message from the given pieces of data and hand it over + * to the local commd. + * This may fail for various reasons (rpc error / class busy / class locked ...) + * Some error types are immediately deadly, others will cause retries + * until the request is fulfilled or until the retries are ecxceeded. + * + * In case an error is returned it is up to the user to decide what to do. + * + * Returns: + * 0 on success + * 1 if retries1 exceeded + * 2 if retries2 exceeded + * -1 if connecting to the local daemon failed + * -2 if the RPC call to the local daemon failed + * -3 if this node hasn't yet joined the set + * -4 if any other problem occured + * + * CAVEAT EMPTOR: + * The caller is responsible for calling free_result() when finished with + * the results! + */ +int +mdmn_send_message_with_msgid( + set_t setno, + md_mn_msgtype_t type, + uint_t flags, + char *data, + int size, + md_mn_result_t **result, + md_mn_msgid_t *msgid, + md_error_t *ep) +{ + uint_t retry1, ticks1, retry2, ticks2; + int retval; + + CLIENT *local_daemon; + struct timeval timeout; + + md_mn_msg_t msg; + md_mn_result_t *resp; + + /* + * Special case for multithreaded applications: + * When starting up, the application should call mdmn_send_message + * single threaded with all parameters set to NULL. + * When we detect this we know, we safely can do initialization + * stuff here. + * We only check for set and type being zero + */ + if ((setno == 0) && (type == 0)) { + /* do all needed initializations here */ + (void) mdmn_get_local_clnt(MNGLC_INIT_ONLY); + return (0); /* success */ + } + + + /* did the caller specify space to store the result pointer? */ + if (result == (md_mn_result_t **)NULL) { + syslog(LOG_INFO, dgettext(TEXT_DOMAIN, + "FATAL, can not allocate result structure\n")); + return (-4); + } + *result = NULL; + + /* Replay messages already have their msgID */ + if ((flags & MD_MSGF_REPLAY_MSG) == 0) { + if (mdmn_create_msgid(&msg.msg_msgid) != 0) { + syslog(LOG_INFO, dgettext(TEXT_DOMAIN, + "FATAL, can not create message ID\n")); + return (-4); + } + } else { + /* in this case a message ID must be specified */ + assert(msgid != MD_NULL_MSGID); + MSGID_COPY(msgid, &msg.msg_msgid); + } + + + /* + * When setting the flags, additionally apply the + * default flags for this message type. + */ + msg.msg_flags = flags; + msg.msg_setno = setno; + msg.msg_type = type; + msg.msg_event_size = size; + msg.msg_event_data = data; + + /* + * For the timeout pick the specific timeout for the message times the + * the maximum number of nodes. + * This is a better estimate than 1 hour or 3 days or never. + */ + timeout.tv_sec = mdmn_get_timeout(type) * NNODES; + timeout.tv_usec = 0; + + if (flags & MD_MSGF_VERBOSE) { + syslog(LOG_INFO, "send_message: ID=(%d, 0x%llx-%d)\n", + MSGID_ELEMS(msg.msg_msgid)); + } + + /* get an RPC client to the local commd */ + local_daemon = mdmn_get_local_clnt(MNGLC_FOR_REAL); + if (local_daemon == (CLIENT *)NULL) { + return (-1); + } + clnt_control(local_daemon, CLSET_TIMEOUT, (char *)&timeout); + + retry1 = msg_table[type].mte_retry1; + ticks1 = msg_table[type].mte_ticks1; + retry2 = msg_table[type].mte_retry2; + ticks2 = msg_table[type].mte_ticks2; + + /* + * run that loop until: + * - commstate is Ok + * - deadly commstate occured + * - retries1 or retries2 exceeded + */ + for (; ; ) { + *result = mdmn_send_1(&msg, local_daemon); + resp = *result; + if (resp != (md_mn_result_t *)NULL) { + /* Bingo! */ + if (resp->mmr_comm_state == MDMNE_ACK) { + retval = 0; + goto out; + } + /* Hmm... what if there's no handler? */ + if (resp->mmr_comm_state == MDMNE_NO_HANDLER) { + retval = 0; + goto out; + + } + /* + * This node didn't yet join the disk set. It is not + * supposed to send any messages then. + * This is deadly (no retries) + */ + if (resp->mmr_comm_state == MDMNE_NOT_JOINED) { + retval = -3; + goto out; + + } + /* these two are deadly too (no retries) */ + if ((resp->mmr_comm_state == MDMNE_NO_WAKEUP_ENTRY) || + (resp->mmr_comm_state == MDMNE_LOG_FAIL)) { + retval = -4; + goto out; + + } + /* Class busy? Use retry1 */ + if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) { + if (retry1-- == 0) { + retval = 1; /* retry1 exceeded */ + goto out; + } + (void) usleep(ticks1 * USECS_PER_TICK); + free_result(resp); + + if (flags & MD_MSGF_VERBOSE) + (void) printf("#Resend1 ID=(%d, " + "0x%llx-%d)\n", + MSGID_ELEMS(msg.msg_msgid)); + continue; + } + if ((resp->mmr_comm_state == MDMNE_CLASS_LOCKED) || + (resp->mmr_comm_state == MDMNE_ABORT)) { + /* + * Be patient, wait for 1 secs and try again. + * It's not likely that the ABORT condition ever + * goes away, but it won't hurt to retry + */ + free_result(resp); + (void) sleep(1); + continue; + } + if (resp->mmr_comm_state == MDMNE_SUSPENDED) { + if (flags & MD_MSGF_FAIL_ON_SUSPEND) { + /* caller wants us to fail here */ + (void) mddserror(ep, + MDE_DS_NOTNOW_RECONFIG, setno, + mynode(), mynode(), NULL); + retval = -4; + goto out; + } else { + /* wait for 1 secs and try again. */ + free_result(resp); + (void) sleep(1); + continue; + } + } + } else { + /* + * If we get a NULL back from the rpc call, try to + * reinitialize the client. + * Depending on retries2 we try again, or not. + */ + syslog(LOG_INFO, + "send_message: ID=(%d, 0x%llx-%d) resp = NULL\n", + MSGID_ELEMS(msg.msg_msgid)); + + clnt_destroy(local_daemon); + local_daemon = mdmn_get_local_clnt(MNGLC_FOR_REAL); + + if (local_daemon == (CLIENT *)NULL) { + return (-1); + } + clnt_control(local_daemon, CLSET_TIMEOUT, + (char *)&timeout); + } + + /* + * If we are here, either resp is zero or resp is non-zero + * but some commstate not mentioned above occured. + * In either case we use retry2 + */ + if (retry2-- == 0) { + syslog(LOG_INFO, dgettext(TEXT_DOMAIN, + "send_message: (%d, 0x%llx-%d) retry2 exceeded\n"), + MSGID_ELEMS(msg.msg_msgid)); + + retval = 2; /* retry2 exceeded */ + goto out; + } + if (flags & MD_MSGF_VERBOSE) { + syslog(LOG_DEBUG, dgettext(TEXT_DOMAIN, + "send_message: (%d, 0x%llx-%d) resend on retry2\n"), + MSGID_ELEMS(msg.msg_msgid)); + } + + (void) usleep(ticks2 * USECS_PER_TICK); + + if (resp != (md_mn_result_t *)NULL) { + free_result(resp); + } + } +out: + mdmn_put_local_clnt(local_daemon); + return (retval); +} + +/* + * suspend the commd for a given set/class combination. + * + * Parameter: + * set number or 0 (meaning all sets) + * class number or 0 (meaning all classes) + * + * Returns: + * 0 on success (set is suspended and all messages drained) + * MDE_DS_COMMDCTL_SUSPEND_NYD if set is not yet drained + * MDE_DS_COMMDCTL_SUSPEND_FAIL if any failure occurred + */ +int +mdmn_suspend(set_t setno, md_mn_msgclass_t class) +{ + int *resp; + CLIENT *local_daemon; + md_mn_set_and_class_t msc; + + if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) { + return (MDE_DS_COMMDCTL_SUSPEND_FAIL); + } + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, + "tcp"); + if (local_daemon == (CLIENT *)NULL) { + clnt_pcreateerror("local_daemon"); + return (MDE_DS_COMMDCTL_SUSPEND_FAIL); + } + msc.msc_set = setno; + msc.msc_class = class; + msc.msc_flags = 0; + + resp = mdmn_comm_suspend_1(&msc, local_daemon); + clnt_destroy(local_daemon); + + if (resp == NULL) { + return (MDE_DS_COMMDCTL_SUSPEND_FAIL); + } + + if (*resp == MDMNE_ACK) { + /* set successfully drained, no outstanding messages */ + return (0); + } + if (*resp != MDMNE_SET_NOT_DRAINED) { + /* some error occurred */ + return (MDE_DS_COMMDCTL_SUSPEND_FAIL); + } + + /* still outstanding messages, return not yet drained failure */ + return (MDE_DS_COMMDCTL_SUSPEND_NYD); +} + +/* + * resume the commd for a given set/class combination. + * + * Parameter: + * set number or 0 (meaning all sets) + * class number or 0 (meaning all classes) + * + * Returns: + * 0 on success + * MDE_DS_COMMDCTL_RESUME_FAIL on failure + */ +int +mdmn_resume(set_t setno, md_mn_msgclass_t class, uint_t flags) +{ + md_mn_set_and_class_t msc; + int ret = MDE_DS_COMMDCTL_RESUME_FAIL; + int *resp; + CLIENT *local_daemon; + + if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) { + return (MDE_DS_COMMDCTL_RESUME_FAIL); + } + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, + "tcp"); + if (local_daemon == (CLIENT *)NULL) { + clnt_pcreateerror("local_daemon"); + return (MDE_DS_COMMDCTL_RESUME_FAIL); + } + + msc.msc_set = setno; + msc.msc_class = class; + msc.msc_flags = flags; + + resp = mdmn_comm_resume_1(&msc, local_daemon); + + if (resp != NULL) { + if (*resp == MDMNE_ACK) { + ret = 0; + } + Free(resp); + } + + clnt_destroy(local_daemon); + return (ret); +} + +/* + * abort all communication + * + * returns void, because: if *this* get's an error what do you want to do? + */ +void +mdmn_abort(void) +{ + char *dummy = "abort"; + md_mn_result_t *resultp = NULL; + md_error_t mdne = mdnullerror; + + (void) mdmn_send_message(0, /* No set is needed for this message */ + MD_MN_MSG_ABORT, + MD_MSGF_LOCAL_ONLY, + dummy, sizeof (dummy), + &resultp, &mdne); + + if (resultp != NULL) { + Free(resultp); + } +} + +/* + * trigger the reinitialization for a given set. + * + * Parameter: set number + * + * Returns: + * 0 on success + * 1 on failure + */ +int +mdmn_reinit_set(set_t setno) +{ + int ret = 1; + int *resp; + CLIENT *local_daemon; + + + if ((setno == 0) || (setno >= MD_MAXSETS)) { + return (1); + } + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, + "tcp"); + if (local_daemon == (CLIENT *)NULL) { + clnt_pcreateerror("local_daemon"); + return (1); + } + + resp = mdmn_comm_reinit_set_1(&setno, local_daemon); + + if (resp != NULL) { + if (*resp == MDMNE_ACK) { + ret = 0; + } + Free(resp); + } + + clnt_destroy(local_daemon); + return (ret); +} + + +/* + * Lock a single message type from being processed on this node + * + * Parameter: md_mn_msgtype_t msgtype, uint_t locktype + * + * Returns: + * 0 on success + * 1 on failure + */ +int +mdmn_msgtype_lock(md_mn_msgtype_t msgtype, uint_t locktype) +{ + int ret = 1; + int *resp; + CLIENT *local_daemon; + md_mn_type_and_lock_t mmtl; + + + if ((msgtype == 0) || (msgtype >= MD_MN_NMESSAGES)) { + return (1); + } + local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE, + "tcp"); + if (local_daemon == (CLIENT *)NULL) { + clnt_pcreateerror("local_daemon"); + return (1); + } + mmtl.mmtl_type = msgtype; + mmtl.mmtl_lock = locktype; + + resp = mdmn_comm_msglock_1(&mmtl, local_daemon); + + if (resp != NULL) { + if (*resp == MDMNE_ACK) { + ret = 0; + } + Free(resp); + } + + clnt_destroy(local_daemon); + return (ret); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c new file mode 100644 index 0000000000..8603aca5ac --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c @@ -0,0 +1,1957 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdlib.h> +#include <unistd.h> +#include <wait.h> +#include <sys/time.h> +#include <syslog.h> + +#include <meta.h> +#include <sys/lvm/mdio.h> +#include <sys/lvm/md_mddb.h> +#include <sys/lvm/md_mirror.h> + +#define MAX_N_ARGS 64 +#define MAX_ARG_LEN 1024 + +/* we reserve 1024 bytes for stdout and the same for stderr */ +#define MAX_OUT 1024 +#define MAX_ERR 1024 +#define JUNK 128 /* used to flush stdout and stderr */ + + +/*ARGSUSED*/ +void +mdmn_do_cmd(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + + /* + * We are given one string containing all the arguments + * For execvp() we have to regenerate the arguments again + */ + int arg; /* argument that is currently been built */ + int index; /* runs through arg above */ + int i; /* helper for for loop */ + char *argv[MAX_N_ARGS]; /* argument array for execvp */ + char *cp; /* runs through the given command line string */ + char *command = NULL; /* the command we call locally */ + int pout[2]; /* pipe for stdout */ + int perr[2]; /* pipe for stderr */ + pid_t pid; /* process id */ + + cp = msg->msg_event_data; + arg = 0; + index = 0; + + /* init the args array alloc the first one and null out the rest */ + argv[0] = Malloc(MAX_ARG_LEN); + for (i = 1; i < MAX_N_ARGS; i++) { + argv[i] = NULL; + } + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + + while (*cp != '\0') { + if (arg == MAX_N_ARGS) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: too many arguments specified\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + goto out; + } + if (index == MAX_ARG_LEN) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: argument too long\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + goto out; + } + + if ((*cp != ' ') && (*cp != '\t')) { + /* + * No space or tab: copy char into current + * argv and advance both pointers + */ + + argv[arg][index] = *cp; + cp++; /* next char in command line */ + index++; /* next char in argument */ + } else { + /* + * space or tab: terminate current argv, + * advance arg, reset pointer into arg, + * advance pointer in command line + */ + argv[arg][index] = '\0'; + arg++; /* next argument */ + argv[arg] = Malloc(MAX_ARG_LEN); + cp++; /* next char in command line */ + index = 0; /* starts at char 0 */ + } + } + /* terminate the last real argument */ + argv[arg][index] = '\0'; + /* the last argument is an NULL pointer */ + argv[++arg] = NULL; + if (pipe(pout) < 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: pipe failed\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + goto out; + } + if (pipe(perr) < 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: pipe failed\n")); + (void) close(pout[0]); + (void) close(pout[1]); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + goto out; + } + command = Strdup(argv[0]); + (void) strcat(argv[0], ".rpc_call"); + pid = fork1(); + if (pid == (pid_t)-1) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: fork failed\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + (void) close(pout[0]); + (void) close(pout[1]); + (void) close(perr[0]); + (void) close(perr[1]); + goto out; + } else if (pid == (pid_t)0) { + /* child */ + (void) close(0); + /* close the reading channels of pout and perr */ + (void) close(pout[0]); + (void) close(perr[0]); + /* redirect stdout */ + if (dup2(pout[1], 1) < 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: dup2 failed\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + return; + } + + /* redirect stderr */ + if (dup2(perr[1], 2) < 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "PANIC: dup2 failed\n")); + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + return; + } + + (void) execvp(command, (char *const *)argv); + perror("execvp"); + _exit(1); + } else { + /* parent process */ + int stat_loc; + char *out, *err; /* for stdout and stderr of child */ + int i; /* index into the aboves */ + char junk[JUNK]; + int out_done = 0; + int err_done = 0; + int out_read = 0; + int err_read = 0; + int maxfd; + fd_set rset; + + + /* close the writing channels of pout and perr */ + (void) close(pout[1]); + (void) close(perr[1]); + resp->mmr_out = Malloc(MAX_OUT); + resp->mmr_err = Malloc(MAX_ERR); + resp->mmr_out_size = MAX_OUT; + resp->mmr_err_size = MAX_ERR; + out = resp->mmr_out; + err = resp->mmr_err; + FD_ZERO(&rset); + while ((out_done == 0) || (err_done == 0)) { + FD_SET(pout[0], &rset); + FD_SET(perr[0], &rset); + maxfd = max(pout[0], perr[0]) + 1; + (void) select(maxfd, &rset, NULL, NULL, NULL); + + /* + * Did the child produce some output to stdout? + * If so, read it until we either reach the end of the + * output or until we read MAX_OUT bytes. + * Whatever comes first. + * In case we already read MAX_OUT bytes we simply + * read away the output into a junk buffer. + * Just to make the child happy + */ + if (FD_ISSET(pout[0], &rset)) { + if (MAX_OUT - out_read - 1 > 0) { + i = read(pout[0], out, + MAX_OUT - out_read); + out_read += i; + out += i; + } else { + /* buffer full, empty stdout */ + i = read(pout[0], junk, JUNK); + } + if (i == 0) { + /* stdout is closed by child */ + out_done++; + } + } + /* same comment as above | sed -e 's/stdout/stderr/' */ + if (FD_ISSET(perr[0], &rset)) { + if (MAX_ERR - err_read - 1 > 0) { + i = read(perr[0], err, + MAX_ERR - err_read); + err_read += i; + err += i; + } else { + /* buffer full, empty stderr */ + i = read(perr[0], junk, JUNK); + } + if (i == 0) { + /* stderr is closed by child */ + err_done++; + } + } + } + resp->mmr_out[out_read] = '\0'; + resp->mmr_err[err_read] = '\0'; + + while (waitpid(pid, &stat_loc, 0) < 0) { + if (errno != EINTR) { + resp->mmr_comm_state = MDMNE_HANDLER_FAILED; + break; + } + } + if (errno == 0) + resp->mmr_exitval = WEXITSTATUS(stat_loc); + + (void) close(pout[0]); + (void) close(perr[0]); + } +out: + for (i = 0; i < MAX_N_ARGS; i++) { + if (argv[i] != NULL) { + free(argv[i]); + } + } + if (command != NULL) { + Free(command); + } +} + +/* + * This is for checking if a metadevice is opened, and for + * locking in case it is not and for + * unlocking a locked device + */ +/*ARGSUSED*/ +void +mdmn_do_clu(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + if (msg->msg_type == MD_MN_MSG_CLU_CHECK) { + md_isopen_t *d; + int ret; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_isopen_t *)(void *)msg->msg_event_data; + ret = metaioctl(MD_IOCISOPEN, d, &(d->mde), NULL); + /* + * In case the ioctl succeeded, return the open state of + * the metadevice. Otherwise we return the error the ioctl + * produced. As this is not zero, no attempt is made to + * remove/rename the metadevice later + */ + + if (ret == 0) { + resp->mmr_exitval = d->isopen; + } else { + /* + * When doing a metaclear, one node after the other + * does the two steps: + * - check on all nodes if this md is opened. + * - remove the md locally. + * When the 2nd node asks all nodes if the md is + * open it starts with the first node. + * As this already removed the md, the check + * returns MDE_UNIT_NOT_SETUP. + * In order to not keep the 2nd node from proceeding, + * we map this to an Ok. + */ + if (mdismderror(&(d->mde), MDE_UNIT_NOT_SETUP)) { + mdclrerror(&(d->mde)); + ret = 0; + } + + resp->mmr_exitval = ret; + } + } +} + +/* handler for MD_MN_MSG_REQUIRE_OWNER */ +/*ARGSUSED*/ +void +mdmn_do_req_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_set_mmown_params_t setown; + md_mn_req_owner_t *d; + int ret, n = 0; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_req_owner_t *)(void *)msg->msg_event_data; + + (void) memset(&setown, 0, sizeof (setown)); + MD_SETDRIVERNAME(&setown, MD_MIRROR, MD_MIN2SET(d->mnum)) + setown.d.mnum = d->mnum; + setown.d.owner = d->owner; + + /* Retry ownership change if we get EAGAIN returned */ + while ((ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, &setown.mde, NULL)) + != 0) { + md_sys_error_t *ip = + &setown.mde.info.md_error_info_t_u.sys_error; + if (ip->errnum != EAGAIN) { + break; + } + if (n++ >= 10) { + break; + } + (void) sleep(1); + } + + resp->mmr_exitval = ret; +} + +/* + * handler for MD_MN_MSG_CHOOSE_OWNER + * This is called when a mirror resync has no owner. The master node generates + * this message which is not broadcast to the other nodes. The message is + * required as the kernel does not have access to the nodelist for the set. + */ +/*ARGSUSED*/ +void +mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_chowner_t chownermsg; + md_mn_msg_chooseid_t *d; + int ret = 0; + int nodecnt; + int nodeno; + uint_t nodeid; + uint_t myflags; + set_t setno; + mdsetname_t *sp; + md_set_desc *sd; + md_mnnode_desc *nd; + md_error_t mde = mdnullerror; + md_mn_result_t *resp1 = NULL; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_chooseid_t *)(void *)msg->msg_event_data; + + /* + * The node to be chosen will be the resync count for the set + * modulo the number of live nodes in the set + */ + setno = MD_MIN2SET(d->msg_chooseid_mnum); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_CHOOSE_OWNER: Invalid setno %d\n"), setno); + resp->mmr_exitval = 1; + return; + } + if ((sd = metaget_setdesc(sp, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_CHOOSE_OWNER: Invalid set pointer\n")); + resp->mmr_exitval = 1; + return; + } + + /* Count the number of live nodes */ + nodecnt = 0; + nd = sd->sd_nodelist; + while (nd) { + if (nd->nd_flags & MD_MN_NODE_ALIVE) + nodecnt++; + nd = nd->nd_next; + } + nodeno = (d->msg_chooseid_rcnt%nodecnt); + + /* + * If we've been called with msg_chooseid_set_node set TRUE then we + * are simply re-setting the owner id to ensure consistency across + * the cluster. + * If the flag is reset (B_FALSE) we are requesting a new owner to be + * determined. + */ + if (d->msg_chooseid_set_node) { + nodeid = d->msg_chooseid_rcnt; + } else { + /* scan the nodelist looking for the required node */ + nodecnt = 0; + nd = sd->sd_nodelist; + while (nd) { + if (nd->nd_flags & MD_MN_NODE_ALIVE) { + if (nodecnt == nodeno) + break; + nodecnt++; + } + nd = nd->nd_next; + } + nodeid = nd->nd_nodeid; + } + + /* Send message to all nodes to make ownership change */ + chownermsg.msg_chowner_mnum = d->msg_chooseid_mnum; + chownermsg.msg_chowner_nodeid = nodeid; + myflags = MD_MSGF_NO_LOG; + + /* inherit some flags from the parent message */ + myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS; + + ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum), + MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg, + sizeof (chownermsg), &resp1, &mde); + if (resp1 != NULL) + free_result(resp1); + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_CHANGE_OWNER + * This is called when we are perfoming a resync and wish to change from + * no mirror owner to an owner chosen by the master. + * This mesage is only relevant for the new owner, the message will be + * ignored by all other nodes + */ +/*ARGSUSED*/ +void +mdmn_do_change_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_set_mmown_params_t setown; + md_mn_msg_chowner_t *d; + int ret = 0; + set_t setno; + mdsetname_t *sp; + md_set_desc *sd; + md_error_t mde = mdnullerror; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_chowner_t *)(void *)msg->msg_event_data; + + setno = MD_MIN2SET(d->msg_chowner_mnum); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_CHANGE_OWNER: Invalid setno %d\n"), setno); + resp->mmr_exitval = 1; + return; + } + if ((sd = metaget_setdesc(sp, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_CHANGE_OWNER: Invalid set pointer\n")); + resp->mmr_exitval = 1; + return; + } + + if (d->msg_chowner_nodeid == sd->sd_mn_mynode->nd_nodeid) { + /* + * If we are the chosen owner, issue ioctl to make the + * ownership change + */ + (void) memset(&setown, 0, sizeof (md_set_mmown_params_t)); + setown.d.mnum = d->msg_chowner_mnum; + setown.d.owner = d->msg_chowner_nodeid; + setown.d.flags = MD_MN_MM_SPAWN_THREAD; + MD_SETDRIVERNAME(&setown, MD_MIRROR, + MD_MIN2SET(d->msg_chowner_mnum)); + + /* + * Single shot at changing the the owner, if it fails EAGAIN, + * another node must have become the owner while we are in the + * process of making this choice. + */ + + ret = metaioctl(MD_MN_SET_MM_OWNER, &setown, + &(setown.mde), NULL); + if (ret == EAGAIN) + ret = 0; + } + resp->mmr_exitval = ret; +} + +/* handler for MD_MN_MSG_SUSPEND_WRITES */ +/*ARGSUSED*/ +void +mdmn_do_susp_write(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + /* Suspend writes to a region of a mirror */ + md_suspend_wr_params_t suspwr_ioc; + md_mn_msg_suspwr_t *d; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_suspwr_t *)(void *)msg->msg_event_data; + + (void) memset(&suspwr_ioc, 0, sizeof (md_suspend_wr_params_t)); + MD_SETDRIVERNAME(&suspwr_ioc, MD_MIRROR, + MD_MIN2SET(d->msg_suspwr_mnum)); + suspwr_ioc.mnum = d->msg_suspwr_mnum; + ret = metaioctl(MD_MN_SUSPEND_WRITES, &suspwr_ioc, + &(suspwr_ioc.mde), NULL); + resp->mmr_exitval = ret; +} + +/* + * handler for MD_MN_MSG_STATE_UPDATE_RESWR + * This functions update a submirror component state and then resumes writes + * to the mirror + */ +/*ARGSUSED*/ +void +mdmn_do_state_upd_reswr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + /* Update the state of the component of a mirror */ + md_set_state_params_t setstate_ioc; + md_mn_msg_stch_t *d; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data; + + (void) memset(&setstate_ioc, 0, sizeof (md_set_state_params_t)); + MD_SETDRIVERNAME(&setstate_ioc, MD_MIRROR, + MD_MIN2SET(d->msg_stch_mnum)); + setstate_ioc.mnum = d->msg_stch_mnum; + setstate_ioc.sm = d->msg_stch_sm; + setstate_ioc.comp = d->msg_stch_comp; + setstate_ioc.state = d->msg_stch_new_state; + setstate_ioc.hs_id = d->msg_stch_hs_id; + ret = metaioctl(MD_MN_SET_STATE, &setstate_ioc, + &(setstate_ioc.mde), NULL); + resp->mmr_exitval = ret; +} + +/* + * submessage generator for MD_MN_MSG_STATE_UPDATE and MD_MN_MSG_STATE_UPDATE2 + * This generates 2 messages, the first is SUSPEND_WRITES and + * depending on the type of the original message the second one is + * either STATE_UPDATE_RESWR or STATE_UPDATE_RESWR2 which actually does + * the same, but runs on a higher class. + */ +int +mdmn_smgen_state_upd(md_mn_msg_t *msg, md_mn_msg_t *msglist[]) +{ + md_mn_msg_t *nmsg; + md_mn_msg_stch_t *d; + md_mn_msg_stch_t *stch_data; + md_mn_msg_suspwr_t *suspwr_data; + + d = (md_mn_msg_stch_t *)(void *)msg->msg_event_data; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_SUSPEND_WRITES; + nmsg->msg_event_size = sizeof (md_mn_msg_suspwr_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_suspwr_t)); + suspwr_data = (md_mn_msg_suspwr_t *)(void *)nmsg->msg_event_data; + suspwr_data->msg_suspwr_mnum = d->msg_stch_mnum; + msglist[0] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + if (msg->msg_type == MD_MN_MSG_STATE_UPDATE2) { + nmsg->msg_type = MD_MN_MSG_STATE_UPDATE_RESWR2; + } else { + nmsg->msg_type = MD_MN_MSG_STATE_UPDATE_RESWR; + } + nmsg->msg_event_size = sizeof (md_mn_msg_stch_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_stch_t)); + stch_data = (md_mn_msg_stch_t *)(void *)nmsg->msg_event_data; + stch_data->msg_stch_mnum = d->msg_stch_mnum; + stch_data->msg_stch_sm = d->msg_stch_sm; + stch_data->msg_stch_comp = d->msg_stch_comp; + stch_data->msg_stch_new_state = d->msg_stch_new_state; + stch_data->msg_stch_hs_id = d->msg_stch_hs_id; + msglist[1] = nmsg; + return (2); /* Return the number of submessages generated */ +} + +/* + * handler for MD_MN_MSG_ALLOCATE_HOTSPARE and MD_MN_MSG_ALLOCATE_HOTSPARE2 + * This sends a message to all nodes requesting them to allocate a hotspare + * for the specified component. The component is specified by the mnum of + * the mirror, the submirror index and the component index. + */ +/*ARGSUSED*/ +void +mdmn_do_allocate_hotspare(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + /* Allocate a hotspare for a mirror component */ + md_alloc_hotsp_params_t allochsp_ioc; + md_mn_msg_allochsp_t *d; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_allochsp_t *)((void *)(msg->msg_event_data)); + + (void) memset(&allochsp_ioc, 0, + sizeof (md_alloc_hotsp_params_t)); + MD_SETDRIVERNAME(&allochsp_ioc, MD_MIRROR, + MD_MIN2SET(d->msg_allochsp_mnum)); + allochsp_ioc.mnum = d->msg_allochsp_mnum; + allochsp_ioc.sm = d->msg_allochsp_sm; + allochsp_ioc.comp = d->msg_allochsp_comp; + allochsp_ioc.hs_id = d->msg_allochsp_hs_id; + ret = metaioctl(MD_MN_ALLOCATE_HOTSPARE, &allochsp_ioc, + &(allochsp_ioc.mde), NULL); + resp->mmr_exitval = ret; +} + +/* + * handler for MD_MN_MSG_RESYNC_STARTING,MD_MN_MSG_RESYNC_FIRST, + * MD_MN_MSG_RESYNC_NEXT, MD_MN_MSG_RESYNC_FINISH, MD_MN_MSG_RESYNC_PHASE_DONE + */ +/*ARGSUSED*/ +void +mdmn_do_resync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_resync_t *d; + md_mn_rs_params_t respar; + int ret; + int smi; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_resync_t *)((void *)(msg->msg_event_data)); + + (void) memset(&respar, 0, sizeof (respar)); + MD_SETDRIVERNAME(&respar, MD_MIRROR, + MD_MIN2SET(d->msg_resync_mnum)) + respar.msg_type = (int)msg->msg_type; + respar.mnum = d->msg_resync_mnum; + respar.rs_type = d->msg_resync_type; + respar.rs_start = d->msg_resync_start; + respar.rs_size = d->msg_resync_rsize; + respar.rs_done = d->msg_resync_done; + respar.rs_2_do = d->msg_resync_2_do; + respar.rs_originator = d->msg_originator; + respar.rs_flags = d->msg_resync_flags; + + for (smi = 0; smi < NMIRROR; smi++) { + respar.rs_sm_state[smi] = d->msg_sm_state[smi]; + respar.rs_sm_flags[smi] = d->msg_sm_flags[smi]; + } + + ret = metaioctl(MD_MN_RESYNC, &respar, &respar.mde, NULL); + + resp->mmr_exitval = ret; +} + +/* + * handler for MD_MN_MSG_SETSYNC + */ +/*ARGSUSED*/ +void +mdmn_do_setsync(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_setsync_t *d; + md_resync_ioctl_t ri; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_setsync_t *)((void *)(msg->msg_event_data)); + + (void) memset(&ri, 0, sizeof (ri)); + MD_SETDRIVERNAME(&ri, MD_MIRROR, MD_MIN2SET(d->setsync_mnum)) + ri.ri_mnum = d->setsync_mnum; + ri.ri_copysize = d->setsync_copysize; + ri.ri_flags = d->setsync_flags; + + ret = metaioctl(MD_MN_SETSYNC, &ri, &ri.mde, NULL); + + resp->mmr_exitval = ret; +} + +/* + * handler for MD_MN_MSG_SET_CAP. As this handler can deal with both mirrors + * and soft partitions, the driver name that is required for the ioctl call + * is included in the message. + */ +/*ARGSUSED*/ +void +mdmn_do_set_cap(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_setcap_t *d; + md_mn_setcap_params_t setcap_ioc; + minor_t mnum; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_setcap_t *)((void *)(msg->msg_event_data)); + mnum = d->msg_setcap_mnum; + + (void) memset(&setcap_ioc, 0, sizeof (setcap_ioc)); + + MD_SETDRIVERNAME(&setcap_ioc, d->msg_setcap_driver, MD_MIN2SET(mnum)); + setcap_ioc.mnum = mnum; + setcap_ioc.sc_set = d->msg_setcap_set; + + ret = metaioctl(MD_MN_SET_CAP, &setcap_ioc, &setcap_ioc.mde, NULL); + + resp->mmr_exitval = ret; +} + +/* + * Dummy handler for various CLASS0 messages like + * MD_MN_MSG_VERBOSITY / MD_MN_MSG_RESUME / MD_MN_MSG_SUSPEND ... + */ +/*ARGSUSED*/ +void +mdmn_do_dummy(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_exitval = 0; + resp->mmr_comm_state = MDMNE_ACK; +} + +/* + * Overall description of mdcommd support that keeps all nodes in-sync + * with the ondisk diskset mddbs. + * + * All configuration changes to the mddb - addition/deletion of metadevices + * or replicas must use a CLASS1 message to block out these changes. + * Changes to the state of existing replicas do not need to block CLASS1 + * since there is no conflict when just updating the state of a replica. + * + * Error encountered when master writes to mddbs: + * As the master updates parts of the mddbs, flags are updated describing + * what has been written. When all locks are dropped (either in + * mddb_setexit or mdioctl), a PARSE message will be generated to all + * nodes with an index list of known good mddbs and the parse flags. + * The master node ignore the parse message since it sent it. + * The slave nodes re-read in the changed part of the mddb using the list + * of known good replicas that was passed. + * PARSE message does not block CLASS1. + * The PARSE message must be the highest class message. Since this + * message could be sent on any ioctl, this PARSE message class must + * be higher than any other class message that could issue an ioctl. + * + * Master Slave1 Slave2 + * Handles_error + * PARSE PARSE PARSE + * + * + * Add/Delete mddbs can occur from the following commands: + * metadb -s set_name -a/-d + * metaset -s set_name -a/-d disk + * metaset -s set_name -b + * + * The metadb/metaset command is run on the node executing the command + * and sends an ATTACH/DETACH message to the master node blocking CLASS1 + * messages on all nodes until this message is finished. The master + * node generates 3 submessages of BLOCK, SM_ATTACH/SM_DETACH, UNBLOCK. + * The BLOCK message is only run on the master node and will BLOCK + * the PARSE messages from being sent to the nodes. + * The SM_ATTACH/SM_DETACH message is run on all nodes and actually adds or + * removes the replica(s) from the given disk slice. + * The UNBLOCK message is only run on the master node and allows the + * sending of PARSE messages. + * + * Master Slave1 Slave2 + * Add mddb cmd + * ATTACH msg to master + * BLOCK + * ATTACH ATTACH ATTACH + * UNBLOCK + * PARSE PARSE PARSE + * ATTACH msg finished + * + * Add/Delete host side information from the following commands: + * metaset -s set_name -a/-d -h + * + * The metaset command is run on the node executing the command and + * sends a DB_NEWSIDE/DB_DELSIDE message and a MD_NEWSIDE/MD_DELSIDE + * message whenever a host is added to or deleted from the diskset. + * + * The side information contains the major name and minor number + * associated with a disk slice from a certain node's perspective + * in an (failed) effort to support clustered systems that don't have the + * same device name for a physical device. (The original designers of + * SVM eventually took the shortcut of assuming that all device names + * are the same on all systems, but left the side information in the + * mddb and namespace.) The side information is used for disk slices + * that contain mddbs and/or are components for metadevices. + * + * The DB_NEWSIDE/DELSIDE command adds or deletes the side information + * for each mddb for the host being added or deleted. + * The MD_ADDSIDE/MD_DELSIDE command adds or deletes the side information + * for all disk slice components that are in the namespace records for + * the host being added or deleted. + * + * The DB_NEWSIDE/DB_DELSIDE message does not change any mddb records + * and only needs to be executed on the master node since the slave + * nodes will be brought up to date by the PARSE message that is + * generated as a result of a change to the mddb. + * The MD_ADDSIDE/MD_DELSIDE message does modify the records in the mddb + * and needs to be run on all nodes. The message must block class1 + * messages so that record changing commands don't interfere. + * + * Master Slave1 Slave2 + * Add host + * DB_NEWSIDE msg to master + * DB_NEWSIDE + * PARSE PARSE PARSE + * DB_NEWSIDE msg finished + * MD_NEWSIDE msg to master + * MD_NEWSIDE MD_NEWSIDE MD_NEWSIDE + * MD_NEWSIDE msg finished + * + * + * Optimized resync record failure: + * When any node sees a failure to write an optimized resync record + * that node notifies the master node of the replica that failed. + * The master node handles the error and updates the rest of the + * nodes using a PARSE message. The PARSE message also calls + * fixoptrecord on each slave node causing each node to fix up + * the optimized resync records that are owned by that node (the mirror + * owner code also sets the optimized resync record owner). The master + * node will fix up all optimized resync records that have no owner or + * are owned by the master node. + * + * Master Slave1 Slave2 + * Optimized Record Failure + * OPTRECERR msg to master + * Master handles opt rec failure + * PARSE PARSE PARSE + * OPTRECERR msg finished + * Slave rewrites optimized record + * + */ + +/* + * Handler for MD_MN_MSG_MDDB_PARSE which send parse messages to the + * slave nodes in order to keep the incore view of the mddbs the + * same on all nodes. + * + * Since master node generated the mddb parse message, do nothing + * if this is the master node. + * + * If this is a slave node, send the parse message down to the kernel + * where this node will re-read in parts of the mddbs. + * + */ +void +mdmn_do_mddb_parse(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_mddb_parse_t *d; + mddb_parse_parm_t mpp; + int ret = 0; + int i; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_mddb_parse_t *)((void *)(msg->msg_event_data)); + + if (flags & MD_MSGF_ON_MASTER) + return; + + (void) memset(&mpp, 0, sizeof (mpp)); + mpp.c_setno = msg->msg_setno; + mpp.c_parse_flags = d->msg_parse_flags; + for (i = 0; i < MDDB_NLB; i++) { + mpp.c_lb_flags[i] = d->msg_lb_flags[i]; + } + ret = metaioctl(MD_MN_MDDB_PARSE, &mpp, &mpp.c_mde, NULL); + if (ret) + (void) mdstealerror(&(resp->mmr_ep), &mpp.c_mde); + + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_MDDB_BLOCK which blocks the generation + * of parse messages from this node. + * + * This is needed when attaching/detaching mddbs on the master and the + * slave node is unable to handle a parse message until the slave node + * has done the attach/detach of the mddbs. So, master node will block + * the parse messages, execute the attach/detach on all nodes and + * then unblock the parse messages which causes the parse message to + * be sent to all nodes. + */ +/*ARGSUSED*/ +void +mdmn_do_mddb_block(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_mddb_block_t *d; + mddb_block_parm_t mbp; + int ret; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_mddb_block_t *)((void *)(msg->msg_event_data)); + + (void) memset(&mbp, 0, sizeof (mbp)); + mbp.c_setno = msg->msg_setno; + mbp.c_blk_flags = d->msg_block_flags; + ret = metaioctl(MD_MN_MDDB_BLOCK, &mbp, &mbp.c_mde, NULL); + if (ret) + (void) mdstealerror(&(resp->mmr_ep), &mbp.c_mde); + + resp->mmr_exitval = ret; +} + +/* + * Submessage generator for MD_MN_MSG_META_DB_ATTACH which generates + * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_ATTACH + * message on all nodes and then an UNBLOCK message on the master only. + */ +int +mdmn_smgen_mddb_attach(md_mn_msg_t *msg, md_mn_msg_t *msglist[]) +{ + md_mn_msg_t *nmsg; + md_mn_msg_meta_db_attach_t *d; + md_mn_msg_meta_db_attach_t *attach_d; + md_mn_msg_mddb_block_t *block_d; + + d = (md_mn_msg_meta_db_attach_t *)(void *)msg->msg_event_data; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST); + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK; + nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t)); + block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data; + block_d->msg_block_flags = MDDB_BLOCK_PARSE; + msglist[0] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + /* Don't log submessages and panic on inconsistent results */ + nmsg->msg_flags = MD_MSGF_NO_LOG | + MD_MSGF_PANIC_WHEN_INCONSISTENT; + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_SM_MDDB_ATTACH; + nmsg->msg_event_size = sizeof (md_mn_msg_meta_db_attach_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_meta_db_attach_t)); + attach_d = (md_mn_msg_meta_db_attach_t *) + (void *)nmsg->msg_event_data; + attach_d->msg_l_dev = d->msg_l_dev; + attach_d->msg_cnt = d->msg_cnt; + attach_d->msg_dbsize = d->msg_dbsize; + (void) strncpy(attach_d->msg_dname, d->msg_dname, 16); + attach_d->msg_splitname = d->msg_splitname; + attach_d->msg_options = d->msg_options; + msglist[1] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST); + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK; + nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t)); + block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data; + block_d->msg_block_flags = MDDB_UNBLOCK_PARSE; + msglist[2] = nmsg; + + return (3); /* Return the number of submessages generated */ +} + +/* + * Submessage generator for MD_MN_MSG_META_DB_DETACH which generates + * a BLOCK message on the master node only, a MD_MN_MSG_SM_MDDB_DETACH + * message on all nodes and then an UNBLOCK message on the master only. + */ +int +mdmn_smgen_mddb_detach(md_mn_msg_t *msg, md_mn_msg_t *msglist[]) +{ + md_mn_msg_t *nmsg; + md_mn_msg_meta_db_detach_t *d; + md_mn_msg_meta_db_detach_t *detach_d; + md_mn_msg_mddb_block_t *block_d; + + d = (md_mn_msg_meta_db_detach_t *)(void *)msg->msg_event_data; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST); + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK; + nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t)); + block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data; + block_d->msg_block_flags = MDDB_BLOCK_PARSE; + msglist[0] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + /* Don't log submessages and panic on inconsistent results */ + nmsg->msg_flags = MD_MSGF_NO_LOG | + MD_MSGF_PANIC_WHEN_INCONSISTENT; + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_SM_MDDB_DETACH; + nmsg->msg_event_size = sizeof (md_mn_msg_meta_db_detach_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_meta_db_detach_t)); + detach_d = (md_mn_msg_meta_db_detach_t *) + (void *)nmsg->msg_event_data; + detach_d->msg_splitname = d->msg_splitname; + msglist[1] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = (MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST); + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_MDDB_BLOCK; + nmsg->msg_event_size = sizeof (md_mn_msg_mddb_block_t); + nmsg->msg_event_data = Zalloc(sizeof (md_mn_msg_mddb_block_t)); + block_d = (md_mn_msg_mddb_block_t *)(void *)nmsg->msg_event_data; + block_d->msg_block_flags = MDDB_UNBLOCK_PARSE; + msglist[2] = nmsg; + + return (3); /* Return the number of submessages generated */ +} + +/* + * Handler for MD_MN_MSG_SM_MDDB_ATTACH which is used to attach mddbs. + * + * Used when running: + * metadb -s set_name -a + * metaset -s set_name -a/-d disk + * metaset -s set_name -b + */ +/*ARGSUSED*/ +void +mdmn_do_sm_mddb_attach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_db_attach_t *d; + struct mddb_config c; + int i; + int ret = 0; + md_error_t ep = mdnullerror; + char *name, *add_name; + mdname_t *np; + mdsetname_t *sp; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_db_attach_t *)((void *)(msg->msg_event_data)); + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = msg->msg_setno; + c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev); + (void) strncpy(c.c_locator.l_driver, d->msg_dname, + sizeof (c.c_locator.l_driver)); + c.c_devname = d->msg_splitname; + c.c_locator.l_mnum = meta_getminor(d->msg_l_dev); + c.c_multi_node = 1; + if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + (void) strcpy(c.c_setname, sp->setname); + c.c_sideno = getmyside(sp, &ep); + if (c.c_sideno == MD_SIDEWILD) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + + name = splicename(&d->msg_splitname); + if ((np = metaname(&sp, name, &ep)) == NULL) { + Free(name); + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + /* + * All nodes in MN diskset must do meta_check_replica + * since this causes the shared namespace to be + * populated by the md driver names while checking + * to see if this device is already in use as a + * metadevice. + */ + if (meta_check_replica(sp, np, d->msg_options, 0, + (d->msg_cnt * d->msg_dbsize), &ep)) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + + for (i = 0; i < d->msg_cnt; i++) { + c.c_locator.l_blkno = i * d->msg_dbsize + 16; + if (setup_med_cfg(sp, &c, + (d->msg_options & MDCHK_SET_FORCE), &ep)) { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &ep); + break; + } + ret = metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL); + /* If newdev was successful, continue with attach */ + if (ret == 0) { + if (meta_db_addsidenms(sp, np, c.c_locator.l_blkno, + DB_ADDSIDENMS_NO_BCAST, &ep)) { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &ep); + break; + } + } else { + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + break; + } + } + add_name = splicename(&d->msg_splitname); + if ((np = metaname(&sp, add_name, &ep)) != NULL) { + meta_invalidate_name(np); + } else { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &ep); + } + Free(add_name); + + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_SM_MDDB_DETACH which is used to detach mddbs. + * + * Used when running: + * metadb -s set_name -d + * metaset -s set_name -a/-d disk + * metaset -s set_name -b + */ +/*ARGSUSED*/ +void +mdmn_do_sm_mddb_detach(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_db_detach_t *d; + struct mddb_config c; + int i; + int ret = 0; + md_error_t ep = mdnullerror; + char *name, *del_name; + mdname_t *np; + mdsetname_t *sp; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_db_detach_t *)((void *)(msg->msg_event_data)); + + if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = msg->msg_setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + resp->mmr_exitval = -1; + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + return; + } + i = 0; + del_name = splicename(&d->msg_splitname); + while (i < c.c_dbcnt) { + c.c_id = i; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + break; + } + name = splicename(&c.c_devname); + if (strcmp(name, del_name) != 0) { + Free(name); + i++; + continue; + } + Free(name); + /* Found a match - delete mddb */ + if (metaioctl(MD_DB_DELDEV, &c, &c.c_mde, NULL) != 0) { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + break; + } + /* Not incrementing "i" intentionally (dbcnt is changed) */ + } + if ((np = metaname(&sp, del_name, &ep)) != NULL) { + meta_invalidate_name(np); + } else { + ret = -1; + (void) mdstealerror(&(resp->mmr_ep), &ep); + } + Free(del_name); + + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_META_DB_NEWSIDE which is used to update the + * side information for each diskset mddb when a new host has been + * added to the diskset. The side information is the /dev/dsk/ctds name + * that the new node would use to access each mddb. + * + * Since this routine makes no changes to the records in the diskset mddb, + * this routine only needs to be run on the master node. The master node's + * kernel code will detect that portions of the mddb have changed and + * will send a parse message to all nodes to re-parse parts of the mddb. + * + * Used when running: + * metaset -s set_name -a -h new_hostname + */ +/*ARGSUSED*/ +void +mdmn_do_meta_db_newside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_db_newside_t *d; + struct mddb_config c; + int ret = 0; + mdsetname_t *sp; + md_error_t ep = mdnullerror; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_db_newside_t *)((void *)(msg->msg_event_data)); + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = msg->msg_setno; + c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev); + c.c_locator.l_blkno = d->msg_blkno; + (void) strncpy(c.c_locator.l_driver, d->msg_dname, + sizeof (c.c_locator.l_driver)); + c.c_devname = d->msg_splitname; + c.c_locator.l_mnum = d->msg_mnum; + c.c_multi_node = 1; + if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + (void) strcpy(c.c_setname, sp->setname); + c.c_sideno = d->msg_sideno; + + if ((ret = metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL)) != 0) { + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + } + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_META_DB_DELSIDE which is used to remove the + * side information for each diskset mddb when a host has been + * deleted from the diskset. The side information is the /dev/dsk/ctds name + * that the node would use to access each mddb. + * + * Since this routine makes no changes to the records in the diskset mddb, + * this routine only needs to be run on the master node. The master node's + * kernel code will detect that portions of the mddb have changed and + * will send a parse message to all nodes to re-parse parts of the mddb. + * + * Used when running: + * metaset -s set_name -d -h hostname + */ +/*ARGSUSED*/ +void +mdmn_do_meta_db_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_db_delside_t *d; + mddb_config_t c; + int ret = 0; + mdsetname_t *sp; + md_error_t ep = mdnullerror; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_db_delside_t *)((void *)(msg->msg_event_data)); + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = msg->msg_setno; + c.c_locator.l_dev = meta_cmpldev(d->msg_l_dev); + c.c_locator.l_blkno = d->msg_blkno; + c.c_multi_node = 1; + if ((sp = metasetnosetname(c.c_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + (void) strcpy(c.c_setname, sp->setname); + c.c_sideno = d->msg_sideno; + + if ((ret = metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL)) != 0) { + (void) mdstealerror(&(resp->mmr_ep), &c.c_mde); + } + resp->mmr_exitval = ret; +} + +/* + * Handler for MD_MN_MSG_META_MD_ADDSIDE which is used to add the + * side information for each diskset metadevice component (if that + * component is a disk) when a host has been added to the diskset. + * The side information is the /dev/dsk/ctds name that the node would + * use to access the metadevice component. + * + * This routine makes changes to the mddb records and must be run + * on all nodes. + * + * Used when running: + * metaset -s set_name -a -h new_hostname + */ +/*ARGSUSED*/ +void +mdmn_do_meta_md_addside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_md_addside_t *d; + mdnm_params_t nm; + mdsetname_t *sp; + char *cname, *dname; + minor_t mnum; + int done, i; + md_error_t ep = mdnullerror; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_md_addside_t *)((void *)(msg->msg_event_data)); + + (void) memset(&nm, 0, sizeof (nm)); + if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + /* While loop continues until IOCNXTKEY_NM gives nm.key of KEYWILD */ + /*CONSTCOND*/ + while (1) { + nm.mde = mdnullerror; + nm.setno = msg->msg_setno; + nm.side = d->msg_otherside; + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(&(resp->mmr_ep), &nm.mde); + resp->mmr_exitval = -1; + return; + } + + /* Normal exit path is to eventually get a KEYWILD */ + if (nm.key == MD_KEYWILD) { + resp->mmr_exitval = 0; + return; + } + + nm.devname = (uint64_t)meta_getnmbykey(msg->msg_setno, + d->msg_otherside, nm.key, &ep); + if (nm.devname == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + nm.side = d->msg_sideno; + if ((done = meta_getside_devinfo(sp, (char *)nm.devname, + d->msg_sideno, &cname, &dname, &mnum, &ep)) == -1) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + Free((void *)nm.devname); + resp->mmr_exitval = -1; + return; + } + Free((void *)nm.devname); + if (done != 1) { + Free(cname); + Free(dname); + resp->mmr_exitval = -1; + return; + } + + /* + * The device reference count can be greater than 1 if + * more than one softpart is configured on top of the + * same device. If this is the case then we want to + * increment the count to sync up with the other sides. + */ + for (i = 0; i < nm.ref_count; i++) { + if (add_name(sp, d->msg_sideno, nm.key, dname, mnum, + cname, &ep) == -1) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + Free(cname); + Free(dname); + resp->mmr_exitval = -1; + return; + } + } + Free(cname); + Free(dname); + } + + /*NOTREACHED*/ +} +/* + * Handler for MD_MN_MSG_META_MD_DELSIDE which is used to delete the + * side information for each diskset metadevice component (if that + * component is a disk) when a host has been removed from the diskset. + * The side information is the /dev/dsk/ctds name that the node would + * use to access the metadevice component. + * + * This routine makes changes to the mddb records and must be run + * on all nodes. + * + * Used when running: + * metaset -s set_name -d -h hostname + */ +/*ARGSUSED*/ +void +mdmn_do_meta_md_delside(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_meta_md_delside_t *d; + mdnm_params_t nm; + mdsetname_t *sp; + md_error_t ep = mdnullerror; + int i; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_meta_md_delside_t *)((void *)(msg->msg_event_data)); + + if ((sp = metasetnosetname(msg->msg_setno, &ep)) == NULL) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + + (void) memset(&nm, 0, sizeof (nm)); + nm.key = MD_KEYWILD; + /*CONSTCOND*/ + while (1) { + nm.mde = mdnullerror; + nm.setno = msg->msg_setno; + nm.side = MD_SIDEWILD; + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(&(resp->mmr_ep), &nm.mde); + resp->mmr_exitval = -1; + return; + } + + /* Normal exit path is to eventually get a KEYWILD */ + if (nm.key == MD_KEYWILD) { + resp->mmr_exitval = 0; + return; + } + + /* + * The device reference count can be greater than 1 if + * more than one softpart is configured on top of the + * same device. If this is the case then we want to + * decrement the count to zero so the entry can be + * actually removed. + */ + for (i = 0; i < nm.ref_count; i++) { + if (del_name(sp, d->msg_sideno, nm.key, &ep) == -1) { + (void) mdstealerror(&(resp->mmr_ep), &ep); + resp->mmr_exitval = -1; + return; + } + } + } + + /*NOTREACHED*/ +} + +/* + * Handler for MD_MN_MSG_MDDB_OPTRECERR which is used to notify + * the master node that a node has seen an error when attempting to + * write to the optimized resync records that reside on 2 of the diskset + * mddbs. Master node will mark the failed replica in error and this + * will send a parse message to all nodes to re-read parts of the mddb + * and to fix their optimized resync records based on this information. + */ +/*ARGSUSED*/ +void +mdmn_do_mddb_optrecerr(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_mddb_optrecerr_t *d; + mddb_optrec_parm_t mop; + int ret; + int i; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_mddb_optrecerr_t *)((void *)(msg->msg_event_data)); + + (void) memset(&mop, 0, sizeof (mop)); + mop.c_setno = msg->msg_setno; + for (i = 0; i < 2; i++) { + mop.c_recerr[i] = d->msg_recerr[i]; + } + ret = metaioctl(MD_MN_MDDB_OPTRECFIX, &mop, &mop.c_mde, NULL); + if (ret) + (void) mdstealerror(&(resp->mmr_ep), &mop.c_mde); + + resp->mmr_exitval = ret; +} + +int +mdmn_smgen_test6(md_mn_msg_t *msg, md_mn_msg_t **msglist) +{ + md_mn_msg_t *nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_TEST2; + nmsg->msg_event_size = sizeof ("test2"); + nmsg->msg_event_data = Strdup("test2"); + msglist[0] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_TEST2; + nmsg->msg_event_size = sizeof ("test2"); + nmsg->msg_event_data = Strdup("test2"); + msglist[1] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_TEST3; + nmsg->msg_event_size = sizeof ("test3"); + nmsg->msg_event_data = Strdup("test3"); + msglist[2] = nmsg; + + nmsg = Zalloc(sizeof (md_mn_msg_t)); + MSGID_COPY(&(msg->msg_msgid), &(nmsg->msg_msgid)); + + nmsg->msg_flags = MD_MSGF_NO_LOG; /* Don't log submessages */ + nmsg->msg_setno = msg->msg_setno; + nmsg->msg_type = MD_MN_MSG_TEST4; + nmsg->msg_event_size = sizeof ("test4"); + nmsg->msg_event_data = Strdup("test4"); + msglist[3] = nmsg; + + return (4); /* Return the number of submessages generated */ +} + +/* + * This is to send an MD_IOCSET ioctl to all nodes to create a soft + * partition. + */ +/*ARGSUSED*/ +void +mdmn_do_iocset(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_iocset_t *d; + int ret; + set_t setno; + mdsetname_t *sp; + mdname_t *np; + md_error_t mde = mdnullerror; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_mn_msg_iocset_t *)(void *)msg->msg_event_data; + + setno = MD_MIN2SET(d->iocset_params.mnum); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno); + resp->mmr_exitval = 1; + return; + } + + if ((np = metamnumname(&sp, d->iocset_params.mnum, 1, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_IOCSET: Invalid mnum %d\n"), + d->iocset_params.mnum); + resp->mmr_exitval = 1; + return; + } + + if (meta_init_make_device(&sp, np->cname, &mde) == -1) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_IOCSET: Invalid metadevice name %s\n"), + np->cname); + resp->mmr_exitval = 1; + return; + } + + d->iocset_params.mdp = (uint64_t)&d->unit; /* set pointer to unit */ + ret = metaioctl(MD_IOCSET, &(d->iocset_params), &mde, np->cname); + resp->mmr_exitval = ret; +} + +/* + * This is to update the status of a softpart + */ +/*ARGSUSED*/ +void +mdmn_do_sp_setstat(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_sp_setstat_t *d; + int ret; + set_t setno; + mdsetname_t *sp; + minor_t mnum; + md_error_t mde = mdnullerror; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_mn_msg_sp_setstat_t *)(void *)msg->msg_event_data; + + mnum = d->sp_setstat_mnum; + setno = MD_MIN2SET(mnum); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_MSG_IOCSET: Invalid setno %d\n"), setno); + resp->mmr_exitval = 1; + return; + } + + ret = meta_sp_setstatus(sp, &mnum, 1, d->sp_setstat_status, &mde); + resp->mmr_exitval = ret; +} + +/* + * This is to add a key to the namespace + */ +/*ARGSUSED*/ +void +mdmn_do_addkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_addkeyname_t *d; + int ret; + set_t setno; + mdsetname_t *sp; + md_error_t mde = mdnullerror; + mdname_t *compnp; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_mn_msg_addkeyname_t *)(void *)msg->msg_event_data; + + setno = d->addkeyname_setno; + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_ADDKEYNAME: Invalid setno %d\n"), setno); + resp->mmr_exitval = -1; + return; + } + + compnp = metaname(&sp, d->addkeyname_name, &mde); + if (compnp != NULL) { + ret = add_key_name(sp, compnp, NULL, &mde); + if (ret < 0) + resp->mmr_exitval = -1; + else + resp->mmr_exitval = compnp->key; + } else { + resp->mmr_exitval = -1; + } +} + +/* + * This is to delete a key from the namespace + */ +/*ARGSUSED*/ +void +mdmn_do_delkeyname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_delkeyname_t *d; + int ret; + set_t setno; + mdsetname_t *sp; + md_error_t mde = mdnullerror; + mdname_t *compnp; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_mn_msg_delkeyname_t *)(void *)msg->msg_event_data; + + setno = d->delkeyname_setno; + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_DELKEYNAME: Invalid setno %d\n"), setno); + resp->mmr_exitval = -1; + return; + } + + compnp = metadevname(&sp, d->delkeyname_dev, &mde); + if (compnp != NULL) { + /* + * Reset the key value for the name. This is required because + * any previous call of del_key_name for the same component + * will have resulted in the key value being reset to MD_KEYBAD + * even though there may still be references to this component. + */ + compnp->key = d->delkeyname_key; + ret = del_key_name(sp, compnp, &mde); + resp->mmr_exitval = ret; + } else { + resp->mmr_exitval = -1; + } +} + +/* + * This is to get the value of tstate from the master node. We use this + * to get the ABR state of a metadevice from the master. + */ +/*ARGSUSED*/ +void +mdmn_do_get_tstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_gettstate_t *d; + int ret; + uint_t tstate; + md_error_t mde = mdnullerror; + + resp->mmr_comm_state = MDMNE_ACK; /* Ok state */; + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + d = (md_mn_msg_gettstate_t *)(void *)msg->msg_event_data; + + ret = meta_get_tstate(d->gettstate_dev, &tstate, &mde); + if (ret != 0) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_GET_TSTATE: Invalid dev %llx\n"), d->gettstate_dev); + tstate = 0; + } + resp->mmr_exitval = tstate; +} + +/* + * This is to get the mirror ABR state and the state of its submirrors from + * the master node. We need this to ensure consistent output from metastat + * when a new node joins the cluster during a resync. Without this the + * submirror status will be incorrect until the whole resync is complete which + * may take days for very large metadevices. + */ +/*ARGSUSED*/ +void +mdmn_do_get_mirstate(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + md_mn_msg_mir_state_t *d; + md_mn_msg_mir_state_res_t *res; /* Results */ + set_t setno; + mdsetname_t *sp; /* Set name */ + mdname_t *mirnp; /* Mirror name */ + md_error_t mde = mdnullerror; + mm_unit_t *mm; /* Mirror */ + int smi; + uint_t tstate; + + resp->mmr_comm_state = MDMNE_ACK; + resp->mmr_out_size = sizeof (md_mn_msg_mir_state_res_t); + resp->mmr_err_size = 0; + resp->mmr_out = Malloc(resp->mmr_out_size); + resp->mmr_err = NULL; + d = (md_mn_msg_mir_state_t *)(void *)msg->msg_event_data; + res = (md_mn_msg_mir_state_res_t *)(void *)resp->mmr_out; + + /* Validate set information from minor number */ + setno = MD_MIN2SET(d->mir_state_mnum); + sp = metasetnosetname(setno, &mde); + if (sp == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_GET_MIRROR_STATE: Invalid set %d\n"), setno); + resp->mmr_exitval = 1; /* Failure */ + Free(resp->mmr_out); + resp->mmr_out_size = 0; + return; + } + + /* Construct mirror name from minor number */ + mirnp = metamnumname(&sp, d->mir_state_mnum, 0, &mde); + if (mirnp == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"), + d->mir_state_mnum); + resp->mmr_exitval = 2; /* Failure */ + Free(resp->mmr_out); + resp->mmr_out_size = 0; + return; + } + + /* Get common mirror structure */ + mm = (mm_unit_t *)meta_get_mdunit(sp, mirnp, &mde); + if (mm == NULL) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_GET_MIRROR_STATE: Invalid mirror minor %x\n"), + d->mir_state_mnum); + resp->mmr_exitval = 3; /* Failure */ + Free(resp->mmr_out); + resp->mmr_out_size = 0; + return; + } + + if (meta_get_tstate(d->mir_state_mnum, &tstate, &mde) != 0) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "MD_MN_GET_MIRROR_STATE: Invalid minor %lx\n"), + d->mir_state_mnum); + resp->mmr_exitval = 4; /* Failure */ + Free(resp->mmr_out); + resp->mmr_out_size = 0; + return; + } + /* + * Fill in the sm_state/sm_flags value in the results structure which + * gets passed back to the message originator + */ + resp->mmr_exitval = 0; + for (smi = 0; (smi < NMIRROR); smi++) { + mm_submirror_t *mmsp = &mm->un_sm[smi]; + res->sm_state[smi] = mmsp->sm_state; + res->sm_flags[smi] = mmsp->sm_flags; + } + /* Returm value of tstate for mirror */ + res->mir_tstate = tstate; +} + +/* + * This is to issue an ioctl to call poke_hotspares + */ +/*ARGSUSED*/ +void +mdmn_do_poke_hotspares(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp) +{ + + md_mn_poke_hotspares_t pokehsp; + md_mn_msg_pokehsp_t *d; + + resp->mmr_out_size = 0; + resp->mmr_err_size = 0; + resp->mmr_out = NULL; + resp->mmr_err = NULL; + resp->mmr_comm_state = MDMNE_ACK; + d = (md_mn_msg_pokehsp_t *)(void *)msg->msg_event_data; + + (void) memset(&pokehsp, 0, sizeof (pokehsp)); + MD_SETDRIVERNAME(&pokehsp, MD_MIRROR, d->pokehsp_setno); + + resp->mmr_exitval = metaioctl(MD_MN_POKE_HOTSPARES, &pokehsp, + &pokehsp.mde, NULL); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c new file mode 100644 index 0000000000..a6ba008376 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c @@ -0,0 +1,690 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> + +extern void mdmn_do_cmd(HANDLER_PARMS); +extern void mdmn_do_clu(HANDLER_PARMS); +extern void mdmn_do_req_owner(HANDLER_PARMS); +extern void mdmn_do_susp_write(HANDLER_PARMS); +extern void mdmn_do_state_upd_reswr(HANDLER_PARMS); +extern void mdmn_do_allocate_hotspare(HANDLER_PARMS); +extern void mdmn_do_poke_hotspares(HANDLER_PARMS); +extern void mdmn_do_resync(HANDLER_PARMS); +extern void mdmn_do_setsync(HANDLER_PARMS); +extern void mdmn_do_choose_owner(HANDLER_PARMS); +extern void mdmn_do_change_owner(HANDLER_PARMS); +extern void mdmn_do_set_cap(HANDLER_PARMS); +extern void mdmn_do_dummy(HANDLER_PARMS); +extern void mdmn_do_mddb_parse(HANDLER_PARMS); +extern void mdmn_do_mddb_block(HANDLER_PARMS); +extern void mdmn_do_sm_mddb_attach(HANDLER_PARMS); +extern void mdmn_do_sm_mddb_detach(HANDLER_PARMS); +extern void mdmn_do_meta_db_newside(HANDLER_PARMS); +extern void mdmn_do_meta_db_delside(HANDLER_PARMS); +extern void mdmn_do_meta_md_addside(HANDLER_PARMS); +extern void mdmn_do_meta_md_delside(HANDLER_PARMS); +extern void mdmn_do_mddb_optrecerr(HANDLER_PARMS); +extern void mdmn_do_iocset(HANDLER_PARMS); +extern void mdmn_do_sp_setstat(HANDLER_PARMS); +extern void mdmn_do_addkeyname(HANDLER_PARMS); +extern void mdmn_do_delkeyname(HANDLER_PARMS); +extern void mdmn_do_get_tstate(HANDLER_PARMS); +extern void mdmn_do_get_mirstate(HANDLER_PARMS); + +extern int mdmn_smgen_test6(SMGEN_PARMS); +extern int mdmn_smgen_state_upd(SMGEN_PARMS); +extern int mdmn_smgen_mddb_attach(SMGEN_PARMS); +extern int mdmn_smgen_mddb_detach(SMGEN_PARMS); + +md_mn_msg_tbl_entry_t msg_table[MD_MN_NMESSAGES] = { + +/* + * In order to have fast direct access to the table, we use the message type as + * an index into it. + * Thus the order of the elements in this table MUST match the order of the + * message types specified in mdmn_commd.x! + * See the definition of md_mn_msg_t. + * + * Be careful and do not disturb the order of the messages! + */ + { + /* MD_MN_MSG_NULL */ + MD_MSG_CLASS0, /* message class */ + NULL, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 0, 0, /* class busy retry / time delta */ + 0, 0 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST1 */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_dummy, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST2 */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_dummy, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST3 */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_dummy, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST4 */ + MD_MSG_CLASS4, /* message class */ + mdmn_do_dummy, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST5 */ + MD_MSG_CLASS5, /* message class */ + mdmn_do_dummy, /* message handler */ + NULL, /* submessage generator */ + 4, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_TEST6 */ + MD_MSG_CLASS1, /* message class */ + NULL, /* message handler */ + mdmn_smgen_test6, /* submessage generator */ + 1, /* timeout in seconds */ + 200, 4, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_CMD + * Send a command string to all nodes + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_cmd, /* message handler */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 40, 20, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_CMD_RETRY + * Send a command string to all nodes and retry on busy + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_cmd, /* message handler */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 100000, 20, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_CLU_CHECK */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_clu, /* message handler */ + NULL, /* submessage generator */ + 5, /* timeout in seconds */ + 10000, 2, /* class busy retry / time delta */ + 0, 0 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_CLU_LOCK */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_clu, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 10000, 2, /* class busy retry / time delta */ + 0, 0 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_CLU_UNLOCK */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_clu, /* message handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 10000, 2, /* class busy retry / time delta */ + 0, 0 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_REQUIRE_OWNER */ + MD_MSG_CLASS5, /* message class */ + mdmn_do_req_owner, /* message handler */ + NULL, /* submessage generator */ + 12, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_CHOOSE_OWNER + * Using the current resync count for the set, choose a resync + * owner and send a CHANGE_OWNER message to request that node + * to make itself the owner + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_choose_owner, /* message handler */ + NULL, /* submessage generator */ + 12, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_CHANGE_OWNER + * Request a change of ownership to the specified node + */ + MD_MSG_CLASS4, /* message class */ + mdmn_do_change_owner, /* message handler */ + NULL, /* submessage generator */ + 12, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SUSPEND_WRITES + * Suspend all writes to the specified mirror + */ + MD_MSG_CLASS6, /* message class */ + mdmn_do_susp_write, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + 200, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_STATE_UPDATE_RESWR + * Update the state of a mirror component + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_state_upd_reswr, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_STATE_UPDATE + * Suspend writes to a mirror and then update the state of a + * mirror component + */ + MD_MSG_CLASS1, /* message class */ + NULL, /* message handler */ + mdmn_smgen_state_upd, /* submessage generator */ + 16, /* SUSPEND_WRITES + STATE_UPDATE_RESWR */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_ALLOCATE_HOTSPARE + * Allocate a hotspare for a mirror component + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_allocate_hotspare, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_RESYNC_STARTING + * Start a resync thread for the specified mirror + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_resync, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_RESYNC_NEXT + * Send the next region to be resyned to all nodes. For ABR + * mirrors, the nodes must suspend all writes to this region until + * the next message of this type or a RESYNC_FINISH + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_resync, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_RESYNC_FINISH + * All resyncs for a mirror are complete, terminate resync thread + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_resync, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_RESYNC_PHASE_DONE + * A resync phase, optimized, submirror or component is complete + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_resync, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SET_CAP + * Set the specified metadevice capability on all nodes + * This is used to propagate the ABR capability + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_set_cap, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + 100000, 10, /* class busy retry/ time delta */ + 200, 100 /* comm fail retry / time delta */ + }, + + { + /* MD_MN_MSG_VERBOSITY */ + MD_MSG_CLASS0, /* special message class */ + mdmn_do_dummy, /* dummy handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 0, 0, /* No retries for class busy */ + 0, 0 /* No retries for comm fail */ + }, + + { + /* + * MD_MN_MSG_MDDB_PARSE + * Message cannot fail unless node failure causes node panic + */ + MD_MSG_CLASS7, /* message class */ + mdmn_do_mddb_parse, /* reparse mddb */ + NULL, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_MDDB_BLOCK + * Message cannot fail unless node failure causes node panic + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_mddb_block, /* block/unblock reparse */ + NULL, /* submessage generator */ + 5, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_DB_ATTACH + */ + MD_MSG_CLASS3, /* message class */ + NULL, /* message handler */ + mdmn_smgen_mddb_attach, /* submessage generator */ + 30, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SM_MDDB_ATTACH + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_sm_mddb_attach, /* message handler */ + NULL, /* submessage generator */ + 20, /* timeout in seconds */ + /* creates mddbs */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_DB_DETACH + */ + MD_MSG_CLASS3, /* message class */ + NULL, /* detach mddb */ + mdmn_smgen_mddb_detach, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + { + + /* + * MD_MN_MSG_SM_MDDB_DETACH + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_sm_mddb_detach, /* detach mddb */ + NULL, /* submessage generator */ + 5, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_DB_NEWSIDE + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_meta_db_newside, /* add new mddb side info */ + NULL, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_DB_DELSIDE + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_meta_db_delside, /* delete mddb side info */ + NULL, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_MD_ADDSIDE + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_meta_md_addside, /* add new md side info */ + NULL, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_META_MD_DELSIDE + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_meta_md_delside, /* delete md side info */ + NULL, /* submessage generator */ + 10, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_MDDB_OPTRECERR + * Message cannot fail unless node failure causes node panic + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_mddb_optrecerr, /* fix opt rec mddb */ + NULL, /* submessage generator */ + 3, /* timeout in seconds */ + UINT_MAX, 2, /* class busy retry / time delta */ + 10, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_ABORT + */ + MD_MSG_CLASS0, /* special message class */ + mdmn_do_dummy, /* dummy handler */ + NULL, /* submessage generator */ + 1, /* timeout in seconds */ + 0, 0, /* No retries for class busy */ + 0, 0 /* No retries for comm fail */ + }, + + { + /* + * MD_MN_MSG_STATE_UPDATE_RESWR2 + * Update the state of a mirror component, called if during the updates + * of the watermarks for a softpartition, an IO error on a submirror + * occurs. Need to have a class different from CLASS1, otherwise we + * deadlock with the command that is currently being processed + * (metainit/metaclear/metattach/metarecover) + * + * And we may actually use a class different than CLASS1 because this + * can only happen when a metainit or similar is called, and in that + * case all potential metadb or metaset commands are blocked anyway. + * Besides the different class it does exactly what + * MD_MN_MSG_STATE_UPDATE_RESWR would do + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_state_upd_reswr, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_STATE_UPDATE2 + * Like MD_MN_MSG_STATE_UPDATE only using a different class. + * See comment for MD_MN_MSG_STATE_UPDATE_RESWR2 + */ + MD_MSG_CLASS3, /* message class */ + NULL, /* message handler */ + mdmn_smgen_state_upd, /* submessage generator */ + 16, /* SUSPEND_WRITES + STATE_UPDATE_RESWR */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_ALLOCATE_HOTSPARE2 + * Like MD_MN_MSG_ALLOCATE_HOTSPARE only using a different class. + * See comment for MD_MN_MSG_STATE_UPDATE_RESWR2 + */ + MD_MSG_CLASS3, /* message class */ + mdmn_do_allocate_hotspare, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_IOCSET + * Send IOCSET ioctl to create a soft part + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_iocset, /* create softpart */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SP_SETSTAT + * Update the status of a softpart + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_sp_setstat, /* create softpart */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_ADDKEYNAME + * Add a key to the namespace + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_addkeyname, /* add key */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SP_DELKEYNAME + * Remove a key from the namespace + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_delkeyname, /* delete key */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_GET_TSTATE + * Get ui_tstate for a metadevice from the master. Used to get ABR + * state from the master node. + */ + MD_MSG_CLASS2, /* message class */ + mdmn_do_get_tstate, /* get tstate */ + NULL, /* submessage generator */ + 5, /* times out in 5 secs */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_GET_MIRROR_STATE + * Get submirror state for specified submirror from master node. + * Used to synchronise initial resync state across a cluster. + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_get_mirstate, /* get smstate */ + NULL, /* submessage generator */ + 5, /* times out in 5 secs */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SP_SETSTAT2 + * Update the status of a softpart. Used for propagating an error from + * the soft-part sp_error() routine + */ + MD_MSG_CLASS4, /* message class */ + mdmn_do_sp_setstat, /* update softpart state */ + NULL, /* submessage generator */ + 90, /* times out in 90 secs */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_SETSYNC + * Start a resync thread for the specified mirror + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_setsync, /* message handler */ + NULL, /* submessage generator */ + 90, /* timeout in seconds */ + 10000, 2, /* class busy retry / time delta */ + 10, 1000 /* comm fail retry / time delta */ + }, + + { + /* + * MD_MN_MSG_POKE_HOTSPARES + * Call poke_hotspares() + */ + MD_MSG_CLASS1, /* message class */ + mdmn_do_poke_hotspares, /* message handler */ + NULL, /* submessage generator */ + 8, /* timeout in seconds */ + UINT_MAX, 10, /* class busy retry / time delta */ + UINT_MAX, 100 /* comm fail retry / time delta */ + }, + +}; diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c new file mode 100644 index 0000000000..582b7d293e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c @@ -0,0 +1,922 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#include <meta.h> +#include <sdssc.h> +#include <arpa/inet.h> +#include <sys/lvm/md_mddb.h> + +#define MAX_LINE_SIZE 1024 + +/* + * Maximum amount of time to spend waiting for an ownership change to complete. + */ +static const int OWNER_TIMEOUT = 3; + +/* + * FUNCTION: meta_is_mn_set() + * INPUT: sp - the set name + * OUTPUT: ep - return error pointer + * RETURNS: int - 1 if MultiNode set else 0 + * PURPOSE: checks if the set is a MultiNode set + */ +int +meta_is_mn_set( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + + /* Local set cannot be MultiNode */ + if ((sp == NULL) || (sp->setname == NULL) || + (strcmp(sp->setname, MD_LOCAL_NAME) == 0)) + return (0); + sd = metaget_setdesc(sp, ep); + ASSERT(sd != NULL); + if (sd->sd_flags & MD_SR_MN) + return (1); + return (0); +} + +/* + * FUNCTION: meta_is_mn_name() + * INPUT: spp - ptr to the set name, if NULL the setname is derived + * from the metadevice name (eg set/d10 ) + * name - the metadevice name + * OUTPUT: ep - return error pointer + * RETURNS: int - 1 if MultiNode set else 0 + * PURPOSE: checks if the metadevice is in a MultiNode set + */ +int +meta_is_mn_name( + mdsetname_t **spp, + char *name, + md_error_t *ep +) +{ + md_error_t t_e = mdnullerror; + char *cname; + + if (*spp == NULL) { + if (is_hspname(name)) { + if (metahspname(spp, name, ep) == NULL) + return (0); + } else if (is_metaname(name)) { + /* Will fill in *spp based on name */ + if ((cname = meta_name_getname(spp, name, &t_e)) + != NULL) + Free(cname); + if (! mdisok(&t_e)) { + (void) mdstealerror(ep, &t_e); + return (0); + } + } else return (0); + } + + if ((strcmp((*spp)->setname, MD_LOCAL_NAME) != 0) && + (metaget_setdesc(*spp, ep) != NULL) && + ((*spp)->setdesc->sd_flags & MD_SR_MN)) { + return (1); + } + return (0); +} + +/* + * meta_ping_mnset(set_t setno) + * Send a test message for this set in order to make commd do some init stuff + * Don't bother changelog. + * If set is suspended, fail immediately. + */ +void +meta_ping_mnset(set_t setno) +{ + char *data = "test"; + md_error_t mde = mdnullerror; + md_mn_result_t *resp = NULL; + + (void) mdmn_send_message(setno, MD_MN_MSG_TEST2, + MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data, + sizeof (data), &resp, &mde); + + if (resp != (md_mn_result_t *)NULL) { + free_result(resp); + } +} + +/* + * + * FUNCTION: print_stderr + * INPUT: errstr - the error message returned by the command + * context - the context string from metainit -a + * PURPOSE: called from meta_mn_send_command to print the error message + * to stderr. When context is NO_CONTEXT_STRING, the errstr string + * is output unchanged. When context is a string, it is the context + * string for the metainit -a command and in this case the errstr + * string has to be parsed to extract the command and node name + * and to send a message to stderr in the format + * command: node: context: error message + */ +static void +print_stderr( + char *errstr, + char *context +) +{ + char *command; + char *node; + char *message; + int length = strlen(errstr + 1); + + if (context == NO_CONTEXT_STRING) { + (void) fprintf(stderr, "%s", errstr); + } else { + command = Malloc(length); + node = Malloc(length); + message = Malloc(length); + if (sscanf(errstr, "%[^:]: %[^:]: %[^\n]", command, node, + message) == 3) { + (void) fprintf(stderr, "%s: %s: %s: %s\n", command, + node, context, message); + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: Invalid format error message"), errstr); + } + Free(command); + Free(node); + Free(message); + } +} + +/* + * FUNCTION: meta_mn_send_command() + * INPUT: sp - the set name + * argc - number of arguments + * argv - arg list + * flags - some controlling flags + * initall_context - context string for metainit -a + * OUTPUT: ep - return error pointer + * RETURNS: return exitval from mdmn_send_message + * PURPOSE: sends the command to the master node for execution + */ +int +meta_mn_send_command( + mdsetname_t *sp, + int argc, + char *argv[], + int flags, + char *initall_context, + md_error_t *ep +) +{ + int a; + int err; + int retval; + int send_message_flags = MD_MSGF_DEFAULT_FLAGS; + int send_message_type; + char *cmd; + md_mn_result_t *resp = NULL; + + cmd = Malloc(1024); + (void) strlcpy(cmd, argv[0], 1024); + for (a = 1; a < argc; a++) { + /* don't copy empty arguments */ + if (*argv[a] == '\0') { + continue; + } + (void) strcat(cmd, " "); + (void) strcat(cmd, argv[a]); + } + /* + * in dryrun mode stop on the first error + * use the CMD_RETRY message type if RETRY_BUSY flag set + */ + if (flags & MD_DRYRUN) + send_message_flags |= MD_MSGF_STOP_ON_ERROR; + if (flags & MD_NOLOG) + send_message_flags |= MD_MSGF_NO_LOG; + if (flags & MD_PANIC_WHEN_INCONSISTENT) + send_message_flags |= MD_MSGF_PANIC_WHEN_INCONSISTENT; + if (flags & MD_RETRY_BUSY) { + send_message_type = MD_MN_MSG_BC_CMD_RETRY; + } else { + send_message_type = MD_MN_MSG_BC_CMD; + } + err = mdmn_send_message( + sp->setno, send_message_type, send_message_flags, + cmd, 1024, &resp, ep); + + free(cmd); + + if (err == 0) { + /* + * stderr may be turned off by IGNORE_STDERR + * In dryrun we only print stderr if the exit_val is non-zero + */ + if ((resp->mmr_err_size != 0) && + ((flags & MD_IGNORE_STDERR) == 0)) { + if (((flags & MD_DRYRUN) == 0) || + (resp->mmr_exitval != 0)) { + print_stderr(resp->mmr_err, initall_context); + } + } + + /* + * If dryrun is set, we don't display stdout, + * because the real run has yet to follow. + */ + if (((flags & MD_DRYRUN) == 0) && (resp->mmr_out_size != 0)) { + (void) printf("%s", resp->mmr_out); + } + retval = resp->mmr_exitval; + free_result(resp); + return (retval); + } + if (resp != NULL) { + if (resp->mmr_comm_state == MDMNE_CLASS_BUSY) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "rpc.mdcommd currently busy. " + "Retry operation later.\n")); + } else if (resp->mmr_comm_state == MDMNE_NOT_JOINED) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Node %s must join the %s multi-owner diskset to " + "issue commands.\n" + "To join, use: metaset -s %s -j\n"), + mynode(), sp->setname, sp->setname); + } else if (resp->mmr_comm_state == MDMNE_LOG_FAIL) { + mddb_config_t c; + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = sp->setno; + (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Command not attempted: Unable to log message " + "in set %s\n"), sp->setname); + if (c.c_flags & MDDB_C_STALE) { + (void) mdmddberror(ep, MDE_DB_STALE, NODEV64, + sp->setno, 0, NULL); + mde_perror(ep, ""); + } + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Command failed: Commd State %d " + "encountered.\n"), resp->mmr_comm_state); + } + free_result(resp); + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Command failed: mdmn_send_message returned %d.\n"), + err); + } + + + return (1); +} + +/* + * FUNCTION: meta_mn_send_suspend_writes() + * INPUT: mnum - minor num of mirror + * OUTPUT: ep - return error pointer + * RETURNS: return value from mdmn_send_message() + * PURPOSE: sends message to all nodes to suspend writes to the mirror. + */ +int +meta_mn_send_suspend_writes( + minor_t mnum, + md_error_t *ep +) +{ + int result; + md_mn_msg_suspwr_t suspwrmsg; + md_mn_result_t *resp = NULL; + + suspwrmsg.msg_suspwr_mnum = mnum; + /* + * This message is never directly issued. + * So we launch it with a suspend override flag. + * If the commd is suspended, and this message comes + * along it must be sent due to replaying a command or similar. + * In that case we don't want this message to be blocked. + * If the commd is not suspended, the flag does no harm. + */ + result = mdmn_send_message(MD_MIN2SET(mnum), + MD_MN_MSG_SUSPEND_WRITES, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + (char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep); + if (resp != NULL) { + free_result(resp); + } + return (result); +} + +/* + * Parse the multi-node list file + * + * Return Values: Zero - Success + * Non Zero - Failure + * + * File content: The content of the nodelist file should consist of + * triplets of nodeid, nodename and private interconnect + * address seperated by one or more white space. + * e.g. + * 1 node_a 192.168.111.3 + * 2 node_b 192.168.111.5 + * + * Any missing fields will result in an error. + */ +int +meta_read_nodelist( + int *nodecnt, + mndiskset_membershiplist_t **nl, + md_error_t *ep +) +{ + FILE *fp = NULL; + char line[MAX_LINE_SIZE]; + char *buf; + uint_t i; + int sz; + mndiskset_membershiplist_t **tailp = nl; + + /* open file */ + if ((fp = fopen(META_MNSET_NODELIST, "r")) == NULL) { + mndiskset_membershiplist_t *nlp; + struct hostent *hp; + + /* return this node with id of 1 */ + nlp = *tailp = Zalloc(sizeof (*nlp)); + tailp = &nlp->next; + + *nodecnt = 1; + nlp->msl_node_id = 1; + buf = mynode(); + sz = min(strlen(buf), sizeof (nlp->msl_node_name) - 1); + (void) strncpy(nlp->msl_node_name, buf, sz); + nlp->msl_node_name[sz] = '\0'; + + /* retrieve info about our host */ + if ((hp = gethostbyname(buf)) == NULL) { + return (mdsyserror(ep, EADDRNOTAVAIL, buf)); + } + /* We only do IPv4 addresses, for now */ + if (hp->h_addrtype != AF_INET) { + return (mdsyserror(ep, EPFNOSUPPORT, buf)); + } + /* We take the first address only */ + if (*hp->h_addr_list) { + struct in_addr in; + + (void) memcpy(&in.s_addr, *hp->h_addr_list, + sizeof (struct in_addr)); + (void) strncpy(nlp->msl_node_addr, inet_ntoa(in), + MD_MAX_NODENAME); + } else { + return (mdsyserror(ep, EADDRNOTAVAIL, buf)); + } + + return (0); + } + + *nl = NULL; + *nodecnt = 0; + + while ((fp != NULL) && ((buf = fgets(line, sizeof (line) - 1, fp)) != + NULL)) { + mndiskset_membershiplist_t *nlp; + + /* skip leading spaces */ + while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) + buf++; + + /* skip comments and blank lines */ + if (*buf == '\0' || *buf == '#') + continue; + + /* allocate memory and set tail pointer */ + nlp = *tailp = Zalloc(sizeof (*nlp)); + tailp = &nlp->next; + + /* parse node id */ + nlp->msl_node_id = strtoul(buf, NULL, 0); + buf += i; + + /* skip leading spaces */ + while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) + buf++; + + /* fields missing, return error */ + if (*buf == '\0' || *buf == '#') { + meta_free_nodelist(*nl); + *nl = NULL; + *nodecnt = 0; + + /* close file and return */ + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, + META_MNSET_NODELIST)); + + return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); + } + + /* parse node name */ + sz = min(i, sizeof (nlp->msl_node_name) - 1); + (void) strncpy(nlp->msl_node_name, buf, sz); + nlp->msl_node_name[sz] = '\0'; + buf += i; + + /* skip leading spaces */ + while ((*buf != '\0') && (i = strcspn(buf, " \t\n")) == 0) + buf++; + + /* fields missing, return error */ + if (*buf == '\0' || *buf == '#') { + meta_free_nodelist(*nl); + *nl = NULL; + *nodecnt = 0; + + /* close file and return */ + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, + META_MNSET_NODELIST)); + + return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); + } + + /* parse node address */ + sz = min(i, sizeof (nlp->msl_node_addr) - 1); + (void) strncpy(nlp->msl_node_addr, buf, sz); + nlp->msl_node_addr[sz] = '\0'; + + ++*nodecnt; + } + + /* close file */ + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, META_MNSET_NODELIST)); + + return (0); +} + +/* + * Populate the multi-node list file from a given list of node id's + * The nids must have only one node id in each cell. Range of node + * id's in the form 1-n are not allowed. + * + * Return Values: Zero - Success + * Non Zero - Failure + */ +int +meta_write_nodelist( + int nodecnt, + char **nids, + md_error_t *ep +) +{ + FILE *fp = NULL; + char name[MAX_LINE_SIZE], addr[MAX_LINE_SIZE]; + uint_t i, nid; + struct in_addr ipaddr; + int err = 0; + + /* check if we are running on clustering */ + if ((err = sdssc_bind_library()) != SDSSC_OKAY) { + return (mdsyserror(ep, err, META_MNSET_NODELIST)); + } + + /* open file for writing */ + if ((fp = fopen(META_MNSET_NODELIST, "w")) == NULL) { + return (mdsyserror(ep, errno, META_MNSET_NODELIST)); + } + + for (i = 0; i < nodecnt; i++) { + /* extract the node id */ + errno = 0; + nid = strtoul(nids[i], NULL, 0); + if (errno != 0) { + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, + META_MNSET_NODELIST)); + + return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); + } + + /* get node name */ + (void) snprintf(name, sizeof (name), "%d", nid); + sdssc_cm_nid2nm(name); + + /* finally get the private ip address */ + (void) snprintf(addr, sizeof (addr), "%s", name); + if (sdssc_get_priv_ipaddr(addr, &ipaddr) != SDSSC_OKAY) { + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, + META_MNSET_NODELIST)); + + return (mdsyserror(ep, EINVAL, META_MNSET_NODELIST)); + } + + (void) fprintf(fp, "%d\t%s\t%s\n", nid, name, + inet_ntoa(ipaddr)); + } + + /* close file */ + if ((fp) && (fclose(fp) != 0)) + return (mdsyserror(ep, errno, META_MNSET_NODELIST)); + + return (0); +} + +/* + * Free node list + */ +void +meta_free_nodelist( + mndiskset_membershiplist_t *nl +) +{ + mndiskset_membershiplist_t *next = NULL; + + for (/* void */; (nl != NULL); nl = next) { + next = nl->next; + Free(nl); + } +} + +/* + * FUNCTION: meta_mn_send_setsync() + * INPUT: sp - setname + * mirnp - mirror name + * size - buffer size, 0 if none + * OUTPUT: ep - return error pointer + * RETURNS: return value from meta_mn_send_command() + * PURPOSE: Send a setsync command to all nodes to set resync status + */ + +int +meta_mn_send_setsync( + mdsetname_t *sp, + mdname_t *mirnp, + daddr_t size, + md_error_t *ep +) +{ + md_mn_msg_setsync_t setsyncmsg; + int ret; + md_mn_result_t *resp = NULL; + + setsyncmsg.setsync_mnum = meta_getminor(mirnp->dev); + setsyncmsg.setsync_copysize = size; + setsyncmsg.setsync_flags = 0; + + /* + * We do not log the metasync command as it will have no effect on the + * underlying metadb state. If we have a master change the + * reconfiguration process will issue a new 'metasync' to all affected + * mirrors, so we would actually end up sending the message twice. + * Removing the logging of the message helps reduce the processing + * time required. + */ + ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + (char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep); + if (resp != NULL) { + free_result(resp); + } + + /* + * Unlike non-MN sets, the metasync command does not actually + * start a resync, it simply updates the state on all of the + * nodes. Therefore, to start a resync we send a resync starting + * message for the metadevice + */ + if (ret == 0) + ret = meta_mn_send_resync_starting(mirnp, ep); + return (ret); +} + +/* + * FUNCTION: meta_mn_send_metaclear_command() + * INPUT: sp - setname + * name - metadevice name + * options - command options + * pflag - clear all soft partitions for a given device + * OUTPUT: ep - return error pointer + * RETURNS: return value from meta_mn_send_command() + * PURPOSE: Send a metaclear command to all nodes with force(-f) and + * recurse(-r) options set if required. For hotspare pool and + * metadevices, the metadevice name is of the form setname/dxx or + * setname/hspxxx so a '-s' argument isn't required. If pflag is set + * the name refers to a metadevice or component and in the is case + * a '-s' argument is required to define the set. + */ + +int +meta_mn_send_metaclear_command( + mdsetname_t *sp, + char *name, + mdcmdopts_t options, + int pflag, + md_error_t *ep +) +{ + int newargc; + char **newargv; + int ret; + + /* + * Allocate an array large enough to hold all of the possible + * metaclear arguments + */ + newargv = Calloc(7, sizeof (char *)); + newargv[0] = "metaclear"; + newargc = 1; + if (pflag) { + newargv[newargc] = "-s"; + newargc++; + newargv[newargc] = sp->setname; + newargc++; + } + if (options & MDCMD_FORCE) { + newargv[newargc] = "-f"; + newargc++; + } + if (options & MDCMD_RECURSE) { + newargv[newargc] = "-r"; + newargc++; + } + if (pflag) { + newargv[newargc] = "-p"; + newargc++; + } + newargv[newargc] = name; + newargc++; + + ret = meta_mn_send_command(sp, newargc, newargv, + MD_DISP_STDERR, NO_CONTEXT_STRING, ep); + + free(newargv); + return (ret); +} + +/* + * FUNCTION: meta_mn_send_resync_starting() + * INPUT: sp - setname + * mirnp - mirror name + * OUTPUT: ep - return error pointer + * RETURNS: return value from mdmn_send_message() + * PURPOSE: Send a resync starting message to all nodes. + */ + +int +meta_mn_send_resync_starting( + mdname_t *mirnp, + md_error_t *ep +) +{ + int result; + md_mn_msg_resync_t resyncmsg; + md_mn_result_t *resp = NULL; + minor_t mnum = meta_getminor(mirnp->dev); + + /* + * This message is never directly issued. + * So we launch it with a suspend override flag. + * If the commd is suspended, and this message comes + * along it must be sent due to replaying a command or similar. + * In that case we don't want this message to be blocked. + * If the commd is not suspended, the flag does no harm. + */ + resyncmsg.msg_resync_mnum = mnum; + result = mdmn_send_message(MD_MIN2SET(mnum), + MD_MN_MSG_RESYNC_STARTING, + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + (char *)&resyncmsg, sizeof (resyncmsg), &resp, ep); + + if (resp != NULL) { + free_result(resp); + } + return (result); +} + +/* + * FUNCTION: meta_mn_change_owner() + * INPUT: opp - pointer to parameter block + * setno - set number of mirror metadevice + * mnum - minor number of mirror metadevice + * owner - node ID of mirror owner + * flags - flag field for ioctl + * OUTPUT: opp - parameter block used to send ioctl + * RETURNS: int - 0 success, -1 error + * PURPOSE: issue an ioctl to change the ownership of the specified mirror + * to our node ID. We need to be the owner before any watermarks + * are committed to the device otherwise we'll enter a deadly + * embrace when attempting to write the watermark. + * This function can also be used so set the owner on a node to + * NULL. In this case the change is only made on the local node. + * In addition by setting the MD_MN_MM_CHOOSE_OWNER flag, the + * function can also be used to choose a mirror resync owner. This + * function should only be called on the master and it will + * select the owner and request it to become the owner. + */ +int +meta_mn_change_owner( + md_set_mmown_params_t **opp, /* Returned parameter block */ + set_t setno, /* Mirror set number */ + uint_t mnum, /* Minor number */ + uint_t owner, /* Node ID of mirror owner */ + uint_t flags /* Flags */ +) +{ + md_set_mmown_params_t *ownpar = *opp; + md_mn_own_status_t *ownstat = NULL; + struct timeval tvs, tve; + int n = 0; + int rval; + + if (ownpar != NULL) { + (void) memset(ownpar, 0, sizeof (*ownpar)); + } else { + ownpar = Zalloc(sizeof (*ownpar)); + } + ownstat = Zalloc(sizeof (*ownstat)); + + ownpar->d.mnum = mnum; + ownpar->d.owner = owner; + ownpar->d.flags = flags; + MD_SETDRIVERNAME(ownpar, MD_MIRROR, setno); + MD_SETDRIVERNAME(ownstat, MD_MIRROR, setno); + + /* + * Attempt to change the ownership to the specified node. We retry this + * up to 10 times if we receive EAGAIN from the metadevice. This only + * happens if the underlying metadevice is busy with outstanding i/o + * that requires ownership change. + */ + while ((rval = metaioctl(MD_MN_SET_MM_OWNER, ownpar, &ownpar->mde, + NULL)) != 0) { + md_sys_error_t *ip = + &ownpar->mde.info.md_error_info_t_u.sys_error; + if (ip->errnum != EAGAIN) + break; + if (n++ >= 10) + break; + (void) sleep(1); + } + + /* + * There is no need to wait for the ioctl completion if we are setting + * the owner to NULL or requesting the master to choose the owner + */ + if ((owner == 0) || (flags & MD_MN_MM_CHOOSE_OWNER)) { + Free(ownstat); + *opp = ownpar; + return (0); + } + + /* + * Wait for ioctl completion or a timeout to occur. If we + * timeout we fail the i/o request. + */ + ownstat->mnum = ownpar->d.mnum; + (void) gettimeofday(&tvs, NULL); + + while ((rval == 0) && !(ownstat->flags & MD_MN_MM_RESULT)) { + while ((rval = metaioctl(MD_MN_MM_OWNER_STATUS, ownstat, + &ownstat->mde, NULL)) != 0) { + (void) gettimeofday(&tve, NULL); + if ((tve.tv_sec - tvs.tv_sec) > OWNER_TIMEOUT) { + rval = -1; + break; + } + (void) sleep(1); + } + } + + /* we did not not timeout but ioctl failed set rval */ + + if (rval == 0) { + rval = (ownstat->flags & MD_MN_MM_RES_FAIL) ? -1 : 0; + } + + Free(ownstat); + *opp = ownpar; + return (rval); +} +/* + * special handling is required when running on a single node + * non-SC3.x environment. This function determines tests + * for that case. + * + * Return values: + * 0 - no nodes or joined or in a SC3.x env + * 1 - 1 node and not in SC3.x env + */ + +int +meta_mn_singlenode() +{ + md_error_t xep = mdnullerror; + int nodecnt; + int mnset_single_node = 0; + mndiskset_membershiplist_t *nl; + + /* + * If running on SunCluster, then don't validate MN sets, + * this is done during a reconfig cycle since all nodes must + * take the same action. + * + * Only cleanup in case of a single node situation + * when not running on SunCluster. This single node + * situation occurs when the nodelist only contains + * this node and the MN setrecords only contain this + * node. + */ + if (meta_read_nodelist(&nodecnt, &nl, &xep) == -1) { + nodecnt = 0; /* no nodes are alive */ + nl = NULL; + mdclrerror(&xep); + } else { + /* + * If only 1 node in nodelist and not running + * on SunCluster, set single_node flag. + */ + if ((nodecnt == 1) && + (strcmp(nl->msl_node_name, mynode()) == 0) && + ((sdssc_bind_library()) != SDSSC_OKAY)) { + mnset_single_node = 1; + } + meta_free_nodelist(nl); + } + return (mnset_single_node); +} + +/* + * FUNCTION: meta_mn_send_get_tstate() + * INPUT: dev - dev_t of device + * OUTPUT: tstatep - tstate value + * ep - return error pointer + * RETURNS: return value from mdmn_send_message() + * PURPOSE: Send a message to the master to get ui_tstate for a given device. + */ + +int +meta_mn_send_get_tstate( + md_dev64_t dev, + uint_t *tstatep, + md_error_t *ep +) +{ + int result; + md_mn_msg_gettstate_t tstatemsg; + md_mn_result_t *resp = NULL; + minor_t mnum = meta_getminor(dev); + + tstatemsg.gettstate_dev = dev; + result = mdmn_send_message(MD_MIN2SET(mnum), + MD_MN_MSG_GET_TSTATE, + MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, + (char *)&tstatemsg, sizeof (tstatemsg), &resp, ep); + + if (result == 0) + *tstatep = resp->mmr_exitval; + else + /* If some error occurred set tstate to 0 */ + *tstatep = 0; + + if (resp != NULL) { + free_result(resp); + } + return (result); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_mount.c b/usr/src/lib/lvm/libmeta/common/meta_mount.c new file mode 100644 index 0000000000..6d9cf39b4b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_mount.c @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * return mount association with meta device + */ + +#include <meta.h> + +#include <sys/mnttab.h> + +#include "meta_lib_prv.h" + +/* + * return associated mount point with this mdname_t + */ +char * +meta_get_mountp( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + FILE *mfp; + struct mnttab m; + char *mountp = NULL; + char mnt_mountp[MNT_LINE_MAX]; + char mnt_special[MNT_LINE_MAX]; + + /* should have a set */ + assert(sp != NULL); + + /* look in mnttab */ + if ((mfp = open_mnttab()) == NULL) { + (void) mdsyserror(ep, errno, MNTTAB); + return (NULL); + } + + while ((!mountp) && (getmntent(mfp, &m) == 0)) { + mdname_t *mnp; + + if ((m.mnt_special == NULL) || (m.mnt_mountp == NULL)) + continue; + + if (m.mnt_mountp[0] != '/') + continue; + + if ((strcmp(m.mnt_fstype, "nfs") == 0) || + (strcmp(m.mnt_fstype, "autofs") == 0) || + (strcmp(m.mnt_fstype, "proc") == 0) || + (strcmp(m.mnt_fstype, "tmpfs") == 0) || + (strcmp(m.mnt_fstype, "cachefs") == 0) || + (strcmp(m.mnt_fstype, "lofs") == 0) || + (strcmp(m.mnt_fstype, "rfs") == 0) || + (strcmp(m.mnt_fstype, "fd") == 0)) + continue; + + (void) strcpy(mnt_mountp, m.mnt_mountp); + (void) strcpy(mnt_special, m.mnt_special); + if ((mnp = metaname(&sp, mnt_special, ep)) == NULL) { + mdclrerror(ep); + continue; + } + + if (np->dev == mnp->dev) { + mountp = mnt_mountp; + } + } + + /* return success, if found */ + return (mountp? Strdup(mountp): NULL); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_name.c b/usr/src/lib/lvm/libmeta/common/meta_name.c new file mode 100644 index 0000000000..7becd6af2f --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_name.c @@ -0,0 +1,3289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> +#include <metad.h> + +#include <ctype.h> +#include <string.h> + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Macros to produce a quoted string containing the value of a + * preprocessor macro. For example, if SIZE is defined to be 256, + * VAL2STR(SIZE) is "256". This is used to construct format + * strings for scanf-family functions below. + */ +#define QUOTE(x) #x +#define VAL2STR(x) QUOTE(x) + +extern char *getfullblkname(); +extern char *getfullrawname(); + +/* + * caches + */ +static mdsetnamelist_t *setlistp = NULL; +static mddrivenamelist_t *drivelistp = NULL; +static mdnamelist_t *fastnmlp = NULL; +static mdhspnamelist_t *hsplistp = NULL; + +/* + * leak proof name conversion + */ +static char * +rawname( + char *uname +) +{ + char *p; + struct stat sbuf1, sbuf2; + + if ((p = getfullrawname(uname)) == NULL) { + return (NULL); + } else if (*p == '\0') { + Free(p); + return (NULL); + } else { + if (stat(uname, &sbuf1) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "device to mount in /etc/vfstab is " + "invalid for device %s\n"), uname); + exit(1); + } + if (stat(p, &sbuf2) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "device to fsck in /etc/vfstab is " + "invalid for raw device %s\n"), p); + exit(1); + } + if (sbuf1.st_rdev != sbuf2.st_rdev) { + (void) printf(dgettext(TEXT_DOMAIN, + "/etc/vfstab entries inconsistent on " + "line containing device %s\n"), uname); + exit(1); + } + if ((sbuf1.st_mode & S_IFBLK) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "/etc/vfstab device to mount is not a " + "block device for device %s\n"), uname); + exit(1); + } + if ((sbuf2.st_mode & S_IFCHR) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "/etc/vfstab device to fsck is not a " + "raw device for device %s\n"), p); + exit(1); + } + return (p); + } +} + +char * +blkname( + char *uname +) +{ + char *p; + + if ((p = getfullblkname(uname)) == NULL) { + return (NULL); + } else if (*p == '\0') { + Free(p); + return (NULL); + } else { + return (p); + } +} + +/* + * parse up metadevice name + */ +static int +parse_metadevice( + char *uname, + char **snamep, + unit_t *unitp +) +{ + char *sname = Malloc(strlen(uname) + 1); + char *tname = Malloc(strlen(uname) + 1); + + unit_t unit; + int len; + char *up; + char *tp; + int lcws; /* last character was slash */ + + /* handle dont cares */ + if (unitp == NULL) + unitp = &unit; + + /* Now copy uname to tname by throwing away any duplicate '/' */ + for (lcws = 0, tp = tname, up = uname; *up; up++) { + if (lcws) { + if (*up == '/') { + continue; + } else { + lcws = 0; + } + } + if (*up == '/') { + lcws = 1; + } + *tp++ = *up; /* ++ is done by for loop */ + } + *tp = '\0'; + + /* without set */ + if ((sscanf(tname, "d%lu%n", unitp, &len) == 1) && + (strlen(tname) == len) && ((long)*unitp >= 0)) { + if (snamep != NULL) + *snamep = NULL; + Free(sname); + Free(tname); + return (0); + } + + /* fully-qualified without set */ + if (((sscanf(tname, "/dev/md/dsk/d%lu%n", unitp, &len) == 1) && + (strlen(tname) == len) && ((long)*unitp >= 0)) || + ((sscanf(tname, "/dev/md/rdsk/d%lu%n", unitp, &len) == 1) && + (strlen(tname) == len) && ((long)*unitp >= 0))) { + if (snamep != NULL) + *snamep = Strdup(MD_LOCAL_NAME); + Free(sname); + Free(tname); + return (0); + } + + /* with set */ + if (((sscanf(tname, "%[^/]/d%lu%n", sname, unitp, &len) == 2) && + (strlen(tname) == len) && ((long)*unitp >= 0)) || + ((sscanf(tname, "/dev/md/%[^/]/dsk/d%lu%n", sname, + unitp, &len) == 2) && + (strlen(tname) == len) && ((long)*unitp >= 0)) || + ((sscanf(tname, "/dev/md/%[^/]/rdsk/d%lu%n", sname, + unitp, &len) == 2) && + (strlen(tname) == len) && ((long)*unitp >= 0))) { + if (snamep != NULL) { + *snamep = sname; + } else { + Free(sname); + } + Free(tname); + return (0); + } + + /* no match */ + if (snamep != NULL) + *snamep = NULL; + Free(sname); + Free(tname); + return (-1); +} + +/* + * FUNCTION: parse_device() + * INPUT: sp - pointer to setname struct + * uname - Name of either a hotspare pool or metadevice + * This can either be a fully qualified path or + * in the form [set name/]device + * OUTPUT: setnamep - name of the set that uname is in + * uname - name of the hotspare pools or metadevice + * only contains the name of the device with all + * other path information stripped off. + * PURPOSE: Parse uname and sp into the set name and device name strings. + * If the set name is specified as part of uname then use that + * otherwise attempt to get the set name from sp. + */ +static void +parse_device( + mdsetname_t *sp, + char *uname, + char **setnamep /* dynamically alloced - caller must free */ +) +{ + char setname[FILENAME_MAX+1]; + char *tname = Malloc(strlen(uname) + 1); + + int len; + char *up; + char *tp; + int lcws; /* last character was slash */ + + /* Now copy uname to tname by throwing away any duplicate '/' */ + for (lcws = 0, tp = tname, up = uname; *up; up++) { + if (lcws) { + if (*up == '/') { + continue; + } else { + lcws = 0; + } + } + if (*up == '/') { + lcws = 1; + } + *tp++ = *up; /* ++ is done by for loop */ + } + *tp = '\0'; + + /* fully-qualified - local set */ + if (((sscanf(tname, "/dev/md/dsk/%" VAL2STR(FILENAME_MAX) "s%n", + uname, &len) == 1) && (strlen(tname) == len)) || + ((sscanf(tname, "/dev/md/rdsk/%" VAL2STR(FILENAME_MAX) "s%n", + uname, &len) == 1) && (strlen(tname) == len))) { + if (setnamep != NULL) + *setnamep = NULL; + Free(tname); + return; + } + + /* with setname specified - either fully qualified and relative spec */ + if (((sscanf(tname, "%" VAL2STR(FILENAME_MAX) "s/%" + VAL2STR(FILENAME_MAX) "s%n", setname, uname, &len) == 2) && + (strlen(tname) == len)) || + ((sscanf(tname, "/dev/md/%[^/]/dsk/%" VAL2STR(FILENAME_MAX) "s%n", + setname, uname, &len) == 2) && (strlen(tname) == len)) || + ((sscanf(tname, "/dev/md/%[^/]/rdsk/%" VAL2STR(FILENAME_MAX) "s%n", + setname, uname, &len) == 2) && (strlen(tname) == len))) { + + if (setnamep != NULL) { + *setnamep = Strdup(setname); + } + Free(tname); + return; + } + + /* without setname specified */ + (void) strcpy(uname, tname); + if (setnamep != NULL) { + if (sp != NULL && !metaislocalset(sp)) + *setnamep = Strdup(sp->setname); + else + *setnamep = NULL; + } + Free(tname); +} + +/* + * parse up hotspare pool name + */ +static int +parse_hsp( + char *uname, + char **snamep, + hsp_t *hspp +) +{ + char *sname = Malloc(strlen(uname) + 1); + hsp_t hsp; + int len; + + /* handle dont cares */ + if (hspp == NULL) + hspp = &hsp; + + /* without set */ + if ((sscanf(uname, "hsp%03u%n", hspp, &len) == 1) && + (strlen(uname) == len) && ((long)*hspp >= 0)) { + if (snamep != NULL) + *snamep = NULL; + Free(sname); + return (0); + } + + /* with set */ + if ((sscanf(uname, "%[^/]/hsp%03u%n", sname, + hspp, &len) == 2) && + (strlen(uname) == len) && ((long)*hspp >= 0)) { + if (snamep != NULL) { + *snamep = sname; + } else { + Free(sname); + } + return (0); + } + + /* no match */ + Free(sname); + return (-1); +} + +/* + * canonicalize metadevice name + */ +static char * +canon_metadevice( + char *sname, + unit_t unit +) +{ + char *cname; + size_t len; + + if ((sname == NULL) || (strcmp(sname, MD_LOCAL_NAME) == 0)) { + len = strlen("d") + 20 + 1; + cname = Malloc(len); + (void) snprintf(cname, len, "d%lu", unit); + } else { + len = strlen(sname) + strlen("/d") + 20 + 1; + cname = Malloc(len); + (void) snprintf(cname, len, "%s/d%lu", sname, unit); + } + + return (cname); +} + +/* + * canonicalize hotspare pool name + */ +static char * +canon_hsp( + char *sname, + hsp_t hsp +) +{ + char *cname; + size_t len; + + if ((sname == NULL) || (strcmp(sname, MD_LOCAL_NAME) == 0)) { + cname = Malloc(strlen("hsp000") + 1); + (void) sprintf(cname, "hsp%03u", hsp); + } else { + len = strlen(sname) + strlen("/hsp000") + 1; + cname = Malloc(len); + (void) snprintf(cname, len, "%s/hsp%03lu", sname, hsp); + } + + return (cname); +} + +/* + * canonicalize name, return type + * + * NOTE: this is really only for use by meta_tab* + */ +char * +meta_canonicalize( + mdsetname_t *sp, + char *uname +) +{ + char *sname = NULL; + char *cname; + + /* return the set name and dev name */ + parse_device(sp, uname, &sname); + + if (sname == NULL) + cname = Strdup(uname); + else { + size_t cname_len; + + cname_len = strlen(uname) + strlen(sname) + 2; + cname = Malloc(cname_len); + (void) snprintf( + cname, cname_len, "%s/%s", sname, uname); + Free(sname); + } + return (cname); +} + +/* + * check that name is a metadevice + */ +int +is_metaname( + char *uname +) +{ + if (parse_metadevice(uname, NULL, NULL) == 0) + return (1); + else + return (0); +} + +/* + * check that name is a hotspare pool + */ +int +is_hspname( + char *uname +) +{ + if (parse_hsp(uname, NULL, NULL) == 0) + return (1); + else + return (0); +} + +/* + * mdsetname_t stuff + */ + +/* + * initialize setname + */ +static void +metainitsetname( + mdsetname_t *sp +) +{ + (void) memset(sp, '\0', sizeof (*sp)); +} + +static void +metafreesetdesc(md_set_desc *sd) +{ + md_mnnode_desc *nd; + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + sd->sd_nodelist = nd->nd_next; + Free(nd); + nd = sd->sd_nodelist; + } + } + metafreedrivedesc(&sd->sd_drvs); + Free(sd); +} + +/* + * free allocated setname + */ +static void +metafreesetname( + mdsetname_t *sp +) +{ + if (sp->setname != NULL) + Free(sp->setname); + if (sp->setdesc != NULL) + metafreesetdesc(sp->setdesc); + metainitsetname(sp); +} + +/* + * flush the setname cache + */ +static void +metaflushsetnames() +{ + mdsetnamelist_t *p, *n; + + for (p = setlistp, n = NULL; (p != NULL); p = n) { + n = p->next; + metafreesetname(p->sp); + Free(p->sp); + Free(p); + } + setlistp = NULL; +} + +/* + * get set number + */ +static int +getsetno( + char *sname, + set_t *setnop, + md_error_t *ep +) +{ + md_set_record *sr; + size_t len; + + /* local set */ + if ((sname == NULL) || (strcmp(sname, MD_LOCAL_NAME) == 0)) { + *setnop = 0; + return (0); + } + + /* shared set */ + if ((sr = getsetbyname(sname, ep)) == NULL) { + if (mdisrpcerror(ep, RPC_PROGNOTREGISTERED)) { + char *p; + + len = strlen(sname) + 30; + p = Malloc(len); + + (void) snprintf(p, len, "setname \"%s\"", sname); + (void) mderror(ep, MDE_NO_SET, p); + Free(p); + } + return (-1); + } + *setnop = sr->sr_setno; + free_sr(sr); + return (0); +} + +/* + * find setname from name + */ +mdsetname_t * +metasetname( + char *sname, + md_error_t *ep +) +{ + mdsetnamelist_t **tail; + set_t setno; + mdsetname_t *sp; + + /* look for cached value first */ + assert(sname != NULL); + for (tail = &setlistp; (*tail != NULL); tail = &(*tail)->next) { + sp = (*tail)->sp; + if (strcmp(sp->setname, sname) == 0) { + return (sp); + } + } + + /* setup set */ + if (getsetno(sname, &setno, ep) != 0) + return (NULL); + + /* allocate new list element and setname */ + *tail = Zalloc(sizeof (**tail)); + sp = (*tail)->sp = Zalloc(sizeof (*sp)); + + sp->setname = Strdup(sname); + sp->setno = setno; + sp->lockfd = MD_NO_LOCK; + + return (sp); +} + +/* + * find setname from setno + */ +mdsetname_t * +metasetnosetname( + set_t setno, + md_error_t *ep +) +{ + mdsetnamelist_t *slp; + mdsetname_t *sp; + md_set_record *sr; + + /* look for cached value first */ + for (slp = setlistp; (slp != NULL); slp = slp->next) { + sp = slp->sp; + if (sp->setno == setno) + return (sp); + } + + /* local set */ + if (setno == MD_LOCAL_SET) + return (metasetname(MD_LOCAL_NAME, ep)); + + /* shared set */ + if ((sr = getsetbynum(setno, ep)) == NULL) + return (NULL); + sp = metasetname(sr->sr_setname, ep); + free_sr(sr); + return (sp); +} + +mdsetname_t * +metafakesetname( + set_t setno, + char *sname +) +{ + mdsetnamelist_t **tail; + mdsetname_t *sp; + + /* look for cached value first */ + for (tail = &setlistp; (*tail != NULL); tail = &(*tail)->next) { + sp = (*tail)->sp; + if (sp->setno == setno) { + if ((sp->setname == NULL) && (sname != NULL)) + sp->setname = Strdup(sname); + return (sp); + } + } + + /* allocate new list element and setname */ + *tail = Zalloc(sizeof (**tail)); + sp = (*tail)->sp = Zalloc(sizeof (*sp)); + + if (sname != NULL) + sp->setname = Strdup(sname); + sp->setno = setno; + sp->lockfd = MD_NO_LOCK; + + return (sp); +} + + +/* + * setup set record (sr) and cache it in the mdsetname_t struct + */ +md_set_desc * +sr2setdesc( + md_set_record *sr +) +{ + md_set_desc *sd; + int i; + md_mnset_record *mnsr; + md_mnnode_desc *nd, *nd_prev = 0; + md_mnnode_record *nr; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int nodecnt, nrcnt; + mndiskset_membershiplist_t *nl, *nl2; + + sd = Zalloc(sizeof (*sd)); + sd->sd_ctime = sr->sr_ctime; + sd->sd_genid = sr->sr_genid; + sd->sd_setno = sr->sr_setno; + sd->sd_flags = sr->sr_flags; + + if (MD_MNSET_DESC(sd)) { + mnsr = (md_mnset_record *)sr; + (void) strlcpy(sd->sd_mn_master_nodenm, + mnsr->sr_master_nodenm, sizeof (sd->sd_mn_master_nodenm)); + sd->sd_mn_master_nodeid = mnsr->sr_master_nodeid; + if (strcmp(mnsr->sr_master_nodenm, mynode()) == 0) { + sd->sd_mn_am_i_master = 1; + } + + /* + * Get membershiplist from API routine. If there's + * an error, just use a NULL nodelist. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + nodecnt = 0; /* no nodes are alive */ + nl = NULL; + } + nr = mnsr->sr_nodechain; + nrcnt = 0; + /* + * Node descriptor node list must be built in + * ascending order of nodeid. The nodechain + * in the mnset record is in ascending order, + * so just make them the same. + */ + while (nr) { + nd = Zalloc(sizeof (*nd)); + if (nd_prev) { + nd_prev->nd_next = nd; + } else { + sd->sd_nodelist = nd; + } + nd->nd_ctime = nr->nr_ctime; + nd->nd_genid = nr->nr_genid; + nd->nd_flags = nr->nr_flags; + + (void) strlcpy(nd->nd_nodename, nr->nr_nodename, + sizeof (nd->nd_nodename)); + nd->nd_nodeid = nr->nr_nodeid; + if (strcmp(nd->nd_nodename, mynode()) == 0) { + sd->sd_mn_mynode = nd; + } + if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { + sd->sd_mn_masternode = nd; + } + + /* + * If node is marked ALIVE, then set priv_ic + * from membership list. During the early part + * of a reconfig cycle, the membership list may + * have been changed, (a node entering or leaving + * the cluster), but rpc.metad hasn't flushed + * its data yet. So, if node is marked alive, but + * is no longer in the membership list (node has + * left the cluster) then just leave priv_ic to NULL. + */ + if (nd->nd_flags & MD_MN_NODE_ALIVE) { + nl2 = nl; + while (nl2) { + if (nl2->msl_node_id == nd->nd_nodeid) { + (void) strlcpy(nd->nd_priv_ic, + nl2->msl_node_addr, + sizeof (nd->nd_priv_ic)); + break; + } + nl2 = nl2->next; + } + } + + nr = nr->nr_next; + nrcnt++; + nd_prev = nd; + } + sd->sd_mn_numnodes = nrcnt; + if (nodecnt) + meta_free_nodelist(nl); + + /* Just copying to keep consistent view between sr & sd */ + (void) strlcpy(sd->sd_nodes[0], mnsr->sr_nodes_bw_compat[0], + sizeof (sd->sd_nodes[0])); + } else { + for (i = 0; i < MD_MAXSIDES; i++) + (void) strlcpy(sd->sd_nodes[i], sr->sr_nodes[i], + sizeof (sd->sd_nodes[i])); + } + + sd->sd_med = sr->sr_med; /* structure assignment */ + + return (sd); +} + +md_set_desc * +metaget_setdesc( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_record *sr; + + if (sp->setdesc != NULL) + return (sp->setdesc); + + if (sp->setname != NULL) { + if ((sr = getsetbyname(sp->setname, ep)) != NULL) { + sp->setdesc = sr2setdesc(sr); + free_sr(sr); + return (sp->setdesc); + } + } + + if (sp->setno > 0) { + if ((sr = getsetbynum(sp->setno, ep)) != NULL) { + sp->setdesc = sr2setdesc(sr); + free_sr(sr); + return (sp->setdesc); + } + } + + return (NULL); +} + +void +metaflushsetname(mdsetname_t *sp) +{ + if (sp == NULL) + return; + + if (sp->setdesc == NULL) + return; + + metafreesetdesc(sp->setdesc); + sp->setdesc = NULL; +} + +/* + * check for local set + */ +int +metaislocalset( + mdsetname_t *sp +) +{ + assert(sp->setname != NULL); + if (strcmp(sp->setname, MD_LOCAL_NAME) == 0) { + assert(sp->setno == MD_LOCAL_SET); + return (1); + } else { + assert(sp->setno != MD_LOCAL_SET); + return (0); + } +} + +/* + * check for same set + */ +int +metaissameset( + mdsetname_t *sp1, + mdsetname_t *sp2 +) +{ + if (strcmp(sp1->setname, sp2->setname) == 0) { + assert(sp1->setno == sp2->setno); + return (1); + } else { + assert(sp1->setno != sp2->setno); + return (0); + } +} + +/* + * check to see if set changed + */ +static int +chkset( + mdsetname_t **spp, + char *sname, + md_error_t *ep +) +{ + /* if we already have a set, make sure it's the same */ + if (*spp != NULL) { + if ((*spp)->setname != sname && + strcmp((*spp)->setname, sname) != 0) { + return (mderror(ep, MDE_SET_DIFF, sname)); + } + return (0); + } + + /* otherwise store new set name and number */ + if ((*spp = metasetname(sname, ep)) == NULL) { + return (-1); + } + + /* return success */ + return (0); +} + +/* + * check to see if set changed from default + */ +static int +chksetname( + mdsetname_t **spp, + char *sname, + md_error_t *ep +) +{ + /* default to *spp's setname, or if that is NULL to MD_LOCAL_NAME */ + if (sname == NULL) { + if (*spp) { + sname = (*spp)->setname; + } else { + sname = MD_LOCAL_NAME; + } + } + + /* see if changed */ + return (chkset(spp, sname, ep)); +} + +/* + * check setname from setno + */ +static int +chksetno( + mdsetname_t **spp, + set_t setno, + md_error_t *ep +) +{ + md_set_record *sr; + int rval; + + /* local set */ + if (setno == 0) + return (chkset(spp, MD_LOCAL_NAME, ep)); + + /* shared set */ + if ((sr = getsetbynum(setno, ep)) == NULL) + return (-1); + rval = chkset(spp, sr->sr_setname, ep); + free_sr(sr); + return (rval); +} + +/* + * mddrivename_t stuff + */ + +/* + * initialize name + */ +static void +metainitname( + mdname_t *np +) +{ + (void) memset(np, 0, sizeof (*np)); + np->dev = NODEV64; + np->key = MD_KEYBAD; + np->end_blk = -1; + np->start_blk = -1; +} + +/* + * free allocated name + */ +static void +metafreename( + mdname_t *np +) +{ + if (np->cname != NULL) + Free(np->cname); + if (np->bname != NULL) + Free(np->bname); + if (np->rname != NULL) + Free(np->rname); + if (np->devicesname != NULL) + Free(np->devicesname); + metainitname(np); +} + +/* + * initialize drive name + */ +static void +metainitdrivename( + mddrivename_t *dnp +) +{ + (void) memset(dnp, 0, sizeof (*dnp)); + dnp->side_names_key = MD_KEYBAD; +} + +/* + * flush side names + */ +void +metaflushsidenames( + mddrivename_t *dnp +) +{ + mdsidenames_t *p, *n; + + for (p = dnp->side_names, n = NULL; (p != NULL); p = n) { + n = p->next; + if (p->dname != NULL) + Free(p->dname); + if (p->cname != NULL) + Free(p->cname); + Free(p); + } + dnp->side_names = NULL; +} + +/* + * free drive name + */ +void +metafreedrivename( + mddrivename_t *dnp +) +{ + uint_t slice; + + if (dnp->cname != NULL) + Free(dnp->cname); + if (dnp->rname != NULL) + Free(dnp->rname); + metafreevtoc(&dnp->vtoc); + for (slice = 0; (slice < dnp->parts.parts_len); ++slice) + metafreename(&dnp->parts.parts_val[slice]); + if (dnp->parts.parts_val != NULL) + Free(dnp->parts.parts_val); + metaflushsidenames(dnp); + if (dnp->miscname != NULL) + Free(dnp->miscname); + meta_free_unit(dnp); + metainitdrivename(dnp); +} + +/* + * flush the drive name cache + */ +static void +metaflushdrivenames() +{ + mddrivenamelist_t *p, *n; + + for (p = drivelistp, n = NULL; (p != NULL); p = n) { + n = p->next; + metafreedrivename(p->drivenamep); + Free(p->drivenamep); + Free(p); + } + drivelistp = NULL; +} + +/* + * peel off s%u from name + */ +char * +metadiskname( + char *name +) +{ + char *p, *e; + char onmb[BUFSIZ+1], cnmb[BUFSIZ]; + uint_t d = 0; + int l = 0; + int cl = strlen(name); + + if (is_metaname(name)) + return (Strdup(name)); + + /* + * Handle old style names, which are of the form /dev/rXXNN[a-h]. + */ + if (sscanf(name, "/dev/r%" VAL2STR(BUFSIZ) "[^0-9/]%u%*[a-h]%n", + onmb, &d, &l) == 2 && l == cl) { + (void) snprintf(cnmb, sizeof (cnmb), "/dev/r%s%u", onmb, d); + return (Strdup(cnmb)); + } + + /* + * Handle old style names, which are of the form /dev/XXNN[a-h]. + */ + if (sscanf(name, "/dev/%" VAL2STR(BUFSIZ) "[^0-9/]%u%*[a-h]%n", + onmb, &d, &l) == 2 && l == cl) { + (void) snprintf(cnmb, sizeof (cnmb), "/dev/%s%u", onmb, d); + return (Strdup(cnmb)); + } + + /* gobble number and 's' */ + p = e = name + strlen(name) - 1; + for (; (p > name); --p) { + if (!isdigit(*p)) + break; + } + if ((p == e) || (p <= name)) + return (Strdup(name)); + + if (*p != 's' && strchr("dt", *p) == NULL) + return (Strdup(name)); + else if (strchr("dt", *p) != NULL) + return (Strdup(name)); + p--; + + if ((p <= name) || (!isdigit(*p))) + return (Strdup(name)); + + *(++p) = '\0'; + e = Strdup(name); + *p = 's'; + + return (e); +} + +/* + * free list of drivenames + */ +void +metafreedrivenamelist( + mddrivenamelist_t *dnlp +) +{ + mddrivenamelist_t *next = NULL; + + for (/* void */; (dnlp != NULL); dnlp = next) { + next = dnlp->next; + Free(dnlp); + } +} + +/* + * build list of drivenames + */ +int +metadrivenamelist( + mdsetname_t **spp, + mddrivenamelist_t **dnlpp, + int argc, + char *argv[], + md_error_t *ep +) +{ + mddrivenamelist_t **tailpp = dnlpp; + int count = 0; + + for (*dnlpp = NULL; (argc > 0); ++count, --argc, ++argv) { + mddrivenamelist_t *dnlp = Zalloc(sizeof (*dnlp)); + + if ((dnlp->drivenamep = metadrivename(spp, argv[0], + ep)) == NULL) { + metafreedrivenamelist(*dnlpp); + *dnlpp = NULL; + return (-1); + } + *tailpp = dnlp; + tailpp = &dnlp->next; + } + return (count); +} + +/* + * append to end of drivename list + */ +mddrivename_t * +metadrivenamelist_append( + mddrivenamelist_t **dnlpp, + mddrivename_t *dnp +) +{ + mddrivenamelist_t *dnlp; + + /* run to end of list */ + for (; (*dnlpp != NULL); dnlpp = &(*dnlpp)->next) + ; + + /* allocate new list element */ + dnlp = *dnlpp = Zalloc(sizeof (*dnlp)); + + /* append drivename */ + dnlp->drivenamep = dnp; + return (dnp); +} + +/* + * FUNCTION: meta_drivenamelist_append_wrapper() + * INPUT: tailpp - pointer to the list tail pointer + * dnp - name node to be appended to list + * OUTPUT: none + * RETURNS: mddrivenamelist_t * - new tail of the list. + * PURPOSE: wrapper to meta_namelist_append for performance. + * metanamelist_append finds the tail each time which slows + * down long lists. By keeping track of the tail ourselves + * we can change metadrivenamelist_append into a + * constant time operation. + */ +mddrivenamelist_t ** +meta_drivenamelist_append_wrapper( + mddrivenamelist_t **tailpp, + mddrivename_t *dnp +) +{ + (void) metadrivenamelist_append(tailpp, dnp); + + /* If it's the first item in the list, return it instead of the next */ + if ((*tailpp)->next == NULL) + return (tailpp); + + return (&(*tailpp)->next); +} + + +/* + * mdname_t stuff + */ + +/* + * check set and get comparison name + */ +char * +meta_name_getname( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + char *sname = NULL; + int ismeta = 0; + unit_t unit; + + /* check set name */ + if (parse_metadevice(uname, &sname, &unit) == 0) + ismeta = 1; + if (chksetname(spp, sname, ep) != 0) { + if (sname != NULL) + Free(sname); + return (NULL); + } + if (sname != NULL) + Free(sname); + + /* return comparison name */ + if (ismeta) + return (canon_metadevice((*spp)->setname, unit)); + else + return (Strdup(uname)); +} + +/* + * FUNCTION: getrname() + * INPUT: spp - the setname struct + * uname - the possibly unqualified device name + * OUTPUT: ep - return error pointer + * RETURNS: char* - character string containing the fully + * qualified raw device name + * PURPOSE: Create the fully qualified raw name for the possibly + * unqualified device name. If uname is an absolute + * path the raw name is derived from the input string. + * Otherwise, an attempt is made to get the rawname by + * catting "/dev/md/rdsk" and "/dev/rdsk". + */ +static char * +getrname(mdsetname_t **spp, char *uname, md_error_t *ep) +{ + char *rname, + *fname; + int constructed = 0; + + assert(uname != NULL); + /* if it is an absolute name then just call rawname on the input */ + if (uname[0] == '/') { + if ((rname = rawname(uname)) != NULL) + return (rname); + + /* out of luck */ + (void) mdsyserror(ep, ENOENT, uname); + return (NULL); + } + + /* + * Check for metadevice before physical device. + * With the introduction of softpartitions it is more + * likely to be a metadevice. + */ + + /* metadevice short form */ + if (metaislocalset(*spp)) { + fname = Malloc(strlen(uname) + strlen("/dev/md/rdsk/") + 1); + (void) strcpy(fname, "/dev/md/rdsk/"); + (void) strcat(fname, uname); + if (*uname == 'd') + constructed = 1; + } else { + char *p; + size_t len; + + if ((p = strchr(uname, '/')) != NULL) { + ++p; + } else { + p = uname; + } + len = strlen((*spp)->setname) + strlen(p) + + strlen("/dev/md//rdsk/") + 1; + fname = Malloc(len); + (void) snprintf(fname, len, "/dev/md/%s/rdsk/%s", + (*spp)->setname, p); + if (*p == 'd') + constructed = 1; + } + rname = rawname(fname); + + /* + * Handle the case where we have a new metadevice that does not yet + * exist in the name-space. In this case we return the constructed + * metadevice name as that will exist after the metainit call has + * created it. + */ + if ((rname == NULL) && constructed) { + rname = Strdup(fname); + } + Free(fname); + if (rname != NULL) + return (rname); + + fname = Malloc(strlen(uname) + strlen("/dev/rdsk/") + 1); + (void) strcpy(fname, "/dev/rdsk/"); + (void) strcat(fname, uname); + rname = rawname(fname); + Free(fname); + if (rname != NULL) + return (rname); + + /* + * If all else fails try the straight uname. + * NOTE: This check was at the beginning of getrname instead + * of here. It was moved to avoid a conflict with SC3.0. If + * a diskset was mounted with the same name it would hang + * the cluster in a loop. Example: + * + * fubar/d10 -m fubar/d0 fubar/d1 + * mount /dev/md/fubar/dsk/d10 /fubar + * + * When the system was booted DiskSuite would try to take ownership + * of diskset fubar. This would cause rawname("fubar/d10") to be + * called. rawname() stats the string which caused the cluster + * reservation code to try and take ownership which it was already + * doing and a deadlock would occur. By moving this final attempt + * at resolving the rawname to the end we avoid this deadlock. + */ + if (rname = rawname(uname)) + return (rname); + + /* out of luck */ + (void) mdsyserror(ep, ENOENT, uname); + return (NULL); +} + +/* + * get raw slice and drive names + */ +static char * +getrawnames( + mdsetname_t **spp, + char *uname, + char **dnamep, + md_error_t *ep +) +{ + char *rname; + size_t len; + + /* initialize */ + *dnamep = NULL; + + /* get slice name */ + if ((rname = getrname(spp, uname, ep)) != NULL) { + *dnamep = metadiskname(rname); + return (rname); + } + + /* + * If name cannot be found, if may be because is is not accessible. + * If it is an absolute name, try all possible disk name formats and + * if it is device name, assume it is /dev/rdsk/... + */ + if (mdissyserror(ep, ENOENT)) { + if (uname[0] == '/') { + /* Absolute name */ + char *p; + uint_t d = 0; + int l = 0; + char onmb[BUFSIZ+1], snm[BUFSIZ+1]; + + /* + * Handle old style raw names + */ + if (sscanf(uname, + "/dev/r%" VAL2STR(BUFSIZ) "[^0-9/]%u" + "%" VAL2STR(BUFSIZ) "[a-h]%n", + onmb, &d, snm, &l) == 3 && l == strlen(uname)) { + mdclrerror(ep); + rname = Strdup(uname); + *dnamep = metadiskname(rname); + return (rname); + } + + /* + * Handle old style block names + */ + if (sscanf(uname, + "/dev/%" VAL2STR(BUFSIZ) "[^0-9/]%u" + "%" VAL2STR(BUFSIZ) "[a-h]%n", + onmb, &d, snm, &l) == 3 && l == strlen(uname)) { + len = strlen(uname) + 1 + 1; + rname = Malloc(len); + (void) snprintf(rname, len, "/dev/r%s%u%s", + onmb, d, snm); + *dnamep = metadiskname(rname); + return (rname); + } + + /* /.../dsk/... */ + if ((p = strstr(uname, "/dsk/")) != NULL) { + mdclrerror(ep); + ++p; + rname = Malloc(strlen(uname) + 1 + 1); + (void) strncpy(rname, uname, (p - uname)); + rname[(p - uname)] = 'r'; + (void) strcpy(&rname[(p - uname) + 1], p); + *dnamep = metadiskname(rname); + return (rname); + } + + /* /.../rdsk/... */ + else if (strstr(uname, "/rdsk/") != NULL) { + mdclrerror(ep); + rname = Strdup(uname); + *dnamep = metadiskname(rname); + return (rname); + } + } else { + /* + * If it's not an absolute name but is a valid ctd name, + * guess at /dev/rdsk/... + */ + uint_t s; + if (parse_ctd(uname, &s) == 0) { + len = strlen(uname) + strlen("/dev/rdsk/") + 1; + rname = Malloc(len); + (void) snprintf(rname, len, "/dev/rdsk/%s", + uname); + *dnamep = metadiskname(rname); + return (rname); + } + } + } + + /* out of luck */ + return (NULL); +} + +/* + * get number of slices for name + */ +static int +getnslice( + char *rname, + char *dname, + uint_t *slicep +) +{ + char *srname; + uint_t nslice; + size_t dl = strlen(dname); + size_t rl = strlen(rname); + size_t l = 0; + size_t len; + + /* + * get our slice number - works only with names that end in s%u - + * all others return -1. + */ + if (dl >= rl || + sscanf(&rname[dl], "s%u%n", slicep, &l) != 1 || l != rl || + (int)*slicep < 0) { + return (-1); + } + + /* + * go find how many slices there really are + */ + len = strlen(dname) + 20 + 1; + srname = Malloc(len); + for (nslice = 0; /* void */; ++nslice) { + struct stat statbuf; + + /* build slice name */ + (void) snprintf(srname, len, "%ss%u", dname, nslice); + + /* see if it's there */ + if ((meta_stat(srname, &statbuf) != 0) || + (! S_ISCHR(statbuf.st_mode))) { + break; + } + } + Free(srname); + + /* Need to make sure that we at least have V_NUMPAR */ + nslice = max(nslice, V_NUMPAR); + + /* make sure we have at least our slice */ + if (nslice < *slicep) + return (-1); + + /* return number of slices */ + return (nslice); +} + +/* + * Attempt to parse the input string as a c[t]ds specifier + * The target can either be a SCSI target id or if the device + * is in a fabric configuration in a fibre channel setup then + * the target is a standard WWN (world wide name). + * + * if successful return 0 + * if c[t]dp name return 1 + * otherwise return -1 + */ +int +parse_ctd( + char *uname, + uint_t *slice) +{ + uint_t channel; + uint_t target; + uint_t device; + int has_target = 1; + uint_t cl; + uint_t target_str_len; + char *partial_ctd_str; + char *target_str; + char *device_start_pos; + int l = -1; + + /* pull off the channel spec and the 't' for the target */ + if (sscanf(uname, "c%ut%n", &channel, &l) != 1 || l == -1) { + /* check for cds style name */ + if (sscanf(uname, "c%ud%n", &channel, &l) != 1 || l == -1) { + return (-1); + } else { + l--; /* we want to be on the 'd' */ + has_target = 0; + } + } + partial_ctd_str = uname + l; + + /* find the beginning of the device specifier */ + device_start_pos = strrchr(partial_ctd_str, 'd'); + if (device_start_pos == NULL) { + return (-1); + } + + /* check to see if it is a ctd with a WWN or SCSI target */ + if (has_target) { + /* pull off the target and see if it is a WWN */ + target_str_len = device_start_pos - partial_ctd_str + 2; + target_str = (char *)Malloc(target_str_len+1); + (void) strcpy(target_str, "0X"); + (void) strncpy(target_str+2, partial_ctd_str, + target_str_len - 2); + target_str[target_str_len] = '\0'; + if (sscanf(target_str, "%x%n", &target, &l) != 1 || + l != target_str_len) { + Free(target_str); + return (-1); + } + Free(target_str); + } + + /* check the device and slice */ + cl = strlen(device_start_pos); + if (sscanf(device_start_pos, "d%us%u%n", &device, slice, &l) != 2 || + l != cl) { + /* check the device and partition */ + if (sscanf(device_start_pos, "d%up%u%n", &device, slice, &l) + == 2 && l == cl) { + return (1); + } + return (-1); + } + + return (0); +} + + +/* + * get number of slices for name + */ +static int +uname2sliceno( + char *uname, + uint_t *slicep, + md_error_t *ep +) +{ + uint_t c = 0, t = 0, d = 0; + int l = 0, cl = 0; + int fd; + struct dk_cinfo cinfo; + char *p; + char *rname = NULL; + + if (is_metaname(uname)) + return (*slicep = 0); + + if ((p = strrchr(uname, '/')) != NULL) + p++; + else + p = uname; + + cl = strlen(p); + + if (parse_ctd(p, slicep) == 0) + return (*slicep); + else if (sscanf(p, "mc%ut%ud%us%u%n", &c, &t, &d, slicep, &l) == 4 && + l == cl) + return (*slicep); + else if (sscanf(p, "d%us%u%n", &d, slicep, &l) == 2 && l == cl) + return (*slicep); + + /* + * If we can't get the slice from the name, then we have to do it the + * hard and expensive way. + */ + if ((rname = rawname(uname)) == NULL) + return (-1); + + /* get controller info */ + if ((fd = open(rname, (O_RDONLY|O_NDELAY), 0)) < 0) { + Free(rname); + return (-1); + } + + if (ioctl(fd, DKIOCINFO, &cinfo) != 0) { + int save = errno; + + if (save == ENOTTY) + (void) mddeverror(ep, MDE_NOT_DISK, NODEV64, rname); + else + (void) mdsyserror(ep, save, rname); + + Free(rname); + (void) close(fd); + return (-1); + } + (void) close(fd); /* sd/ssd bug */ + + if (cinfo.dki_partition < V_NUMPAR) { + Free(rname); + return (*slicep = cinfo.dki_partition); + } + + return (mddeverror(ep, MDE_NOT_DISK, NODEV64, rname)); +} + +/* + * get partition info + */ +static int +getparts( + mddrivename_t *dnp, + char *rname, + char *dname, + uint_t *npartsp, + uint_t *partnop, + md_error_t *ep +) +{ + int nparts; + uint_t partno; + mdname_t name; + mdvtoc_t *vtocp; + + /* metadevice */ + if (is_metaname(rname)) { + dnp->type = MDT_META; + nparts = 1; + partno = 0; + goto gotit; + } + + /* see how many partitions in drive, this is really tricky */ + metainitname(&name); + name.rname = rname; + name.drivenamep = dnp; + if ((vtocp = metagetvtoc(&name, TRUE, &partno, ep)) != NULL) { + dnp->type = MDT_COMP; + nparts = vtocp->nparts; + /* partno already setup */ + /* dname already setup */ + goto gotit; + } + + if ((ep->info.errclass == MDEC_DEV) && + (ep->info.md_error_info_t_u.dev_error.errnum == MDE_TOO_MANY_PARTS)) + return (-1); + + /* fallback and try and guess (used to check for just EACCES here) */ + if ((dname != NULL) && + ((nparts = getnslice(rname, dname, &partno)) > 0)) { + dnp->type = MDT_ACCES; + if (mdanysyserror(ep)) { + dnp->errnum = + ep->info.md_error_info_t_u.sys_error.errnum; + } else { + dnp->errnum = ENOENT; + } + mdclrerror(ep); + /* nparts already setup */ + /* partno already setup */ + /* dname already setup */ + nparts = roundup(nparts, V_NUMPAR); + goto gotit; + } + + /* nothing worked */ + dnp->type = MDT_UNKNOWN; + if (mdissyserror(ep, EACCES)) + dnp->type = MDT_ACCES; + + if (mdanysyserror(ep)) { + dnp->errnum = ep->info.md_error_info_t_u.sys_error.errnum; + } else { + dnp->errnum = ENOENT; + } + + mdclrerror(ep); + nparts = V_NUMPAR; + if (uname2sliceno(rname, &partno, ep) < 0) { + mdclrerror(ep); + partno = 0; + } + + /* return success */ +gotit: + assert(nparts > 0); + + if (partno >= nparts) + return (mdsyserror(ep, ENOENT, rname)); + + *npartsp = nparts; + *partnop = partno; + return (0); +} + +/* + * get block name + */ +static int +getbname( + mdname_t *np, + md_error_t *ep +) +{ + char *rname = np->rname; + char *bname; + + /* fully qualified */ + assert(rname != NULL); + if ((bname = blkname(rname)) != NULL) { + if (np->bname) + Free(np->bname); + np->bname = bname; + return (0); + } + + /* out of luck */ + return (mdsyserror(ep, ENOENT, rname)); +} + +static void +getcname( + mdsetname_t *sp, + mdname_t *np +) +{ + char *sname = sp->setname; + char *bname = np->bname; + char *p; + size_t len; + + assert(sname != NULL); + assert(bname != NULL); + assert(np->drivenamep->type != MDT_FAST_COMP && + np->drivenamep->type != MDT_FAST_META); + + /* regular device */ + if ((strncmp(bname, "/dev/dsk/", strlen("/dev/dsk/")) == 0) && + (strchr((p = bname + strlen("/dev/dsk/")), '/') == NULL)) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(p); + return; + } + + if ((strncmp(bname, "/dev/ap/dsk/", strlen("/dev/ap/dsk/")) == 0) && + (strchr((p = bname + strlen("/dev/ap/dsk/")), '/') == NULL)) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(p); + return; + } + + if ((strncmp(bname, "/dev/did/dsk/", strlen("/dev/did/dsk/")) == 0) && + (strchr((p = bname + strlen("/dev/did/dsk/")), '/') == NULL)) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(p); + return; + } + + /* anything else but metadevice */ + if (np->drivenamep->type != MDT_META) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(bname); + return; + } + + /* metadevice */ + p = strrchr(bname, '/'); + assert(p != NULL); + ++p; + if (metaislocalset(sp)) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(p); + } else { + assert(sname[0] != '\0'); + if (np->cname) + Free(np->cname); + len = strlen(sname) + 1 + strlen(p) + 1; + np->cname = Malloc(len); + (void) snprintf(np->cname, len, "%s/%s", sname, p); + } +} + +/* + * get dev + */ +int +meta_getdev( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + struct stat statbuf; + + /* get dev */ + if (meta_stat(np->rname, &statbuf) != 0) + return (mdsyserror(ep, errno, np->rname)); + else if (! S_ISCHR(statbuf.st_mode)) + return (mddeverror(ep, MDE_NOT_DISK, NODEV64, np->rname)); + np->dev = meta_expldev(statbuf.st_rdev); + + assert(np->drivenamep->type != MDT_FAST_META && + np->drivenamep->type != MDT_FAST_COMP); + + /* check set */ + assert((np->drivenamep->type == MDT_META) ? + (sp->setno == MD_MIN2SET(meta_getminor(np->dev))) : 1); + + /* return sucess */ + return (0); +} + +/* + * set up names for a slice + */ +static int +getnames( + mdsetname_t *sp, + mdname_t *np, + char *rname, + md_error_t *ep +) +{ + /* get names */ + if (np->rname) + Free(np->rname); + np->rname = Strdup(rname); + if (getbname(np, ep) != 0) + return (-1); + getcname(sp, np); + if (meta_getdev(sp, np, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * fake up names for a slice + */ +static void +getfakenames( + mdsetname_t *sp, + mdname_t *np, + char *rname +) +{ + char *p; + char onmb[BUFSIZ+1], snm[BUFSIZ+1]; + uint_t d = 0; + int l = 0; + + /* fake names */ + if (np->rname != NULL) + Free(np->rname); + np->rname = Strdup(rname); + + if (np->bname != NULL) + Free(np->bname); + np->bname = Strdup(rname); + + /* + * Fixup old style names + */ + if (sscanf(rname, "/dev/r%" VAL2STR(BUFSIZ) "[^0-9/]%u" + "%" VAL2STR(BUFSIZ) "[a-h]%n", + onmb, &d, snm, &l) == 3 && l == strlen(rname)) + (void) snprintf(np->bname, l, "/dev/%s%u%s", onmb, d, snm); + + /* + * Fixup new style names + */ + if ((p = strstr(np->bname, "/rdsk/")) != NULL) { + for (++p; (*(p + 1) != '\0'); ++p) + *p = *(p + 1); + *p = '\0'; + } + + if (np->cname != NULL) + Free(np->cname); + getcname(sp, np); +} + +static mdname_t * +setup_slice( + mdsetname_t *sp, + mddrivename_t *dnp, + char *uname, + char *rname, + char *dname, + uint_t partno, + md_error_t *ep +) +{ + char *srname = NULL; + mdname_t *np; + + /* must have a set */ + assert(sp != NULL); + assert(partno < dnp->parts.parts_len); + assert(dname != NULL); + + np = &dnp->parts.parts_val[partno]; + + if (rname) + srname = rname; + else if (is_metaname(dname)) + srname = dname; + else { + char onmb[BUFSIZ+1]; + uint_t d = 0; + int l = 0, cl = strlen(dname); + size_t len; + + len = cl + 20 + 1; + srname = Malloc(len); + + /* + * Handle /dev/rXXNN. + */ + if (sscanf(dname, "/dev/r%" VAL2STR(BUFSIZ) "[^0-9/]%u%n", + onmb, &d, &l) == 2 && l == cl) { + (void) snprintf(srname, len, "/dev/r%s%u%c", onmb, d, + 'a' + partno); + } else if (sscanf(dname, "/dev/%" VAL2STR(BUFSIZ) "[^0-9/]%u%n", + onmb, &d, &l) == 2 && l == cl) { + (void) snprintf(srname, len, "/dev/%s%u%c", onmb, d, + 'a' + partno); + } else { + /* build the slice that is wanted */ + (void) snprintf(srname, len, "%ss%u", dname, partno); + } + } + + if (getnames(sp, np, srname, ep) != 0) { + if (dnp->type == MDT_UNKNOWN) { + mdclrerror(ep); + getfakenames(sp, np, srname); + } else if (dnp->type == MDT_COMP && mdissyserror(ep, ENOENT)) { + dnp->type = MDT_UNKNOWN; + if (mdanysyserror(ep)) { + dnp->errnum = + ep->info.md_error_info_t_u.sys_error.errnum; + } else { + dnp->errnum = ENOENT; + } + mdclrerror(ep); + getfakenames(sp, np, srname); + } else { + mdclrerror(ep); + if (getnames(sp, np, dname, ep) != 0) { + np = NULL; + goto fixup; + } + } + } + +out: + if ((srname != rname) && (srname != dname)) + Free(srname); + + /* return name */ + return (np); + +fixup: + if (mdanysyserror(ep)) { + char *p; + int errnum = ep->info.md_error_info_t_u.sys_error.errnum; + + mdclrerror(ep); + if (uname && *uname) { + if ((p = strrchr(uname, '/')) != NULL) + (void) mdsyserror(ep, errnum, ++p); + else + (void) mdsyserror(ep, errnum, uname); + } else { + if ((p = strrchr(srname, '/')) != NULL) + (void) mdsyserror(ep, errnum, ++p); + else + (void) mdsyserror(ep, errnum, srname); + } + } + goto out; +} + +/* + * flush the fast name cache + */ +static void +metafreefastnm(mdname_t **np) +{ + mddrivename_t *dnp; + + assert(np != NULL && *np != NULL); + + if ((dnp = (*np)->drivenamep) != NULL) { + if (dnp->cname != NULL) + Free(dnp->cname); + if (dnp->rname != NULL) + Free(dnp->rname); + if (dnp->miscname != NULL) + Free(dnp->miscname); + meta_free_unit(dnp); + Free(dnp); + } + if ((*np)->cname != NULL) + Free((*np)->cname); + if ((*np)->bname != NULL) + Free((*np)->bname); + if ((*np)->rname != NULL) + Free((*np)->rname); + if ((*np)->devicesname != NULL) + Free((*np)->devicesname); + Free(*np); + *np = NULL; +} + +/* + * flush the fast name cache + */ +static void +metaflushfastnames() +{ + mdnamelist_t *p, *n; + + for (p = fastnmlp, n = NULL; (p != NULL); p = n) { + n = p->next; + metafreefastnm(&p->namep); + Free(p); + } + fastnmlp = NULL; +} + +static char * +getrname_fast(char *unm, md_error_t *ep) +{ + uint_t d = 0; + int l = 0; + int cl = strlen(unm); + char onmb[BUFSIZ+1], snm[BUFSIZ+1], cnmb[BUFSIZ]; + char *rnm; + char *p; + size_t len; + + if (is_metaname(unm)) { + /* without set */ + if (sscanf(unm, "d%u%n", &d, &l) == 1 && cl == l) { + rnm = Zalloc(14 + cl + 1); + (void) sprintf(rnm, "/dev/md/rdsk/d%u", d); + return (rnm); + } + + /* fully-qualified without set */ + if ((sscanf(unm, "/dev/md/dsk/d%u%n", &d, &l) == 1 || + sscanf(unm, "/dev/md/rdsk/d%u%n", &d, &l) == 1) && + cl == l) { + rnm = Zalloc(14 + cl + 1); + (void) sprintf(rnm, "/dev/md/rdsk/d%u", d); + return (rnm); + } + + /* with set */ + if ((sscanf(unm, + "%" VAL2STR(BUFSIZ) "[^/]/d%u%n", snm, &d, &l) == 2 || + sscanf(unm, "/dev/md/%" VAL2STR(BUFSIZ) "[^/]/dsk/d%u%n", + snm, &d, &l) == 2 || + sscanf(unm, "/dev/md/%" VAL2STR(BUFSIZ) "[^/]/rdsk/d%u%n", + snm, &d, &l) == 2) && cl == l) { + len = 14 + cl + strlen(snm) + 1; + rnm = Zalloc(len); + (void) snprintf(rnm, len, "/dev/md/%s/rdsk/d%u", + snm, d); + return (rnm); + } + } + + /* NOT Fully qualified path, done */ + if (unm[0] != '/') { + (void) mdsyserror(ep, EINVAL, unm); + return (NULL); + } + + /* + * Get slice information from old style names of the form + * /dev/rXXNN[a-h] or /dev/XXNN[a-h], must be done before regular + * devices, but after metadevices. + */ + if ((sscanf(unm, "/dev/r%" VAL2STR(BUFSIZ) "[^0-9/]%u" + "%" VAL2STR(BUFSIZ) "[a-h]%n", + onmb, &d, snm, &l) == 3 || + sscanf(unm, "/dev/%" VAL2STR(BUFSIZ) "[^0-9/]%u" + "%" VAL2STR(BUFSIZ) "[a-h]%n", + onmb, &d, snm, &l) == 3) && l == cl) { + if ((p = strchr("abcdefgh", snm[0])) != NULL) { + (void) snprintf(cnmb, sizeof (cnmb), "/dev/r%s%u%s", + onmb, d, snm); + return (Strdup(cnmb)); + } + } + + if ((p = strstr(unm, "/dsk/")) != NULL) { /* /.../dsk/... */ + ++p; + rnm = Zalloc(strlen(unm) + 1 + 1); + (void) strncpy(rnm, unm, (p - unm)); + rnm[(p - unm)] = 'r'; + (void) strcpy(&rnm[(p - unm) + 1], p); + return (rnm); + } else if (strstr(unm, "/rdsk/") != NULL) { /* /.../rdsk/... */ + return (Strdup(unm)); + } + + /* + * Shouldn't get here but if we do then we have an unrecognized + * fully qualified path - error + */ + (void) mdsyserror(ep, EINVAL, unm); + return (NULL); +} + +static mdname_t * +metainitfastname( + mdsetname_t *sp, + char *uname, + md_error_t *ep +) +{ + uint_t c = 0, t = 0, d = 0, s = 0; + int l = 0; + mddrivename_t *dnp; + mdname_t *np; + mdnamelist_t **fnlpp; + + for (fnlpp = &fastnmlp; (*fnlpp != NULL); fnlpp = &(*fnlpp)->next) { + np = (*fnlpp)->namep; + + if (strcmp(np->bname, uname) == 0) + return (np); + } + + *fnlpp = Zalloc(sizeof (**fnlpp)); + np = (*fnlpp)->namep = Zalloc(sizeof (mdname_t)); + metainitname(np); + dnp = np->drivenamep = Zalloc(sizeof (mddrivename_t)); + metainitdrivename(dnp); + + + /* Metadevices */ + if (is_metaname(uname)) { + char *p; + size_t len; + + if ((p = strrchr(uname, '/')) != NULL) + ++p; + else + p = uname; + + if (metaislocalset(sp)) { + if (np->cname) + Free(np->cname); + np->cname = Strdup(p); + } else { + if (np->cname) + Free(np->cname); + len = strlen(sp->setname) + 1 + strlen(p) + 1; + np->cname = Zalloc(len); + (void) snprintf(np->cname, len, "%s/%s", + sp->setname, p); + } + dnp->type = MDT_FAST_META; + goto done; + } + + /* Others */ + dnp->type = MDT_FAST_COMP; + + if (((sscanf(uname, "/dev/rdsk/c%ut%ud%us%u%n", &c, &t, &d, + &s, &l) == 4 || + sscanf(uname, "/dev/dsk/c%ut%ud%us%u%n", &c, &t, &d, + &s, &l) == 4 || + sscanf(uname, "/dev/ap/rdsk/mc%ut%ud%us%u%n", &c, &t, &d, + &s, &l) == 4 || + sscanf(uname, "/dev/ap/dsk/mc%ut%ud%us%u%n", &c, &t, &d, + &s, &l) == 4 || + sscanf(uname, "/dev/did/rdsk/d%us%u%n", &t, &s, &l) == 2 || + sscanf(uname, "/dev/did/dsk/d%us%u%n", &t, &s, &l) == 2|| + sscanf(uname, "/dev/rdsk/c%ud%us%u%n", &c, &d, &s, &l) == 3 || + sscanf(uname, "/dev/dsk/c%ud%us%u%n", &c, &d, &s, &l) == 3 || + sscanf(uname, "/dev/rdsk/c%ut%ud%u%n", &c, &t, &d, &l) == 3 || + sscanf(uname, "/dev/dsk/c%ut%ud%u%n", &c, &t, &d, &l) == 3 || + sscanf(uname, "/dev/ap/rdsk/mc%ut%ud%u%n", &c, &t, &d, &l) == 3 || + sscanf(uname, "/dev/ap/dsk/mc%ut%ud%u%n", &c, &t, &d, &l) == 3 || + sscanf(uname, "/dev/did/rdsk/d%u%n", &t, &l) == 1 || + sscanf(uname, "/dev/did/dsk/d%u%n", &t, &l) == 1 || + sscanf(uname, "/dev/rdsk/c%ud%u%n", &c, &d, &l) == 2 || + sscanf(uname, "/dev/dsk/c%ud%u%n", &c, &d, &l) == 2) && + l == strlen(uname))) { + if ((np->cname = strrchr(uname, '/')) == NULL) + np->cname = Strdup(uname); + else + np->cname = Strdup(++np->cname); + } else { + np->cname = Strdup(uname); + } + +done: + /* Driver always gives us block names */ + np->bname = Strdup(uname); + + /* canonical disk name */ + if ((dnp->cname = metadiskname(np->cname)) == NULL) + dnp->cname = Strdup(np->cname); + + if ((np->rname = getrname_fast(uname, ep)) != NULL) { + if ((dnp->rname = metadiskname(np->rname)) == NULL) + dnp->rname = Strdup(np->rname); + } else { + metafreefastnm(&(*fnlpp)->namep); + Free(*fnlpp); + *fnlpp = NULL; + return (NULL); + } + + /* cleanup, return success */ + return (np); +} + +/* + * set up names for a device + */ +static mdname_t * +metaname_common( + mdsetname_t **spp, + char *uname, + int fast, + md_error_t *ep +) +{ + mddrivenamelist_t **tail; + mddrivename_t *dnp; + uint_t slice; + mdname_t *np; + char *rname = NULL; + char *dname = NULL; + char *cname = NULL; + uint_t nparts, partno; + + assert(uname != NULL); + + /* check setname */ + if ((cname = meta_name_getname(spp, uname, ep)) == NULL) + return (NULL); + + assert(*spp != NULL); + Free(cname); + + /* get raw name (rname) of the slice and drive (dname) we have */ + if ((rname = getrawnames(spp, uname, &dname, ep)) == NULL) { + return (NULL); + } + + /* look in cache first */ + for (tail = &drivelistp; (*tail != NULL); tail = &(*tail)->next) { + dnp = (*tail)->drivenamep; + + /* check to see if the drive name is already in the cache */ + if ((dnp->rname != NULL) && strcmp(dnp->rname, dname) == 0) { + + Free(rname); + if (dname != NULL) + Free(dname); + + if (uname2sliceno(uname, &partno, ep) < 0) + return (NULL); + + return (metaslicename(dnp, partno, ep)); + } + } + + /* + * If a fast names is OK, then get one, and be done. + */ + if (fast) { + Free(rname); + if (dname != NULL) + Free(dname); + + return (metainitfastname(*spp, uname, ep)); + } + + /* allocate new list element and drive */ + *tail = Zalloc(sizeof (**tail)); + dnp = (*tail)->drivenamep = Zalloc(sizeof (*dnp)); + + metainitdrivename(dnp); + + /* get parts info */ + if (getparts(dnp, rname, dname, &nparts, &partno, ep) != 0) + goto out; + + /* + * libmeta needs at least V_NUMPAR partitions. + * If we have an EFI partition with less than V_NUMPAR slices, + * we nevertheless reserve space for V_NUMPAR + */ + if (nparts < V_NUMPAR) { + nparts = V_NUMPAR; + } + + /* allocate and link in parts */ + dnp->parts.parts_len = nparts; + dnp->parts.parts_val = Zalloc((sizeof (*dnp->parts.parts_val)) * + dnp->parts.parts_len); + for (slice = 0; (slice < nparts); ++slice) { + np = &dnp->parts.parts_val[slice]; + metainitname(np); + np->drivenamep = dnp; + } + + /* setup name_t (or slice) wanted */ + if ((np = setup_slice(*spp, dnp, uname, rname, dname, partno, ep)) + == NULL) + goto out; + + /* canonical disk name */ + if ((dnp->cname = metadiskname(np->cname)) == NULL) + dnp->cname = Strdup(np->cname); + if ((dnp->rname = metadiskname(np->rname)) == NULL) + dnp->rname = Strdup(np->rname); + + /* cleanup, return success */ + if (dname != NULL) + Free(dname); + Free(rname); + return (np); + + /* cleanup, return error */ +out: + if (dname != NULL) + Free(dname); + if (rname != NULL) + Free(rname); + + metafreedrivename(dnp); + Free(dnp); + Free(*tail); + *tail = NULL; + return (NULL); +} + +mdname_t * +metaname( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + return (metaname_common(spp, uname, 0, ep)); +} + +mdname_t * +metaname_fast( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + return (metaname_common(spp, uname, 1, ep)); +} + +/* + * set up names for a drive + */ +mddrivename_t * +metadrivename( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + char *slicename; + mdname_t *np; + + char *cname; + mddrivenamelist_t **tail; + mddrivename_t *dnp; + char *dname; + int i; + int mplen; + size_t len; + + /* check setname, get comparison name */ + assert(uname != NULL); + if ((cname = meta_name_getname(spp, uname, ep)) == NULL) { + (void) mdsyserror(ep, ENOENT, uname); + return (NULL); + } + + assert(*spp != NULL); + + if ((dname = metadiskname(cname)) == NULL) { + (void) mdsyserror(ep, ENOENT, cname); + Free(cname); + return (NULL); + } + + /* look in cache first */ + for (tail = &drivelistp; (*tail != NULL); tail = &(*tail)->next) { + dnp = (*tail)->drivenamep; + if ((dnp->cname != NULL && + (strcmp(dnp->cname, dname) == 0)) || + (dnp->rname != NULL && + (strcmp(dnp->rname, dname) == 0))) { + Free(cname); + Free(dname); + return (dnp); + } + } + + /* Check each possible slice name based on MD_MAX_PARTS. */ + + /* + * Figure out how much string space to reserve to fit + * (MD_MAX_PARTS - 1) into the name string; the loop will + * increment the mplen counter once for each decimal digit in + * (MD_MAX_PARTS - 1). + */ + for (i = MD_MAX_PARTS - 1, mplen = 0; i; i /= 10, ++mplen); + len = strlen(uname) + mplen + 2; + slicename = Malloc(len); + + /* Check for each slice in turn until we find one */ + for (np = NULL, i = 0; ((np == NULL) && (i < MD_MAX_PARTS)); ++i) { + (void) snprintf(slicename, len, "%ss%d", uname, i); + np = metaname(spp, slicename, ep); + } + Free(slicename); + + if (np == NULL) { + char *dname; + + if ((mdissyserror(ep, ENOENT)) && + ((dname = metadiskname(uname)) != NULL)) { + Free(dname); + (void) mderror(ep, MDE_NOT_DRIVENAME, uname); + } + return (NULL); + } + return (np->drivenamep); +} + +/* + * FUNCTION: metaslicename() + * INPUT: dnp - the drivename structure + * sliceno - the slice on the drive to return + * OUTPUT: ep - return error pointer + * RETURNS: mdname_t- pointer the the slice name structure + * PURPOSE: interface to the parts struct in the drive name struct + * Since there is no guarantee that the slice name + * structures are populated users should call this + * function rather than accessing the structure directly + * since it will populate the structure values if they + * haven't already been populated before returning. + */ +mdname_t * +metaslicename( + mddrivename_t *dnp, + uint_t sliceno, + md_error_t *ep +) +{ + mdsetname_t *sp = NULL; + char *namep = NULL; + mdname_t *np; + + assert(dnp->type != MDT_FAST_COMP && dnp->type != MDT_FAST_META); + + if (sliceno >= dnp->parts.parts_len) { + (void) mderror(ep, MDE_NOSLICE, dnp->cname); + return (NULL); + } + + np = &dnp->parts.parts_val[sliceno]; + + /* check to see if the struct is already populated */ + if (np->cname) { + return (np); + } + + if ((namep = meta_name_getname(&sp, dnp->cname, ep)) == NULL) + return (NULL); + + np = setup_slice(sp, dnp, NULL, NULL, dnp->rname, sliceno, ep); + + Free(namep); + + return (np); +} + +/* + * set up metadevice name from id + */ +mdname_t * +metamnumname( + mdsetname_t **spp, + minor_t mnum, + int fast, + md_error_t *ep +) +{ + set_t setno = MD_MIN2SET(mnum); + mdsetname_t *sp = NULL; + char *uname; + mdname_t *np; + size_t len; + + /* check set first */ + if (spp == NULL) + spp = &sp; + if (chksetno(spp, setno, ep) != 0) + return (NULL); + assert(*spp != NULL); + sp = *spp; + + /* build corresponding device name */ + if (metaislocalset(sp)) { + uname = Malloc(20); + (void) sprintf(uname, "d%lu", MD_MIN2UNIT(mnum)); + } else { + len = strlen(sp->setname) + 1 + 20; + uname = Malloc(len); + (void) snprintf(uname, len, "%s/d%lu", sp->setname, + MD_MIN2UNIT(mnum)); + } + + /* setup name */ + if (fast) { + np = metaname_fast(spp, uname, ep); + np->dev = metamakedev(mnum); + } else + np = metaname(spp, uname, ep); + + Free(uname); + return (np); +} + +/* + * return metadevice name + */ +char * +get_mdname( + minor_t mnum +) +{ + mdname_t *np; + md_error_t status = mdnullerror; + + /* get name */ + if ((np = metamnumname(NULL, mnum, 0, &status)) == NULL) { + mdclrerror(&status); + return (NULL); + } + assert(meta_getminor(np->dev) == mnum); + + /* return name */ + return (np->cname); +} + +/* + * check for device type + */ +int +metaismeta( + mdname_t *np +) +{ + return (np->drivenamep->type == MDT_META || + np->drivenamep->type == MDT_FAST_META); +} + +int +metachkmeta( + mdname_t *np, + md_error_t *ep +) +{ + if (! metaismeta(np)) { + return (mddeverror(ep, MDE_NOT_META, np->dev, + np->cname)); + } + return (0); +} + +int +metachkdisk( + mdname_t *np, + md_error_t *ep +) +{ + mddrivename_t *dnp = np->drivenamep; + + assert(dnp->type != MDT_FAST_COMP && dnp->type != MDT_FAST_META); + + if ((! metaismeta(np)) && (dnp->type != MDT_COMP)) { + switch (dnp->type) { + case MDT_ACCES: + case MDT_UNKNOWN: + return (mdsyserror(ep, dnp->errnum, np->bname)); + default: + assert(0); + return (mddeverror(ep, MDE_NOT_DISK, np->dev, + np->cname)); + } + } + return (0); +} + +int +metachkcomp( + mdname_t *np, + md_error_t *ep +) +{ + if (metaismeta(np)) { + return (mddeverror(ep, MDE_IS_META, np->dev, + np->cname)); + } + return (metachkdisk(np, ep)); +} + +/* + * free list of names + */ +void +metafreenamelist( + mdnamelist_t *nlp +) +{ + mdnamelist_t *next = NULL; + + for (/* void */; (nlp != NULL); nlp = next) { + next = nlp->next; + Free(nlp); + } +} + +/* + * build list of names + */ +int +metanamelist( + mdsetname_t **spp, + mdnamelist_t **nlpp, + int argc, + char *argv[], + md_error_t *ep +) +{ + mdnamelist_t **tailpp = nlpp; + int count = 0; + + for (*nlpp = NULL; (argc > 0); ++count, --argc, ++argv) { + mdnamelist_t *nlp = Zalloc(sizeof (*nlp)); + + if ((nlp->namep = metaname(spp, argv[0], ep)) == NULL) { + metafreenamelist(*nlpp); + *nlpp = NULL; + return (-1); + } + *tailpp = nlp; + tailpp = &nlp->next; + } + return (count); +} + +/* + * append to end of name list + */ +mdname_t * +metanamelist_append( + mdnamelist_t **nlpp, + mdname_t *np +) +{ + mdnamelist_t *nlp; + + /* run to end of list */ + for (; (*nlpp != NULL); nlpp = &(*nlpp)->next) + ; + + /* allocate new list element */ + nlp = *nlpp = Zalloc(sizeof (*nlp)); + + /* append name */ + nlp->namep = np; + return (np); +} + +/* + * FUNCTION: meta_namelist_append_wrapper() + * INPUT: tailpp - pointer to the list tail pointer + * np - name node to be appended to list + * OUTPUT: none + * RETURNS: mdnamelist_t * - new tail of the list. + * PURPOSE: wrapper to meta_namelist_append for performance. + * metanamelist_append finds the tail each time which slows + * down long lists. By keeping track of the tail ourselves + * we can change metanamelist_append into a constant time + * operation. + */ +mdnamelist_t ** +meta_namelist_append_wrapper( + mdnamelist_t **tailpp, + mdname_t *np +) +{ + (void) metanamelist_append(tailpp, np); + + /* If it's the first item in the list, return it instead of the next */ + if ((*tailpp)->next == NULL) + return (tailpp); + + return (&(*tailpp)->next); +} + + +/* + * mdhspname_t stuff + */ + +/* + * initialize hspname + */ +static void +metainithspname( + mdhspname_t *hspnamep +) +{ + (void) memset(hspnamep, '\0', sizeof (*hspnamep)); + hspnamep->hsp = MD_HSP_NONE; +} + +/* + * free allocated hspname + */ +static void +metafreehspname( + mdhspname_t *hspnamep +) +{ + if (hspnamep->hspname != NULL) + Free(hspnamep->hspname); + if (hspnamep->unitp != NULL) + meta_invalidate_hsp(hspnamep); + metainithspname(hspnamep); +} + +/* + * clear the hspname cache + */ +static void +metaflushhspnames() +{ + mdhspnamelist_t *p, *n; + + for (p = hsplistp, n = NULL; (p != NULL); p = n) { + n = p->next; + metafreehspname(p->hspnamep); + Free(p->hspnamep); + Free(p); + } + hsplistp = NULL; +} + +/* + * check set and get comparison name + */ +static char * +gethspname( + mdsetname_t **spp, + char *uname, + hsp_t *hspp, + md_error_t *ep +) +{ + char *sname = NULL; + + /* check setname */ + assert(uname != NULL); + if (parse_hsp(uname, &sname, hspp) != 0) { + (void) mdsyserror(ep, ENOENT, uname); + return (NULL); + } + if (chksetname(spp, sname, ep) != 0) { + if (sname != NULL) + Free(sname); + return (NULL); + } + if (sname != NULL) + Free(sname); + + /* return comparison name */ + return (canon_hsp((*spp)->setname, *hspp)); +} + +/* + * set up names for a hotspare pool + */ +mdhspname_t * +metahspname( + mdsetname_t **spp, + char *uname, + md_error_t *ep +) +{ + char *cname; + hsp_t hsp; + mdhspnamelist_t **tail; + mdhspname_t *hspnp; + + /* check setname */ + assert(uname != NULL); + if ((cname = gethspname(spp, uname, &hsp, ep)) == NULL) + return (NULL); + assert(*spp != NULL); + + /* look in cache first */ + for (tail = &hsplistp; (*tail != NULL); tail = &(*tail)->next) { + hspnp = (*tail)->hspnamep; + if (strcmp(hspnp->hspname, cname) == 0) { + Free(cname); + return (hspnp); + } + } + + /* allocate new list element and hspname */ + *tail = Zalloc(sizeof (**tail)); + hspnp = (*tail)->hspnamep = Zalloc(sizeof (*hspnp)); + metainithspname(hspnp); + + /* save hspname and number */ + hspnp->hspname = cname; + hspnp->hsp = MAKE_HSP_ID((*spp)->setno, hsp); + + /* success */ + return (hspnp); + + /* cleanup, return error */ +out: + metafreehspname(hspnp); + Free(hspnp); + Free(*tail); + *tail = NULL; + return (NULL); + + +} + +/* + * set up hotspare pool name from id + */ +mdhspname_t * +metahsphspname( + mdsetname_t **spp, + hsp_t hsp, + md_error_t *ep +) +{ + set_t setno = HSP_SET(hsp); + mdsetname_t *sp = NULL; + char *uname; + mdhspname_t *hspnp; + size_t len; + + /* check set first */ + if (spp == NULL) + spp = &sp; + if (chksetno(spp, setno, ep) != 0) + return (NULL); + assert(*spp != NULL); + sp = *spp; + + /* build corresponding hotspare pool name */ + if (metaislocalset(sp)) { + uname = Malloc(20); + (void) sprintf(uname, "hsp%03u", HSP_ID(hsp)); + } else { + len = strlen(sp->setname) + 1 + 20; + uname = Malloc(len); + (void) snprintf(uname, len, "%s/hsp%03lu", sp->setname, + HSP_ID(hsp)); + } + + /* setup name */ + hspnp = metahspname(spp, uname, ep); + Free(uname); + return (hspnp); +} + +/* + * return hotspare pool name + */ +char * +get_hspname(hsp_t hsp) +{ + mdhspname_t *hspnp; + md_error_t status = mdnullerror; + + /* get name */ + if ((hspnp = metahsphspname(NULL, hsp, &status)) == NULL) { + mdclrerror(&status); + return (NULL); + } + + /* return name */ + return (hspnp->hspname); +} + +/* + * free hotspare pool list + */ +void +metafreehspnamelist(mdhspnamelist_t *hspnlp) +{ + mdhspnamelist_t *next = NULL; + + for (/* void */; (hspnlp != NULL); hspnlp = next) { + next = hspnlp->next; + Free(hspnlp); + } +} + +/* + * build list of hotspare pool names + */ +int +metahspnamelist( + mdsetname_t **spp, + mdhspnamelist_t **hspnlpp, + int argc, + char *argv[], + md_error_t *ep +) +{ + mdhspnamelist_t **tailpp = hspnlpp; + int count = 0; + + for (*hspnlpp = NULL; (argc > 0); ++count, --argc, ++argv) { + mdhspnamelist_t *hspnlp = Zalloc(sizeof (*hspnlp)); + + if ((hspnlp->hspnamep = metahspname(spp, argv[0], + ep)) == NULL) { + metafreehspnamelist(*hspnlpp); + *hspnlpp = NULL; + return (-1); + } + *tailpp = hspnlp; + tailpp = &hspnlp->next; + } + return (count); +} + +/* + * append to end of hotspare pool list + */ +mdhspname_t * +metahspnamelist_append(mdhspnamelist_t **hspnlpp, mdhspname_t *hspnp) +{ + mdhspnamelist_t *hspnlp; + + /* run to end of list */ + for (; (*hspnlpp != NULL); hspnlpp = &(*hspnlpp)->next) + ; + + /* allocate new list element */ + hspnlp = *hspnlpp = Zalloc(sizeof (*hspnlp)); + + /* append hotspare pool name */ + hspnlp->hspnamep = hspnp; + return (hspnp); +} + +/* + * get name from dev + */ +mdname_t * +metadevname( + mdsetname_t **spp, + md_dev64_t dev, + md_error_t *ep) +{ + char *device_name; + mdname_t *namep; + mdkey_t key; + + /* short circuit metadevices */ + assert(dev != NODEV64); + if (meta_dev_ismeta(dev)) + return (metamnumname(spp, meta_getminor(dev), 0, ep)); + + /* create local set, if necessary */ + if (*spp == NULL) { + if ((*spp = metasetname(MD_LOCAL_NAME, ep)) == NULL) + return (NULL); + } + + /* get name from namespace */ + if ((device_name = meta_getnmentbydev((*spp)->setno, MD_SIDEWILD, + dev, NULL, NULL, &key, ep)) == NULL) { + return (NULL); + } + namep = metaname_fast(spp, device_name, ep); + if (namep != NULL) + namep->key = key; + + Free(device_name); + return (namep); +} + +/* + * return cached name from md_dev64_t + */ +static char * +metadevtocachename(md_dev64_t dev) +{ + mddrivenamelist_t *dnlp; + + /* look in cache */ + for (dnlp = drivelistp; (dnlp != NULL); dnlp = dnlp->next) { + mddrivename_t *dnp = dnlp->drivenamep; + uint_t i; + + for (i = 0; (i < dnp->parts.parts_len); ++i) { + mdname_t *np = &dnp->parts.parts_val[i]; + + if (np->dev == dev) + return (np->cname); + } + } + + /* not found */ + return (NULL); +} + +/* + * Ask the driver for the name, which has been stored in the + * metadevice state database (on behalf of the utilities). + * (by devno) + */ +char * +get_devname( + set_t setno, + md_dev64_t dev) +{ + mdsetname_t *sp; + mdname_t *np; + md_error_t status = mdnullerror; + + /* get name */ + if ((setno == MD_SET_BAD) || + ((sp = metasetnosetname(setno, &status)) == NULL) || + ((np = metadevname(&sp, dev, &status)) == NULL)) { + mdclrerror(&status); + return (metadevtocachename(dev)); + } + + /* return name */ + return (np->cname); +} + +/* + * get name from key + */ +mdname_t * +metakeyname( + mdsetname_t **spp, + mdkey_t key, + int fast, + md_error_t *ep +) +{ + char *device_name; + md_dev64_t dev = NODEV64; + mdname_t *namep; + + /* create local set, if necessary */ + if (*spp == NULL) { + if ((*spp = metasetname(MD_LOCAL_NAME, ep)) == NULL) + return (NULL); + } + + /* get name from namespace */ + if ((device_name = meta_getnmentbykey((*spp)->setno, MD_SIDEWILD, + key, NULL, NULL, &dev, ep)) == NULL) { + return (NULL); + } + if (fast) + namep = metaname_fast(spp, device_name, ep); + else + namep = metaname(spp, device_name, ep); + + assert(dev != NODEV64); + if (namep) + namep->dev = dev; + Free(device_name); + return (namep); +} + +/* + * completely flush the caches + */ +void +metaflushnames(int flush_sr_cache) +{ + metaflushhspnames(); + metaflushdrivenames(); + metaflushsetnames(); + metaflushctlrcache(); + metaflushfastnames(); + metaflushstatcache(); + if (flush_sr_cache) + sr_cache_flush(0); +} + +/* + * meta_get_hotspare_names + * returns an mdnamelist_t of hot spare names + */ + +int +meta_get_hotspare_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + mdhspnamelist_t *hspnlp = NULL; + mdhspnamelist_t *hspp; + int cnt = 0; + + assert(nlpp != NULL); + + /* get hotspare names */ + if (meta_get_hsp_names(sp, &hspnlp, options, ep) < 0) { + cnt = -1; + goto out; + } + + /* build name list */ + for (hspp = hspnlp; (hspp != NULL); hspp = hspp->next) { + md_hsp_t *hsp; + int i; + + if ((hsp = meta_get_hsp(sp, hspp->hspnamep, ep)) == NULL) { + cnt = -1; + goto out; + } + for (i = 0; (i < hsp->hotspares.hotspares_len); i++) { + md_hs_t *hs = &hsp->hotspares.hotspares_val[i]; + + (void) metanamelist_append(nlpp, hs->hsnamep); + ++cnt; + } + } + + /* cleanup and return count or error */ +out: + metafreehspnamelist(hspnlp); + if ((cnt == -1) && mdisok(ep)) { + /* + * At least try to give some sort of meaningful error + */ + (void) mderror(ep, MDE_NO_HSPS, "Generic Hotspare Error"); + } + + return (cnt); +} +/* + * meta_create_non_dup_list + * INPUT: mdnp mdname_t pointer to add to the list if a new name + * ldevidp list of non-duplicate names. + * OUTPUT: ldevidp list of non-duplicate names. + * meta_create_non_dup_list will take a mdname_t pointer and if the device + * is not in the list (ldevidp) will add it to the list. + * User needs to free allocated memory. + */ +void +meta_create_non_dup_list( + mdname_t *mdnp, + mddevid_t **ldevidpp +) +{ + char *lcname; + mddevid_t *tmp; + mddevid_t *lastdevidp; + mddevid_t *lldevidp; + char *ctd, *slice; + mddevid_t *ldevidp; + + if (mdnp == NULL) + return; + + ldevidp = *ldevidpp; + /* + * Grab the name of the device and strip off slice information + */ + lcname = Strdup(mdnp->cname); + if (lcname == NULL) { + return; + } + ctd = strrchr(lcname, '/'); + if (ctd != NULL) + slice = strrchr(ctd, 's'); + else + slice = strrchr(lcname, 's'); + + if (slice != NULL) + *slice = '\0'; + + if (ldevidp == NULL) { + /* first item in list */ + ldevidp = Zalloc(sizeof (mddevid_t)); + ldevidp->ctdname = lcname; + ldevidp->key = mdnp->key; + *ldevidpp = ldevidp; + } else { + for (tmp = ldevidp; (tmp != NULL); tmp = tmp->next) { + if (strcmp(tmp->ctdname, lcname) == 0) { + /* already there so just return */ + Free(lcname); + return; + } + lastdevidp = tmp; + } + lldevidp = Zalloc(sizeof (mddevid_t)); + lldevidp->ctdname = lcname; + lldevidp->key = mdnp->key; + lastdevidp->next = lldevidp; + } +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c new file mode 100644 index 0000000000..337b48f98e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c @@ -0,0 +1,1267 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <dlfcn.h> +#include <meta.h> +#include <metadyn.h> +#include <ctype.h> +#include <dirent.h> +#include <devid.h> +#include <sys/param.h> +#include <sys/scsi/impl/uscsi.h> +#include <sys/scsi/generic/commands.h> +#include <sys/scsi/generic/inquiry.h> +#include <sys/efi_partition.h> + +#define MD_EFI_FG_HEADS 128 +#define MD_EFI_FG_SECTORS 256 +#define MD_EFI_FG_RPM 7200 +#define MD_EFI_FG_WRI 1 +#define MD_EFI_FG_RRI 1 + + +typedef struct ctlr_cache { + char *ctlr_nm; + int ctlr_ty; + struct ctlr_cache *ctlr_nx; +} ctlr_cache_t; + +static ctlr_cache_t *ctlr_cache = NULL; + + +/* + * return set for a device + */ +mdsetname_t * +metagetset( + mdname_t *np, + int bypass_daemon, + md_error_t *ep +) +{ + mdsetname_t *sp; + + /* metadevice */ + if (metaismeta(np)) + return (metasetnosetname(MD_MIN2SET(meta_getminor(np->dev)), + ep)); + + /* regular device */ + if (meta_is_drive_in_anyset(np->drivenamep, &sp, bypass_daemon, + ep) != 0) + return (NULL); + + if (sp != NULL) + return (sp); + + return (metasetnosetname(MD_LOCAL_SET, ep)); +} + +/* + * convert system to md types + */ +static void +meta_geom_to_md( + struct dk_geom *gp, + mdgeom_t *mdgp +) +{ + (void) memset(mdgp, '\0', sizeof (*mdgp)); + mdgp->ncyl = gp->dkg_ncyl; + mdgp->nhead = gp->dkg_nhead; + mdgp->nsect = gp->dkg_nsect; + mdgp->rpm = gp->dkg_rpm; + mdgp->write_reinstruct = gp->dkg_write_reinstruct; + mdgp->read_reinstruct = gp->dkg_read_reinstruct; + mdgp->blk_sz = DEV_BSIZE; +} + +/* + * convert efi to md types + */ +static void +meta_efi_to_mdgeom(struct dk_gpt *gpt, mdgeom_t *mdgp) +{ + (void) memset(mdgp, '\0', sizeof (*mdgp)); + mdgp->ncyl = (gpt->efi_last_u_lba - gpt->efi_first_u_lba) / + (MD_EFI_FG_HEADS * MD_EFI_FG_SECTORS); + mdgp->nhead = MD_EFI_FG_HEADS; + mdgp->nsect = MD_EFI_FG_SECTORS; + mdgp->rpm = MD_EFI_FG_RPM; + mdgp->write_reinstruct = MD_EFI_FG_WRI; + mdgp->read_reinstruct = MD_EFI_FG_RRI; + mdgp->blk_sz = DEV_BSIZE; +} + +static void +meta_efi_to_mdvtoc(struct dk_gpt *gpt, mdvtoc_t *mdvp) +{ + char typename[EFI_PART_NAME_LEN]; + uint_t i; + + (void) memset(mdvp, '\0', sizeof (*mdvp)); + mdvp->nparts = gpt->efi_nparts; + if (mdvp->nparts > MD_MAX_PARTS) + return; + + mdvp->first_lba = gpt->efi_first_u_lba; + mdvp->last_lba = gpt->efi_last_u_lba; + mdvp->lbasize = gpt->efi_lbasize; + + for (i = 0; (i < gpt->efi_nparts); ++i) { + mdvp->parts[i].start = gpt->efi_parts[i].p_start; + mdvp->parts[i].size = gpt->efi_parts[i].p_size; + mdvp->parts[i].tag = gpt->efi_parts[i].p_tag; + mdvp->parts[i].flag = gpt->efi_parts[i].p_flag; + /* + * Due to the lack of a label for the entire partition table, + * we use p_name of the reserved partition + */ + if ((gpt->efi_parts[i].p_tag == V_RESERVED) && + (gpt->efi_parts[i].p_name != NULL)) { + (void) strlcpy(typename, gpt->efi_parts[i].p_name, + EFI_PART_NAME_LEN); + /* Stop at first (if any) space or tab */ + (void) strtok(typename, " \t"); + mdvp->typename = Strdup(typename); + } + } +} + +static void +meta_mdvtoc_to_efi(mdvtoc_t *mdvp, struct dk_gpt **gpt) +{ + char typename[EFI_PART_NAME_LEN]; + uint_t i; + uint_t lastpart; + size_t size; + + /* first we count how many partitions we have to send */ + for (i = 0; i < MD_MAX_PARTS; i++) { + if ((mdvp->parts[i].start == 0) && + (mdvp->parts[i].size == 0) && + (mdvp->parts[i].tag != V_RESERVED)) { + continue; + } + /* if we are here, we know the partition is really used */ + lastpart = i; + } + size = sizeof (struct dk_gpt) + (sizeof (struct dk_part) * lastpart); + *gpt = calloc(size, sizeof (char)); + + (*gpt)->efi_nparts = lastpart + 1; + (*gpt)->efi_first_u_lba = mdvp->first_lba; + (*gpt)->efi_last_u_lba = mdvp->last_lba; + (*gpt)->efi_lbasize = mdvp->lbasize; + for (i = 0; (i < (*gpt)->efi_nparts); ++i) { + (*gpt)->efi_parts[i].p_start = mdvp->parts[i].start; + (*gpt)->efi_parts[i].p_size = mdvp->parts[i].size; + (*gpt)->efi_parts[i].p_tag = mdvp->parts[i].tag; + (*gpt)->efi_parts[i].p_flag = mdvp->parts[i].flag; + /* + * Due to the lack of a label for the entire partition table, + * we use p_name of the reserved partition + */ + if (((*gpt)->efi_parts[i].p_tag == V_RESERVED) && + (mdvp->typename != NULL)) { + (void) strlcpy((*gpt)->efi_parts[i].p_name, typename, + EFI_PART_NAME_LEN); + } + } +} + + +void +ctlr_cache_add(char *nm, int ty) +{ + ctlr_cache_t **ccpp; + + for (ccpp = &ctlr_cache; *ccpp != NULL; ccpp = &(*ccpp)->ctlr_nx) + if (strcmp((*ccpp)->ctlr_nm, nm) == 0) + return; + + *ccpp = Zalloc(sizeof (ctlr_cache_t)); + (*ccpp)->ctlr_nm = Strdup(nm); + (*ccpp)->ctlr_ty = ty; +} + +int +ctlr_cache_look(char *nm) +{ + ctlr_cache_t *tcp; + + for (tcp = ctlr_cache; tcp != NULL; tcp = tcp->ctlr_nx) + if (strcmp(tcp->ctlr_nm, nm) == 0) + return (tcp->ctlr_ty); + + return (-1); +} + + +void +metaflushctlrcache(void) +{ + ctlr_cache_t *cp, *np; + + for (cp = ctlr_cache, np = NULL; cp != NULL; cp = np) { + np = cp->ctlr_nx; + Free(cp->ctlr_nm); + Free(cp); + } + ctlr_cache = NULL; +} + +/* + * getdrvnode -- return the driver name based on mdname_t->bname + * Need to free pointer when finished. + */ +char * +getdrvnode(mdname_t *np, md_error_t *ep) +{ + char *devicespath, + *drvnode, + *cp; + + if ((devicespath = metagetdevicesname(np, ep)) == NULL) + return (NULL); + + /* + * At this point devicespath should be like the following + * "/devices/<unknow_and_dont_care>/xxxx@vvvv" + * + * There's a couple of 'if' statements below which could + * return an error condition, but I've decide to allow + * a more open approach regarding the mapping so as to + * not restrict possible future projects. + */ + if (drvnode = strrchr(devicespath, '/')) + /* + * drvnode now just "xxxx@vvvv" + */ + drvnode++; + + if (cp = strrchr(drvnode, '@')) + /* + * Now drvnode is just the driver name "xxxx" + */ + *cp = '\0'; + + cp = Strdup(drvnode); + Free(devicespath); + np->devicesname = NULL; + + return (cp); +} + +/* + * meta_load_dl -- open dynamic library using LDLIBRARYPATH, a debug + * environment variable METALDPATH, or the default location. + */ +static void * +meta_load_dl(mdname_t *np, md_error_t *ep) +{ + char *drvnode, + newpath[MAXPATHLEN], + *p; + void *cookie; + + if ((drvnode = getdrvnode(np, ep)) != NULL) { + + /* + * Library seach algorithm: + * 1) Use LDLIBRARYPATH which is implied when a non-absolute + * path name is passed to dlopen() + * 2) Use the value of METALDPATH as the directory. Mainly + * used for debugging + * 3) Last search the default location of "/usr/lib" + */ + (void) snprintf(newpath, sizeof (newpath), "lib%s.so.1", + drvnode); + if ((cookie = dlopen(newpath, RTLD_LAZY)) == NULL) { + if ((p = getenv("METALDPATH")) == NULL) + p = METALDPATH_DEFAULT; + (void) snprintf(newpath, sizeof (newpath), + "%s/lib%s.so.1", p, drvnode); + Free(drvnode); + if ((cookie = dlopen(newpath, RTLD_LAZY)) != NULL) { + /* + * Common failure here would be failing to + * find a libXX.so.1 such as libsd.so.1 + * Some controllers will not have a library + * because there's no enclosure or name + * translation required. + */ + return (cookie); + } + } else { + Free(drvnode); + return (cookie); + } + } + return (NULL); +} + +/* + * meta_match_names -- possibly convert the driver names returned by CINFO + */ +static void +meta_match_names(mdname_t *np, struct dk_cinfo *cp, mdcinfo_t *mdcp, + md_error_t *ep) +{ + void *cookie; + meta_convert_e ((*fptr)(mdname_t *, struct dk_cinfo *, mdcinfo_t *, + md_error_t *)); + + if ((cookie = meta_load_dl(np, ep)) != NULL) { + fptr = (meta_convert_e (*)(mdname_t *, struct dk_cinfo *, + mdcinfo_t *, md_error_t *))dlsym(cookie, "convert_path"); + if (fptr != NULL) + (void) (*fptr)(np, cp, mdcp, ep); + (void) dlclose(cookie); + } +} + +/* + * meta_match_enclosure -- return any enclosure info if found + */ +int +meta_match_enclosure(mdname_t *np, mdcinfo_t *mdcp, md_error_t *ep) +{ + meta_enclosure_e e, + ((*fptr)(mdname_t *, mdcinfo_t *, + md_error_t *)); + void *cookie; + + if ((cookie = meta_load_dl(np, ep)) != NULL) { + fptr = (meta_enclosure_e (*)(mdname_t *, mdcinfo_t *, + md_error_t *))dlsym(cookie, "get_enclosure"); + if (fptr != NULL) { + e = (*fptr)(np, mdcp, ep); + switch (e) { + case Enclosure_Error: + /* + * Looks like this library wanted to handle + * our device and had an internal error. + */ + return (1); + + case Enclosure_Okay: + /* + * Found a library to handle the request so + * just return with data provided. + */ + return (0); + + case Enclosure_Noop: + /* + * Need to continue the search + */ + break; + } + } + (void) dlclose(cookie); + } + return (0); +} + +static int +meta_cinfo_to_md(mdname_t *np, struct dk_cinfo *cp, mdcinfo_t *mdcp, + md_error_t *ep) +{ + /* default */ + (void) memset(mdcp, '\0', sizeof (*mdcp)); + (void) strncpy(mdcp->cname, cp->dki_cname, + min((sizeof (mdcp->cname) - 1), sizeof (cp->dki_cname))); + mdcp->ctype = MHD_CTLR_GENERIC; + mdcp->cnum = cp->dki_cnum; + (void) strncpy(mdcp->dname, cp->dki_dname, + min((sizeof (mdcp->dname) - 1), sizeof (cp->dki_dname))); + mdcp->unit = cp->dki_unit; + mdcp->maxtransfer = cp->dki_maxtransfer; + + /* + * See if the driver name returned from DKIOCINFO + * is valid or not. In somecases, such as the ap_dmd + * driver, we need to modify the name that's return + * for everything to work. + */ + meta_match_names(np, cp, mdcp, ep); + + if (meta_match_enclosure(np, mdcp, ep)) + return (-1); + + /* return success */ + return (0); +} + +static void +meta_vtoc_to_md( + struct vtoc *vp, + mdvtoc_t *mdvp +) +{ + char typename[sizeof (vp->v_asciilabel) + 1]; + uint_t i; + + (void) memset(mdvp, '\0', sizeof (*mdvp)); + (void) strncpy(typename, vp->v_asciilabel, + sizeof (vp->v_asciilabel)); + typename[sizeof (typename) - 1] = '\0'; + for (i = 0; ((i < sizeof (typename)) && (typename[i] != '\0')); ++i) { + if ((typename[i] == ' ') || (typename[i] == '\t')) { + typename[i] = '\0'; + break; + } + } + mdvp->typename = Strdup(typename); + mdvp->nparts = vp->v_nparts; + for (i = 0; (i < vp->v_nparts); ++i) { + mdvp->parts[i].start = vp->v_part[i].p_start; + mdvp->parts[i].size = vp->v_part[i].p_size; + mdvp->parts[i].tag = vp->v_part[i].p_tag; + mdvp->parts[i].flag = vp->v_part[i].p_flag; + if (vp->v_part[i].p_start == 0 && vp->v_part[i].p_size > 0) + mdvp->parts[i].label = btodb(DK_LABEL_SIZE); + } +} + +/* + * free allocations in vtoc + */ +void +metafreevtoc( + mdvtoc_t *vtocp +) +{ + if (vtocp->typename != NULL) + Free(vtocp->typename); + (void) memset(vtocp, 0, sizeof (*vtocp)); +} + +/* + * return md types + */ +mdvtoc_t * +metagetvtoc( + mdname_t *np, /* only rname, drivenamep, are setup */ + int nocache, + uint_t *partnop, + md_error_t *ep +) +{ + mddrivename_t *dnp = np->drivenamep; + struct dk_geom geom; + char *minor_name = NULL; + char *rname = np->rname; + int fd; + int partno; + int err = 0; /* saves errno from ioctl */ + ddi_devid_t devid; + char *p; + + /* short circuit */ + if ((! nocache) && (dnp->vtoc.nparts != 0)) { + if (partnop != NULL) { + /* + * the following assigment works because the + * mdname_t structs are always created as part + * of the drivenamep struct. When a user + * creates an mdname_t struct it either + * uses an existing drivenamep struct or creates + * a new one and then adds the mdname_t struct + * as part of its parts_val array. So what is + * being computed below is the slice offset in + * the parts_val array. + */ + *partnop = np - np->drivenamep->parts.parts_val; + assert(*partnop < dnp->parts.parts_len); + } + return (&dnp->vtoc); + } + + /* can't get vtoc */ + if (! nocache) { + switch (dnp->type) { + case MDT_ACCES: + case MDT_UNKNOWN: + (void) mdsyserror(ep, dnp->errnum, rname); + return (NULL); + } + } + + /* get all the info */ + if ((fd = open(rname, (O_RDONLY|O_NDELAY), 0)) < 0) { + (void) mdsyserror(ep, errno, rname); + return (NULL); + } + + /* + * The disk is open so this is a good point to get the devid + * otherwise it will need to be done at another time which + * means reopening it. + */ + if (devid_get(fd, &devid) != 0) { + /* there is no devid for the disk */ + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "DEVID") != NULL)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s has no device id\n"), np->rname); + } + np->minor_name = (char *)NULL; + dnp->devid = NULL; + } else { + (void) devid_get_minor_name(fd, &minor_name); + /* + * The minor name could be NULL if the underlying + * device driver does not support 'minor names'. + * This means we do not use devid's for this device. + * SunCluster did driver does not support minor names. + */ + if (minor_name != NULL) { + np->minor_name = Strdup(minor_name); + devid_str_free(minor_name); + dnp->devid = devid_str_encode(devid, NULL); + } else { + np->minor_name = (char *)NULL; + dnp->devid = NULL; + + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "DEVID") != NULL)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s no minor name (no devid)\n"), + np->rname); + } + } + devid_free(devid); + } + + /* + * if our drivenamep points to a device not supporting DKIOCGGEOM, + * it's likely to have an EFI label. + */ + (void) memset(&geom, 0, sizeof (geom)); + if (ioctl(fd, DKIOCGGEOM, &geom) != 0) { + err = errno; + if (err == ENOTTY) { + (void) mddeverror(ep, MDE_NOT_DISK, NODEV, rname); + (void) close(fd); + return (NULL); + } else if (err != ENOTSUP) { + (void) mdsyserror(ep, err, rname); + (void) close(fd); + return (NULL); + } + + } + /* + * If we are here, there was either no failure on DKIOCGGEOM or + * the failure was ENOTSUP + */ + if (err == ENOTSUP) { + /* DKIOCGGEOM yielded ENOTSUP => try efi_alloc_and_read */ + struct dk_gpt *gpt; + int save_errno; + + /* this also sets errno */ + partno = efi_alloc_and_read(fd, &gpt); + save_errno = errno; + (void) close(fd); + if (partno < 0) { + efi_free(gpt); + (void) mdsyserror(ep, save_errno, rname); + return (NULL); + } + if (partno >= gpt->efi_nparts) { + efi_free(gpt); + (void) mddeverror(ep, MDE_INVALID_PART, NODEV64, + rname); + return (NULL); + } + + /* convert to our format */ + metafreevtoc(&dnp->vtoc); + meta_efi_to_mdvtoc(gpt, &dnp->vtoc); + if (dnp->vtoc.nparts > MD_MAX_PARTS) { + (void) mddeverror(ep, MDE_TOO_MANY_PARTS, NODEV64, + rname); + return (NULL); + } + /* + * libmeta needs at least V_NUMPAR partitions. + * If we have an EFI partition with less than V_NUMPAR slices, + * we nevertheless reserve space for V_NUMPAR + */ + + if (dnp->vtoc.nparts < V_NUMPAR) { + dnp->vtoc.nparts = V_NUMPAR; + } + meta_efi_to_mdgeom(gpt, &dnp->geom); + efi_free(gpt); + } else { + /* no error on DKIOCGGEOM, try meta_getvtoc */ + struct vtoc vtoc; + + if (meta_getvtoc(fd, np->cname, &vtoc, &partno, ep) < 0) { + (void) close(fd); + return (NULL); + } + (void) close(fd); + + /* convert to our format */ + meta_geom_to_md(&geom, &dnp->geom); + metafreevtoc(&dnp->vtoc); + meta_vtoc_to_md(&vtoc, &dnp->vtoc); + } + + /* fix up any drives which are now accessible */ + if ((nocache) && (dnp->type == MDT_ACCES) && + (dnp->vtoc.nparts == dnp->parts.parts_len)) { + dnp->type = MDT_COMP; + dnp->errnum = 0; + } + + /* save partno */ + assert(partno < dnp->vtoc.nparts); + if (partnop != NULL) + *partnop = partno; + + /* return info */ + return (&dnp->vtoc); +} + +static void +meta_mdvtoc_to_vtoc( + mdvtoc_t *mdvp, + struct vtoc *vp +) +{ + uint_t i; + + (void) memset(&vp->v_part, '\0', sizeof (vp->v_part)); + vp->v_nparts = (ushort_t)mdvp->nparts; + for (i = 0; (i < mdvp->nparts); ++i) { + vp->v_part[i].p_start = (daddr32_t)mdvp->parts[i].start; + vp->v_part[i].p_size = (daddr32_t)mdvp->parts[i].size; + vp->v_part[i].p_tag = mdvp->parts[i].tag; + vp->v_part[i].p_flag = mdvp->parts[i].flag; + } +} + +/* + * Set the vtoc, but use the cached copy to get the info from. + * We write np->drivenamep->vtoc to disk. + * Before we can do this we read the vtoc in. + * if we're dealing with a metadevice and this metadevice is a 64 bit device + * we can use meta_getmdvtoc/meta_setmdvtoc + * else + * we use meta_getvtoc/meta_setvtoc but than we first have to convert + * dnp->vtoc (actually being a mdvtoc_t) into a vtoc_t + */ +int +metasetvtoc( + mdname_t *np, + md_error_t *ep +) +{ + char *rname = np->rname; + mddrivename_t *dnp = np->drivenamep; + int fd; + int err; + int save_errno; + struct dk_geom geom; + + if ((fd = open(rname, (O_RDONLY | O_NDELAY), 0)) < 0) + return (mdsyserror(ep, errno, rname)); + + err = ioctl(fd, DKIOCGGEOM, &geom); + save_errno = errno; + if (err == 0) { + struct vtoc vtoc; + + if (meta_getvtoc(fd, np->cname, &vtoc, NULL, ep) < 0) { + (void) close(fd); + return (-1); + } + + meta_mdvtoc_to_vtoc(&dnp->vtoc, &vtoc); + + if (meta_setvtoc(fd, np->cname, &vtoc, ep) < 0) { + (void) close(fd); + return (-1); + } + } else if (save_errno == ENOTSUP) { + struct dk_gpt *gpt; + int ret; + + /* allocation of gpt is done in meta_mdvtoc_to_efi */ + meta_mdvtoc_to_efi(&dnp->vtoc, &gpt); + + ret = efi_write(fd, gpt); + save_errno = errno; + free(gpt); + if (ret != 0) { + (void) close(fd); + return (mdsyserror(ep, save_errno, rname)); + } else { + (void) close(fd); + return (0); + } + + } else { + (void) close(fd); + return (mdsyserror(ep, save_errno, rname)); + } + + (void) close(fd); + + return (0); +} + +mdgeom_t * +metagetgeom( + mdname_t *np, /* only rname, drivenamep, are setup */ + md_error_t *ep +) +{ + if (metagetvtoc(np, FALSE, NULL, ep) == NULL) + return (NULL); + return (&np->drivenamep->geom); +} + +mdcinfo_t * +metagetcinfo( + mdname_t *np, /* only rname, drivenamep, are setup */ + md_error_t *ep +) +{ + char *rname = np->rname; + mddrivename_t *dnp = np->drivenamep; + int fd; + struct dk_cinfo cinfo; + + /* short circuit */ + if (dnp->cinfo.cname[0] != '\0') + return (&dnp->cinfo); + + /* get controller info */ + if ((fd = open(rname, (O_RDONLY|O_NDELAY), 0)) < 0) { + (void) mdsyserror(ep, errno, rname); + return (NULL); + } + if (ioctl(fd, DKIOCINFO, &cinfo) != 0) { + int save = errno; + + (void) close(fd); + if (save == ENOTTY) { + (void) mddeverror(ep, MDE_NOT_DISK, NODEV64, rname); + } else { + (void) mdsyserror(ep, save, rname); + } + return (NULL); + } + (void) close(fd); /* sd/ssd bug */ + + /* convert to our format */ + if (meta_cinfo_to_md(np, &cinfo, &dnp->cinfo, ep) != 0) + return (NULL); + + /* return info */ + return (&dnp->cinfo); +} + +/* + * get partition number + */ +int +metagetpartno( + mdname_t *np, + md_error_t *ep +) +{ + mdvtoc_t *vtocp; + uint_t partno; + + if ((vtocp = metagetvtoc(np, FALSE, &partno, ep)) == NULL) + return (-1); + assert(partno < vtocp->nparts); + return (partno); +} + +/* + * get size of device + */ +diskaddr_t +metagetsize( + mdname_t *np, + md_error_t *ep +) +{ + mdvtoc_t *vtocp; + uint_t partno; + + if ((vtocp = metagetvtoc(np, FALSE, &partno, ep)) == NULL) + return (MD_DISKADDR_ERROR); + assert(partno < vtocp->nparts); + return (vtocp->parts[partno].size); +} + +/* + * get label of device + */ +diskaddr_t +metagetlabel( + mdname_t *np, + md_error_t *ep +) +{ + mdvtoc_t *vtocp; + uint_t partno; + + if ((vtocp = metagetvtoc(np, FALSE, &partno, ep)) == NULL) + return (MD_DISKADDR_ERROR); + assert(partno < vtocp->nparts); + return (vtocp->parts[partno].label); +} + +/* + * find out where database replicas end + */ +static int +mddb_getendblk( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t *endblkp, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + + /* make sure we have a component */ + *endblkp = 0; + if (metaismeta(np)) + return (0); + + /* get replicas, quit if none */ + if (metareplicalist(sp, MD_BASICNAME_OK | PRINT_FAST, &rlp, ep) < 0) { + if (! mdismddberror(ep, MDE_DB_NODB)) + return (-1); + mdclrerror(ep); + return (0); + } else if (rlp == NULL) + return (0); + + /* go through all the replicas */ + for (rl = rlp; (rl != NULL); rl = rl->rl_next) { + md_replica_t *rp = rl->rl_repp; + mdname_t *repnamep = rp->r_namep; + diskaddr_t dbend; + + if (np->dev != repnamep->dev) + continue; + dbend = rp->r_blkno + rp->r_nblk - 1; + if (dbend > *endblkp) + *endblkp = dbend; + } + + /* cleanup, return success */ + metafreereplicalist(rlp); + return (0); +} + +/* + * return cached start block + */ +static diskaddr_t +metagetend( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + diskaddr_t end_blk = MD_DISKADDR_ERROR; + + /* short circuit */ + if (np->end_blk != MD_DISKADDR_ERROR) + return (np->end_blk); + + /* look for database locations */ + if (mddb_getendblk(sp, np, &end_blk, ep) != 0) + return (MD_DISKADDR_ERROR); + + /* success */ + np->end_blk = end_blk; + return (end_blk); +} + +/* + * does device have a metadb + */ +int +metahasmddb( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + if (metagetend(sp, np, ep) == MD_DISKADDR_ERROR) + return (-1); + else if (np->end_blk > 0) + return (1); + else + return (0); +} + +/* + * return cached start block + */ +diskaddr_t +metagetstart( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + diskaddr_t start_blk = MD_DISKADDR_ERROR; + + /* short circuit */ + if (np->start_blk != MD_DISKADDR_ERROR) + return (np->start_blk); + + /* look for database locations */ + if ((start_blk = metagetend(sp, np, ep)) == MD_DISKADDR_ERROR) + return (MD_DISKADDR_ERROR); + + /* check for label */ + if (start_blk == 0) { + start_blk = metagetlabel(np, ep); + if (start_blk == MD_DISKADDR_ERROR) { + return (MD_DISKADDR_ERROR); + } + } + + /* roundup to next cylinder */ + if (start_blk != 0) { + mdgeom_t *geomp; + + if ((geomp = metagetgeom(np, ep)) == NULL) + return (MD_DISKADDR_ERROR); + start_blk = roundup(start_blk, (geomp->nhead * geomp->nsect)); + } + + /* success */ + np->start_blk = start_blk; + return (start_blk); +} + +/* + * return cached devices name + */ +char * +metagetdevicesname( + mdname_t *np, + md_error_t *ep +) +{ + char path[MAXPATHLEN + 1]; + int len; + + /* short circuit */ + if (np->devicesname != NULL) + return (np->devicesname); + + /* follow symlink */ + if ((len = readlink(np->bname, path, (sizeof (path) - 1))) < 0) { + (void) mdsyserror(ep, errno, np->bname); + return (NULL); + } else if (len >= sizeof (path)) { + (void) mdsyserror(ep, ENAMETOOLONG, np->bname); + return (NULL); + } + path[len] = '\0'; + if ((len = strfind(path, "/devices/")) < 0) { + (void) mddeverror(ep, MDE_DEVICES_NAME, np->dev, np->bname); + return (NULL); + } + + /* return name */ + np->devicesname = Strdup(path + len + strlen("/devices")); + return (np->devicesname); +} + +/* + * get metadevice misc name + */ +char * +metagetmiscname( + mdname_t *np, + md_error_t *ep +) +{ + mddrivename_t *dnp = np->drivenamep; + md_i_driverinfo_t mid; + + /* short circuit */ + if (dnp->miscname != NULL) + return (dnp->miscname); + if (metachkmeta(np, ep) != 0) + return (NULL); + + /* get misc module from driver */ + (void) memset(&mid, 0, sizeof (mid)); + mid.mnum = meta_getminor(np->dev); + if (metaioctl(MD_IOCGET_DRVNM, &mid, &mid.mde, np->cname) != 0) { + (void) mdstealerror(ep, &mid.mde); + return (NULL); + } + + /* return miscname */ + dnp->miscname = Strdup(MD_PNTDRIVERNAME(&mid)); + return (dnp->miscname); +} + +/* + * get unit structure from driver + */ +md_unit_t * +meta_get_mdunit( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + md_i_get_t mig; + char *miscname = NULL; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* get size of unit structure */ + if (metachkmeta(np, ep) != 0) + return (NULL); + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (NULL); + (void) memset(&mig, '\0', sizeof (mig)); + MD_SETDRIVERNAME(&mig, miscname, sp->setno); + mig.id = meta_getminor(np->dev); + if (metaioctl(MD_IOCGET, &mig, &mig.mde, np->cname) != 0) { + (void) mdstealerror(ep, &mig.mde); + return (NULL); + } + + /* get actual unit structure */ + assert(mig.size > 0); + mig.mdp = (uintptr_t)Zalloc(mig.size); + if (metaioctl(MD_IOCGET, &mig, &mig.mde, np->cname) != 0) { + (void) mdstealerror(ep, &mig.mde); + Free((void *)mig.mdp); + return (NULL); + } + + return ((md_unit_t *)mig.mdp); +} + +/* + * free metadevice unit + */ +void +meta_free_unit( + mddrivename_t *dnp +) +{ + if (dnp->unitp != NULL) { + switch (dnp->unitp->type) { + case MD_DEVICE: + meta_free_stripe((md_stripe_t *)dnp->unitp); + break; + case MD_METAMIRROR: + meta_free_mirror((md_mirror_t *)dnp->unitp); + break; + case MD_METATRANS: + meta_free_trans((md_trans_t *)dnp->unitp); + break; + case MD_METARAID: + meta_free_raid((md_raid_t *)dnp->unitp); + break; + case MD_METASP: + meta_free_sp((md_sp_t *)dnp->unitp); + break; + default: + assert(0); + break; + } + dnp->unitp = NULL; + } +} + +/* + * free metadevice name info + */ +void +meta_invalidate_name( + mdname_t *namep +) +{ + mddrivename_t *dnp = namep->drivenamep; + + /* get rid of cached name info */ + if (namep->devicesname != NULL) { + Free(namep->devicesname); + namep->devicesname = NULL; + } + namep->key = MD_KEYBAD; + namep->start_blk = -1; + namep->end_blk = -1; + + /* get rid of cached drivename info */ + (void) memset(&dnp->geom, 0, sizeof (dnp->geom)); + (void) memset(&dnp->cinfo, 0, sizeof (dnp->cinfo)); + metafreevtoc(&dnp->vtoc); + metaflushsidenames(dnp); + dnp->side_names_key = MD_KEYBAD; + if (dnp->miscname != NULL) { + Free(dnp->miscname); + dnp->miscname = NULL; + } + meta_free_unit(dnp); +} + +/* + * get metadevice unit + */ +md_common_t * +meta_get_unit( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + char *miscname; + + /* short circuit */ + if (np->drivenamep->unitp != NULL) + return (np->drivenamep->unitp); + if (metachkmeta(np, ep) != 0) + return (NULL); + + /* dispatch */ + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (NULL); + else if (strcmp(miscname, MD_STRIPE) == 0) + return ((md_common_t *)meta_get_stripe(sp, np, ep)); + else if (strcmp(miscname, MD_MIRROR) == 0) + return ((md_common_t *)meta_get_mirror(sp, np, ep)); + else if (strcmp(miscname, MD_TRANS) == 0) + return ((md_common_t *)meta_get_trans(sp, np, ep)); + else if (strcmp(miscname, MD_RAID) == 0) + return ((md_common_t *)meta_get_raid(sp, np, ep)); + else if (strcmp(miscname, MD_SP) == 0) + return ((md_common_t *)meta_get_sp(sp, np, ep)); + else { + (void) mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(np->dev), + np->cname); + return (NULL); + } +} + + +int +meta_isopen( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep, + mdcmdopts_t options +) +{ + md_isopen_t d; + + if (metachkmeta(np, ep) != 0) + return (-1); + + (void) memset(&d, '\0', sizeof (d)); + d.dev = np->dev; + if (metaioctl(MD_IOCISOPEN, &d, &d.mde, np->cname) != 0) + return (mdstealerror(ep, &d.mde)); + + /* + * shortcut: if the device is open, no need to check on other nodes, + * even in case of a mn metadevice + * Also return in case we're told not to check on other nodes. + */ + if ((d.isopen != 0) || ((options & MDCMD_MN_OPEN_CHECK) == 0)) { + return (d.isopen); + } + + /* + * If the device is closed locally, but it's a mn device, + * check on all other nodes, too + */ + if (sp->setno != MD_LOCAL_SET) { + (void) metaget_setdesc(sp, ep); /* not supposed to fail */ + if (sp->setdesc->sd_flags & MD_SR_MN) { + int err = 0; + md_mn_result_t *resp; + /* + * This message is never directly issued. + * So we launch it with a suspend override flag. + * If the commd is suspended, and this message comes + * along it must be sent due to replaying a metainit or + * similar. In that case we don't want this message to + * be blocked. + * If the commd is not suspended, the flag does no harm. + * Additionally we don't want the result of the message + * cached in the MCT, because we want uptodate results, + * and the message doesn't need being logged either. + * Hence NO_LOG and NO_MCT + */ + err = mdmn_send_message( + sp->setno, + MD_MN_MSG_CLU_CHECK, + MD_MSGF_NO_MCT | MD_MSGF_STOP_ON_ERROR | + MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, + (char *)&d, sizeof (md_isopen_t), + &resp, ep); + if (err == 0) { + d.isopen = resp->mmr_exitval; + } else { + /* + * in case some error occurred, + * we better say the device is open + */ + d.isopen = 1; + } + if (resp != (md_mn_result_t *)NULL) { + free_result(resp); + } + + } + } + + return (d.isopen); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_namespace.c b/usr/src/lib/lvm/libmeta/common/meta_namespace.c new file mode 100644 index 0000000000..eb21cbbdd3 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_namespace.c @@ -0,0 +1,601 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * namespace utilities + */ + +#include <meta.h> + +typedef struct deviceinfo { + char *bname; /* block name of the device */ + char *dname; /* driver for the device */ + minor_t mnum; /* minor number for the device */ +} deviceinfo_t; + +static deviceinfo_t devlist[MD_MNMAXSIDES]; + +/* + * Ask the driver for the device name, driver name, and minor number; + * which has been stored in the metadevice state database + * (on behalf of the utilities). + * (by key) + */ +char * +meta_getnmentbykey( + set_t setno, + side_t sideno, + mdkey_t key, + char **drvnm, + minor_t *mnum, + md_dev64_t *dev, + md_error_t *ep +) +{ + struct mdnm_params nm; + static char device_name[MAXPATHLEN]; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = setno; + nm.side = sideno; + nm.key = key; + nm.devname = (uint64_t)device_name; + + if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (NULL); + } + + if (drvnm != NULL) + *drvnm = Strdup(nm.drvnm); + + if (mnum != NULL) + *mnum = nm.mnum; + + if (dev != NULL) + *dev = meta_expldev(makedevice(nm.major, nm.mnum)); + + return (Strdup(device_name)); +} + +/* + * Ask the driver for the minor name which has been stored in the + * metadevice state database. + * (by key) + */ +char * +meta_getdidminorbykey( + set_t setno, + side_t sideno, + mdkey_t key, + md_error_t *ep +) +{ + struct mdnm_params nm; + static char minorname[MAXPATHLEN]; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = setno; + nm.side = sideno; + nm.key = key; + nm.minorname = (uint64_t)minorname; + + if (metaioctl(MD_IOCGET_DIDMIN, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (NULL); + } + + return (Strdup(minorname)); +} + +/* + * Ask the driver for the device id string which has been stored in the + * metadevice state database (on behalf of the utilities). + * (by key) + */ +ddi_devid_t +meta_getdidbykey( + set_t setno, + side_t sideno, + mdkey_t key, + md_error_t *ep +) +{ + struct mdnm_params nm; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = setno; + nm.side = sideno; + nm.key = key; + + /* + * First ask the driver for the size of the device id string. This is + * signaled by passing the driver a devid_size of zero. + */ + nm.devid_size = 0; + if (metaioctl(MD_IOCGET_DID, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (NULL); + } + + /* + * If the devid_size is still zero then something is wrong. + */ + if (nm.devid_size == 0) { + (void) mdstealerror(ep, &nm.mde); + return (NULL); + } + + /* + * Now go get the actual device id string. Caller is responsible for + * free'ing device id memory buffer. + */ + if ((nm.devid = (uintptr_t)malloc(nm.devid_size)) == NULL) { + return (NULL); + } + if (metaioctl(MD_IOCGET_DID, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + (void) free((void *)nm.devid); + return (NULL); + } + + return ((void *)nm.devid); +} + +/* + * set the devid. + */ +int +meta_setdid( + set_t setno, + side_t sideno, + mdkey_t key, + md_error_t *ep +) +{ + struct mdnm_params nm; + int i; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = setno; + nm.side = sideno; + nm.key = key; + + if (metaioctl(MD_IOCSET_DID, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (-1); + } + + if (setno == MD_LOCAL_SET) { + /* + * If this is the local set then we are adding in the devids + * for the disks in the diskset and so this means adding + * a reference count for each side. Need to do this after + * the initial add so that the correct devid is picked up. + * The key is the key of the drive record and as such this + * means the minor number of the device which is used to + * get the devid. If the wrong side is used then it would + * be possible to get the wrong devid in the namespace, hence + * the requirement to process the local side first of all. + */ + for (i = 0 + SKEW; i < MD_MAXSIDES; i++) { + /* + * We can just call the ioctl again because it will + * fail with ENOENT if the side does not exist, and + * more importantly does not increment the usage count + * on the devid. + */ + nm.side = (side_t)i; + if (nm.side == sideno) + continue; + if (metaioctl(MD_IOCSET_DID, &nm, &nm.mde, NULL) != 0) { + if (mdissyserror(&nm.mde, ENODEV)) { + mdclrerror(&nm.mde); + } else { + (void) mdstealerror(ep, &nm.mde); + return (-1); + } + } + } + } + return (0); +} +/* + * Ask the driver for the name, which has been stored in the + * metadevice state database (on behalf of the utilities). + * (by key) + */ +char * +meta_getnmbykey( + set_t setno, + side_t sideno, + mdkey_t key, + md_error_t *ep +) +{ + return (meta_getnmentbykey(setno, sideno, key, NULL, NULL, NULL, ep)); +} + +/* + * Ask the driver for the device name, driver name, minor number, and key; + * which has been stored in the metadevice state database + * (on behalf of the utilities). + * (by md_dev64_t) + */ +char * +meta_getnmentbydev( + set_t setno, + side_t sideno, + md_dev64_t dev, + char **drvnm, + minor_t *mnum, + mdkey_t *key, + md_error_t *ep +) +{ + struct mdnm_params nm; + static char device_name[MAXPATHLEN]; + + /* must have a dev */ + assert(dev != NODEV64); + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = setno; + nm.side = sideno; + nm.key = MD_KEYWILD; + nm.major = meta_getmajor(dev); + nm.mnum = meta_getminor(dev); + nm.devname = (uint64_t)device_name; + + if (metaioctl(MD_IOCGET_NM, &nm, &nm.mde, NULL) != 0) { + (void) mdstealerror(ep, &nm.mde); + return (NULL); + } + + if (drvnm != NULL) + *drvnm = Strdup(nm.drvnm); + if (mnum != NULL) + *mnum = nm.mnum; + + if (key != NULL) + *key = nm.retkey; + + return (Strdup(device_name)); +} + +int +add_name( + mdsetname_t *sp, + side_t sideno, + mdkey_t key, + char *dname, + minor_t mnum, + char *bname, + md_error_t *ep +) +{ + struct mdnm_params nm; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = sp->setno; + nm.side = sideno; + nm.key = key; + nm.mnum = mnum; + (void) strncpy(nm.drvnm, dname, sizeof (nm.drvnm)); + nm.devname_len = strlen(bname) + 1; + nm.devname = (uintptr_t)bname; + + if (metaioctl(MD_IOCSET_NM, &nm, &nm.mde, bname) < 0) + return (mdstealerror(ep, &nm.mde)); + + return (nm.key); +} + +/* + * Remove the device name which corresponds to the given device number. + */ +int +del_name( + mdsetname_t *sp, + side_t sideno, + mdkey_t key, + md_error_t *ep +) +{ + struct mdnm_params nm; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.setno = sp->setno; + nm.side = sideno; + nm.key = key; + + if (metaioctl(MD_IOCREM_NM, &nm, &nm.mde, NULL) != 0) + return (mdstealerror(ep, &nm.mde)); + + return (0); +} + +static void +empty_devicelist() +{ + side_t sideno; + + for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) { + if (devlist[sideno].bname != (char *)NULL) { + Free(devlist[sideno].bname); + Free(devlist[sideno].dname); + devlist[sideno].mnum = NODEV; + } + } +} + +static void +add_to_devicelist( + side_t sideno, + char *bname, + char *dname, + minor_t mnum +) +{ + devlist[sideno].bname = Strdup(bname); + devlist[sideno].dname = Strdup(dname); + + devlist[sideno].mnum = mnum; +} + +/* + * Build a list of the names on the systems, if this fails the caller + * will tidy up the entries in the devlist. + */ +static int +build_sidenamelist( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + side_t sideno = MD_SIDEWILD; + minor_t mnum = NODEV; + char *bname = NULL; + char *dname = NULL; + int err; + + /*CONSTCOND*/ + while (1) { + + if ((err = meta_getnextside_devinfo(sp, np->bname, &sideno, + &bname, &dname, &mnum, ep)) == -1) + return (-1); + + if (err == 0) + break; + + /* the sideno gives us the index into the array */ + add_to_devicelist(sideno, bname, dname, mnum); + } + return (0); +} + +/* + * add name key + * the meta_create* functions should be the only ones using this. The + * adding of a name to the namespace must be done in a particular order + * to devid support for the disksets. The order is: add the 'local' side + * first of all, so the devid lookup in the kernel will use the correct + * device information and then add in the other sides. + */ +int +add_key_name( + mdsetname_t *sp, + mdname_t *np, + mdnamelist_t **nlpp, + md_error_t *ep +) +{ + int err; + side_t sideno = MD_SIDEWILD; + side_t thisside; + mdkey_t key = MD_KEYWILD; + md_set_desc *sd; + int maxsides; + + /* should have a set */ + assert(sp != NULL); + + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + } + + if (build_sidenamelist(sp, np, ep) == -1) { + empty_devicelist(); + return (-1); + } + + /* + * When a disk is added into the namespace the local information for + * that disk is added in first of all. For the local set this is not + * a concern and for the host that owns the diskset it is not a concern + * but when a disk is added in the remote namespace we *must* use the + * local information for that disk first of all. This is because when + * in the kernel (md_setdevname) the passed in dev_t is used to find + * the devid of the disk. This means we have to cater for the following: + * + * - a disk on the remote host having the dev_t that has been passed + * into the kernel and this disk is not actually the disk that is + * being added into the diskset. + * - the dev_t does not exist on this node + * + * So putting in the local information first of all makes sure that the + * dev_t passed into the kernel is correct with respect to that node + * and then any further additions for that name match on the key + * passed back. + */ + thisside = getmyside(sp, ep); + + if (devlist[thisside].dname == NULL || + strlen(devlist[thisside].dname) == 0) { + /* + * Did not find the disk information for the disk. This can + * be because of an inconsistancy in the namespace: that is the + * devid we have in the namespace does not exist on the + * system and thus when looking up the disk information + * using this devid we fail to find anything. + */ + (void) mdcomperror(ep, MDE_SP_COMP_OPEN_ERR, 0, np->dev, + np->cname); + empty_devicelist(); + return (-1); + } + + if ((err = add_name(sp, thisside, key, devlist[thisside].dname, + devlist[thisside].mnum, devlist[thisside].bname, ep)) == -1) { + empty_devicelist(); + return (-1); + } + + /* We now have a 'key' so add in the other sides */ + key = (mdkey_t)err; + + if (metaislocalset(sp)) + goto done; + + if (MD_MNSET_DESC(sd)) + maxsides = MD_MNMAXSIDES; + else + maxsides = MD_MAXSIDES; + + for (sideno = 0; sideno < maxsides; sideno++) { + /* ignore thisside, as it has been added above */ + if (sideno == thisside) + continue; + + if (devlist[sideno].dname != NULL) { + err = add_name(sp, sideno, key, devlist[sideno].dname, + devlist[sideno].mnum, devlist[sideno].bname, ep); + if (err == -1) { + empty_devicelist(); + return (-1); + } + } + } + +done: + empty_devicelist(); + /* save key, return success */ + np->key = key; + if (nlpp != NULL) + (void) metanamelist_append(nlpp, np); + return (0); +} + +/* + * delete name key + * the meta_create* functions should be the only ones using this. The + * removal of the names must be done in a particular order: remove the + * non-local entries first of all and then finally the local entry. + */ +int +del_key_name( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + side_t sideno = MD_SIDEWILD; + int err; + int retval = 0; + side_t thisside; + + /* should have a set */ + assert(sp != NULL); + + /* should have a key */ + assert((np->key != MD_KEYWILD) && (np->key != MD_KEYBAD)); + + thisside = getmyside(sp, ep); + + /* remove the remote sides first of all */ + for (;;) { + if ((err = meta_getnextside_devinfo(sp, np->bname, &sideno, + NULL, NULL, NULL, ep)) == -1) + return (-1); + + if (err == 0) + break; + + /* ignore thisside */ + if (thisside == sideno) { + continue; + } + if ((err = del_name(sp, sideno, np->key, ep)) == -1) + retval = -1; + } + + /* now remove this side */ + if (retval == 0) + if ((err = del_name(sp, thisside, np->key, ep)) == -1) + retval = -1; + + np->key = MD_KEYBAD; + return (retval); +} + +/* + * delete namelist keys + * the meta_create* functions should be the only ones using this + */ +int +del_key_names( + mdsetname_t *sp, + mdnamelist_t *nlp, + md_error_t *ep +) +{ + mdnamelist_t *p; + md_error_t status = mdnullerror; + int rval = 0; + + /* if ignoring errors */ + if (ep == NULL) + ep = &status; + + /* delete names */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (del_key_name(sp, np, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + if (ep == &status) + mdclrerror(&status); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_notify.c b/usr/src/lib/lvm/libmeta/common/meta_notify.c new file mode 100644 index 0000000000..5f66758f76 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_notify.c @@ -0,0 +1,692 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1995-2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * libmeta wrappers for event notification + */ + +#include <meta.h> +#include <sys/lvm/md_notify.h> + +#if defined(DEBUG) +#include <assert.h> +#endif /* DEBUG */ + +struct tag2obj_type { + md_tags_t tag; + ev_obj_t obj; +} tag2obj_typetab[] = +{ + { TAG_EMPTY, EVO_EMPTY }, + { TAG_METADEVICE, EVO_METADEV }, + { TAG_REPLICA, EVO_REPLICA }, + { TAG_HSP, EVO_HSP }, + { TAG_HS, EVO_HS }, + { TAG_SET, EVO_SET }, + { TAG_DRIVE, EVO_DRIVE }, + { TAG_HOST, EVO_HOST }, + { TAG_MEDIATOR, EVO_MEDIATOR }, + { TAG_UNK, EVO_UNSPECIFIED }, + + { TAG_LAST, EVO_LAST } +}; + +struct evdrv2evlib_type { + md_event_type_t drv; + evid_t lib; +} evdrv2evlib_typetab[] = +{ + { EQ_EMPTY, EV_EMPTY }, + { EQ_CREATE, EV_CREATE }, + { EQ_DELETE, EV_DELETE }, + { EQ_ADD, EV_ADD }, + { EQ_REMOVE, EV_REMOVE }, + { EQ_REPLACE, EV_REPLACE }, + { EQ_MEDIATOR_ADD, EV_MEDIATOR_ADD }, + { EQ_MEDIATOR_DELETE, EV_MEDIATOR_DELETE }, + { EQ_HOST_ADD, EV_HOST_ADD }, + { EQ_HOST_DELETE, EV_HOST_DELETE }, + { EQ_DRIVE_ADD, EV_DRIVE_ADD }, + { EQ_DRIVE_DELETE, EV_DRIVE_DELETE }, + { EQ_RENAME_SRC, EV_RENAME_SRC }, + { EQ_RENAME_DST, EV_RENAME_DST }, + { EQ_INIT_START, EV_INIT_START }, + { EQ_INIT_FAILED, EV_INIT_FAILED }, + { EQ_INIT_FATAL, EV_INIT_FATAL }, + { EQ_INIT_SUCCESS, EV_INIT_SUCCESS }, + { EQ_IOERR, EV_IOERR }, + { EQ_ERRED, EV_ERRED }, + { EQ_LASTERRED, EV_LASTERRED }, + { EQ_OK, EV_OK }, + { EQ_ENABLE, EV_ENABLE }, + { EQ_RESYNC_START, EV_RESYNC_START }, + { EQ_RESYNC_FAILED, EV_RESYNC_FAILED }, + { EQ_RESYNC_SUCCESS, EV_RESYNC_SUCCESS }, + { EQ_RESYNC_DONE, EV_RESYNC_DONE }, + { EQ_HOTSPARED, EV_HOTSPARED }, + { EQ_HS_FREED, EV_HS_FREED }, + { EQ_TAKEOVER, EV_TAKEOVER }, + { EQ_RELEASE, EV_RELEASE }, + { EQ_OPEN_FAIL, EV_OPEN_FAIL }, + { EQ_OFFLINE, EV_OFFLINE }, + { EQ_ONLINE, EV_ONLINE }, + { EQ_GROW, EV_GROW }, + { EQ_DETACH, EV_DETACH }, + { EQ_DETACHING, EV_DETACHING }, + { EQ_ATTACH, EV_ATTACH }, + { EQ_ATTACHING, EV_ATTACHING }, + { EQ_CHANGE, EV_CHANGE }, + { EQ_EXCHANGE, EV_EXCHANGE }, + { EQ_REGEN_START, EV_REGEN_START }, + { EQ_REGEN_DONE, EV_REGEN_DONE }, + { EQ_REGEN_FAILED, EV_REGEN_FAILED }, + { EQ_USER, EV_USER }, + { EQ_NOTIFY_LOST, EV_NOTIFY_LOST }, + + { EQ_LAST, EV_LAST } +}; + +static ev_obj_t +dev2tag(md_dev64_t dev, set_t setno, md_error_t *ep) +{ + mdname_t *np = NULL; + mdsetname_t *sp = NULL; + ev_obj_t obj = EVO_METADEV; + char *miscname; + + if ((sp = metasetnosetname(setno, ep)) == NULL) { + goto out; + } + if (!(np = metamnumname(&sp, meta_getminor(dev), 0, ep))) { + goto out; + } + + /* need to invalidate name in case rename or delete/create done */ + meta_invalidate_name(np); + + if (!(miscname = metagetmiscname(np, ep))) { + goto out; + } + if (strcmp(miscname, MD_STRIPE) == 0) { + obj = EVO_STRIPE; + } else if (strcmp(miscname, MD_MIRROR) == 0) { + obj = EVO_MIRROR; + } else if (strcmp(miscname, MD_RAID) == 0) { + obj = EVO_RAID5; + } else if (strcmp(miscname, MD_TRANS) == 0) { + obj = EVO_TRANS; + } +out: + return (obj); +} + +static ev_obj_t +tagdrv_2_objlib(md_tags_t tag) +{ + int i; + + for (i = 0; tag2obj_typetab[i].tag != TAG_LAST; i++) { + if (tag2obj_typetab[i].tag == tag) + return (tag2obj_typetab[i].obj); + } + return (EVO_UNSPECIFIED); +} + +static md_tags_t +objlib_2_tagdrv(ev_obj_t obj) +{ + int i; + + for (i = 0; tag2obj_typetab[i].tag != TAG_LAST; i++) { + if (tag2obj_typetab[i].obj == obj) + return (tag2obj_typetab[i].tag); + } + return (TAG_UNK); +} + + +static evid_t +evdrv_2_evlib(md_event_type_t drv_ev) +{ + int i; + + for (i = 0; evdrv2evlib_typetab[i].drv != EQ_LAST; i++) { + if (evdrv2evlib_typetab[i].drv == drv_ev) + return (evdrv2evlib_typetab[i].lib); + } + return (EV_UNK); +} + +static md_event_type_t +evlib_2_evdrv(evid_t lib_ev) +{ + int i; + + for (i = 0; evdrv2evlib_typetab[i].drv != EQ_LAST; i++) { + if (evdrv2evlib_typetab[i].lib == lib_ev) + return (evdrv2evlib_typetab[i].drv); + } + return (EQ_EMPTY); +} + + +/* + * meta_event + * returns 0 on succcess or < 0 to indicate error. + * abs(return code) = errno + */ +static int +meta_event(md_event_ioctl_t *evctl, md_error_t *ep) +{ + int l; + + if (!evctl || !ep) + return (-EINVAL); + + l = strlen(evctl->mdn_name); + if ((l == 0 && evctl->mdn_cmd != EQ_PUT) || l >= MD_NOTIFY_NAME_SIZE) { + return (-EINVAL); + } + + MD_SETDRIVERNAME(evctl, MD_NOTIFY, 0); + mdclrerror(ep); + errno = 0; + + if (metaioctl(MD_IOCNOTIFY, evctl, ep, evctl->mdn_name) != 0) { + if (errno == 0) { + errno = EINVAL; + } + if (mdisok(ep)) { + (void) mdsyserror(ep, errno, evctl->mdn_name); + } + return (-errno); + } + + return (0); +} + +static void +init_evctl(char *qname, + md_tags_t tag, + md_event_type_t ev, + uint_t flags, + set_t set, + md_dev64_t dev, + md_event_cmds_t cmd, + u_longlong_t udata, + md_event_ioctl_t *evctlp) +{ + + assert(evctlp); + + (void) memset(evctlp, 0, sizeof (md_event_ioctl_t)); + + evctlp->mdn_magic = MD_EVENT_ID; + evctlp->mdn_rev = MD_NOTIFY_REVISION; + + if (qname) + (void) strncpy(evctlp->mdn_name, qname, MD_NOTIFY_NAME_SIZE-1); + else + (void) memset(evctlp->mdn_name, 0, MD_NOTIFY_NAME_SIZE); + + evctlp->mdn_tag = tag; + evctlp->mdn_event = ev; + evctlp->mdn_flags = flags; + evctlp->mdn_set = set; + evctlp->mdn_dev = dev; + evctlp->mdn_cmd = cmd; + evctlp->mdn_user = udata; +} + +/* + * meta_notify_createq + * - creates an eventq + * - returns 0 on success or errno and sets ep + */ +int +meta_notify_createq(char *qname, ulong_t flags, md_error_t *ep) +{ + md_event_ioctl_t evctl; + int err = 0; + + mdclrerror(ep); + if (!qname || strlen(qname) == 0) { + (void) mdsyserror(ep, EINVAL, + dgettext(TEXT_DOMAIN, + "null or zero-length queue name")); + return (EINVAL); + } + + init_evctl(qname, + TAG_EMPTY, + EQ_EMPTY, + (flags & EVFLG_PERMANENT) != 0? EQ_Q_PERM: 0, + /* set */ 0, + /* dev */ 0, + EQ_ON, + /* user-defined event data */ 0, + &evctl); + + err = meta_event(&evctl, ep); + + if (err == -EEXIST && !(flags & EVFLG_EXISTERR)) { + err = 0; + mdclrerror(ep); + } + if (!mdisok(ep) && mdanysyserror(ep)) { + err = (ep)->info.md_error_info_t_u.ds_error.errnum; + } + return (-err); +} + +/* + * meta_notify_deleteq + * - deletes an eventq + * - free's any underlying resources + * - returns 0 on success or errno and sets ep + */ +int +meta_notify_deleteq(char *qname, md_error_t *ep) +{ + md_event_ioctl_t evctl; + int err; + + init_evctl(qname, + TAG_EMPTY, + EQ_EMPTY, + /* flags */ 0, + /* set */ 0, + /* dev */ 0, + EQ_OFF, + /* user-defined event data */ 0, + &evctl); + + err = meta_event(&evctl, ep); + return (-err); +} + +/* + * meta_notify_validq + * - verifies that the queue exists + * - returns true or false, ep may be changed as a side-effect + */ +bool_t +meta_notify_validq(char *qname, md_error_t *ep) +{ + md_event_ioctl_t evctl; + + init_evctl(qname, + TAG_EMPTY, + EQ_EMPTY, + /* flags */ 0, + /* set */ 0, + /* dev */ 0, + EQ_ON, + /* user-defined event data */ 0, + &evctl); + + return (meta_event(&evctl, ep) == -EEXIST); +} + +/* + * meta_notify_listq + * - returns number of (currently) active queus or -errno + * - allocates qnames array and sets user's pointer to it, + * fills in array with vector of qnames + */ +int +meta_notify_listq(char ***qnames, md_error_t *ep) +{ + +#ifdef lint + qnames = qnames; +#endif /* lint */ + + mdclrerror(ep); + (void) mdsyserror(ep, EOPNOTSUPP, "EOPNOTSUPP"); + return (-EOPNOTSUPP); +} + +/* + * meta_notify_flushq + * - calls the underlying notify driver to flush all events + * from the named queue + * - returns 0 on success or errno and sets ep as necessary + */ +int +meta_notify_flushq(char *qname, md_error_t *ep) +{ + +#ifdef lint + qname = qname; +#endif /* lint */ + + mdclrerror(ep); + (void) mdsyserror(ep, EOPNOTSUPP, "EOPNOTSUPP"); + return (EOPNOTSUPP); +} + +static void +cook_ev(md_event_ioctl_t *evctlp, md_ev_t *evp, md_error_t *ep) +{ + assert(evctlp); + assert(evp); + + evp->obj_type = tagdrv_2_objlib(evctlp->mdn_tag); + + if (evp->obj_type == EVO_METADEV) { + evp->obj_type = dev2tag(evctlp->mdn_dev, evctlp->mdn_set, ep); + } + + evp->setno = evctlp->mdn_set; + evp->ev = evdrv_2_evlib(evctlp->mdn_event); + evp->obj = evctlp->mdn_dev; + evp->uev = evctlp->mdn_user; +} + +/* + * meta_notify_getev + * - collects up to 1 event and stores it into md_ev_t + * - returns number of events found (0 or 1) on success or -errno + * - flags governs whether an empty queue is waited upon (EVFLG_WAIT) + */ +int +meta_notify_getev(char *qname, ulong_t flags, md_ev_t *evp, md_error_t *ep) +{ + md_event_ioctl_t evctl; + int n_ev; + int err = -EINVAL; + + if (!evp) { + goto out; + } + + init_evctl(qname, + TAG_EMPTY, + EQ_EMPTY, + /* flags (unused in get) */ 0, + (evp->setno == EV_ALLSETS)? MD_ALLSETS: evp->setno, + (evp->obj == EV_ALLOBJS)? MD_ALLDEVS: evp->obj, + (flags & EVFLG_WAIT) != 0? EQ_GET_WAIT: EQ_GET_NOWAIT, + /* user-defined event data */ 0, + &evctl); + + err = meta_event(&evctl, ep); + + /* + * trap EAGAIN so that EV_EMPTY events get returned, but + * be sure n_ev = 0 so that users who just watch the count + * will also work + */ + switch (err) { + case -EAGAIN: + err = n_ev = 0; + cook_ev(&evctl, evp, ep); + break; + case 0: + n_ev = 1; + cook_ev(&evctl, evp, ep); + break; + } +out: + return (err == 0? n_ev: err); +} + + +/* + * meta_notify_getevlist + * - collects all pending events in the named queue and allocates + * an md_evlist_t * to return them + * - returns the number of events found (may be 0 if !WAIT) on success + * or -errno and sets ep as necessary + */ +int +meta_notify_getevlist(char *qname, + ulong_t flags, + md_evlist_t **evpp_arg, + md_error_t *ep) +{ + md_ev_t *evp = NULL; + md_evlist_t *evlp = NULL; + md_evlist_t *evlp_head = NULL; + md_evlist_t *new = NULL; + int n_ev = 0; + int err = -EINVAL; + + mdclrerror(ep); + if (!evpp_arg) { + (void) mdsyserror(ep, EINVAL, dgettext(TEXT_DOMAIN, + "No event list pointer")); + goto out; + } + + if (!qname || strlen(qname) == 0) { + (void) mdsyserror(ep, EINVAL, dgettext(TEXT_DOMAIN, + "Null or zero-length queue name")); + goto out; + } + + do { + if (!(evp = (md_ev_t *)Malloc(sizeof (md_ev_t)))) { + (void) mdsyserror(ep, ENOMEM, qname); + continue; + } + evp->obj_type = EVO_EMPTY; + evp->setno = EV_ALLSETS; + evp->ev = EV_EMPTY; + evp->obj = EV_ALLOBJS; + evp->uev = 0ULL; + + err = meta_notify_getev(qname, flags, evp, ep); + + if (evp->ev != EV_EMPTY) { + new = (md_evlist_t *)Zalloc(sizeof (md_evlist_t)); + if (evlp_head == NULL) { + evlp = evlp_head = new; + } else { + evlp->next = new; + evlp = new; + } + evlp->evp = evp; + n_ev++; + } + + } while (err >= 0 && evp && evp->ev != EV_EMPTY); +out: + if (err == -EAGAIN) { + err = 0; + } + + if (err < 0) { + meta_notify_freeevlist(evlp_head); + evlp_head = NULL; + return (err); + } else if ((err == 0) && (evp->ev == EV_EMPTY)) { + Free(evp); + evp = NULL; + } + + if (evpp_arg) { + *evpp_arg = evlp_head; + } + + return (n_ev); +} + + +/* + * the guts of meta_notify_putev() and meta_notify_sendev() + * are within this function. + * + * meta_notify_putev() is intended for general use by user-level code, + * such as the GUI, to send user-defined events. + * + * meta_notify_sendev() is for "user-level driver" code, such as + * set manipulation and the multi-host daemon to generate events. + * + * Note- only convention enforces this usage. + */ +int +meta_notify_doputev(md_ev_t *evp, md_error_t *ep) +{ + md_event_ioctl_t evctl; + + if (!evp || !ep) { + return (EINVAL); + } + + /* + * users may only put events of type EQ_USER + */ + init_evctl(/* qname (unused in put) */ NULL, + TAG_EMPTY, + EQ_EMPTY, + /* flags (unused in put) */ 0, + (evp->setno == EV_ALLSETS)? MD_ALLSETS: evp->setno, + (evp->obj == EV_ALLOBJS)? MD_ALLDEVS: evp->obj, + EQ_PUT, + evp->uev, + &evctl); + + evctl.mdn_tag = objlib_2_tagdrv(evp->obj_type); + evctl.mdn_event = evlib_2_evdrv(evp->ev); + + return (-meta_event(&evctl, ep)); +} + +/* + * meta_notify_putev + * - sends an event down to the notify driver (hence, all queues) + * - returns 0 on success or errno + */ +int +meta_notify_putev(md_ev_t *evp, md_error_t *ep) +{ + if (!evp || !ep) { + return (EINVAL); + } + + evp->ev = EV_USER; /* by definition */ + + return (meta_notify_doputev(evp, ep)); +} + +/* + * alternate put event entry point which allows + * more control of event innards (for use by md "user-level drivers") + * + * Since this routine isn't for use by clients, the user event data + * is always forced to be 0. That is only meaningful for events + * of type EQ_USER (and those go through meta_notify_putev()), so + * this is consistent. + */ +int +meta_notify_sendev( + ev_obj_t tag, + set_t set, + md_dev64_t dev, + evid_t ev) +{ + md_error_t status = mdnullerror; + md_error_t *ep = &status; + md_ev_t ev_packet; + int rc; + + ev_packet.obj_type = tag; + ev_packet.setno = set; + ev_packet.obj = dev; + ev_packet.ev = ev; + ev_packet.uev = 0ULL; + + rc = meta_notify_doputev(&ev_packet, ep); + + if (0 == rc && !mdisok(ep)) { + rc = EINVAL; + mdclrerror(ep); + } + return (rc); +} + +/* + * meta_notify_putevlist + * - sends all of the events in the event list + * - returns number of events sent (>= 0) on success or -errno + */ +int +meta_notify_putevlist(md_evlist_t *evlp, md_error_t *ep) +{ + md_evlist_t *evlpi; + int n_ev = 0; + int err; + + if (!evlp) { + err = 0; + goto out; /* that was easy */ + } + + for (n_ev = 0, evlpi = evlp; evlpi; evlpi = evlpi->next) { + if ((err = meta_notify_putev(evlpi->evp, ep)) < 0) { + goto out; + } + n_ev++; + } +out: + return (err != 0? err: n_ev); +} + +/* + * meta_notify_freevlist + * - frees any memory allocated within the event list + * - returns 0 on success or errno and sets ep as necessary + */ +void +meta_notify_freeevlist(md_evlist_t *evlp) +{ + md_evlist_t *i; + md_evlist_t *next; + + for (i = evlp; i; i = i->next) { + if (i && i->evp) { + Free(i->evp); + i->evp = NULL; + } + } + for (i = evlp; i; /* NULL */) { + next = i->next; + Free(i); + i = next; + } +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_patch.c b/usr/src/lib/lvm/libmeta/common/meta_patch.c new file mode 100644 index 0000000000..7c0ff549f1 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_patch.c @@ -0,0 +1,299 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * patch /etc/vfstab file + */ +#include <meta.h> +#include <string.h> + +/* + * patch filesystem lines into vfstab file, return tempfilename + */ +int +meta_patch_vfstab( + char *cmpname, /* filesystem mount point or */ + /* "swap" if updating swap partition */ + mdname_t *fsnp, /* filesystem device name */ + char *vname, /* vfstab file name */ + char *old_bdevname, /* old name of block device, needed */ + /* for deciding which of multiple */ + /* swap file entries to change */ + /* if NULL then not changing swap */ + int doit, /* really patch file */ + int verbose, /* show what we're doing */ + char **tname, /* returned temp file name */ + md_error_t *ep /* returned error */ +) +{ + char *chrname = fsnp->rname; + char *blkname = fsnp->bname; + FILE *fp = NULL; + FILE *tfp = NULL; + struct stat sbuf; + char buf[512]; + char cdev[512]; + char bdev[512]; + char mntpt[512]; + char fstype[512]; + char fsckpass[512]; + char mntboot[512]; + char mntopt[512]; + int gotfs = 0; + char *cmpstr = &mntpt[0]; /* compare against mntpnt if fs, */ + /* or fstype if swap */ + char *char_device = chrname; + + /* check names */ + assert(vname != NULL); + assert(tname != NULL); + + /* get temp names */ + *tname = NULL; + *tname = Malloc(strlen(vname) + strlen(".tmp") + 1); + (void) strcpy(*tname, vname); + (void) strcat(*tname, ".tmp"); + + /* check if going to update swap entry in file */ + /* if so then compare against file system type */ + if ((old_bdevname != NULL) && (strcmp("swap", cmpname) == 0)) { + cmpstr = &fstype[0]; + char_device = &cdev[0]; + } + + /* copy vfstab file, replace filesystem line */ + if ((fp = fopen(vname, "r")) == NULL) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + if (fstat(fileno(fp), &sbuf) != 0) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + if (doit) { + if ((tfp = fopen(*tname, "w")) == NULL) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + if (fchmod(fileno(tfp), (sbuf.st_mode & 0777)) != 0) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + if (fchown(fileno(tfp), sbuf.st_uid, sbuf.st_gid) != 0) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + while (fgets(buf, sizeof (buf), fp) != NULL) { + + /* check that have all required params from vfstab file */ + /* or that the line isnt a comment */ + /* or that the fstype/mntpoint match what was passed in */ + /* or that the block device matches if changing swap */ + /* the last check is needed since there may be multiple */ + /* entries of swap in the file, and so the fstype is not */ + /* a sufficient check */ + if ((sscanf(buf, "%512s %512s %512s %512s %512s %512s %512s", + bdev, cdev, mntpt, fstype, fsckpass, + mntboot, mntopt) != 7) || + (bdev[0] == '#') || (strcmp(cmpstr, cmpname) != 0) || + ((old_bdevname != NULL) && + (strstr(bdev, old_bdevname) == NULL))) { + if (doit) { + if (fputs(buf, tfp) == EOF) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + continue; + } + + if (verbose) { + (void) printf(dgettext(TEXT_DOMAIN, + "Delete the following line from %s:\n\n"), + vname); + (void) printf("%s\n", buf); + (void) printf( + dgettext(TEXT_DOMAIN, + "Add the following line to %s:\n\n"), + vname); + (void) printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\n\n", + blkname, char_device, mntpt, fstype, fsckpass, + mntboot, mntopt); + } + if (doit) { + if (fprintf(tfp, "%s\t%s\t%s\t%s\t%s\t%s\t%s\n", + blkname, char_device, mntpt, fstype, fsckpass, + mntboot, mntopt) == EOF) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + + + gotfs = 1; + } + if (! feof(fp)) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + if (! gotfs) { + (void) mderror(ep, MDE_VFSTAB_FILE, vname); + goto out; + } + if (fclose(fp) != 0) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + fp = NULL; + if (doit) { + if ((fflush(tfp) != 0) || + (fsync(fileno(tfp)) != 0) || + (fclose(tfp) != 0)) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + tfp = NULL; + } + + /* return success */ + return (0); + + /* cleanup, return error */ +out: + if (fp != NULL) + (void) fclose(fp); + if (tfp != NULL) + (void) fclose(tfp); + if (*tname != NULL) { + (void) unlink(*tname); + Free(*tname); + } + return (-1); +} + + +/* + * set filesystem device name in vfstab + */ +int +meta_patch_fsdev( + char *fsname, /* filesystem mount point */ + mdname_t *fsnp, /* filesystem device */ + char *vname, /* vfstab file name */ + md_error_t *ep /* returned error */ +) +{ + int doit = 1; + int verbose = 0; + char *tvname = NULL; + int rval = -1; + + /* check names */ + assert(fsname != NULL); + if (vname == NULL) + vname = "/etc/vfstab"; + + /* replace lines in vfstab */ + if (meta_patch_vfstab(fsname, fsnp, vname, NULL, doit, verbose, &tvname, + ep) != 0) { + goto out; + } + + /* rename temp file on top of real one */ + if (rename(tvname, vname) != 0) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + Free(tvname); + tvname = NULL; + rval = 0; + + /* cleanup, return error */ +out: + if (tvname != NULL) { + if (doit) + (void) unlink(tvname); + Free(tvname); + } + return (rval); +} + + +/* + * set filesystem device name in vfstab + */ +int +meta_patch_swapdev( + mdname_t *fsnp, /* filesystem device */ + char *vname, /* vfstab file name */ + char *old_bdevname, /* block device name to change */ + md_error_t *ep /* returned error */ +) +{ + int doit = 1; + int verbose = 0; + char *tvname = NULL; + int rval = -1; + + /* check names */ + if (vname == NULL) + vname = "/etc/vfstab"; + + /* replace lines in vfstab */ + if (meta_patch_vfstab("swap", fsnp, vname, old_bdevname, doit, + verbose, &tvname, ep) != 0) { + goto out; + } + + /* rename temp file on top of real one */ + if (rename(tvname, vname) != 0) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + Free(tvname); + tvname = NULL; + rval = 0; + + /* cleanup, return error */ +out: + if (tvname != NULL) { + if (doit) + (void) unlink(tvname); + Free(tvname); + } + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_patch_root.c b/usr/src/lib/lvm/libmeta/common/meta_patch_root.c new file mode 100644 index 0000000000..ac3f4b04d9 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_patch_root.c @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1992-1994, 2000-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * patch /etc/system file for the root device + */ + +#include <dlfcn.h> +#include <meta.h> + + +/* + * set root device name in md.conf and vfstab, patch in mddb locations + */ +int +meta_patch_rootdev( + mdname_t *rootnp, /* root device */ + char *sname, /* system file name */ + char *vname, /* vfstab file name */ + char *cname, /* mddb.cf file name */ + char *dbname, /* md.conf file name */ + int doit, /* really patch files */ + int verbose, /* show what we're doing */ + md_error_t *ep /* returned error */ +) +{ + mdsetname_t *sp; + int ismeta = metaismeta(rootnp); + char *tsname = NULL; + FILE *tsfp = NULL; + char *dbtname = NULL; + FILE *dbtfp = NULL; + char *tvname = NULL; + int rval = -1; + + /* check names */ + if (sname == NULL) + sname = "/etc/system"; + if (vname == NULL) + vname = "/etc/vfstab"; + if (cname == NULL) + cname = META_DBCONF; + if (dbname == NULL) + dbname = "/kernel/drv/md.conf"; + + /* make sure we have a local name */ + if ((sp = metagetset(rootnp, TRUE, ep)) == NULL) + return (-1); + + if (! metaislocalset(sp)) { + return (mddeverror(ep, MDE_NOT_LOCAL, rootnp->dev, + rootnp->cname)); + } + + /* replace forceload and rootdev lines in system */ + if (meta_systemfile_copy(sname, 1, 0, doit, verbose, &tsname, &tsfp, + ep) != 0) { + goto out; + } + if (meta_systemfile_append_mdroot(rootnp, sname, + tsname, tsfp, ismeta, doit, verbose, ep) != 0) { + goto out; + } + + /* replace bootlist lines in /kernel/drv/md.conf */ + if (meta_systemfile_copy(dbname, 0, 1, doit, verbose, &dbtname, + &dbtfp, ep) != 0) { + goto out; + } + if (meta_systemfile_append_mddb(cname, dbname, dbtname, dbtfp, doit, + verbose, ep) != 0) { + goto out; + } + + /* force the file contents out to disk */ + if (doit) { + if ((fflush(tsfp) != 0) || + (fsync(fileno(tsfp)) != 0) || + (fclose(tsfp) != 0)) { + (void) mdsyserror(ep, errno, tsname); + goto out; + } + tsfp = NULL; + if ((fflush(dbtfp) != 0) || + (fsync(fileno(dbtfp)) != 0) || + (fclose(dbtfp) != 0)) { + (void) mdsyserror(ep, errno, dbtname); + goto out; + } + dbtfp = NULL; + } + + /* replace lines in vfstab */ + if (meta_patch_vfstab("/", rootnp, vname, NULL, doit, verbose, &tvname, + ep) != 0) { + goto out; + } + + /* rename files, better hope both work */ + if (doit) { + if (rename(tsname, sname) != 0) { + (void) mdsyserror(ep, errno, sname); + goto out; + } + Free(tsname); + tsname = NULL; + if (rename(dbtname, dbname) != 0) { + (void) mdsyserror(ep, errno, dbname); + goto out; + } + Free(dbtname); + dbtname = NULL; + if (rename(tvname, vname) != 0) { + (void) mdsyserror(ep, errno, vname); + goto out; + } + Free(tvname); + tvname = NULL; + } + rval = 0; + + /* cleanup, return error */ +out: + if (tsfp != NULL) + (void) fclose(tsfp); + if (tsname != NULL) { + if (doit) + (void) unlink(tsname); + Free(tsname); + } + if (tvname != NULL) { + if (doit) + (void) unlink(tvname); + Free(tvname); + } + + /* free the temporary files for md.conf */ + if (dbtfp != NULL) + (void) fclose(dbtfp); + if (dbtname != NULL) { + if (doit) + (void) unlink(dbtname); + Free(dbtname); + } + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_print.c b/usr/src/lib/lvm/libmeta/common/meta_print.c new file mode 100644 index 0000000000..a539628685 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_print.c @@ -0,0 +1,439 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * report metadevice status + */ + +#include <meta.h> + +/* + * print named metadevice + */ +int +meta_print_name( + mdsetname_t *sp, + mdname_t *namep, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + mdnamelist_t **lognlpp, + md_error_t *ep +) +{ + char *miscname; + + /* must have set */ + assert(sp != NULL); + + /* get type */ + if ((miscname = metagetmiscname(namep, ep)) == NULL) + return (-1); + + /* dispatch */ + if (strcmp(miscname, MD_TRANS) == 0) { + return (meta_trans_print(sp, namep, nlpp, fname, fp, + options, NULL, lognlpp, ep)); + } + if (strcmp(miscname, MD_MIRROR) == 0) { + return (meta_mirror_print(sp, namep, nlpp, fname, fp, + options, ep)); + } + if (strcmp(miscname, MD_RAID) == 0) { + return (meta_raid_print(sp, namep, nlpp, fname, fp, + options, ep)); + } + if (strcmp(miscname, MD_STRIPE) == 0) { + return (meta_stripe_print(sp, namep, nlpp, fname, fp, + options, ep)); + } + if (strcmp(miscname, MD_SP) == 0) { + return (meta_sp_print(sp, namep, nlpp, fname, fp, + options, ep)); + } + + /* unknown type */ + return (mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(namep->dev), + namep->cname)); +} + +/* + * print all metadevices + */ +int +meta_print_all( + mdsetname_t *sp, + char *fname, + mdnamelist_t **nlpp, + FILE *fp, + mdprtopts_t options, + int *meta_print_trans_msgp, + md_error_t *ep +) +{ + md_error_t status = mdnullerror; + int rval = 0; + mdnamelist_t *lognlp = NULL; + + + /* print various types (save first error) */ + if (meta_trans_print(sp, NULL, nlpp, fname, fp, options, + meta_print_trans_msgp, &lognlp, ep) != 0) { + rval = -1; + ep = &status; + } + if (meta_logs_print(sp, lognlp, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + metafreenamelist(lognlp); + if (meta_mirror_print(sp, NULL, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + if (meta_raid_print(sp, NULL, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + if (meta_stripe_print(sp, NULL, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + if (meta_sp_print(sp, NULL, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + if (meta_hsp_print(sp, NULL, nlpp, fname, fp, options, ep) != 0) { + rval = -1; + ep = &status; + } + + /* discard further errors */ + mdclrerror(&status); + + /* return success */ + return (rval); +} + +/* + * format timestamp + */ +char * +meta_print_time( + md_timeval32_t *tvp +) +{ + static char buf[128]; + struct tm *tmp; + char *dcmsg; + + if (tvp == NULL) + return (""); + + /* + * TRANSLATION_NOTE_LC_TIME + * This message is the format of file + * timestamps written with the -C and + * -c options. + * %a -- locale's abbreviated weekday name + * %b -- locale's abbreviated month name + * %e -- day of month [1,31] + * %T -- Time as %H:%M:%S + * %Y -- Year, including the century + */ + dcmsg = dcgettext(TEXT_DOMAIN, "%a %b %e %T %Y", LC_TIME); + + if (((tvp->tv_sec == 0) && (tvp->tv_usec == 0)) || + ((tmp = localtime((const time_t *)&tvp->tv_sec)) == NULL) || + (strftime(buf, sizeof (buf), dcmsg, tmp) == 0)) { + return (dgettext(TEXT_DOMAIN, "(invalid time)")); + } + return (buf); +} + +/* + * format high resolution time into a tuple of seconds:milliseconds:microseconds + */ +char * +meta_print_hrtime( + hrtime_t secs +) +{ + long long sec, msec, usec; + static char buf[128]; + + usec = secs / 1000; + msec = usec / 1000; + sec = msec / 1000; + msec %= 1000; + usec %= 1000; + + (void) snprintf(buf, sizeof (buf), "%4lld:%03lld:%03lld", sec, msec, + usec); + return (buf); +} + +/* + * Routine to print 32 bit bitmasks + * + * Takes: + * fp - a file descriptor + * fmt - optional text + * ul - unsigned long bit vector + * bitfmt - special string to map bits to words. + * bitfmt is layed out as follows: + * byte 0 is the output base. + * byte 1 a bit position less than 32 + * byte 2-n text for position in byte 1 + * byte n+1 another bit position + * byte n+2-m text for position in byte n+1 + * . + * . + * . + * + * Eg. - "\020\001DOG\002CAT\003PIG" + * Print the bitmask in hex. + * If bit 1 (0x0001) is set print "<DOG>" + * If bit 2 (0x0002) is set print "<CAT>" + * If bit 3 (0x0004) is set print "<PIG>" + * If bit 4 (0x0008) is set nothing is printed. + * If bit 1 and bit 2 (0x0003) are set print <DOG,CAT> + * + * Returns 0 on OK + * EOF on error + * + * Outputs on fp + * + */ + +int +meta_prbits(FILE *fp, const char *fmt, ...) +{ + va_list ap; + unsigned long ul; + int set; + int n; + char *p; + + va_start(ap, fmt); + + if (fmt && *fmt) + if (fprintf(fp, fmt) == EOF) + return (EOF); + + ul = va_arg(ap, int); + p = va_arg(ap, char *); + + switch (*p++) { + case 8: + if (fprintf(fp, "0%lo", ul) == EOF) + return (EOF); + break; + + case 16: + if (fprintf(fp, "0x%lx", ul) == EOF) + return (EOF); + break; + + default: + case 10: + if (fprintf(fp, "%ld", ul) == EOF) + return (EOF); + break; + } + + if (! ul) + return (0); + + for (set = 0; (n = *p++) != '\0'; /* void */) { + if (ul & (1 << (n - 1))) { + if (fputc(set ? ',' : '<', fp) == EOF) + return (EOF); + for (/* void */; (n = *p) > ' '; ++p) + if (fputc(n, fp) == EOF) + return (EOF); + set = 1; + } else + for (/* void */; *p > ' '; ++p); + } + if (set) + if (fputc('>', fp) == EOF) + return (EOF); + + return (0); +} + + +/* + * Convert a number of blocks to a string representation + * Input: 64 bit wide number of blocks + * Outout: string like "199MB" or "27TB" or "3.5GB" + * Returns a pointer to the buffer. + */ +char * +meta_number_to_string(diskaddr_t number, u_longlong_t blk_sz) +{ + diskaddr_t save = 0; + char *M = " KMGTPE"; /* kilo, mega, giga, tera, peta, exa */ + char *uom = M; /* unit of measurement, initially ' ' (=M[0]) */ + static char buf[64]; + u_longlong_t total_bytes; + + /* convert from blocks to bytes */ + total_bytes = number * blk_sz; + + /* + * Stop scaling when we reached exa bytes, then something is + * probably wrong with our number. + */ + while ((total_bytes >= 1024) && (*uom != 'E')) { + uom++; /* next unit of measurement */ + save = total_bytes; + total_bytes = total_bytes / 1024; + } + + /* check if we should output a decimal place after the point */ + if (save && ((save / 1024) < 10)) { + /* sprintf() will round for us */ + float fnum = (float)save / 1024; + (void) sprintf(buf, "%1.1f %cB", fnum, *uom); + } else { + (void) sprintf(buf, "%llu %cB", total_bytes, *uom); + } + return (buf); +} + +/* + * meta_get_tstate: get the transient state bits from the kernel. + * this is for use with printing out the state field in metastat. + * INPUT: dev64 -- devt of the metadevice + * tstatep -- return for tstate + * ep -- error + * RETURN: -1 for error + * 0 for success + */ +int +meta_get_tstate(md_dev64_t dev64, uint_t *tstatep, md_error_t *ep) +{ + md_i_get_tstate_t params; + minor_t mnum = meta_getminor(dev64); + + (void) memset(¶ms, 0, sizeof (params)); + params.id = mnum; + if (metaioctl(MD_IOCGET_TSTATE, ¶ms, ¶ms.mde, NULL) != 0) { + return (mdstealerror(ep, ¶ms.mde)); + } + *tstatep = params.tstate; + return (0); +} + +/* + * meta_print_devid: print out the devid information, given a mddevid_t list. + * INPUT: mdsetname_t set we're looking at + * FILE where to print to + * mddevid_t list to print from. + * md_error_t error + * RETURN: -1 for error + * 0 for success + */ +int +meta_print_devid( + mdsetname_t *sp, + FILE *fp, + mddevid_t *mddevidp, + md_error_t *ep +) +{ + int len = 0; + mddevid_t *tmp_mddevidp = NULL; + ddi_devid_t did = NULL; + char *devid = ""; + int freedevid = 0; + char *reloc = ""; + + + /* print header */ + if (fprintf(fp, gettext("Device Relocation Information:\n")) < 0) + return (-1); + + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + + tmp_mddevidp = mddevidp; + while (tmp_mddevidp != NULL) { + len = max(len, strlen(tmp_mddevidp->ctdname)); + tmp_mddevidp = tmp_mddevidp->next; + } + + if (fprintf(fp, "%-*s %-5s\t%s\n", len + 2, + gettext("Device "), + gettext("Reloc"), + gettext("Device ID")) < 0) + return (-1); + + /* print ctd's and devids */ + while (mddevidp != NULL) { + did = (ddi_devid_t) + meta_getdidbykey(sp->setno, getmyside(sp, ep), + mddevidp->key, ep); + + if (did == (ddi_devid_t)NULL) { + devid = "-"; + reloc = gettext("No "); + freedevid = 0; + } else { + devid = devid_str_encode(did, NULL); + reloc = gettext("Yes"); + freedevid = 1; + Free(did); + } + + if (fprintf(fp, "%-*s %-5s\t%s\n", len + 2, mddevidp->ctdname, + reloc, devid) < 0) + return (-1); + + mddevidp = mddevidp->next; + + if (freedevid == 1) + devid_str_free(devid); + } + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_raid.c b/usr/src/lib/lvm/libmeta/common/meta_raid.c new file mode 100644 index 0000000000..cce31ad3fa --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_raid.c @@ -0,0 +1,2784 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * RAID operations + */ + +#include <stdlib.h> +#include <meta.h> +#include <sys/lvm/md_raid.h> +#include <sys/lvm/mdvar.h> +#include <sys/lvm/md_convert.h> +#include <stddef.h> + +/* + * FUNCTION: meta_get_raid_names() + * INPUT: sp - the set name to get raid from + * options - options from the command line + * OUTPUT: nlpp - list of all raid names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all raid in the metadb + * for all devices in the specified set + */ +int +meta_get_raid_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + return (meta_get_names(MD_RAID, sp, nlpp, options, ep)); +} + +/* + * free raid unit + */ +void +meta_free_raid( + md_raid_t *raidp +) +{ + if (raidp->cols.cols_val != NULL) { + assert(raidp->cols.cols_len > 0); + Free(raidp->cols.cols_val); + } + Free(raidp); +} + +/* + * get raid (common) + */ +md_raid_t * +meta_get_raid_common( + mdsetname_t *sp, + mdname_t *raidnp, + int fast, + md_error_t *ep +) +{ + mddrivename_t *dnp = raidnp->drivenamep; + char *miscname; + mr_unit_t *mr; + md_raid_t *raidp; + uint_t ncol; + uint_t col; + md_resync_ioctl_t ri; + + /* must have set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* short circuit */ + if (dnp->unitp != NULL) { + assert(dnp->unitp->type == MD_METARAID); + return ((md_raid_t *)dnp->unitp); + } + + /* get miscname and unit */ + if ((miscname = metagetmiscname(raidnp, ep)) == NULL) + return (NULL); + if (strcmp(miscname, MD_RAID) != 0) { + (void) mdmderror(ep, MDE_NOT_RAID, meta_getminor(raidnp->dev), + raidnp->cname); + return (NULL); + } + if ((mr = (mr_unit_t *)meta_get_mdunit(sp, raidnp, ep)) == NULL) + return (NULL); + assert(mr->c.un_type == MD_METARAID); + + /* allocate raid */ + raidp = Zalloc(sizeof (*raidp)); + + /* allocate columns */ + ncol = mr->un_totalcolumncnt; + assert(ncol >= MD_RAID_MIN); + raidp->cols.cols_len = ncol; + raidp->cols.cols_val = Zalloc(raidp->cols.cols_len * + sizeof (*raidp->cols.cols_val)); + + /* get common info */ + raidp->common.namep = raidnp; + raidp->common.type = mr->c.un_type; + raidp->common.state = mr->c.un_status; + raidp->common.capabilities = mr->c.un_capabilities; + raidp->common.parent = mr->c.un_parent; + raidp->common.size = mr->c.un_total_blocks; + raidp->common.user_flags = mr->c.un_user_flags; + raidp->common.revision = mr->c.un_revision; + + /* get options */ + raidp->state = mr->un_state; + raidp->timestamp = mr->un_timestamp; + raidp->interlace = mr->un_segsize; + raidp->orig_ncol = mr->un_origcolumncnt; + raidp->column_size = mr->un_segsize * mr->un_segsincolumn; + raidp->pw_count = mr->un_pwcnt; + assert(raidp->orig_ncol <= ncol); + if ((mr->un_hsp_id != MD_HSP_NONE) && + ((raidp->hspnamep = metahsphspname(&sp, mr->un_hsp_id, + ep)) == NULL)) { + goto out; + } + + /* get columns, update unit state */ + for (col = 0; (col < ncol); ++col) { + mr_column_t *rcp = &mr->un_column[col]; + md_raidcol_t *mdrcp = &raidp->cols.cols_val[col]; + + /* get column name */ + mdrcp->colnamep = metakeyname(&sp, rcp->un_orig_key, fast, ep); + if (mdrcp->colnamep == NULL) + goto out; + + /* override any start_blk */ +#ifdef DEBUG + if (metagetstart(sp, mdrcp->colnamep, ep) != + MD_DISKADDR_ERROR) { + assert(mdrcp->colnamep->start_blk <= + rcp->un_orig_devstart); + } else { + mdclrerror(ep); + } +#endif /* DEBUG */ + mdrcp->colnamep->start_blk = rcp->un_orig_devstart; + + /* if hotspared */ + if (HOTSPARED(mr, col)) { + /* get hotspare name */ + mdrcp->hsnamep = metakeyname(&sp, rcp->un_hs_key, + fast, ep); + if (mdrcp->hsnamep == NULL) + goto out; + + if (getenv("META_DEBUG_START_BLK") != NULL) { + if (metagetstart(sp, mdrcp->hsnamep, ep) == + MD_DISKADDR_ERROR) + mdclrerror(ep); + + if ((mdrcp->hsnamep->start_blk == 0) && + (rcp->un_hs_pwstart != 0)) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block," + " seems labelled [raid]\n"), + mdrcp->hsnamep->cname); + + if ((mdrcp->hsnamep->start_blk > 0) && + (rcp->un_hs_pwstart == 0)) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block, " + " seems unlabelled [raid]\n"), + mdrcp->hsnamep->cname); + } + + /* override any start_blk */ + mdrcp->hsnamep->start_blk = rcp->un_hs_devstart; + } + + /* get state, flags, and timestamp */ + mdrcp->state = rcp->un_devstate; + mdrcp->flags = rcp->un_devflags; + mdrcp->timestamp = rcp->un_devtimestamp; + } + + /* get resync info */ + (void) memset(&ri, 0, sizeof (ri)); + ri.ri_mnum = meta_getminor(raidnp->dev); + MD_SETDRIVERNAME(&ri, MD_RAID, sp->setno); + if (metaioctl(MD_IOCGETSYNC, &ri, &ri.mde, raidnp->cname) != 0) { + (void) mdstealerror(ep, &ri.mde); + goto out; + } + raidp->resync_flags = ri.ri_flags; + raidp->percent_dirty = ri.ri_percent_dirty; + raidp->percent_done = ri.ri_percent_done; + + /* cleanup, return success */ + Free(mr); + dnp->unitp = (md_common_t *)raidp; + return (raidp); + + /* cleanup, return error */ +out: + Free(mr); + meta_free_raid(raidp); + return (NULL); +} + +/* + * get raid + */ +md_raid_t * +meta_get_raid( + mdsetname_t *sp, + mdname_t *raidnp, + md_error_t *ep +) +{ + return (meta_get_raid_common(sp, raidnp, 0, ep)); +} + +/* + * check raid for dev + */ +static int +in_raid( + mdsetname_t *sp, + mdname_t *raidnp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_raid_t *raidp; + uint_t col; + + /* should be in the same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* get unit */ + if ((raidp = meta_get_raid(sp, raidnp, ep)) == NULL) + return (-1); + + /* look in columns */ + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + diskaddr_t col_sblk; + int err; + + /* check same drive since metagetstart() can fail */ + if ((err = meta_check_samedrive(np, colnp, ep)) < 0) + return (-1); + else if (err == 0) + continue; + + /* check overlap */ + if ((col_sblk = metagetstart(sp, colnp, ep)) == + MD_DISKADDR_ERROR) + return (-1); + if (meta_check_overlap(raidnp->cname, np, slblk, nblks, + colnp, col_sblk, -1, ep) != 0) { + return (-1); + } + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a raid + */ +int +meta_check_inraid( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdnamelist_t *raidnlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each raid */ + if (meta_get_raid_names(sp, &raidnlp, 0, ep) < 0) + return (-1); + for (p = raidnlp; (p != NULL); p = p->next) { + mdname_t *raidnp = p->namep; + + /* check raid */ + if (in_raid(sp, raidnp, np, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(raidnlp); + return (rval); +} + +/* + * check column + */ +int +meta_check_column( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + mdchkopts_t options = (MDCHK_ALLOW_MDDB); + + /* check for soft partitions */ + if (meta_sp_issp(sp, np, ep) != 0) { + /* make sure we have a disk */ + if (metachkcomp(np, ep) != 0) + return (-1); + } + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * print raid + */ +static int +raid_print( + md_raid_t *raidp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t col; + int rval = -1; + + + if (options & PRINT_LARGEDEVICES) { + if (raidp->common.revision != MD_64BIT_META_DEV) { + rval = 0; + goto out; + } + } + + /* print name and -r */ + if (fprintf(fp, "%s -r", raidp->common.namep->cname) == EOF) + goto out; + + /* print columns */ + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *mdrcp = &raidp->cols.cols_val[col]; + + /* print column */ + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(mdrcp->colnamep->rname, "/dev/rdsk") == NULL) && + (strstr(mdrcp->colnamep->rname, "/dev/md/rdsk") == NULL) && + (strstr(mdrcp->colnamep->rname, "/dev/td/") == NULL)) { + /* not standard path, print full pathname */ + if (fprintf(fp, " %s", mdrcp->colnamep->rname) == EOF) + goto out; + } else { + /* standard path so print ctd or d number */ + if (fprintf(fp, " %s", mdrcp->colnamep->cname) == EOF) + goto out; + } + } + + if (fprintf(fp, " -k") == EOF) + goto out; + + /* print options */ + if (fprintf(fp, " -i %lldb", raidp->interlace) == EOF) + goto out; + + if (raidp->pw_count != PWCNT_MIN) + if (fprintf(fp, " -w %d", raidp->pw_count) == EOF) + goto out; + + if (raidp->hspnamep != NULL) { + if (fprintf(fp, " -h %s", raidp->hspnamep->hspname) == EOF) + goto out; + } + if (raidp->orig_ncol != raidp->cols.cols_len) { + assert(raidp->orig_ncol < raidp->cols.cols_len); + if (fprintf(fp, " -o %u", raidp->orig_ncol) == EOF) + goto out; + } + + /* terminate last line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +static int +find_resyncing_column( + md_raid_t *raidp +) +{ + int col; + + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + if (cp->state & RCS_RESYNC) + return (col); + } + + /* No resyncing columns */ + return (-1); +} + +/* + * convert raid state to name + */ +char * +raid_state_to_name( + md_raid_t *raidp, + md_timeval32_t *tvp, + uint_t tstate /* Errored tstate flags */ +) +{ + + /* grab time */ + if (tvp != NULL) + *tvp = raidp->timestamp; + + /* + * If the device has a transient error state (due to it being DR'ed or + * failed) and there has been no I/O to it (the actual device is still + * marked as 'Okay') then we cannot know what the state is or what + * action to take on it. Therefore report the device as 'Unavailable'. + * A subsequent I/O to the device will cause the 'Okay' status to + * disappear if the device is actually gone and then we will print out + * the appropriate status. The MD_INACCESSIBLE state is only set + * on the raid when we open it or probe it. One the raid is open + * then we will just have regular error status on the device. + */ + if (tstate & MD_INACCESSIBLE) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* resyncing */ + if (find_resyncing_column(raidp) >= 0) + return (dgettext(TEXT_DOMAIN, "Resyncing")); + + /* everything else */ + switch (raidp->state) { + case RUS_INIT : + return (dgettext(TEXT_DOMAIN, "Initializing")); + case RUS_OKAY : + return (dgettext(TEXT_DOMAIN, "Okay")); + case RUS_ERRED : + /*FALLTHROUGH*/ + case RUS_LAST_ERRED : + return (dgettext(TEXT_DOMAIN, "Needs Maintenance")); + case RUS_DOI : + return (dgettext(TEXT_DOMAIN, "Initialization Failed")); + case RUS_REGEN : + return (dgettext(TEXT_DOMAIN, "Regen")); + default : + return (dgettext(TEXT_DOMAIN, "invalid")); + } /* switch */ +} + +static int +find_erred_column(md_raid_t *raidp, rcs_state_t state) +{ + int col; + + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + if (cp->state & state) + return (col); + } + + /* No erred columns */ + return (-1); +} + +/* + * convert raid state to repair action + */ +char * +raid_state_to_action(md_raid_t *raidp) +{ + static char emsg[1024]; + mdname_t *raidnp = raidp->common.namep; + int err_col; + + /* first check for full init failure */ + if (raidp->state & RUS_DOI) { + (void) snprintf(emsg, sizeof (emsg), + "metaclear -f %s", raidnp->cname); + return (emsg); + } + + /* replace errored or init errored raid column */ + if ((err_col = find_erred_column(raidp, + (RCS_ERRED | RCS_INIT_ERRED))) >= 0) { + mdname_t *colnp; + + /* get column with error */ + assert(err_col < raidp->cols.cols_len); + colnp = raidp->cols.cols_val[err_col].colnamep; + (void) snprintf(emsg, sizeof (emsg), + "metareplace %s%s %s <%s>", + ((raidp->state == RUS_LAST_ERRED) ? "-f " : ""), + raidnp->cname, colnp->cname, + dgettext(TEXT_DOMAIN, "new device")); + return (emsg); + } + + + /* replace last errored raid column */ + if ((err_col = find_erred_column(raidp, RCS_LAST_ERRED)) >= 0) { + mdname_t *colnp; + + assert(err_col < raidp->cols.cols_len); + colnp = raidp->cols.cols_val[err_col].colnamep; + (void) snprintf(emsg, sizeof (emsg), + "metareplace %s %s %s <%s>", + ((raidp->state == RUS_LAST_ERRED) ? "-f " : ""), + raidnp->cname, colnp->cname, + dgettext(TEXT_DOMAIN, "new device")); + return (emsg); + } + + /* OK */ + return (NULL); +} + +/* + * get printable raid column state + */ +char * +raid_col_state_to_name( + md_raidcol_t *colp, + md_timeval32_t *tvp, + uint_t tstate +) +{ + /* grab time */ + if (tvp != NULL) + *tvp = colp->timestamp; + + if (tstate != 0) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* everything else */ + switch (colp->state) { + case RCS_INIT: + return (dgettext(TEXT_DOMAIN, "Initializing")); + + case RCS_OKAY: + return (dgettext(TEXT_DOMAIN, "Okay")); + + case RCS_INIT_ERRED: + /*FALLTHROUGH*/ + case RCS_ERRED: + return (dgettext(TEXT_DOMAIN, "Maintenance")); + + case RCS_LAST_ERRED: + return (dgettext(TEXT_DOMAIN, "Last Erred")); + + case RCS_RESYNC: + return (dgettext(TEXT_DOMAIN, "Resyncing")); + + default: + return (dgettext(TEXT_DOMAIN, "Unknown")); + } +} + +/* + * print raid column + */ +static int +display_raid_device_info( + mdsetname_t *sp, + md_raidcol_t *colp, + char *fname, + FILE *fp, + mdprtopts_t options, + int print_len, + uint_t top_tstate, /* Errored tstate flags */ + md_error_t *ep +) +{ + mdname_t *namep = ((colp->hsnamep != NULL) ? + colp->hsnamep : colp->colnamep); + char *devid = ""; + char *cname = colp->colnamep->cname; + diskaddr_t start_blk; + int has_mddb; + char *has_mddb_str; + char *col_state; + md_timeval32_t tv; + char *hsname = ((colp->hsnamep != NULL) ? + colp->hsnamep->cname : ""); + int rval = -1; + mdname_t *didnp = NULL; + ddi_devid_t dtp; + uint_t tstate = 0; + + /* get info */ + if ((start_blk = metagetstart(sp, namep, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((has_mddb = metahasmddb(sp, namep, ep)) < 0) + return (-1); + if (has_mddb) + has_mddb_str = dgettext(TEXT_DOMAIN, "Yes"); + else + has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + + if (metaismeta(namep)) { + if (meta_get_tstate(namep->dev, &tstate, ep) != 0) + return (-1); + col_state = raid_col_state_to_name(colp, &tv, + tstate & MD_DEV_ERRORED); + } else { + /* + * if top_tstate is set, that implies that you have + * a ctd type device with an unavailable metadevice + * on top of it. If so, print a - for it's state + */ + if (top_tstate != 0) + col_state = "-"; + else + col_state = raid_col_state_to_name(colp, &tv, tstate); + } + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, namep->dev, ep)) == NULL) + return (-1); + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) { + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + } + /* print column */ + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + "\t%-*.*s %8lld %5.5s %12.12s %5.5s %s\n", + print_len, print_len, cname, start_blk, has_mddb_str, + col_state, devid, hsname) == EOF) { + goto out; + } + } else { + char *timep = meta_print_time(&tv); + + if (fprintf(fp, + "\t%-*s %5lld %-5s %-11s %-5s %-9s %s\n", + print_len, cname, start_blk, has_mddb_str, + col_state, devid, hsname, timep) == EOF) { + goto out; + } + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + + return (rval); +} + +/* + * print raid options + */ +int +meta_print_raid_options( + mdhspname_t *hspnamep, + char *fname, + FILE *fp, + md_error_t *ep +) +{ + char *hspname = ((hspnamep != NULL) ? hspnamep->hspname : + dgettext(TEXT_DOMAIN, "none")); + int rval = -1; + + /* print options */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Hot spare pool: %s\n"), hspname) == EOF) { + goto out; + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * report raid + */ +static int +raid_report( + mdsetname_t *sp, + md_raid_t *raidp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + char *p; + uint_t ncol = raidp->cols.cols_len; + uint_t orig_ncol = raidp->orig_ncol; + diskaddr_t column_size = raidp->column_size; + char *raid_state; + md_timeval32_t tv; + char *timep; + uint_t col; + int rval = -1; + int len = 0; + uint_t tstate = 0; + + if (options & PRINT_LARGEDEVICES) { + if (raidp->common.revision != MD_64BIT_META_DEV) { + rval = 0; + goto out; + } + } + + /* print header */ + if (options & PRINT_HEADER) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: RAID\n"), + raidp->common.namep->cname) == EOF) { + goto out; + } + + } + + /* print state */ + if (metaismeta(raidp->common.namep)) { + if (meta_get_tstate(raidp->common.namep->dev, &tstate, ep) != 0) + return (-1); + } + tstate &= MD_DEV_ERRORED; /* extract the errored tstate bits */ + raid_state = raid_state_to_name(raidp, &tv, tstate); + if (options & PRINT_TIMES) { + timep = meta_print_time(&tv); + } else { + timep = ""; + } + + if (fprintf(fp, dgettext(TEXT_DOMAIN, " State: %-12s %s\n"), + raid_state, timep) == EOF) { + goto out; + } + + /* + * Display recovery action if we're marked in the Unavailable state. + */ + if ((tstate == 0) || (tstate & MD_INACCESSIBLE)) { + /* print what to do */ + if (tstate & MD_INACCESSIBLE) { + char sname[MD_MAX_SETNAME + 3]; /* 3 = sizeof("-s ") */ + + if (metaislocalset(sp)) { + sname[0] = '\0'; + } else { + (void) snprintf(sname, MD_MAX_SETNAME + 3, + "-s %s", sp->setname); + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Invoke: metastat -i %s\n"), sname) == EOF) { + goto out; + } + } else if ((p = raid_state_to_action(raidp)) != NULL) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Invoke: %s\n"), p) == EOF) { + goto out; + } + } + + /* resync status */ + if (raidp->resync_flags & MD_RI_INPROGRESS) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Resync in progress: %2d.%1d%% done\n"), + raidp->percent_done/10, + raidp->percent_done % 10) == EOF) { + goto out; + } + } else if (raidp->resync_flags & MD_GROW_INPROGRESS) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Initialization in progress: %2d.%1d%% " + "done\n"), + raidp->percent_done/10, + raidp->percent_done % 10) == EOF) { + goto out; + } + } else if (raidp->state & RUS_REGEN) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Parity regeneration in progress: %2d.%1d%% " + "done\n"), + raidp->percent_done/10, + raidp->percent_done % 10) == EOF) { + goto out; + } + } + } + + /* print hotspare pool */ + if (raidp->hspnamep != NULL) { + if (meta_print_raid_options(raidp->hspnamep, + fname, fp, ep) != 0) { + return (-1); + } + } + + /* print interlace */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Interlace: %lld blocks\n"), + raidp->interlace) == EOF) { + goto out; + } + + /* print size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %lld blocks (%s)\n"), + raidp->common.size, + meta_number_to_string(raidp->common.size, DEV_BSIZE)) == EOF) { + goto out; + } + + /* MD_DEBUG stuff */ + if (options & PRINT_DEBUG) { + mdname_t *raidnp = raidp->common.namep; + mr_unit_t *mr; + + /* get additional info */ + if ((mr = (mr_unit_t *)meta_get_mdunit(sp, raidnp, ep)) == NULL) + return (-1); + assert(mr->c.un_type == MD_METARAID); + + /* print prewrite count and size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Prewrite Count: %u slots\n"), + mr->un_pwcnt) == EOF) { + Free(mr); + goto out; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Prewrite Slot Size: %u blocks\n"), + (mr->un_pwsize / mr->un_pwcnt)) == EOF) { + Free(mr); + goto out; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Prewrite Total Size: %u blocks\n"), + mr->un_pwsize) == EOF) { + Free(mr); + goto out; + } + Free(mr); + } + + /* print original devices */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, "Original device:\n")) == EOF) + goto out; + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %lld blocks (%s)\n"), + column_size * (orig_ncol - 1), + meta_number_to_string(column_size * (orig_ncol - 1), DEV_BSIZE)) + == EOF) { + goto out; + } + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + for (col = 0; (col < orig_ncol); ++col) { + len = max(len, + strlen(raidp->cols.cols_val[col].colnamep->cname)); + } + + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s %12.12s %-5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare")) == EOF) { + goto out; + } + } else { + if (fprintf(fp, + "\t%-*s %5s %-5s %-11s %-5s %-9s %s\n", + len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare"), + dgettext(TEXT_DOMAIN, "Time")) == EOF) { + goto out; + } + } + for (col = 0; (col < orig_ncol); ++col) { + md_raidcol_t *mdrcp = &raidp->cols.cols_val[col]; + + if (display_raid_device_info(sp, mdrcp, fname, fp, options, + len, tstate, ep) != 0) { + return (-1); + } + } + + /* print concatenated devices */ + if (col < ncol) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + "Concatenated Devices:\n")) == EOF) { + goto out; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Size: %lld blocks (%s)\n"), + column_size * (ncol - orig_ncol), + meta_number_to_string(column_size * (ncol - orig_ncol), + DEV_BSIZE)) + == EOF) { + goto out; + } + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s %-12.12s %5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare")) == EOF) { + goto out; + } + } else { + if (fprintf(fp, + "\t%-*s %5s %-5s %-11s %-9s %s\t%s\n", + len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare"), + dgettext(TEXT_DOMAIN, "Time")) == EOF) { + goto out; + } + } + assert(col == orig_ncol); + for (/* void */; (col < ncol); col++) { + md_raidcol_t *mdrcp = &raidp->cols.cols_val[col]; + + if (display_raid_device_info(sp, mdrcp, fname, fp, + options, len, tstate, ep) != 0) { + return (-1); + } + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report raid + */ +int +meta_raid_print( + mdsetname_t *sp, + mdname_t *raidnp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_raid_t *raidp; + int col; + + /* should have same set */ + assert(sp != NULL); + assert((raidnp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev)))); + + /* print all raids */ + if (raidnp == NULL) { + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int cnt; + int rval = 0; + + /* get list */ + if ((cnt = meta_get_raid_names(sp, &nlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recurse */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (meta_raid_print(sp, np, nlpp, fname, fp, + options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + metafreenamelist(nlp); + return (rval); + } + + /* get unit structure */ + if ((raidp = meta_get_raid_common(sp, raidnp, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* check for parented */ + if ((! (options & PRINT_SUBDEVS)) && + (MD_HAS_PARENT(raidp->common.parent))) { + return (0); + } + + /* print appropriate detail */ + if (options & PRINT_SHORT) { + if (raid_print(raidp, fname, fp, options, ep) != 0) + return (-1); + } else { + if (raid_report(sp, raidp, fname, fp, options, ep) != 0) + return (-1); + } + + /* Recurse on components that are metadevices */ + for (col = 0; col < raidp->cols.cols_len; ++col) { + md_raidcol_t *colp = &raidp->cols.cols_val[col]; + mdname_t *namep = colp->colnamep; + + if ((metaismeta(namep)) && + (meta_print_name(sp, namep, nlpp, fname, fp, + (options | PRINT_HEADER | PRINT_SUBDEVS), + NULL, ep) != 0)) { + return (-1); + } + } + + return (0); +} + +/* + * adjust raid geometry + */ +static int +adjust_geom( + mdname_t *raidnp, + mdname_t *colnp, + mr_unit_t *mr, + md_error_t *ep +) +{ + uint_t round_cyl = 1; + mdgeom_t *geomp; + + /* get reinstructs */ + if ((geomp = metagetgeom(colnp, ep)) == NULL) + return (-1); + + /* adjust geometry */ + if (meta_adjust_geom((md_unit_t *)mr, raidnp, geomp->write_reinstruct, + geomp->read_reinstruct, round_cyl, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * add another column to the raid unit structure + */ +static int +attach_raid_col( + mdsetname_t *sp, + mdname_t *raidnp, + mr_unit_t *mr, + mr_column_t *mdc, + mdname_t *colnp, + rcs_state_t state, + mdnamelist_t **keynlpp, + mdcmdopts_t options, + md_error_t *ep +) +{ + diskaddr_t column_size = mr->un_segsize * mr->un_segsincolumn; + diskaddr_t size; + uint_t maxio; + mdcinfo_t *cinfop; + md_timeval32_t tmp_time; + + /* setup state and timestamp */ + mdc->un_devstate = state; + if (meta_gettimeofday(&tmp_time) == -1) + return (mdsyserror(ep, errno, NULL)); + + mdc->un_devtimestamp = tmp_time; + /* get start, size, and maxio */ + if ((mdc->un_orig_devstart = metagetstart(sp, colnp, ep)) == + MD_DISKADDR_ERROR) + return (-1); + if ((size = metagetsize(colnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((cinfop = metagetcinfo(colnp, ep)) == NULL) + return (-1); + maxio = cinfop->maxtransfer; + + /* adjust start and size by prewrite */ + mdc->un_orig_pwstart = mdc->un_orig_devstart; + mdc->un_orig_devstart += mr->un_pwsize; + + /* make sure we still have something left */ + if ((mdc->un_orig_devstart >= size) || + ((size - mdc->un_orig_devstart) < column_size)) { + return (mdsyserror(ep, ENOSPC, colnp->cname)); + } + size -= mdc->un_orig_devstart; + if (maxio < mr->un_maxio) { + return (mdcomperror(ep, MDE_MAXIO, + meta_getminor(raidnp->dev), colnp->dev, colnp->cname)); + } + + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, colnp, keynlpp, ep) != 0) + return (-1); + } + + /* setup column */ + mdc->un_orig_dev = colnp->dev; + mdc->un_orig_key = colnp->key; + mdc->un_dev = colnp->dev; + mdc->un_pwstart = mdc->un_orig_pwstart; + mdc->un_devstart = mdc->un_orig_devstart; + mdc->un_alt_dev = NODEV64; + mdc->un_alt_pwstart = 0; + mdc->un_alt_devstart = 0; + mdc->un_hs_id = 0; + + /* add the size (we use) of the device to the total */ + mr->c.un_actual_tb += column_size; + + /* adjust geometry */ + if (adjust_geom(raidnp, colnp, mr, ep) != 0) + return (-1); + + /* count column */ + mr->un_totalcolumncnt++; + + /* return success */ + return (0); +} + +/* + * invalidate column names + */ +static int +invalidate_columns( + mdsetname_t *sp, + mdname_t *raidnp, + md_error_t *ep +) +{ + md_raid_t *raidp; + uint_t col; + + if ((raidp = meta_get_raid(sp, raidnp, ep)) == NULL) + return (-1); + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + + meta_invalidate_name(colnp); + } + return (0); +} + +/* + * attach columns to raid + */ +int +meta_raid_attach( + mdsetname_t *sp, + mdname_t *raidnp, + mdnamelist_t *colnlp, + mdcmdopts_t options, + md_error_t *ep +) +{ + uint_t concat_cnt = 0; + mdnamelist_t *p; + mr_unit_t *old_mr; + mr_unit_t *new_mr; + size_t old_rusize; + size_t new_rusize; + mdnamelist_t *keynlp = NULL; + md_grow_params_t mgp; + int rval = -1; + int create_flag = MD_CRO_32BIT; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* check type */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* check and count new columns */ + for (p = colnlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + mdnamelist_t *p2; + + /* check against existing devices */ + if (meta_check_column(sp, np, ep) != 0) + return (-1); + + /* check against ourselves */ + for (p2 = p->next; (p2 != NULL); p2 = p2->next) { + if (meta_check_overlap(np->cname, np, 0, -1, + p2->namep, 0, -1, ep) != 0) { + return (-1); + } + } + + /* count */ + ++concat_cnt; + } + + /* get old unit */ + if ((old_mr = (mr_unit_t *)meta_get_mdunit(sp, raidnp, ep)) == NULL) + return (-1); + + /* + * calculate the size needed for the new raid unit and allocate + * the appropriate structure. allocate new unit. + */ + old_rusize = sizeof (*old_mr) - sizeof (old_mr->un_column[0]); + old_rusize += old_mr->un_totalcolumncnt * sizeof (old_mr->un_column[0]); + new_rusize = sizeof (*new_mr) - sizeof (new_mr->un_column[0]); + new_rusize += (old_mr->un_totalcolumncnt + concat_cnt) + * sizeof (new_mr->un_column[0]); + new_mr = Zalloc(new_rusize); + (void) memcpy(new_mr, old_mr, old_rusize); + + /* We always want a do-it, this is for attach_raid_col below */ + options |= MDCMD_DOIT; + + /* build new unit structure */ + for (p = colnlp; (p != NULL); p = p->next) { + mdname_t *colnp = p->namep; + mr_column_t *mdc; + + /* attach column */ + mdc = &new_mr->un_column[new_mr->un_totalcolumncnt]; + if (attach_raid_col(sp, raidnp, new_mr, mdc, colnp, + RCS_INIT, &keynlp, options, ep) != 0) { + goto out; + } + } + assert(new_mr->un_totalcolumncnt + == (old_mr->un_totalcolumncnt + concat_cnt)); + + + create_flag = meta_check_devicesize(new_mr->c.un_total_blocks); + + /* grow raid */ + (void) memset(&mgp, 0, sizeof (mgp)); + mgp.mnum = MD_SID(new_mr); + MD_SETDRIVERNAME(&mgp, MD_RAID, sp->setno); + mgp.size = new_rusize; + mgp.mdp = (uintptr_t)new_mr; + + if (create_flag == MD_CRO_32BIT) { + mgp.options = MD_CRO_32BIT; + new_mr->c.un_revision = MD_32BIT_META_DEV; + } else { + mgp.options = MD_CRO_64BIT; + new_mr->c.un_revision = MD_64BIT_META_DEV; + } + if (metaioctl(MD_IOCGROW, &mgp, &mgp.mde, NULL) != 0) { + (void) mdstealerror(ep, &mgp.mde); + goto out; + } + + /* clear cache */ + if (invalidate_columns(sp, raidnp, ep) != 0) + goto out; + meta_invalidate_name(raidnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + if (concat_cnt == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: component is attached\n"), + raidnp->cname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: components are attached\n"), + raidnp->cname); + } + (void) fflush(stdout); + } + + + /* grow any parents */ + if (meta_concat_parent(sp, raidnp, ep) != 0) + goto out; + rval = 0; /* success */ + + /* cleanup, return error */ +out: + Free(old_mr); + Free(new_mr); + if (rval != 0) + (void) del_key_names(sp, keynlp, NULL); + metafreenamelist(keynlp); + return (rval); +} + +/* + * get raid parameters + */ +int +meta_raid_get_params( + mdsetname_t *sp, + mdname_t *raidnp, + mr_params_t *paramsp, + md_error_t *ep +) +{ + md_raid_t *raidp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* check name */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* get unit */ + if ((raidp = meta_get_raid(sp, raidnp, ep)) == NULL) + return (-1); + + /* return parameters */ + (void) memset(paramsp, 0, sizeof (*paramsp)); + if (raidp->hspnamep == NULL) + paramsp->hsp_id = MD_HSP_NONE; + else + paramsp->hsp_id = raidp->hspnamep->hsp; + return (0); +} + +/* + * set raid parameters + */ +int +meta_raid_set_params( + mdsetname_t *sp, + mdname_t *raidnp, + mr_params_t *paramsp, + md_error_t *ep +) +{ + md_raid_params_t msp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* check name */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* set parameters */ + (void) memset(&msp, 0, sizeof (msp)); + MD_SETDRIVERNAME(&msp, MD_RAID, sp->setno); + msp.mnum = meta_getminor(raidnp->dev); + msp.params = *paramsp; + if (metaioctl(MD_IOCCHANGE, &msp, &msp.mde, raidnp->cname) != 0) + return (mdstealerror(ep, &msp.mde)); + + /* clear cache */ + meta_invalidate_name(raidnp); + + /* return success */ + return (0); +} + +/* + * validate raid replace column + */ +static int +validate_new_raid( + mdsetname_t *sp, + mdname_t *raidnp, + mdname_t *colnp, + replace_params_t *paramsp, + int dup_ok, + md_error_t *ep +) +{ + mr_unit_t *mr; + diskaddr_t column_size; + diskaddr_t label; + mdcinfo_t *cinfop; + int rval = -1; + + /* get raid unit */ + if ((mr = (mr_unit_t *)meta_get_mdunit(sp, raidnp, ep)) == NULL) + return (-1); + column_size = mr->un_segsize * mr->un_segsincolumn; + + /* check it out */ + if (meta_check_column(sp, colnp, ep) != 0) { + if ((! dup_ok) || (! mdisuseerror(ep, MDE_ALREADY))) + goto out; + mdclrerror(ep); + } + if ((paramsp->number_blks = metagetsize(colnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((label = metagetlabel(colnp, ep)) == MD_DISKADDR_ERROR) + goto out; + paramsp->has_label = ((label > 0) ? 1 : 0); + if ((paramsp->start_blk = metagetstart(sp, colnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((paramsp->number_blks - paramsp->start_blk) < column_size) { + (void) mdsyserror(ep, ENOSPC, colnp->cname); + goto out; + } + if ((cinfop = metagetcinfo(colnp, ep)) == NULL) + goto out; + if (cinfop->maxtransfer < mr->un_maxio) { + (void) mdcomperror(ep, MDE_MAXIO, meta_getminor(raidnp->dev), + colnp->dev, colnp->cname); + goto out; + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + Free(mr); + return (rval); +} + +/* + * replace raid column + */ +int +meta_raid_replace( + mdsetname_t *sp, + mdname_t *raidnp, + mdname_t *oldnp, + mdname_t *newnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + int force = ((options & MDCMD_FORCE) ? 1 : 0); + replace_params_t params; + md_dev64_t old_dev, new_dev; + diskaddr_t new_start_blk, new_end_blk; + int rebind; + mr_unit_t *mr; + char *new_devidp = NULL; + md_error_t xep = mdnullerror; + int ret; + md_set_desc *sd; + uint_t tstate; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* check name */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* save new binding incase this is a rebind where oldnp==newnp */ + new_dev = newnp->dev; + new_start_blk = newnp->start_blk; + new_end_blk = newnp->end_blk; + + /* invalidate, then get the raid (fill in oldnp from metadb) */ + meta_invalidate_name(raidnp); + if (meta_get_raid(sp, raidnp, ep) == NULL) + return (-1); + + /* can't replace a component if the raid inaccessible */ + if (meta_get_tstate(raidnp->dev, &tstate, ep) != 0) { + return (-1); + } + if (tstate & MD_INACCESSIBLE) { + return (mdmderror(ep, MDE_IN_UNAVAIL_STATE, + meta_getminor(raidnp->dev), raidnp->cname)); + } + + /* the old device binding is now established */ + if ((old_dev = oldnp->dev) == NODEV64) + return (mdsyserror(ep, ENODEV, oldnp->cname)); + + + /* setup raid info */ + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(raidnp->dev); + MD_SETDRIVERNAME(¶ms, MD_RAID, sp->setno); + params.old_dev = old_dev; + params.cmd = force ? FORCE_REPLACE_COMP : REPLACE_COMP; + + if (options & MDCMD_CLUSTER_REPLACE) { + if ((mr = (mr_unit_t *)meta_get_mdunit(sp, raidnp, ep)) == NULL) + return (NULL); + Free(mr); + params.options = MDIOCTL_NO_RESYNC_RAID; + params.number_blks = metagetsize(newnp, ep); + if ((metagetlabel(newnp, ep) == MD_DISKADDR_ERROR) || + (metagetlabel(newnp, ep) == 0)) + params.has_label = 0; + else + params.has_label = 1; + params.start_blk = metagetstart(sp, newnp, ep); + } else { + if ((strcmp(oldnp->rname, newnp->rname) == 0) && + (old_dev != new_dev)) { + rebind = 1; + } else { + rebind = 0; + } + if (rebind) { + newnp->dev = new_dev; + newnp->start_blk = new_start_blk; + newnp->end_blk = new_end_blk; + } + + /* + * Save a copy of the devid associated with the new disk, the + * reason is that the checks for the column (meta_check_column) + * via validate_new_raid(), could cause the disk's devid to be + * changed to that of the devid that is currently stored in the + * replica namespace for the disk in question. This devid could + * be stale if we are replacing the disk. The actual function + * that overwrites the devid is dr2drivedesc(). + */ + + /* don't setup new_devid if no devid's or MN diskset */ + if (newnp->drivenamep->devid != NULL) + new_devidp = Strdup(newnp->drivenamep->devid); + + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if (MD_MNSET_DESC(sd)) + new_devidp = NULL; + } + + /* check out new (sets up start_blk, has_label, number_blks) */ + if (validate_new_raid(sp, raidnp, newnp, ¶ms, rebind, + ep) != 0) { + Free(new_devidp); + return (-1); + } + + /* + * Copy back the saved devid. + */ + Free(newnp->drivenamep->devid); + if (new_devidp) { + newnp->drivenamep->devid = Strdup(new_devidp); + Free(new_devidp); + } + } + + /* store name in namespace, allocate new key */ + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + + if (rebind && !metaislocalset(sp)) { + /* + * We are 'rebind'ing a disk that is in a diskset so as well + * as updating the diskset's namespace the local set needs + * to be updated because it also contains a reference to the + * disk in question. + */ + ret = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, + newnp->cname, ep); + + if (ret != METADEVADM_SUCCESS) { + (void) del_key_name(sp, newnp, &xep); + return (-1); + } + } + + /* replace column */ + params.new_dev = new_dev; + params.new_key = newnp->key; + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) { + (void) del_key_name(sp, newnp, ep); + return (mdstealerror(ep, ¶ms.mde)); + } + + /* clear cache */ + meta_invalidate_name(oldnp); + meta_invalidate_name(newnp); + meta_invalidate_name(raidnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is replaced with %s\n"), + raidnp->cname, oldnp->cname, newnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * enable raid column + */ +int +meta_raid_enable( + mdsetname_t *sp, + mdname_t *raidnp, + mdname_t *colnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + int force = ((options & MDCMD_FORCE) ? 1 : 0); + replace_params_t params; + md_dev64_t fs_dev, del_dev; + int err = 0; + char *devnm; + int ret; + uint_t tstate; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* check name */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* get the file_system dev binding */ + if (meta_getdev(sp, colnp, ep) != 0) + return (-1); + fs_dev = colnp->dev; + + /* get the raid unit (fill in colnp->dev with metadb version) */ + meta_invalidate_name(raidnp); + if (meta_get_raid(sp, raidnp, ep) == NULL) + return (-1); + + /* enabling a component can't work if the raid inaccessible */ + if (meta_get_tstate(raidnp->dev, &tstate, ep) != 0) { + return (-1); + } + if (tstate & MD_INACCESSIBLE) { + return (mdmderror(ep, MDE_IN_UNAVAIL_STATE, + meta_getminor(raidnp->dev), raidnp->cname)); + } + + /* the metadb device binding is now established */ + if (colnp->dev == NODEV64) + return (mdsyserror(ep, ENODEV, colnp->cname)); + + /* + * check for the case where the dev_t has changed between the + * filesystem and the metadb. This is called a rebind, and + * is handled by meta_raid_replace. + */ + if (fs_dev != colnp->dev) { + /* + * Save the devt of mddb version + */ + del_dev = colnp->dev; + + /* establish file system binding with invalid start/end */ + colnp->dev = fs_dev; + colnp->start_blk = -1; + colnp->end_blk = -1; + err = meta_raid_replace(sp, raidnp, colnp, colnp, options, ep); + + /* + * Don't do it if meta_raid_replace returns an error + */ + if (!err && (devnm = meta_getnmentbydev(sp->setno, MD_SIDEWILD, + del_dev, NULL, NULL, &colnp->key, ep)) != NULL) { + (void) del_key_name(sp, colnp, ep); + Free(devnm); + } + return (err); + } + + /* setup raid info */ + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(raidnp->dev); + MD_SETDRIVERNAME(¶ms, MD_RAID, sp->setno); + params.old_dev = params.new_dev = colnp->dev; + if (force) + params.cmd = FORCE_ENABLE_COMP; + else + params.cmd = ENABLE_COMP; + + /* check it out */ + if (validate_new_raid(sp, raidnp, colnp, ¶ms, 1, ep) != 0) + return (-1); + + /* enable column */ + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) + return (mdstealerror(ep, ¶ms.mde)); + + /* + * are we dealing with a non-local set? If so need to update the + * local namespace so that the disk record has the correct devid. + */ + if (!metaislocalset(sp)) { + ret = meta_fixdevid(sp, DEV_UPDATE|DEV_LOCAL_SET, colnp->cname, + ep); + + if (ret != METADEVADM_SUCCESS) { + /* + * Failed to update the local set. Nothing to do here + * apart from report the error. The namespace is + * most likely broken and some form of remedial + * recovery is going to be required. + */ + mde_perror(ep, ""); + mdclrerror(ep); + } + } + + /* clear cache */ + meta_invalidate_name(colnp); + meta_invalidate_name(raidnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is enabled\n"), + raidnp->cname, colnp->cname); + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * check for dups in the raid itself + */ +static int +check_twice( + md_raid_t *raidp, + uint_t col, + md_error_t *ep +) +{ + mdname_t *raidnp = raidp->common.namep; + mdname_t *thisnp; + uint_t c; + + thisnp = raidp->cols.cols_val[col].colnamep; + for (c = 0; (c < col); ++c) { + md_raidcol_t *mdcp = &raidp->cols.cols_val[c]; + mdname_t *colnp = mdcp->colnamep; + + if (meta_check_overlap(raidnp->cname, thisnp, 0, -1, + colnp, 0, -1, ep) != 0) { + return (-1); + } + } + return (0); +} + +/* + * default raid interlace + */ +diskaddr_t +meta_default_raid_interlace(void) +{ + diskaddr_t interlace; + + /* default to 16k, round up if necessary */ + interlace = btodb(16 * 1024); + if (interlace < lbtodb(MININTERLACE)) + interlace = roundup(MININTERLACE, interlace); + return (interlace); +} + +/* + * convert interlaces + */ +int +meta_raid_check_interlace( + diskaddr_t interlace, + char *uname, + md_error_t *ep +) +{ + if ((interlace < btodb(RAID_MIN_INTERLACE)) || + (interlace > btodb(MAXINTERLACE))) { + return (mderror(ep, MDE_BAD_INTERLACE, uname)); + } + return (0); +} + +/* + * check raid + */ +int +meta_check_raid( + mdsetname_t *sp, + md_raid_t *raidp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *raidnp = raidp->common.namep; + int doit = ((options & MDCMD_DOIT) ? 1 : 0); + int updateit = ((options & MDCMD_UPDATE) ? 1 : 0); + uint_t ncol; + uint_t col; + minor_t mnum = meta_getminor(raidnp->dev); + + /* check number */ + if (((ncol = raidp->cols.cols_len) < MD_RAID_MIN) || + (raidp->orig_ncol > ncol)) { + return (mdmderror(ep, MDE_BAD_RAID, mnum, raidnp->cname)); + } + + /* compute default interlace */ + if (raidp->interlace == 0) { + raidp->interlace = meta_default_raid_interlace(); + } + + /* check state */ + switch (raidp->state) { + case RUS_INIT: + case RUS_OKAY: + break; + + default: + return (mdmderror(ep, MDE_BAD_RAID, mnum, raidnp->cname)); + } + + /* check interlace */ + if (meta_raid_check_interlace(raidp->interlace, raidnp->cname, ep) != 0) + return (-1); + + /* check hotspare pool name */ + if (doit) { + if ((raidp->hspnamep != NULL) && + (metachkhsp(sp, raidp->hspnamep, ep) != 0)) { + return (-1); + } + } + + /* check columns */ + for (col = 0; (col < ncol); ++col) { + md_raidcol_t *mdcp = &raidp->cols.cols_val[col]; + mdname_t *colnp = mdcp->colnamep; + diskaddr_t start_blk, size; + + /* setup column */ + if (raidp->state == RUS_INIT) + mdcp->state = RCS_INIT; + else + mdcp->state = RCS_OKAY; + + /* check column */ + if (!updateit) { + if (meta_check_column(sp, colnp, ep) != 0) + return (-1); + if (((start_blk = metagetstart(sp, colnp, ep)) == + MD_DISKADDR_ERROR) || ((size = metagetsize(colnp, + ep)) == MD_DISKADDR_ERROR)) { + return (-1); + } + if (start_blk >= size) + return (mdsyserror(ep, ENOSPC, colnp->cname)); + size -= start_blk; + size = rounddown(size, raidp->interlace); + if (size == 0) + return (mdsyserror(ep, ENOSPC, colnp->cname)); + } + + /* check this raid too */ + if (check_twice(raidp, col, ep) != 0) + return (-1); + } + + /* return success */ + return (0); +} + +/* + * setup raid geometry + */ +static int +raid_geom( + md_raid_t *raidp, + mr_unit_t *mr, + md_error_t *ep +) +{ + uint_t write_reinstruct = 0; + uint_t read_reinstruct = 0; + uint_t round_cyl = 1; + uint_t col; + mdgeom_t *geomp; + + /* get worst reinstructs */ + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *mdcp = &raidp->cols.cols_val[col]; + mdname_t *colnp = mdcp->colnamep; + + if ((geomp = metagetgeom(colnp, ep)) == NULL) + return (-1); + if (geomp->write_reinstruct > write_reinstruct) + write_reinstruct = geomp->write_reinstruct; + if (geomp->read_reinstruct > read_reinstruct) + read_reinstruct = geomp->read_reinstruct; + } + + /* setup geometry from first column */ + assert(raidp->cols.cols_len > 0); + if ((geomp = metagetgeom(raidp->cols.cols_val[0].colnamep, + ep)) == NULL) { + return (-1); + } + if (meta_setup_geom((md_unit_t *)mr, raidp->common.namep, geomp, + write_reinstruct, read_reinstruct, round_cyl, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +int +meta_raid_state_cnt(mr_unit_t *mr, rcs_state_t state) +{ + int statecnt = 0; + int col; + + for (col = 0; col < mr->un_totalcolumncnt; col++) + if (mr->un_column[col].un_devstate & state) + statecnt++; + return (statecnt); +} +/* + * validate that a raid device being created with the -k flag is a real + * raid device + */ +int +meta_raid_valid(md_raid_t *raidp, mr_unit_t *mr) +{ + long long buf[DEV_BSIZE / sizeof (long long)]; + raid_pwhdr_t pwhdr; + raid_pwhdr_t *rpw = &pwhdr; + minor_t mnum; + int col; + int fd; + + for (col = 0; col < mr->un_totalcolumncnt; col++) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + + if ((fd = open(colnp->rname, O_RDONLY)) < 0) + goto error_exit; + + if (lseek64(fd, + (mr->un_column[col].un_pwstart * DEV_BSIZE), SEEK_SET) < 0) + goto error_exit; + + if (read(fd, buf, DEV_BSIZE) < 0) + goto error_exit; + + /* + * If our raid device is a 64 bit device, we can accept the + * pw header we just read in. + * Otherwise it's of type raid_pwhdr32_od_t and has to + * be converted. + */ + if (mr->c.un_revision == MD_64BIT_META_DEV) { + rpw = (raid_pwhdr_t *)buf; + } else { + RAID_CONVERT_RPW((raid_pwhdr32_od_t *)buf, rpw); + } + + if (rpw->rpw_column != col) + goto error_exit; + + if (col == 0) + mnum = rpw->rpw_unit; + + if (rpw->rpw_unit != mnum) + goto error_exit; + + if (rpw->rpw_magic_ext == RAID_PWMAGIC) { + /* 4.1 prewrite header */ + if ((rpw->rpw_origcolumncnt != mr->un_origcolumncnt) || + (rpw->rpw_totalcolumncnt + != mr->un_totalcolumncnt) || + (rpw->rpw_segsize != mr->un_segsize) || + (rpw->rpw_segsincolumn != mr->un_segsincolumn) || + (rpw->rpw_pwcnt != mr->un_pwcnt) || + (rpw->rpw_pwstart != + mr->un_column[col].un_pwstart) || + (rpw->rpw_devstart != + mr->un_column[col].un_devstart) || + (rpw->rpw_pwsize != mr->un_pwsize)) + goto error_exit; + } + /* + * this is an old prewrite header (4.0) the unit structure + * will have to be trusted. + */ + (void) close(fd); + } + + return (0); + +error_exit: + (void) close(fd); + return (-1); +} + +/* + * create raid + */ +int +meta_create_raid( + mdsetname_t *sp, + md_raid_t *raidp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *raidnp = raidp->common.namep; + uint_t ncol = raidp->cols.cols_len; + uint_t orig_ncol = raidp->orig_ncol; + size_t rdsize; + mr_unit_t *mr; + uint_t col; + diskaddr_t disk_size = 0; + uint_t disk_maxio = 0; + uint_t pwes; + diskaddr_t non_pw_blks, column_size; + mdnamelist_t *keynlp = NULL; + md_set_params_t set_params; + int rval = -1; + md_timeval32_t creation_time; + int create_flag = MD_CRO_32BIT; + + /* validate raid */ + if (meta_check_raid(sp, raidp, options, ep) != 0) + return (-1); + + /* allocate raid unit */ + rdsize = sizeof (*mr) - sizeof (mr->un_column[0]); + rdsize += ncol * sizeof (mr->un_column[0]); + mr = Zalloc(rdsize); + + if (meta_gettimeofday(&creation_time) == -1) + return (mdsyserror(ep, errno, NULL)); + /* + * initialize the top level mr_unit_t structure + * setup the unit state to indicate whether to retain + * any data currently on the metadevice or to clear it + */ + mr->c.un_type = MD_METARAID; + MD_SID(mr) = meta_getminor(raidnp->dev); + mr->c.un_size = rdsize; + mr->un_magic = RAID_UNMAGIC; + mr->un_state = raidp->state; + mr->un_timestamp = creation_time; + mr->un_origcolumncnt = orig_ncol; + mr->un_segsize = (uint_t)raidp->interlace; + if (raidp->hspnamep != NULL) { + mr->un_hsp_id = raidp->hspnamep->hsp; + } else { + mr->un_hsp_id = MD_HSP_NONE; + } + /* + * setup original columns, saving start_block and + * finding smallest size and maxio + */ + for (col = 0; (col < orig_ncol); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + mr_column_t *mdc = &mr->un_column[col]; + diskaddr_t size; + uint_t maxio; + mdcinfo_t *cinfop; + + /* setup state */ + mdc->un_devstate = cp->state; + + /* setup creation time */ + mdc->un_devtimestamp = creation_time; + + /* get start, size, and maxio */ + if ((mdc->un_orig_devstart = metagetstart(sp, colnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((size = metagetsize(colnp, ep)) == MD_DISKADDR_ERROR) + goto out; + size -= mdc->un_orig_devstart; + if ((cinfop = metagetcinfo(colnp, ep)) == NULL) + goto out; + maxio = cinfop->maxtransfer; + + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, colnp, &keynlp, ep) != 0) + goto out; + } + + /* setup column */ + mdc->un_orig_key = colnp->key; + mdc->un_orig_dev = colnp->dev; + mdc->un_dev = mdc->un_orig_dev; + mdc->un_pwstart = mdc->un_orig_pwstart; + mdc->un_devstart = mdc->un_orig_devstart; + mdc->un_alt_dev = NODEV64; + mdc->un_alt_pwstart = 0; + mdc->un_alt_devstart = 0; + mdc->un_hs_id = 0; + if (mr->un_state == RUS_INIT) + mdc->un_devstate = RCS_INIT; + else + mdc->un_devstate = RCS_OKAY; + + /* adjust for smallest disk */ + if (disk_size == 0) { + disk_size = size; + } else if (size < disk_size) { + disk_size = size; + } + if (disk_maxio == 0) { + disk_maxio = maxio; + } else if (maxio < disk_maxio) { + disk_maxio = maxio; + } + } + assert(col == mr->un_origcolumncnt); + + /* + * before processing any of the attached column(s) + * set up the composition of the metadevice for column + * sizes and pre-write information + */ + mr->un_maxio = disk_maxio; /* smallest maxio */ + mr->un_iosize = min(mr->un_maxio, (mr->un_segsize + 1)); + pwes = mr->un_iosize; + if (raidp->pw_count) + mr->un_pwcnt = raidp->pw_count; + else + mr->un_pwcnt = PWCNT_MIN; + if ((mr->un_pwcnt < PWCNT_MIN) || (mr->un_pwcnt > PWCNT_MAX)) { + (void) mderror(ep, MDE_RAID_BAD_PW_CNT, raidnp->cname); + goto out; + } + mr->un_pwsize = roundup((mr->un_pwcnt * pwes), 2); + + /* now calculate the number of segments per column */ + non_pw_blks = disk_size - mr->un_pwsize; /* smallest disk */ + if ((mr->un_pwsize > disk_size) || + (non_pw_blks < (diskaddr_t)mr->un_segsize)) { + (void) mdsyserror(ep, ENOSPC, raidnp->cname); + goto out; + } + mr->un_segsincolumn = non_pw_blks / mr->un_segsize; + column_size = mr->un_segsize * mr->un_segsincolumn; + + /* + * adjust the pw_cnt, pw_size, to fit into any fragmentation + * left over after column_size has been computed + */ + mr->un_pwsize = rounddown(((uint_t)(disk_size - column_size)), 2); + mr->un_pwcnt = mr->un_pwsize / pwes; + assert(mr->un_pwcnt >= PWCNT_MIN); + mr->un_pwsize = roundup((mr->un_pwcnt * pwes), 2); + assert((mr->un_pwsize + column_size) <= disk_size); + + /* + * calculate the actual block count available based on the + * segment size and the number of segments per column ... + * ... and adjust for the number of parity segments + */ + mr->c.un_actual_tb = column_size * (mr->un_origcolumncnt - 1); + + if (raid_geom(raidp, mr, ep) != 0) + goto out; + + create_flag = meta_check_devicesize(mr->c.un_total_blocks); + + /* + * now calculate the pre-write offset and update the column + * structures to include the address of the individual pre-write + * areas + */ + for (col = 0; (col < orig_ncol); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + mr_column_t *mdc = &mr->un_column[col]; + diskaddr_t size; + + /* get size */ + if ((size = metagetsize(colnp, ep)) == MD_DISKADDR_ERROR) + goto out; + + /* adjust start and size by prewrite */ + mdc->un_orig_pwstart = mdc->un_orig_devstart; + mdc->un_orig_devstart += mr->un_pwsize; + mdc->un_pwstart = mdc->un_orig_pwstart; + mdc->un_devstart = mdc->un_orig_devstart; + + assert(size >= mdc->un_orig_devstart); + size -= mdc->un_orig_devstart; + + /* make sure we still have something left */ + assert(size >= column_size); + } + + /* do concat cols */ + mr->un_totalcolumncnt = mr->un_origcolumncnt; + assert(col == mr->un_origcolumncnt); + for (col = orig_ncol; (col < ncol); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + mr_column_t *mdc = &mr->un_column[col]; + + /* attach column */ + if (attach_raid_col(sp, raidnp, mr, mdc, colnp, + cp->state, &keynlp, options, ep) != 0) { + goto out; + } + } + assert(mr->un_totalcolumncnt == ncol); + + /* fill in the size of the raid */ + if (options & MDCMD_UPDATE) { + raidp->common.size = mr->c.un_total_blocks; + raidp->column_size = mr->un_segsize * mr->un_segsincolumn; + } + + /* if we're not doing anything, return success */ + if (! (options & MDCMD_DOIT)) { + rval = 0; /* success */ + goto out; + } + + if ((mr->un_state & RUS_OKAY) && + (meta_raid_valid(raidp, mr) != 0)) { + (void) mderror(ep, MDE_RAID_INVALID, raidnp->cname); + goto out; + } + + /* create raid */ + (void) memset(&set_params, 0, sizeof (set_params)); + /* did the user tell us to generate a large device? */ + if (create_flag == MD_CRO_64BIT) { + mr->c.un_revision = MD_64BIT_META_DEV; + set_params.options = MD_CRO_64BIT; + } else { + mr->c.un_revision = MD_32BIT_META_DEV; + set_params.options = MD_CRO_32BIT; + } + set_params.mnum = MD_SID(mr); + set_params.size = mr->c.un_size; + set_params.mdp = (uintptr_t)mr; + MD_SETDRIVERNAME(&set_params, MD_RAID, MD_MIN2SET(set_params.mnum)); + if (metaioctl(MD_IOCSET, &set_params, &set_params.mde, + raidnp->cname) != 0) { + (void) mdstealerror(ep, &set_params.mde); + goto out; + } + rval = 0; /* success */ + + /* cleanup, return success */ +out: + Free(mr); + if (rval != 0) { + (void) del_key_names(sp, keynlp, NULL); + } + metafreenamelist(keynlp); + if ((rval == 0) && (options & MDCMD_DOIT)) { + if (invalidate_columns(sp, raidnp, ep) != 0) + rval = -1; + meta_invalidate_name(raidnp); + } + return (rval); +} + +/* + * initialize raid + * NOTE: this functions is metainit(1m)'s command line parser! + */ +int +meta_init_raid( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + char *uname = argv[0]; + mdname_t *raidnp = NULL; + int old_optind; + int c; + md_raid_t *raidp = NULL; + uint_t ncol, col; + int rval = -1; + md_set_desc *sd; + + /* get raid name */ + assert(argc > 0); + if (argc < 1) + goto syntax; + if ((raidnp = metaname(spp, uname, ep)) == NULL) + goto out; + assert(*spp != NULL); + + /* + * Raid metadevice not allowed on multi-node diskset. + */ + if (! metaislocalset(*spp)) { + if ((sd = metaget_setdesc(*spp, ep)) == NULL) + goto out; + if (MD_MNSET_DESC(sd)) { + rval = meta_cook_syntax(ep, MDE_MNSET_NORAID, uname, + argc, argv); + goto out; + } + } + + uname = raidnp->cname; + if (metachkmeta(raidnp, ep) != 0) + goto out; + + if (!(options & MDCMD_NOLOCK)) { + /* grab set lock */ + if (meta_lock(*spp, TRUE, ep) != 0) + goto out; + + if (meta_check_ownership(*spp, ep) != 0) + goto out; + } + + /* see if it exists already */ + if (metagetmiscname(raidnp, ep) != NULL) { + (void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP, + meta_getminor(raidnp->dev), uname); + goto out; + } else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) { + goto out; + } else { + mdclrerror(ep); + } + --argc, ++argv; + + /* grab -r */ + if ((argc < 1) || (strcmp(argv[0], "-r") != 0)) + goto syntax; + --argc, ++argv; + + /* parse general options */ + optind = 0; + opterr = 0; + if (getopt(argc, argv, "") != -1) + goto options; + + /* allocate raid */ + raidp = Zalloc(sizeof (*raidp)); + + /* setup common */ + raidp->common.namep = raidnp; + raidp->common.type = MD_METARAID; + raidp->state = RUS_INIT; + + /* allocate and parse cols */ + for (ncol = 0; ((ncol < argc) && (argv[ncol][0] != '-')); ++ncol) + ; + raidp->cols.cols_len = ncol; + if (ncol != 0) { + raidp->cols.cols_val = + Zalloc(ncol * sizeof (*raidp->cols.cols_val)); + } + for (col = 0; ((argc > 0) && (col < ncol)); ++col) { + md_raidcol_t *mdc = &raidp->cols.cols_val[col]; + mdname_t *colnp; + + /* parse column name */ + if ((colnp = metaname(spp, argv[0], ep)) == NULL) + goto out; + /* check for soft partitions */ + if (meta_sp_issp(*spp, colnp, ep) != 0) { + /* check disks */ + if (metachkcomp(colnp, ep) != 0) + goto out; + } + mdc->colnamep = colnp; + --argc, ++argv; + } + + /* parse raid options */ + old_optind = optind = 0; + opterr = 0; + while ((c = getopt(argc, argv, "h:i:ko:w:")) != -1) { + switch (c) { + case 'h': + if ((raidp->hspnamep = metahspname(spp, optarg, + ep)) == NULL) { + goto out; + } + break; + + case 'i': + if (parse_interlace(uname, optarg, &raidp->interlace, + ep) != 0) { + goto out; + } + if (meta_raid_check_interlace(raidp->interlace, + uname, ep)) + goto out; + break; + + case 'k': + raidp->state = RUS_OKAY; + break; + + case 'o': + if ((sscanf(optarg, "%u", &raidp->orig_ncol) != 1) || + ((int)raidp->orig_ncol < 0)) { + goto syntax; + } + if ((raidp->orig_ncol < MD_RAID_MIN) || + (raidp->orig_ncol > ncol)) { + rval = mderror(ep, MDE_BAD_ORIG_NCOL, uname); + goto out; + } + break; + case 'w': + if ((sscanf(optarg, "%d", &raidp->pw_count) != 1) || + ((int)raidp->pw_count < 0)) + goto syntax; + if (((int)raidp->pw_count < PWCNT_MIN) || + ((int)raidp->pw_count > PWCNT_MAX)) { + rval = mderror(ep, MDE_RAID_BAD_PW_CNT, uname); + goto out; + } + break; + default: + argc += old_optind; + argv -= old_optind; + goto options; + } + old_optind = optind; + } + argc -= optind; + argv += optind; + + /* we should be at the end */ + if (argc != 0) + goto syntax; + + /* default to all original columns */ + if (raidp->orig_ncol == 0) + raidp->orig_ncol = ncol; + + /* create raid */ + if (meta_create_raid(*spp, raidp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, "%s: RAID is setup\n"), + uname); + (void) fflush(stdout); + } + goto out; + + /* syntax error */ +syntax: + rval = meta_cook_syntax(ep, MDE_SYNTAX, uname, argc, argv); + goto out; + + /* options error */ +options: + rval = meta_cook_syntax(ep, MDE_OPTION, uname, argc, argv); + goto out; + + /* cleanup, return error */ +out: + if (raidp != NULL) + meta_free_raid(raidp); + return (rval); +} + +/* + * reset RAIDs + */ +int +meta_raid_reset( + mdsetname_t *sp, + mdname_t *raidnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_raid_t *raidp; + int rval = -1; + int col; + + /* should have same set */ + assert(sp != NULL); + assert((raidnp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev)))); + + /* reset all raids */ + if (raidnp == NULL) { + mdnamelist_t *raidnlp = NULL; + mdnamelist_t *p; + + /* for each raid */ + rval = 0; + if (meta_get_raid_names(sp, &raidnlp, 0, ep) < 0) + return (-1); + for (p = raidnlp; (p != NULL); p = p->next) { + /* reset RAID */ + raidnp = p->namep; + if (meta_raid_reset(sp, raidnp, options, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(raidnlp); + return (rval); + } + + /* check name */ + if (metachkmeta(raidnp, ep) != 0) + return (-1); + + /* get unit structure */ + if ((raidp = meta_get_raid(sp, raidnp, ep)) == NULL) + return (-1); + + /* make sure nobody owns us */ + if (MD_HAS_PARENT(raidp->common.parent)) { + return (mdmderror(ep, MDE_IN_USE, meta_getminor(raidnp->dev), + raidnp->cname)); + } + + /* clear subdevices cache */ + if (invalidate_columns(sp, raidnp, ep) != 0) + return (-1); + + /* clear metadevice */ + if (meta_reset(sp, raidnp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, "%s: RAID is cleared\n"), + raidnp->cname); + (void) fflush(stdout); + } + + /* clear subdevices */ + if (! (options & MDCMD_RECURSE)) + goto out; + + for (col = 0; (col < raidp->cols.cols_len); ++col) { + md_raidcol_t *cp = &raidp->cols.cols_val[col]; + mdname_t *colnp = cp->colnamep; + + /* only recurse on metadevices */ + if (! metaismeta(colnp)) + continue; + + if (meta_reset_by_name(sp, colnp, options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ +out: + meta_invalidate_name(raidnp); + return (rval); +} + +/* + * reports TRUE if any RAID component is in error + */ +int +meta_raid_anycomp_is_err(mdsetname_t *sp, mdnamelist_t *raid_names) +{ + mdnamelist_t *nlp; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int any_errs = FALSE; + + for (nlp = raid_names; nlp; nlp = nlp->next) { + md_raid_t *raidp; + + if ((raidp = meta_get_raid(sp, nlp->namep, ep)) == NULL) { + any_errs |= TRUE; + goto out; + } + if (raidp->state != RUS_OKAY && raidp->state != RUS_INIT) { + any_errs |= TRUE; + goto out; + } + } +out: + if (!mdisok(ep)) + mdclrerror(ep); + + return (any_errs); +} +/* + * regen parity on a raid + */ +int +meta_raid_regen_byname(mdsetname_t *sp, mdname_t *raidnp, diskaddr_t size, + md_error_t *ep) +{ + char *miscname; + md_resync_ioctl_t ri; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* make sure we have a raid */ + if ((miscname = metagetmiscname(raidnp, ep)) == NULL) + return (-1); + if (strcmp(miscname, MD_RAID) != 0) { + return (mdmderror(ep, MDE_NOT_RAID, meta_getminor(raidnp->dev), + raidnp->cname)); + } + + /* start resync */ + (void) memset(&ri, 0, sizeof (ri)); + MD_SETDRIVERNAME(&ri, MD_RAID, sp->setno); + ri.ri_mnum = meta_getminor(raidnp->dev); + ri.ri_copysize = size; + if (metaioctl(MD_IOCSETREGEN, &ri, &ri.mde, raidnp->cname) != 0) + return (mdstealerror(ep, &ri.mde)); + + /* return success */ + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_raid_resync.c b/usr/src/lib/lvm/libmeta/common/meta_raid_resync.c new file mode 100644 index 0000000000..061299022f --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_raid_resync.c @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1994-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * raid operations + */ + +#include <meta.h> +#include <sys/lvm/md_mirror.h> + +/* + * resync raid + */ +int +meta_raid_resync( + mdsetname_t *sp, + mdname_t *raidnp, + daddr_t size, + md_error_t *ep +) +{ + char *miscname; + md_resync_ioctl_t ri; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(raidnp->dev))); + + /* make sure we have a raid */ + if ((miscname = metagetmiscname(raidnp, ep)) == NULL) + return (-1); + if (strcmp(miscname, MD_RAID) != 0) { + return (mdmderror(ep, MDE_NOT_RAID, meta_getminor(raidnp->dev), + raidnp->cname)); + } + + /* start resync */ + (void) memset(&ri, 0, sizeof (ri)); + MD_SETDRIVERNAME(&ri, MD_RAID, sp->setno); + ri.ri_mnum = meta_getminor(raidnp->dev); + ri.ri_copysize = size; + if (metaioctl(MD_IOCSETSYNC, &ri, &ri.mde, raidnp->cname) != 0) + return (mdstealerror(ep, &ri.mde)); + + /* return success */ + return (0); +} + +/* + * NAME: meta_raid_resync_all + * DESCRIPTION: loop through the RAID devices synch'ing all + * PARAMETERS: char *sp - the set to synch + * daddr_t size - resync size + * md_error_t *ep - return error info + * + */ +int +meta_raid_resync_all( + mdsetname_t *sp, + daddr_t size, + md_error_t *ep +) +{ + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int rval = 0, fval; + + /* should have a set */ + assert(sp != NULL); + + /* get raids */ + if (meta_get_raid_names(sp, &nlp, 0, ep) < 0) + return (-1); + + /* fork a process */ + if ((fval = md_daemonize(sp, ep)) != 0) { + /* + * md_daemonize forks off a process to do the work. This + * is the parent or errror. + */ + if (fval > 0) { + if (nlp != NULL) + metafreenamelist(nlp); + return (0); + } + mdclrerror(ep); + } + + assert((fval == 0) || (fval == -1)); + + /* resync each raid */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *raidnp = p->namep; + + if (meta_raid_resync(sp, raidnp, size, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + if (nlp != NULL) + metafreenamelist(nlp); + if (fval == 0) + exit(0); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_rename.c b/usr/src/lib/lvm/libmeta/common/meta_rename.c new file mode 100644 index 0000000000..617b3f3694 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_rename.c @@ -0,0 +1,539 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * change the identity of a metadevice + * These are the "do it" functions for the metarename command. + */ + +#include <string.h> +#include <meta.h> +#include <sys/lvm/md_rename.h> + +/* private */ +#define FORCE (0x00000001) +#define NOISY (0x00000010) +#define NOFLIP (0x00000020) +#define DRYRUN (0x00000040) + +#define OP_STR(op) \ + ((op) == MDRNOP_EXCHANGE? "exchange": \ + (op) == MDRNOP_RENAME? "rename": \ + (op) == MDRNOP_UNK? "<unknown>": "garbage") + + +/* + * Check if from_np is open + * Return 0 if not open, -1 if open + */ +static int +check_open( + mdsetname_t *sp, + mdname_t *from_np, + md_error_t *ep) +{ + int rc; + + if ((rc = meta_isopen(sp, from_np, ep, (mdcmdopts_t)0)) < 0) { + assert(!mdisok(ep)); + return (-1); + + } else if (rc > 0) { + if (mdisok(ep)) { + (void) mdmderror(ep, MDE_RENAME_BUSY, + meta_getminor(from_np->dev), + from_np->cname); + } + return (-1); + } + return (0); +} + +/* + * meta_swap is the common code used by the + * meta_rename() and meta_exchange() entry points + */ + +static int +meta_swap( + mdsetname_t *sp, + mdname_t *from_np, + mdname_t *to_np, + md_renop_t op, + int flags, + md_error_t *ep) +{ + md_rename_t txn; + + /* + * If the device exists a key may already exist so need to find it + * otherwise we'll end up adding the key in again which will lead + * to an inconsistent n_count for the namespace record. + */ + if (from_np->dev != NODEV) { + (void) meta_getnmentbydev(sp->setno, MD_SIDEWILD, from_np->dev, + NULL, NULL, &from_np->key, ep); + } + + if ((from_np->key == MD_KEYWILD) || (from_np->key == MD_KEYBAD)) { + if (add_key_name(sp, from_np, NULL, ep) != 0) { + assert(!mdisok(ep)); + return (-1); + } + } + + (void) memset(&txn, 0, sizeof (txn)); + + txn.op = op; + txn.revision = MD_RENAME_VERSION; + txn.flags = 0; + txn.from.mnum = meta_getminor(from_np->dev); + txn.from.key = from_np->key; + + if ((txn.from.key == MD_KEYBAD) || (txn.from.key == MD_KEYWILD)) { + (void) mdmderror(ep, MDE_RENAME_SOURCE_BAD, txn.from.mnum, + from_np->cname); + return (-1); + } + + if ((to_np->key == MD_KEYWILD) || (to_np->key == MD_KEYBAD)) { + if (add_key_name(sp, to_np, NULL, ep) != 0) { + assert(!mdisok(ep)); + return (-1); + } + } + + txn.to.mnum = meta_getminor(to_np->dev); + txn.to.key = to_np->key; + + if ((txn.to.key == MD_KEYBAD) || (txn.to.key == MD_KEYWILD)) { + (void) mdmderror(ep, MDE_RENAME_TARGET_BAD, txn.to.mnum, + to_np->cname); + return (-1); + } + + if (flags & NOISY) { + (void) fprintf(stderr, "\top: %s\n", OP_STR(txn.op)); + (void) fprintf(stderr, "\trevision: %d, flags: %d\n", + txn.revision, txn.flags); + (void) fprintf(stderr, + "\tfrom(mnum,key): %ld, %d\tto: %ld, %d\n", + txn.from.mnum, txn.from.key, + txn.to.mnum, txn.to.key); + } + + mdclrerror(ep); + if (metaioctl(MD_IOCRENAME, &txn, &txn.mde, from_np->cname) != 0) { + (void) del_key_name(sp, to_np, ep); + return (mdstealerror(ep, &txn.mde)); + } + + /* force the name cache to re-read device state */ + meta_invalidate_name(from_np); + meta_invalidate_name(to_np); + + return (0); +} + +/* + * rename a metadevice + */ +int +meta_rename( + mdsetname_t *sp, + mdname_t *from_np, + mdname_t *to_np, + mdcmdopts_t options, + md_error_t *ep +) +{ + int flags = (options & MDCMD_FORCE)? FORCE: 0; + int rc = 0; + mdcinfo_t *cinfop; + char *p; + md_set_desc *sd; + mdkey_t side_key = MD_KEYWILD; + md_error_t dummy_ep = mdnullerror; + int i, j; + md_mnnode_desc *nd, *nd_del; + + /* must have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(from_np->dev))); + + mdclrerror(ep); + + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "RENAME") != NULL)) { + flags |= NOISY; + } + /* if DOIT is not set, we are in dryrun mode */ + if ((options & MDCMD_DOIT) == 0) { + flags |= DRYRUN; + } + + + if (metachkmeta(from_np, ep) != 0) { + assert(!mdisok(ep)); + return (-1); + } + + mdclrerror(ep); + + if (meta_get_mdunit(sp, from_np, ep) == NULL) { + assert(!mdisok(ep)); + return (-1); + } + + if (meta_get_mdunit(sp, to_np, ep) != NULL) { + if (mdisok(ep)) { + (void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP, + meta_getminor(to_np->dev), + to_np->cname); + } + return (-1); + } + mdclrerror(ep); + + /* If FORCE is not set, check if metadevice is open */ + if (!(flags & FORCE)) { + if (check_open(sp, from_np, ep) != 0) { + return (-1); + } + } + + /* + * All checks are done, now we do the real work. + * If we are in dryrun mode, we're done. + */ + if (flags & DRYRUN) { + return (0); /* success */ + } + + /* + * add key for new name to the namespace + */ + if ((cinfop = metagetcinfo(from_np, ep)) == NULL) { + assert(!mdisok(ep)); + return (-1); + } + + if (metaislocalset(sp)) { + to_np->key = add_name(sp, MD_SIDEWILD, MD_KEYWILD, + cinfop->dname, meta_getminor(to_np->dev), to_np->bname, ep); + } else { + /* + * As this is not the local set we have to create a namespace + * record for each side (host) in the set. We cannot use + * add_key_names() because the destination device (to_np) + * should not exist and so the subsequent metagetcinfo() + * call will fail when it tries to open the device, so we + * have to use the information from the source device (from_np) + */ + if ((sd = metaget_setdesc(sp, ep)) == (md_set_desc *)NULL) { + return (-1); + } + to_np->key = MD_KEYWILD; + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + side_key = add_name(sp, (side_t)nd->nd_nodeid, + to_np->key, cinfop->dname, + meta_getminor(to_np->dev), + to_np->bname, ep); + /* + * Break out if failed to add the key, + * but delete any name space records that + * were added. + */ + if (side_key == MD_KEYBAD || + side_key == MD_KEYWILD) { + /* + * If we have a valid to_np->key then + * a record was added correctly but + * we do not know for which side, so + * we need to try to delete all of them. + */ + + if (to_np->key != MD_KEYBAD && + to_np->key != MD_KEYWILD) { + nd_del = sd->sd_nodelist; + while ((nd_del != nd) && + (nd_del != NULL)) { + (void) del_name(sp, + (side_t)nd_del->nd_nodeid, + to_np->key, &dummy_ep); + nd_del = nd_del->nd_next; + } + /* preserve error key state */ + to_np->key = side_key; + } + break; + } + to_np->key = side_key; + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + if (sd->sd_nodes[i][0] != '\0') { + side_key = add_name(sp, (side_t)i, + to_np->key, cinfop->dname, + meta_getminor(to_np->dev), + to_np->bname, ep); + /* + * Break out if failed to add the key, + * but delete any name space records + * that were added. + */ + if (side_key == MD_KEYBAD || + side_key == MD_KEYWILD) { + /* + * If we have a valid + * to_np->key then a record was + * added correctly but we do + * not know for which side, so + * we need to try to delete + * all of them. + */ + if (to_np->key != MD_KEYBAD && + to_np->key != MD_KEYWILD) { + for (j = 0; j < i; + j++) { + (void) del_name(sp, + (side_t)j, + to_np->key, + &dummy_ep); + } + /* + * preserve err + * key state + */ + to_np->key = side_key; + } + break; + } + to_np->key = side_key; + } + } + } + } + + if (to_np->key == MD_KEYBAD || to_np->key == MD_KEYWILD) { + assert(!mdisok(ep)); + return (-1); + } + + rc = meta_swap(sp, from_np, to_np, MDRNOP_RENAME, flags, ep); + + if (rc == 0) { + if (options & MDCMD_PRINT) { + (void) fprintf(stdout, dgettext(TEXT_DOMAIN, + "%s: has been renamed to %s\n"), + from_np->cname, to_np->cname); + } + } + + return (rc); +} + +/* + * return TRUE if current <from>, <to> ordering would + * prevent <from> from being in the role of <self> + */ +static bool_t +meta_exchange_need_to_flip( + md_common_t *from_mdp, + md_common_t *to_mdp +) +{ + assert(from_mdp); + assert(to_mdp); + + /* + * ? + * \ + * <to> + * \ + * <from> + */ + + if (MD_HAS_PARENT(from_mdp->parent)) { + if (MD_HAS_PARENT(to_mdp->parent)) { + if (from_mdp->parent == + meta_getminor(to_mdp->namep->dev)) { + return (TRUE); + } + } + } + + /* + * <from> + * \ + * <to> + * \ + * ? + */ + + if (MD_HAS_PARENT(to_mdp->parent)) { + if (to_mdp->capabilities & MD_CAN_META_CHILD) { + return (TRUE); + } + } + + /* + * <to> + * \ + * <from> + */ + + if (MD_HAS_PARENT(from_mdp->parent)) { + if (from_mdp->parent == meta_getminor(to_mdp->namep->dev)) { + if (!(from_mdp->capabilities & MD_CAN_META_CHILD)) { + return (TRUE); + } + } + } + + /* + * <from> or <to> + * \ \ + * <to> <from> + * \ + * ? + */ + + return (FALSE); +} + +/* + * exchange the names of two metadevices + */ +int +meta_exchange( + mdsetname_t *sp, + mdname_t *from_np, + mdname_t *to_np, + mdcmdopts_t options, + md_error_t *ep +) +{ + int flags = (options & MDCMD_FORCE)? FORCE: 0; + md_common_t *from_mdp, *to_mdp; + int rc; + char *p, *p2; + + /* must have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(from_np->dev))); + assert(sp->setno == MD_MIN2SET(meta_getminor(to_np->dev))); + + if (metachkmeta(from_np, ep) != 0) { + assert(!mdisok(ep)); + return (-1); + } + + if (metachkmeta(to_np, ep) != 0) { + assert(!mdisok(ep)); + return (-1); + } + + if ((options & MDCMD_DOIT) == 0) { + flags |= DRYRUN; + } + + if ((p = getenv("MD_DEBUG")) != NULL) { + if ((p2 = strstr(p, "EXCHANGE=")) != NULL) { + flags |= NOISY; + if ((p2 = strchr(p2, '=')) != NULL) { + if (strcmp((p2+1), "NOFLIP") == 0) { + flags |= NOFLIP; + } + } + } else if (strstr(p, "EXCHANGE") != NULL) { + flags |= NOISY; + } + } + + if ((from_mdp = meta_get_unit(sp, from_np, ep)) == NULL) { + assert(!mdisok(ep)); + return (-1); + } + + if ((to_mdp = meta_get_unit(sp, to_np, ep)) == NULL) { + assert(!mdisok(ep)); + return (-1); + } + assert(mdisok(ep)); + + /* If FORCE is not set, check if metadevice is open */ + if (!(flags & FORCE)) { + if (check_open(sp, from_np, ep) != 0) { + return (-1); + } + } + + /* + * All checks are done, now we do the real work. + * If we are in dryrun mode, we're done. + */ + if (flags & DRYRUN) { + return (0); /* success */ + } + + /* + * NOFLIP is used only for debugging; the driver + * will catch this and return MDE_RENAME_ORDER, if necessary + */ + if (((flags & NOFLIP) == 0) && + meta_exchange_need_to_flip(from_mdp, to_mdp)) { + + rc = meta_swap(sp, to_np, from_np, MDRNOP_EXCHANGE, flags, ep); + + } else { + rc = meta_swap(sp, from_np, to_np, MDRNOP_EXCHANGE, flags, ep); + } + + if (rc == 0) { + if (options & MDCMD_PRINT) { + (void) fprintf(stdout, dgettext(TEXT_DOMAIN, + "%s and %s have exchanged identities\n"), + from_np->cname, to_np->cname); + } + } + + return (rc); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_repartition.c b/usr/src/lib/lvm/libmeta/common/meta_repartition.c new file mode 100644 index 0000000000..16bf7ea597 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_repartition.c @@ -0,0 +1,415 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <meta.h> +#include "meta_repartition.h" + + + +/* + * FUNCTION: meta_replicaslice() + * INPUT: dnp - the name of the drive to check + * OUTPUT: slicep - pointer to slice number + * ep - pointer to an md_error_t structure in which + * to return errors to the caller + * RETURNS: int - 0 - value pointed to by slicep is valid + * -1 - otherwise + * + * PURPOSE: Determine which slice of the specified drive to + * reserve, presumably for metadb replica usage. + * + * NOTE: If slicep is NULL, the return code will indicate + * whether or not the slice number could be determined + */ +int +meta_replicaslice( + mddrivename_t *dnp, + uint_t *slicep, + md_error_t *ep +) +{ + int err = 0; + int ioctl_return; + int fd; + char *rname; + struct dk_geom geom; + + rname = dnp->rname; + if ((fd = open(rname, (O_RDONLY|O_NDELAY), 0)) < 0) { + char *n; + int open_errno; + size_t len; + + if (errno != ENOENT) + return (mdsyserror(ep, errno, rname)); + + len = strlen(rname) + 3; + n = Zalloc(len); + (void) snprintf(n, len, "%ss0", rname); + fd = open(n, (O_RDONLY|O_NDELAY), 0); + open_errno = errno; + Free(n); + if (fd < 0) { + return (mdsyserror(ep, open_errno, rname)); + } + } + + /* + * if our drivenamep points to a device not supporting + * DKIOCGGEOM, we have an EFI label. + */ + errno = 0; + ioctl_return = ioctl(fd, DKIOCGGEOM, &geom); + err = errno; + + (void) close(fd); + + /* + * If the DKIOCGGEOM ioctl succeeded, then the device has a + * VTOC style label. In this case, we use slice 7. + */ + if (ioctl_return == 0) { + if (slicep != NULL) { + *slicep = MD_SLICE7; + } + return (0); + } + + /* + * ENOTSUP indicates an EFI style label, in which case slice 7 + * cannot be used because its minor number is reserved. In + * this case, use slice 6. + */ + if (err == ENOTSUP) { + if (slicep != NULL) { + *slicep = MD_SLICE6; + } + return (0); + } + + /* + * Those are the only two cases we know how to deal with; + * either the drivenamep didn't point to a disk, or the ioctl + * failed for some other reason. + */ + if (err == ENOTTY) { + return (mddeverror(ep, MDE_NOT_DISK, NODEV, rname)); + } + + return (mdsyserror(ep, err, rname)); +} + + + +/* + * FUNCTION: meta_repartition_drive() + * INPUT: sp - the set name for the device to check + * dnp - the name of the drive to partition + * options - options (see NOTES) + * OUTPUT: vtocp - pointer to an mdvtoc_t structure in which + * to return the new VTOC to the caller + * ep - pointer to an md_error_t structure in which + * to return errors to the caller + * RETURNS: int - 0 - drive was or can be repartitioned + * -1 - drive could not or should not be + * repartitioned + * PURPOSE: Repartition a disk for use in a disk set or in order + * to create soft partitions on it. Alternatively, + * return the VTOC that the disk would have if it were + * repartitioned without actually repartitioning it. + * + * NOTES: + * + * This routine will repartition a drive to make it suitable for + * inclusion in a diskset. Specifically, it will create a + * proposed VTOC that specifies a replica slice that begins at the + * first valid lba, is large enough to hold a label and a metadb + * replica, does not overlap any other slices, and is unmountable. + * If the current replica slice already satisfies those criteria, + * the routine will neither create a proposed VTOC nor repartition + * the drive unless the MD_REPART_FORCE flag is passed into the + * routine in the options argument. If the routine does create a + * proposed VTOC, it will return the proposed VTOC in *vtocp if + * vtocp isn't NULL. + * + * The slice to be used as the replica slice is determined by the + * function meta_replicaslice(). + * + * If the replica slice does not satisfy the above criteria or the + * MD_REPART_FORCE flag is set, the proposed VTOC will specify a + * replica slice that satisfies the above criteria, a slice zero + * that contains the remaining space on the disk, and no other + * slices. If that repartitioning would cause the replica slice + * to move or shrink, and the MD_REPART_LEAVE_REP option is set, + * the routine will return -1 without creating or returning a + * proposed vtoc, and without repartitioning the disk. Otherwise + * the routine will repartition the disk unless the + * MD_REPART_DONT_LABEL flag is set in the options argument. + * + * If the MD_REPART_DONT_LABEL flag is set in the options argument, + * but the routine would otherwise repartition the drive, the + * routine won't repartition the drive, but will create a proposed + * VTOC that satisfies the criteria defined above and return it + * it in *vtocp if vtocp isn't NULL, The MD_REPART_DONT_LABEL + * option allows calling routines to determine what the contents of + * the drive's VTOC would be if the drive were repartitioned without + * actually repartitioning the drive. + */ +int +meta_repartition_drive( + mdsetname_t *sp, + mddrivename_t *dnp, + int options, + mdvtoc_t *vtocp, + md_error_t *ep +) +{ + uint_t replicaslice; + diskaddr_t first_lba, last_lba; + int round_sizes = 1; + unsigned long long cylsize; + unsigned long long drvsize; + int i; + mdgeom_t *mdgp; + mdvtoc_t *mdvp; + mdvtoc_t proposed_vtoc; + uint_t reservedcyl; + ushort_t resflag; + mdname_t *resnp; + unsigned long long ressize; + md_set_desc *sd; + daddr_t dbsize; + diskaddr_t replica_start; + diskaddr_t replica_size; + diskaddr_t replica_end; + diskaddr_t data_start; + diskaddr_t data_size; + + if (meta_replicaslice(dnp, &replicaslice, ep) != 0) { + return (-1); + } + + /* Don't round for EFI disks */ + if (replicaslice == MD_SLICE6) + round_sizes = 0; + + /* + * We took as argument a drive name pointer, but we need a + * slice name pointer to retrieve vtoc information. So get + * the name pointer for slice zero first, then use it to get + * the vtoc info for the disk. + */ + if ((resnp = metaslicename(dnp, MD_SLICE0, ep)) == NULL) + return (-1); + + if ((mdvp = metagetvtoc(resnp, FALSE, NULL, ep)) == NULL) + return (-1); + + /* + * Determine the metadb size. + */ + dbsize = MD_DBSIZE; + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + dbsize = MD_MN_DBSIZE; + } + + /* If we've got an efi disk, we better have lba info */ + first_lba = mdvp->first_lba; + last_lba = mdvp->last_lba; + ASSERT((round_sizes != 0) || (last_lba > 0)); + + /* + * At this point, ressize is used as a minimum value. Later + * it will be rounded up to a cylinder boundary if + * appropriate. ressize is in units of disk sectors. + */ + ressize = dbsize + VTOC_SIZE; + resflag = V_UNMNT; + + /* + * If we're forcing the repartition, we can skip the replica + * slice and overlap tests. + */ + if (options & MD_REPART_FORCE) { + goto do_repartition; + } + + /* + * Replica slice tests: it must begin at first_lba, be long + * enough, have the right flags, and not overlap any other + * slices. If any of these conditions is violated, we need to + * repartition the disk. + */ + if (mdvp->parts[replicaslice].start != first_lba) { + goto do_repartition; + } + + if (mdvp->parts[replicaslice].size < ressize) { + goto do_repartition; + } + + if (mdvp->parts[replicaslice].flag != resflag) { + goto do_repartition; + } + + /* + * Check for overlap: this test should use the actual size of + * the replica slice, as contained in the vtoc, and NOT the + * minimum size calculated above. + */ + replica_end = first_lba + mdvp->parts[replicaslice].size; + for (i = 0; i < mdvp->nparts; i++) { + if (i != replicaslice) { + if ((mdvp->parts[i].size > 0) && + (mdvp->parts[i].start < replica_end)) { + goto do_repartition; + } + } + } + + /* + * If we passed the above tests, then the disk is already + * partitioned appropriately, and we're not being told to + * force a change. + */ + return (0); + +do_repartition: + + /* Retrieve disk geometry info and round to cylinder sizes */ + if (round_sizes != 0) { + + if ((mdgp = metagetgeom(resnp, ep)) == NULL) + return (-1); + + /* + * Both cylsize and drvsize are in units of disk + * sectors. + * + * The intended results are of type unsigned long + * long. Since each operand of the first + * multiplication is of type unsigned int, we risk + * overflow by multiplying and then converting the + * result. Therefore we explicitly cast (at least) + * one of the operands, forcing conversion BEFORE + * multiplication, and avoiding overflow. The second + * assignment is OK, since one of the operands is + * already of the desired type. + */ + cylsize = + ((unsigned long long)mdgp->nhead) * mdgp->nsect; + drvsize = cylsize * mdgp->ncyl; + + /* + * How many cylinders must we reserve for the replica + * slice to ensure that it meets the previously + * calculated minimum size? + */ + reservedcyl = (ressize + cylsize - 1) / cylsize; + ressize = reservedcyl * cylsize; + } else { + drvsize = last_lba - first_lba; + } + + /* Would this require a forbidden change? */ + if (options & MD_REPART_LEAVE_REP) { + if ((mdvp->parts[replicaslice].start != first_lba) || + (mdvp->parts[replicaslice].size < ressize)) { + return (mddeverror(ep, MDE_REPART_REPLICA, + resnp->dev, NULL)); + } + } + + /* + * It seems unlikely that someone would pass us too small a + * disk, but it's still worth checking for... + */ + if (((round_sizes != 0) && (reservedcyl >= (int)mdgp->ncyl)) || + ((round_sizes == 0) && (ressize + first_lba >= last_lba))) { + return (mdmddberror(ep, MDE_DB_TOOSMALL, + meta_getminor(resnp->dev), sp->setno, 0, NULL)); + } + + replica_start = first_lba; + replica_size = ressize; + data_start = first_lba + ressize; + data_size = drvsize - ressize; + + /* + * Create the proposed VTOC. First copy the current VTOC + * into the proposed VTOC to duplicate the values that don't + * need to change. Then change the partition table and set + * the flag value for the replica slice to resflag to reserve it + * for metadata. + */ + proposed_vtoc = *mdvp; + /* We need at least replicaslice partitions in the proposed vtoc */ + if (replicaslice >= proposed_vtoc.nparts) { + proposed_vtoc.nparts = replicaslice + 1; + } + for (i = 0; i < proposed_vtoc.nparts; i++) { + /* don't change the reserved partition of an EFI device */ + if (proposed_vtoc.parts[i].tag == V_RESERVED) + data_size = proposed_vtoc.parts[i].start - data_start; + else + (void) memset(&proposed_vtoc.parts[i], '\0', + sizeof (proposed_vtoc.parts[i])); + } + + proposed_vtoc.parts[MD_SLICE0].start = data_start; + proposed_vtoc.parts[MD_SLICE0].size = data_size; + proposed_vtoc.parts[MD_SLICE0].tag = V_USR; + proposed_vtoc.parts[replicaslice].start = replica_start; + proposed_vtoc.parts[replicaslice].size = replica_size; + proposed_vtoc.parts[replicaslice].flag = resflag; + proposed_vtoc.parts[replicaslice].tag = V_USR; + + if (!(options & MD_REPART_DONT_LABEL)) { + /* + * Label the disk with the proposed VTOC. + */ + *mdvp = proposed_vtoc; + if (metasetvtoc(resnp, ep) != 0) { + return (-1); + } + } + + if (vtocp != NULL) { + /* + * Return the proposed VTOC. + */ + *vtocp = proposed_vtoc; + } + + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_replace.c b/usr/src/lib/lvm/libmeta/common/meta_replace.c new file mode 100644 index 0000000000..3165bd0d53 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_replace.c @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * replace components in metadevices + */ + +#include <meta.h> +#include <sys/lvm/md_stripe.h> + +int +meta_replace(mdsetname_t *sp, mdname_t *metanp, mdname_t *oldnp, + mdname_t *newnp, char *uname, mdcmdopts_t options, md_error_t *ep) +{ + char *miscname; + + assert(sp != NULL); + + if (is_hspname(uname)) { + mdhspname_t *hspnp; + + if ((hspnp = metahspname(&sp, uname, ep)) == NULL) + return (-1); + assert(sp != NULL); + (void) meta_hs_replace(sp, hspnp, oldnp, newnp, options, ep); + return (0); + } + assert(sp->setno == MD_MIN2SET(meta_getminor(metanp->dev))); + if (metachkmeta(metanp, ep) != 0) + return (-1); + + + if ((miscname = metagetmiscname(metanp, ep)) == NULL) + return (-1); + + if (strcmp(miscname, MD_RAID) == 0) { + return (meta_raid_replace(sp, metanp, oldnp, newnp, + options, ep)); + } else if (strcmp(miscname, MD_TRANS) == 0) { + return (meta_trans_replace(sp, metanp, oldnp, newnp, + options, ep)); + } else if (strcmp(miscname, MD_STRIPE) == 0) { + return (meta_stripe_replace(sp, metanp, oldnp, newnp, + options, ep)); + } + + return (mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(metanp->dev), + metanp->cname)); +} +/* + * replace named device + */ +int +meta_replace_byname( + mdsetname_t *sp, + mdname_t *np, + mdname_t *oldnp, + mdname_t *newnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + char *miscname; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* get type */ + if (metachkmeta(np, ep) != 0) + return (-1); + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + + /* dispatch */ + if (strcmp(miscname, MD_RAID) == 0) { + return (meta_raid_replace(sp, np, oldnp, newnp, options, ep)); + } else if (strcmp(miscname, MD_MIRROR) == 0) { + return (meta_mirror_replace(sp, np, oldnp, newnp, options, ep)); + } else { + return (mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(np->dev), + np->cname)); + } +} + +/* + * enable named device + */ +int +meta_enable_byname( + mdsetname_t *sp, + mdname_t *np, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + char *miscname; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* get type */ + if (metachkmeta(np, ep) != 0) + return (-1); + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + + /* dispatch */ + if (strcmp(miscname, MD_RAID) == 0) { + return (meta_raid_enable(sp, np, compnp, options, ep)); + } else if (strcmp(miscname, MD_MIRROR) == 0) { + return (meta_mirror_enable(sp, np, compnp, options, ep)); + } else { + return (mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(np->dev), + np->cname)); + } +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_reset.c b/usr/src/lib/lvm/libmeta/common/meta_reset.c new file mode 100644 index 0000000000..ae04edce5d --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_reset.c @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * clear metadevices + */ + +#include <meta.h> + +/* + * clear a metadevice. + */ +int +meta_reset( + mdsetname_t *sp, + mdname_t *np, + mdcmdopts_t options, + md_error_t *ep +) +{ + char *miscname; + md_i_reset_t mir; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + /* clear device */ + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + if (meta_isopen(sp, np, ep, options) != 0) { + return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev), + np->cname)); + } + (void) memset(&mir, '\0', sizeof (mir)); + MD_SETDRIVERNAME(&mir, miscname, sp->setno); + mir.mnum = meta_getminor(np->dev); + mir.force = (options & MDCMD_FORCE) ? 1 : 0; + if (metaioctl(MD_IOCRESET, &mir, &mir.mde, np->cname) != 0) + return (mdstealerror(ep, &mir.mde)); + + /* return success */ + return (0); +} + +/* + * reset all the metadevice and hotspares + */ +int +meta_reset_all( + mdsetname_t *sp, + mdcmdopts_t options, + md_error_t *ep +) +{ + options |= MDCMD_RECURSE; + + /* + * since soft partitions can appear at the top and bottom + * of the stack, we call meta_sp_reset twice to handle all + * cases. + */ + if (meta_trans_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_sp_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_raid_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_mirror_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_stripe_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_hsp_reset(sp, NULL, options, ep) != 0) + return (-1); + if (meta_sp_reset(sp, NULL, options, ep) != 0) + return (-1); + + return (0); +} + +/* + * reset named device + */ +int +meta_reset_by_name( + mdsetname_t *sp, + mdname_t *np, + mdcmdopts_t options, + md_error_t *ep +) +{ + char *miscname; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* get type */ + if (metachkmeta(np, ep) != 0) + return (-1); + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + /* dispatch */ + if (strcmp(miscname, MD_STRIPE) == 0) { + rval = meta_stripe_reset(sp, np, options, ep); + } else if (strcmp(miscname, MD_MIRROR) == 0) { + rval = meta_mirror_reset(sp, np, options, ep); + } else if (strcmp(miscname, MD_TRANS) == 0) { + rval = meta_trans_reset(sp, np, options, ep); + } else if (strcmp(miscname, MD_RAID) == 0) { + rval = meta_raid_reset(sp, np, options, ep); + } else if (strcmp(miscname, MD_SP) == 0) { + rval = meta_sp_reset(sp, np, options, ep); + } else { + rval = mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(np->dev), + np->cname); + } + + /* cleanup */ + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_resync.c b/usr/src/lib/lvm/libmeta/common/meta_resync.c new file mode 100644 index 0000000000..b57dfb1197 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_resync.c @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * mirror operations + */ + +#include <meta.h> +#include <sdssc.h> + +/* + * resync named device + */ +int +meta_resync_byname( + mdsetname_t *sp, + mdname_t *np, + daddr_t size, + md_error_t *ep, + md_resync_cmd_t cmd /* action to perform */ +) +{ + char *miscname; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* get type */ + if (metachkmeta(np, ep) != 0) + return (-1); + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + + /* dispatch */ + if (strcmp(miscname, MD_RAID) == 0) { + return (meta_raid_resync(sp, np, size, ep)); + } else if (strcmp(miscname, MD_MIRROR) == 0) { + return (meta_mirror_resync(sp, np, size, ep, cmd)); + } else { + return (mdmderror(ep, MDE_UNKNOWN_TYPE, meta_getminor(np->dev), + np->cname)); + } +} + +/* + * resync all devices + */ +int +meta_resync_all( + mdsetname_t *sp, + daddr_t size, + md_error_t *ep +) +{ + int rval = 0; + md_set_desc *sd; + + /* see if we have any databases */ + if (meta_setup_db_locations(ep) != 0) { + if (mdismddberror(ep, MDE_DB_NODB)) { + mdclrerror(ep); + return (0); + } + rval = -1; + } + + if (!(metaislocalset(sp))) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* MN disksets don't use DCS clustering services. */ + if (!(MD_MNSET_DESC(sd))) + sdssc_notify_service(NULL, Shutdown_Services); + } + + /* resync units */ + if (meta_mirror_resync_all(sp, size, ep) != 0) + rval = -1; + if (meta_raid_resync_all(sp, size, ep) != 0) + rval = -1; + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_runtime.c b/usr/src/lib/lvm/libmeta/common/meta_runtime.c new file mode 100644 index 0000000000..f9c5915088 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_runtime.c @@ -0,0 +1,301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Return the values of runtime parameters stored in + * /etc/lvm/runtime.cf, converting them to data + * types appropriate for use by functions whose behavior + * is affected by those values. + */ + +/* + * system include files + */ + +#include <libintl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> + +/* + * SUNWmd include files + */ + +#include <meta.h> /* for MDD_DOMAIN */ +#include <meta_runtime.h> /* external interface definition */ +#include <sdssc.h> + +/* + * The following lines define the runtime parameter configuration file. + */ + +static const char *param_file_namep = "/etc/lvm/runtime.cf"; + +/* + * The runtime parameter configuration file is an ascii text file. + * Each text line in the file has a maximum length of 80 four-byte + * wide characters. The line buffer size defined below accomodates + * the maximum line length plus the newline character at the end of + * the line and the null character that fgets() adds at the end of + * the line when it writes the line to the buffer. + */ + +static const int line_buffer_size = 325; + +/* + * The format for parameter entries in the file is "name=value". + * Each "name=value" string must begin a line of the file. + * The "name" and "value" tokens may be preceded or followed by + * spaces. Lines beginning with "#" are comment lines. + */ + +static const char *token_separator_listp = " ="; + +/* + * If a runtime parameter that can be set in the file is not set, + * or is set to an invalid value, or if the file can't be opened, + * the parameter takes on the default value given in the comments + * below. + */ + +/* + * The following string constant declarations name the runtime + * configuration parameters that can be set in the runtime parameter + * configuration file. The allowed values of parameters that + * range over small sets of discrete values are also declared below + * as string constants. + * + * CAUTION: When adding new runtime parameters to the runtime + * parameter configuration file, declare their names + * as string constants below, and check for conflicts + * with the names of existing parameters. + */ + +static const char *ownerioctls_namep = "ownerioctls"; + +/* + * allowed values: + */ + +static const char *ownerioctls_onp = "on"; /* default value */ +static const char *ownerioctls_offp = "off"; + +/* + * The "ownerioctls" parameter controls whether the metaset -t and + * metaset -r commands issue the MHIOCTKOWN, MHIOCRELEASE, and + * MHIOCENFAILFAST ioctls when taking or releasing ownership of disksets. + * The allowed parameter values are "on" and "off". + * + * If the line "ownerioctls=off" appears in the runtime configuration file, + * the metaset -t command doesn't issue the MHIOCTKOWN ioctl when taking + * ownership of disksets, and the metaset -r command doesn't issue the + * MHIOCRELEASE and MHIOCENFAILFAST ioctls when releasing ownership of + * disksets. + * + * If the line "ownerioctls=on" appears in the file, the metaset -t + * command issues the MHIOCTKOWN ioctl when taking ownership of disksets, + * and the metaset -r command issues the MHIOCRELEASE AND MHIOCENFAILFAST + * icotls when releasing ownership of disksets. + * + * The default value of "ownerioctls" is "on". + */ + +/* + * The following lines make forward declarations of private functions. + */ + +static +char * +meta_get_rt_param(const char *param_namep, boolean_t warn_if_not_found); + +/* + * The following lines define public functions. + */ + +boolean_t +do_owner_ioctls(void) +{ + const char *function_namep = "do_owner_ioctls()"; + char *param_valuep; + boolean_t return_value = B_TRUE; /* default behavior */ + sdssc_version_t version; + + if ((sdssc_version(&version) == SDSSC_OKAY) && (version.major >= 3)) { + /* + * If we're bound to a cluster machine never do ioctls. + * The SC3.0 cluster code will always deal with disk + * reservation. + */ + + return_value = B_FALSE; + } else { + param_valuep = meta_get_rt_param(ownerioctls_namep, B_TRUE); + if (param_valuep != NULL) { + if (strcmp(param_valuep, ownerioctls_offp) == 0) { + return_value = B_FALSE; + } else if (strcmp(param_valuep, + ownerioctls_onp) != 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: illegal value for %s: %s.\n"), + function_namep, + ownerioctls_namep, + param_valuep); + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "%s: illegal value for %s: %s.\n"), + function_namep, + ownerioctls_namep, + param_valuep); + } + free(param_valuep); + } + } + return (return_value); +} + +/* + * Retrieve the verbosity level for rpc.mdcommd from the config file. + * If none is specified, don't print a warning and return 0 + */ +uint_t +commd_get_verbosity(void) +{ + char *param_valuep; + uint_t retval = 0; + param_valuep = meta_get_rt_param("commd_verbosity", B_FALSE); + if (param_valuep != NULL) { + retval = (uint_t)strtol(param_valuep, NULL, 16); + free(param_valuep); + } + return (retval); +} + +/* + * Retrieve the debug output file for rpc.mdcommd from the config file. + * If none is specified, don't print a warning. + * Note that if returning non-NULL, the caller is responsible for freeing + * the result pointer. + */ +char * +commd_get_outfile(void) +{ + return (meta_get_rt_param("commd_out_file", B_FALSE)); +} + +/* + * The following lines define private functions + */ + +static char * +meta_get_rt_param(const char *param_namep, boolean_t warn_if_not_found) +{ + const char *function_namep = "meta_get_rt_param()"; + char *line_bufferp = NULL; + char *newlinep = NULL; + FILE *param_filep = NULL; + char *param_name_tokenp = NULL; + char *param_valuep = NULL; + char *param_value_tokenp = NULL; + + line_bufferp = (char *)malloc(line_buffer_size); + if (line_bufferp == NULL) { + (void) fprintf(stderr, + dgettext(TEXT_DOMAIN, "%s: malloc failed\n"), + function_namep); + syslog(LOG_ERR, + dgettext(TEXT_DOMAIN, "%s: malloc failed\n"), + function_namep); + return (param_valuep); + } + param_filep = fopen(param_file_namep, "r"); + if (param_filep == NULL) { + (void) fprintf(stderr, + dgettext(TEXT_DOMAIN, "%s: can't open %s\n"), + function_namep, param_file_namep); + syslog(LOG_ERR, + dgettext(TEXT_DOMAIN, "%s: can't open %s\n"), + function_namep, param_file_namep); + free(line_bufferp); + return (param_valuep); + } + while ((fgets(line_bufferp, line_buffer_size, param_filep) != NULL) && + (param_valuep == NULL)) { + + newlinep = strchr(line_bufferp, '\n'); + if (newlinep != NULL) { + *newlinep = '\0'; + newlinep = NULL; + } + param_name_tokenp = strtok(line_bufferp, token_separator_listp); + if ((param_name_tokenp != NULL) && + (strcmp(param_namep, param_name_tokenp) == 0)) { + + param_value_tokenp = strtok(NULL, + token_separator_listp); + } + if (param_value_tokenp != NULL) { + param_valuep = strdup(param_value_tokenp); + if (param_valuep == NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: strdup failed\n"), + function_namep); + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "%s: strdup failed\n"), + function_namep); + free(line_bufferp); + (void) fclose(param_filep); + return (param_valuep); + } + } + } + if ((param_valuep == NULL) && (warn_if_not_found == B_TRUE)) { + (void) fprintf(stderr, + dgettext(TEXT_DOMAIN, + "%s: value of %s not set or error in %s\n"), + function_namep, + param_namep, + param_file_namep); + syslog(LOG_ERR, + dgettext(TEXT_DOMAIN, + "%s: value of %s not set or error in %s\n"), + function_namep, + param_namep, + param_file_namep); + } + free(line_bufferp); + (void) fclose(param_filep); + return (param_valuep); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_se_notify.c b/usr/src/lib/lvm/libmeta/common/meta_se_notify.c new file mode 100644 index 0000000000..7ee231aa42 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_se_notify.c @@ -0,0 +1,399 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdlib.h> +#include <meta.h> +#include <libsysevent.h> +#include <libnvpair.h> +#include <sys/sysevent/svm.h> +#include <sys/sysevent/eventdefs.h> +#include <dlfcn.h> + +char * +obj2devname(uint32_t tag, set_t setno, md_dev64_t dev) +{ + char *setname; + char name[MD_MAX_CTDLEN]; + mdsetname_t *sp; + md_error_t status = mdnullerror; + md_set_record *md_sr; + minor_t mnum = meta_getminor(dev); + int rtn = 0; + + setname = NULL; + if ((setno != MD_SET_BAD) && + ((sp = metasetnosetname(setno, &status)) != NULL)) { + setname = sp->setname; + } + + name[0] = '\0'; + switch (tag) { + case SVM_TAG_HS: + case SVM_TAG_METADEVICE: + case SVM_TAG_MIRROR: + case SVM_TAG_RAID5: + case SVM_TAG_STRIPE: + case SVM_TAG_TRANS: + if (setno == 0) { + rtn = snprintf(name, sizeof (name), "d%u", + (unsigned)MD_MIN2UNIT(mnum)); + } else if (setname != NULL) { + rtn = snprintf(name, sizeof (name), "%s/d%u", setname, + (unsigned)MD_MIN2UNIT(mnum)); + } + break; + case SVM_TAG_HSP: + if (setno == 0) { + rtn = snprintf(name, sizeof (name), "hsp%u", + (unsigned)MD_MIN2UNIT(mnum)); + } else if (setname != NULL) { + rtn = snprintf(name, sizeof (name), "%s/hsp%u", + setname, (unsigned)MD_MIN2UNIT(mnum)); + } + break; + case SVM_TAG_DRIVE: + (void) sprintf(name, "drive"); + break; + case SVM_TAG_HOST: + md_sr = NULL; + if (setname != NULL) { + md_sr = getsetbyname(setname, &status); + } + if ((md_sr != NULL) && (md_sr->sr_nodes[mnum] != NULL)) { + /* + * Get the host data from the node array. + */ + rtn = snprintf(name, sizeof (name), "%s", + md_sr->sr_nodes[mnum]); + } + if ((name[0] == '\0') || (rtn >= sizeof (name))) { + (void) sprintf(name, "host"); + rtn = 0; + } + break; + case SVM_TAG_SET: + if (setname == NULL) { + (void) sprintf(name, "diskset"); + } else { + rtn = snprintf(name, sizeof (name), "%s", setname); + } + break; + default: + if ((setname = get_devname(setno, dev)) != NULL) { + rtn = snprintf(name, sizeof (name), "%s", setname); + } + break; + } + mdclrerror(&status); + + /* Check if we got any rubbish for any of the snprintf's */ + if ((name[0] == '\0') || (rtn >= sizeof (name))) { + return (NULL); + } + + return (strdup(name)); +} + +/* Sysevent subclass and mdnotify event type pairs */ +struct node { + char *se_ev; + evid_t md_ev; +}; + +/* Table must be sorted in ascending order */ +static struct node ev_table[] = { + { ESC_SVM_ADD, EV_ADD }, + { ESC_SVM_ATTACH, EV_ATTACH }, + { ESC_SVM_ATTACHING, EV_ATTACHING }, + { ESC_SVM_CHANGE, EV_CHANGE }, + { ESC_SVM_CREATE, EV_CREATE }, + { ESC_SVM_DELETE, EV_DELETE }, + { ESC_SVM_DETACH, EV_DETACH }, + { ESC_SVM_DETACHING, EV_DETACHING }, + { ESC_SVM_DRIVE_ADD, EV_DRIVE_ADD }, + { ESC_SVM_DRIVE_DELETE, EV_DRIVE_DELETE }, + { ESC_SVM_ENABLE, EV_ENABLE }, + { ESC_SVM_ERRED, EV_ERRED }, + { ESC_SVM_EXCHANGE, EV_EXCHANGE }, + { ESC_SVM_GROW, EV_GROW }, + { ESC_SVM_HS_CHANGED, EV_HS_CHANGED }, + { ESC_SVM_HS_FREED, EV_HS_FREED }, + { ESC_SVM_HOST_ADD, EV_HOST_ADD }, + { ESC_SVM_HOST_DELETE, EV_HOST_DELETE }, + { ESC_SVM_HOTSPARED, EV_HOTSPARED }, + { ESC_SVM_INIT_FAILED, EV_INIT_FAILED }, + { ESC_SVM_INIT_FATAL, EV_INIT_FATAL }, + { ESC_SVM_INIT_START, EV_INIT_START }, + { ESC_SVM_INIT_SUCCESS, EV_INIT_SUCCESS }, + { ESC_SVM_IOERR, EV_IOERR }, + { ESC_SVM_LASTERRED, EV_LASTERRED }, + { ESC_SVM_MEDIATOR_ADD, EV_MEDIATOR_ADD }, + { ESC_SVM_MEDIATOR_DELETE, EV_MEDIATOR_DELETE }, + { ESC_SVM_OFFLINE, EV_OFFLINE }, + { ESC_SVM_OK, EV_OK }, + { ESC_SVM_ONLINE, EV_ONLINE }, + { ESC_SVM_OPEN_FAIL, EV_OPEN_FAIL }, + { ESC_SVM_REGEN_DONE, EV_REGEN_DONE }, + { ESC_SVM_REGEN_FAILED, EV_REGEN_FAILED }, + { ESC_SVM_REGEN_START, EV_REGEN_START }, + { ESC_SVM_RELEASE, EV_RELEASE }, + { ESC_SVM_REMOVE, EV_REMOVE }, + { ESC_SVM_RENAME_DST, EV_RENAME_DST }, + { ESC_SVM_RENAME_SRC, EV_RENAME_SRC }, + { ESC_SVM_REPLACE, EV_REPLACE }, + { ESC_SVM_RESYNC_DONE, EV_RESYNC_DONE }, + { ESC_SVM_RESYNC_FAILED, EV_RESYNC_FAILED }, + { ESC_SVM_RESYNC_START, EV_RESYNC_START }, + { ESC_SVM_RESYNC_SUCCESS, EV_RESYNC_SUCCESS }, + { ESC_SVM_TAKEOVER, EV_TAKEOVER } +}; + +static ev_obj_t md_tags[] = { + EVO_UNSPECIFIED, + EVO_METADEV, + EVO_MIRROR, + EVO_STRIPE, + EVO_RAID5, + EVO_TRANS, + EVO_REPLICA, + EVO_HSP, + EVO_HS, + EVO_SET, + EVO_DRIVE, + EVO_HOST, + EVO_MEDIATOR +}; + +static int +ev_compare(const void *node1, const void *node2) +{ + return (strcmp((const char *)node1, + ((const struct node *)node2)->se_ev)); +} + +/* + * Log mdnotify event + */ +void +do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid) +{ + evid_t ev_type; + ev_obj_t md_tag; + struct node *node_ptr; + + /* Translate sysevent into mdnotify event */ + node_ptr = bsearch(se_subclass, ev_table, (sizeof (ev_table) / + sizeof (ev_table[0])), sizeof (ev_table[0]), ev_compare); + + if (node_ptr == NULL) { + ev_type = EV_EMPTY; + } else { + ev_type = node_ptr->md_ev; + } + + if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) { + md_tag = EVO_UNSPECIFIED; + } else { + md_tag = md_tags[tag]; + } + + NOTIFY_MD(md_tag, setno, devid, ev_type); +} + +/* + * External symbols from libsysevent and libnvpair which are not + * available in static forms + */ +static void *se_handle = NULL, *nv_handle = NULL; +static int (*_sysevent_post_event)(char *, char *, char *, char *, + nvlist_t *, sysevent_id_t *) = NULL; +static int (*_nvlist_alloc)(nvlist_t **, uint_t, int) = NULL; +static void (*_nvlist_free)(nvlist_t *) = NULL; +static int (*_nvlist_add_uint32)(nvlist_t *, char *, uint32_t) = NULL; +static int (*_nvlist_add_uint64)(nvlist_t *, char *, uint64_t) = NULL; +static int (*_nvlist_add_string)(nvlist_t *, char *, char *) = NULL; + +/* + * Load nvpair and sysevent symbols + */ +static int +load_sev_lib() +{ + /* Try to load the sysevent symbol */ + if (se_handle == NULL) { + se_handle = dlopen("/usr/lib/libsysevent.so.1", RTLD_LAZY); + } + if (se_handle != NULL) { + if ((_sysevent_post_event == NULL) && + (_sysevent_post_event = (int (*)(char *, char *, char *, + char *, nvlist_t *, sysevent_id_t *)) + dlsym(se_handle, "sysevent_post_event")) == NULL) { + goto out; + } + } else { + return (1); + } + + /* Try to load the nvpair symbols */ + if (nv_handle == NULL) { + nv_handle = dlopen("/usr/lib/libnvpair.so.1", RTLD_LAZY); + } + if (nv_handle != NULL) { + if ((_nvlist_alloc == NULL) && + (_nvlist_alloc = (int (*)(nvlist_t **, uint_t, int)) + dlsym(nv_handle, "nvlist_alloc")) == NULL) { + goto out; + } + if ((_nvlist_free == NULL) && + (_nvlist_free = (void (*)(nvlist_t *))dlsym(nv_handle, + "nvlist_free")) == NULL) { + goto out; + } + if ((_nvlist_add_uint32 == NULL) && + (_nvlist_add_uint32 = (int (*)(nvlist_t *, char *, + uint32_t))dlsym(nv_handle, + "nvlist_add_uint32")) == NULL) { + goto out; + } + if ((_nvlist_add_uint64 == NULL) && + (_nvlist_add_uint64 = (int (*)(nvlist_t *, char *, + uint64_t))dlsym(nv_handle, + "nvlist_add_uint64")) == NULL) { + goto out; + } + if ((_nvlist_add_string == NULL) && + (_nvlist_add_string = (int (*)(nvlist_t *, char *, + char *))dlsym(nv_handle, + "nvlist_add_string")) == NULL) { + goto out; + } + + return (0); + } + +out: + if ((se_handle != NULL) && (dlclose(se_handle) == 0)) { + se_handle = NULL; + } + + if ((nv_handle != NULL) && (dlclose(nv_handle) == 0)) { + nv_handle = NULL; + } + + _sysevent_post_event = NULL; + _nvlist_alloc = NULL; + _nvlist_free = NULL; + _nvlist_add_uint32 = NULL; + _nvlist_add_uint64 = NULL; + _nvlist_add_string = NULL; + + return (1); +} + +/* + * Log SVM sys events + */ +void +meta_svm_sysevent( + char *se_class, + char *se_subclass, + uint32_t tag, + set_t setno, + md_dev64_t devid +) +{ + sysevent_id_t eid; + nvlist_t *attr_list; + int err = 0; + char *devname; + + /* Raise the mdnotify event before anything else */ + do_mdnotify(se_subclass, tag, setno, devid); + + /* Just get out if the sysevent symbol can't be loaded */ + if (load_sev_lib()) { + return; + } + + err = (*_nvlist_alloc)(&attr_list, NV_UNIQUE_NAME, 0); + + if (err == 0) { + /* Add the version number */ + err = (*_nvlist_add_uint32)(attr_list, SVM_VERSION_NO, + (uint32_t)SVM_VERSION); + if (err != 0) { + goto fail; + } + + /* Add the tag attribute */ + err = (*_nvlist_add_uint32)(attr_list, SVM_TAG, (uint32_t)tag); + if (err != 0) { + goto fail; + } + + /* Add the set number attribute */ + err = (*_nvlist_add_uint32)(attr_list, SVM_SET_NO, + (uint32_t)setno); + if (err != 0) { + goto fail; + } + + /* Add the device id attribute */ + err = (*_nvlist_add_uint64)(attr_list, SVM_DEV_ID, + (uint64_t)devid); + if (err != 0) { + goto fail; + } + + /* Add the device name attribute */ + devname = obj2devname(tag, setno, devid); + if (devname != NULL) { + err = (*_nvlist_add_string)(attr_list, SVM_DEV_NAME, + devname); + free(devname); + } else { + err = (*_nvlist_add_string)(attr_list, SVM_DEV_NAME, + "unspecified"); + } + if (err != 0) { + goto fail; + } + + /* Attempt to post event */ + (void) (*_sysevent_post_event)(se_class, se_subclass, + SUNW_VENDOR, EP_SVM, attr_list, &eid); + + (*_nvlist_free)(attr_list); + } + + return; + +fail: + (*_nvlist_free)(attr_list); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set.c b/usr/src/lib/lvm/libmeta/common/meta_set.c new file mode 100644 index 0000000000..7634779ce5 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set.c @@ -0,0 +1,5918 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Metadevice diskset interfaces + */ + +#include "meta_set_prv.h" +#include <meta.h> +#include <metad.h> +#include <mdmn_changelog.h> +#include <sys/lvm/md_crc.h> +#include <sys/utsname.h> +#include <sdssc.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/svm.h> +extern char *blkname(char *); + +static md_drive_desc * +dr2drivedesc( + mdsetname_t *sp, + side_t sideno, + int flags, + md_error_t *ep +) +{ + md_set_record *sr; + md_drive_record *dr; + mddrivename_t *dnp; + md_drive_desc *dd_head = NULL; + md_set_desc *sd; + + if (flags & MD_BYPASS_DAEMON) { + if ((sr = metad_getsetbynum(sp->setno, ep)) == NULL) + return (NULL); + sd = metaget_setdesc(sp, ep); + sideno = getnodeside(mynode(), sd); + sp = metafakesetname(sp->setno, sr->sr_setname); + } else { + if ((sr = getsetbyname(sp->setname, ep)) == NULL) + return (NULL); + } + + assert(sideno != MD_SIDEWILD); + + /* + * WARNING: + * The act of getting the dnp from the namespace means that we + * will get the devid of the disk as recorded in the namespace. + * This devid has the potential to be stale if the disk is being + * replaced via a rebind, this means that any code that relies + * on any of the dnp information should take the appropriate action + * to preserve that information. For example in the rebind code the + * devid of the new disk is saved off and then copied back in once + * the code that has called this function has completed. + */ + for (dr = sr->sr_drivechain; dr != NULL; dr = dr->dr_next) { + if ((dnp = metadrivename_withdrkey(sp, sideno, dr->dr_key, + flags, ep)) == NULL) { + if (!(flags & MD_BYPASS_DAEMON)) + free_sr(sr); + metafreedrivedesc(&dd_head); + return (NULL); + } + + (void) metadrivedesc_append(&dd_head, dnp, dr->dr_dbcnt, + dr->dr_dbsize, dr->dr_ctime, dr->dr_genid, dr->dr_flags); + } + + if (!(flags & MD_BYPASS_DAEMON)) { + free_sr(sr); + } + return (dd_head); +} + +static int +get_sidenmlist( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + md_set_desc *sd; + mdsidenames_t *sn, **sn_next; + int i; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + metaflushsidenames(dnp); + sn_next = &dnp->side_names; + if (MD_MNSET_DESC(sd)) { + /* + * Only get sidenames for this node since + * that is the only side information stored in + * the local mddb for a multi-node diskset. + */ + if (sd->sd_mn_mynode) { + sn = Zalloc(sizeof (*sn)); + sn->sideno = sd->sd_mn_mynode->nd_nodeid; + if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, + sn->sideno, dnp->side_names_key, &sn->dname, + &sn->mnum, NULL, ep)) == NULL) { + if (sn->dname != NULL) + Free(sn->dname); + Free(sn); + return (-1); + } + + /* Add to the end of the linked list */ + assert(*sn_next == NULL); + *sn_next = sn; + sn_next = &sn->next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + sn = Zalloc(sizeof (*sn)); + sn->sideno = i; + if ((sn->cname = meta_getnmentbykey(MD_LOCAL_SET, + i+SKEW, dnp->side_names_key, &sn->dname, + &sn->mnum, NULL, ep)) == NULL) { + /* + * It is possible that during the add of a + * host to have a 'missing' side as the side + * for this disk will be added later. So ignore + * the error. The 'missing' side will be added + * once the addhosts process has completed. + */ + if (mdissyserror(ep, ENOENT)) { + mdclrerror(ep); + Free(sn); + continue; + } + + if (sn->dname != NULL) + Free(sn->dname); + Free(sn); + return (-1); + } + + /* Add to the end of the linked list */ + assert(*sn_next == NULL); + *sn_next = sn; + sn_next = &sn->next; + } + } + + return (0); +} + +static md_drive_desc * +rl_to_dd( + mdsetname_t *sp, + md_replicalist_t *rlp, + md_error_t *ep +) +{ + md_replicalist_t *rl; + md_replica_t *r; + md_drive_desc *dd = NULL; + md_drive_desc *d; + int found; + md_set_desc *sd; + daddr_t nblks = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (NULL); + + /* find the smallest existing replica */ + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + r = rl->rl_repp; + nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); + } + + if (nblks <= 0) + nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + r = rl->rl_repp; + + found = 0; + for (d = dd; d != NULL; d = d->dd_next) { + if (strcmp(r->r_namep->drivenamep->cname, + d->dd_dnp->cname) == 0) { + found = 1; + dd->dd_dbcnt++; + break; + } + } + + if (! found) + (void) metadrivedesc_append(&dd, r->r_namep->drivenamep, + 1, nblks, sd->sd_ctime, sd->sd_genid, MD_DR_OK); + } + + return (dd); +} + +/* + * Exported Entry Points + */ + +set_t +get_max_sets(md_error_t *ep) +{ + + static set_t max_sets = 0; + + if (max_sets == 0) + if (metaioctl(MD_IOCGETNSET, &max_sets, ep, NULL) != 0) + return (0); + + return (max_sets); +} + +int +get_max_meds(md_error_t *ep) +{ + static int max_meds = 0; + + if (max_meds == 0) + if (metaioctl(MD_MED_GET_NMED, &max_meds, ep, NULL) != 0) + return (0); + + return (max_meds); +} + +side_t +getmyside(mdsetname_t *sp, md_error_t *ep) +{ + md_set_desc *sd; + char *node = NULL; + side_t sideno; + + if (sp->setno == 0) + return (0); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (MD_SIDEWILD); + + node = mynode(); + + assert(node != NULL); + + sideno = getnodeside(node, sd); + + if (sideno != MD_SIDEWILD) + return (sideno); + + return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, node, NULL, node)); +} + +/* + * get set info from name + */ +md_set_record * +getsetbyname(char *setname, md_error_t *ep) +{ + md_set_record *sr = NULL; + md_mnset_record *mnsr = NULL; + char *p; + size_t len; + + /* get set info from daemon */ + if (clnt_getset(mynode(), setname, MD_SET_BAD, &sr, ep) == -1) + return (NULL); + if (sr != NULL) { + /* + * Returned record could be for a multi-node set or a + * non-multi-node set. + */ + if (MD_MNSET_REC(sr)) { + /* + * Record is for a multi-node set. Reissue call + * to get mnset information. Need to free + * record as if a non-multi-node set record since + * that is what clnt_getset gave us. If in + * the daemon, don't free since this is a pointer + * into the setrecords array. + */ + if (! md_in_daemon) { + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + } + if (clnt_mngetset(mynode(), setname, MD_SET_BAD, &mnsr, + ep) == -1) + return (NULL); + if (mnsr != NULL) + return ((struct md_set_record *)mnsr); + } else { + return (sr); + } + } + + /* no such set */ + len = strlen(setname) + 30; + p = Malloc(len); + (void) snprintf(p, len, "setname \"%s\"", setname); + (void) mderror(ep, MDE_NO_SET, p); + Free(p); + return (NULL); +} + +/* + * get set info from number + */ +md_set_record * +getsetbynum(set_t setno, md_error_t *ep) +{ + md_set_record *sr; + md_mnset_record *mnsr = NULL; + char buf[100]; + + if (clnt_getset(mynode(), NULL, setno, &sr, ep) == -1) + return (NULL); + + if (sr != NULL) { + /* + * Record is for a multi-node set. Reissue call + * to get mnset information. Need to free + * record as if a non-multi-node set record since + * that is what clnt_getset gave us. If in + * the daemon, don't free since this is a pointer + * into the setrecords array. + */ + if (MD_MNSET_REC(sr)) { + /* + * Record is for a multi-node set. Reissue call + * to get mnset information. + */ + if (! md_in_daemon) { + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + } + if (clnt_mngetset(mynode(), NULL, setno, &mnsr, + ep) == -1) + return (NULL); + if (mnsr != NULL) + return ((struct md_set_record *)mnsr); + } else { + return (sr); + } + } + + (void) sprintf(buf, "setno %u", setno); + (void) mderror(ep, MDE_NO_SET, buf); + return (NULL); +} + +int +meta_check_drive_inuse( + mdsetname_t *sp, + mddrivename_t *dnp, + int check_db, + md_error_t *ep +) +{ + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* get all underlying partitions */ + if (meta_getalldevs(sp, &nlp, check_db, ep) != 0) + return (-1); + + /* search for drive */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (strcmp(dnp->cname, np->drivenamep->cname) == 0) { + rval = (mddserror(ep, MDE_DS_DRIVEINUSE, sp->setno, + NULL, dnp->cname, sp->setname)); + break; + } + } + + /* cleanup, return success */ + metafreenamelist(nlp); + return (rval); +} + +/* + * simple check for ownership + */ +int +meta_check_ownership(mdsetname_t *sp, md_error_t *ep) +{ + int ownset; + md_set_desc *sd; + md_drive_desc *dd; + md_replicalist_t *rlp = NULL; + md_error_t xep = mdnullerror; + + if (metaislocalset(sp)) + return (0); + + ownset = own_set(sp, NULL, TRUE, ep); + if (! mdisok(ep)) + return (-1); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); + if (! mdisok(ep)) + return (-1); + + /* If we have no drive descriptors, check for no ownership */ + if (dd == NULL) { + if (ownset == MD_SETOWNER_NONE) + return (0); + + /* If ownership somehow has come to exist, we must clean up */ + + if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, + &xep) < 0) + mdclrerror(&xep); + + if ((dd = rl_to_dd(sp, rlp, &xep)) == NULL) + if (! mdisok(&xep)) + mdclrerror(&xep); + + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, dd, TRUE, &xep)) + mdclrerror(&xep); + } + + if (halt_set(sp, &xep)) + mdclrerror(&xep); + + metafreereplicalist(rlp); + + metafreedrivedesc(&dd); + + return (0); + } + + metafreedrivedesc(&sd->sd_drvs); + + if (ownset == MD_SETOWNER_YES) + return (0); + + return (mddserror(ep, MDE_DS_NOOWNER, sp->setno, NULL, NULL, + sp->setname)); +} + +/* + * simple check for ownership + */ +int +meta_check_ownership_on_host(mdsetname_t *sp, char *hostname, md_error_t *ep) +{ + md_set_desc *sd; + md_drive_desc *dd; + int bool; + + if (metaislocalset(sp)) + return (0); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (getnodeside(hostname, sd) == MD_SIDEWILD) + return (mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, + hostname, NULL, sp->setname)); + + dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), ep); + if (! mdisok(ep)) + return (-1); + + if (clnt_ownset(hostname, sp, &bool, ep) == -1) + return (-1); + + if (dd == NULL) + return (0); + + metafreedrivedesc(&sd->sd_drvs); + + if (bool == TRUE) + return (0); + + return (mddserror(ep, MDE_DS_NODEISNOTOWNER, sp->setno, hostname, NULL, + sp->setname)); +} + +/* + * Function that determines if a node is in the multinode diskset + * membership list. Calling node passes in node to be checked and + * the nodelist as returned from meta_read_nodelist. This routine + * anticipates being called many times using the same diskset membership + * list which is why the alloc and free of the diskset membership list + * is left to the calling routine. + * Returns: + * 1 - if a member + * 0 - not a member + */ +int +meta_is_member( + char *node_name, + md_mn_nodeid_t node_id, + mndiskset_membershiplist_t *nl +) +{ + mndiskset_membershiplist_t *nl2; + int flag_check_name; + + if (node_id != 0) + flag_check_name = 0; + else if (node_name != NULL) + flag_check_name = 1; + else + return (0); + + nl2 = nl; + while (nl2) { + if (flag_check_name) { + /* Compare given name against name in member list */ + if (strcmp(nl2->msl_node_name, node_name) == 0) + break; + } else { + /* Compare given nodeid against nodeid in member list */ + if (nl2->msl_node_id == node_id) + break; + } + nl2 = nl2->next; + } + /* No match found in member list */ + if (nl2 == NULL) { + return (0); + } + /* Return 1 if node is in member list */ + return (1); +} + +/* + * meta_getnext_devinfo should go to the host that + * has the device, to return the device name, driver name, minor num. + * We can take the big cheat for now, since it is a requirement + * that the device names and device numbers are the same, and + * just get the info locally. + * + * This routine is very similar to meta_getnextside_devinfo except + * that the specific side to be used is being passed in. + * + * Exit status: + * 0 - No more side info to return + * 1 - More side info's to return + * -1 - An error has been detected + */ +/*ARGSUSED*/ +int +meta_getside_devinfo( + mdsetname_t *sp, /* for this set */ + char *bname, /* local block name (myside) */ + side_t sideno, /* sideno */ + char **ret_bname, /* block device name of returned side */ + char **ret_dname, /* driver name of returned side */ + minor_t *ret_mnum, /* minor number of returned side */ + md_error_t *ep +) +{ + mdname_t *np; + + if (ret_bname != NULL) + *ret_bname = NULL; + if (ret_dname != NULL) + *ret_dname = NULL; + if (ret_mnum != NULL) + *ret_mnum = NODEV32; + + + if ((np = metaname(&sp, bname, ep)) == NULL) + return (-1); + +/* + * NOTE (future) - There will be more work here once devids are integrated + * into disksets. Then the side should be used to find the correct + * host and the b/d names should be gotten from that host. + */ + + /* + * Return the side info. + */ + if (ret_bname != NULL) + *ret_bname = Strdup(np->bname); + + if (ret_dname != NULL) { + mdcinfo_t *cinfo; + + if ((cinfo = metagetcinfo(np, ep)) == NULL) + return (-1); + + *ret_dname = Strdup(cinfo->dname); + } + + if (ret_mnum != NULL) + *ret_mnum = meta_getminor(np->dev); + + return (1); +} + +/* + * Get the information on the device from the remote node using the devid + * of the disk. + * + * Exit status: + * 0 - No more side info to return + * 1 - More side info's to return + * -1 - An error has been detected + */ +int +meta_getnextside_devinfo( + mdsetname_t *sp, /* for this set */ + char *bname, /* local block name (myside) */ + side_t *sideno, /* previous sideno & returned sideno */ + char **ret_bname, /* block device name of returned side */ + char **ret_dname, /* driver name of returned side */ + minor_t *ret_mnum, /* minor number of returned side */ + md_error_t *ep +) +{ + md_set_desc *sd; + int i; + mdname_t *np; + mddrivename_t *dnp; + char *devidstr = NULL; + int devidstrlen; + md_dev64_t retdev = NODEV64; + char *ret_devname = NULL; + char *ret_blkdevname = NULL; + char *ret_driver = NULL; + char *nodename; + int fd; + int ret = -1; + char *minor_name = NULL; + md_mnnode_desc *nd; + + + if (ret_bname != NULL) + *ret_bname = NULL; + if (ret_dname != NULL) + *ret_dname = NULL; + if (ret_mnum != NULL) + *ret_mnum = NODEV32; + + if (metaislocalset(sp)) { + /* no more sides - we are done */ + if (*sideno != MD_SIDEWILD) + return (0); + + /* First time through - set up return sideno */ + *sideno = 0; + } else { + + /* + * Find the next sideno, starting after the one given. + */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + if ((*sideno == MD_SIDEWILD) && + (nd != (struct md_mnnode_desc *)NULL)) { + *sideno = nd->nd_nodeid; + } else { + while (nd) { + /* + * Found given sideno, now find + * next sideno, if there is one. + */ + if ((*sideno == nd->nd_nodeid) && + (nd->nd_next != + (struct md_mnnode_desc *)NULL)) { + *sideno = + nd->nd_next->nd_nodeid; + break; + } + nd = nd->nd_next; + } + if (nd == NULL) { + return (0); + } + } + if (*sideno == MD_SIDEWILD) + return (0); + } else { + for (i = (*sideno)+1; i < MD_MAXSIDES; i++) + /* Find next full slot */ + if (sd->sd_nodes[i][0] != '\0') + break; + + /* No more sides - we are done */ + if (i == MD_MAXSIDES) + return (0); + + /* Set up the return sideno */ + *sideno = i; + nodename = (char *)sd->sd_nodes[i]; + } + } + + /* + * Need to pass the node the devid of the disk and get it to + * send back the details of the disk from that side. + */ + if ((np = metaname(&sp, bname, ep)) == NULL) + return (-1); + + dnp = np->drivenamep; + + /* + * By default, set up the parameters so that they are copied out. + */ + if (ret_bname != NULL) + *ret_bname = Strdup(np->bname); + + if (ret_dname != NULL) { + mdcinfo_t *cinfo; + + if ((cinfo = metagetcinfo(np, ep)) == NULL) + return (-1); + + *ret_dname = Strdup(cinfo->dname); + } + + if (ret_mnum != NULL) + *ret_mnum = meta_getminor(np->dev); + + /* + * Try some optimization. If this is the local set or the device + * is a metadevice then just copy the information. If the device + * does not have a devid (due to not having a minor name) then + * fall back to the pre-devid behaviour of copying the information + * on the device: this is okay because the sanity checks before this + * call would have found any issues with the device. If it's a + * multi-node diskset also just return ie. copy. + */ + if (metaislocalset(sp) || metaismeta(np) || (dnp->devid == NULL) || + (MD_MNSET_DESC(sd))) + return (1); + + if (np->minor_name == (char *)NULL) { + /* + * Have to get the minor name then. The slice should exist + * on the disk because it will have already been repartitioned + * up prior to getting to this point. + */ + if ((fd = open(np->bname, (O_RDONLY|O_NDELAY), 0)) < 0) { + (void) mdsyserror(ep, errno, np->bname); + return (-1); + } + (void) devid_get_minor_name(fd, &minor_name); + np->minor_name = Strdup(minor_name); + devid_str_free(minor_name); + (void) close(fd); + } + + /* allocate extra space for "/" and NULL hence +2 */ + devidstrlen = strlen(dnp->devid) + strlen(np->minor_name) + 2; + devidstr = (char *)Malloc(devidstrlen); + + /* + * As a minor name is supplied then the ret_devname will be + * appropriate to that minor_name and in this case it will be + * a block device ie /dev/dsk. + */ + (void) snprintf(devidstr, devidstrlen, + "%s/%s", dnp->devid, np->minor_name); + + ret = clnt_devinfo_by_devid(nodename, sp, devidstr, &retdev, + np->bname, &ret_devname, &ret_driver, ep); + + Free(devidstr); + + /* + * If the other side is not running device id in disksets, + * 'ret' is set to ENOTSUP in which case we fallback to + * the existing behaviour + */ + if (ret == ENOTSUP) + return (1); + else if (ret == -1) + return (-1); + + /* + * ret_devname comes from the rpc call and is a + * raw device name. We need to make this into a + * block device via blkname for further processing. + * Unfortunately, when our device id isn't found in + * the system, the rpc call will return a " " in + * ret_devname in which case we need to fill that in + * as ret_blkname because blkname of " " returns NULL. + */ + if (ret_bname != NULL && ret_devname != NULL) { + ret_blkdevname = blkname(ret_devname); + if (ret_blkdevname == NULL) + *ret_bname = Strdup(ret_devname); + else + *ret_bname = Strdup(ret_blkdevname); + } + + if (ret_dname != NULL && ret_driver != NULL) + *ret_dname = Strdup(ret_driver); + + if (ret_mnum != NULL) + *ret_mnum = meta_getminor(retdev); + + return (1); +} + +int +meta_is_drive_in_anyset( + mddrivename_t *dnp, + mdsetname_t **spp, + int bypass_daemon, + md_error_t *ep +) +{ + set_t setno; + mdsetname_t *this_sp; + int is_it; + set_t max_sets; + + if ((max_sets = get_max_sets(ep)) == 0) + return (-1); + + assert(spp != NULL); + *spp = NULL; + + for (setno = 1; setno < max_sets; setno++) { + if (!bypass_daemon) { + if ((this_sp = metasetnosetname(setno, ep)) == NULL) { + if (mdismddberror(ep, MDE_DB_NODB)) { + mdclrerror(ep); + return (0); + } + if (mdiserror(ep, MDE_NO_SET)) { + mdclrerror(ep); + continue; + } + return (-1); + } + } else + this_sp = metafakesetname(setno, NULL); + + if ((is_it = meta_is_drive_in_thisset(this_sp, dnp, + bypass_daemon, ep)) == -1) { + if (mdiserror(ep, MDE_NO_SET)) { + mdclrerror(ep); + continue; + } + return (-1); + } + if (is_it) { + *spp = this_sp; + return (0); + } + } + return (0); +} + +int +meta_is_drive_in_thisset( + mdsetname_t *sp, + mddrivename_t *dnp, + int bypass_daemon, + md_error_t *ep +) +{ + md_drive_desc *dd, *p; + + if (bypass_daemon) + dd = dr2drivedesc(sp, MD_SIDEWILD, + (MD_BASICNAME_OK | MD_BYPASS_DAEMON), ep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); + + if (dd == NULL) { + if (! mdisok(ep)) + return (-1); + return (0); + } + + + for (p = dd; p != NULL; p = p->dd_next) + if (strcmp(p->dd_dnp->cname, dnp->cname) == 0) + return (1); + return (0); +} + +int +meta_set_balance( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd, *curdd; + daddr_t dbsize; + daddr_t nblks; + int i; + int rval = 0; + sigset_t oldsigs; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + int suspend1_flag = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + dbsize = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* END CHECK CODE */ + + /* + * Get drive descriptors for the drives that are currently in the set. + */ + curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); + + if (! mdisok(ep)) + return (-1); + + /* Find the minimum replica size in use is or use the default */ + if ((nblks = meta_db_minreplica(sp, ep)) < 0) + mdclrerror(ep); + else + dbsize = nblks; /* adjust replica size */ + + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * Lock the set on current set members. + * For MN diskset lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + } + } + + /* We are not adding or deleting any drives, just balancing */ + dd = NULL; + + /* + * Balance the DB's according to the list of existing drives and the + * list of added drives. + */ + if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) + goto out; + +out: + /* + * Unlock diskset by resuming class 1 messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if (suspend1_flag) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + /* + * We are here because we failed to resume + * rpc.mdcommd. However we potentially have + * an error from the previous call + * (meta_db_balance). If the previous call + * did fail, we capture that error and + * generate a perror withthe string, + * "Unable to resume...". + * Setting rval to -1 ensures that in the + * next iteration of the loop, ep is not + * clobbered. + */ + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.")); + } + nd = nd->nd_next; + } + } + + /* Unlock the set */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + cl_set_setkey(NULL); + + metaflushsetname(sp); + + return (rval); +} + +int +meta_set_destroy( + mdsetname_t *sp, + int lock_set, + md_error_t *ep +) +{ + int i; + med_rec_t medr; + md_set_desc *sd; + md_drive_desc *dd, *p, *p1; + mddrivename_t *dnp; + mdname_t *np; + mdnamelist_t *nlp = NULL; + int num_users = 0; + int has_set; + side_t mysideno; + sigset_t oldsigs; + md_error_t xep = mdnullerror; + md_setkey_t *cl_sk; + int rval = 0; + int delete_end = 1; + + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, ep) < 0) + return (-1); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + if (! mdisok(ep)) + rval = -1; + goto out; + } + + /* + * meta_set_destroy should not be called for a MN diskset. + * This routine destroys a set without communicating this information + * to the other nodes which would lead to an inconsistency in + * the MN diskset. + */ + if (MD_MNSET_DESC(sd)) { + rval = -1; + goto out; + } + + /* Continue if a traditional diskset */ + + /* + * Check to see who has the set. If we are not the last user of the + * set, we will not touch the replicas. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NST_EQ, + ep); + + if (has_set < 0) { + mdclrerror(ep); + } else + num_users++; + } + + if ((dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) == NULL) { + if (! mdisok(ep)) { + rval = -1; + goto out; + } + } + + if (setup_db_bydd(sp, dd, TRUE, ep) == -1) { + rval = -1; + goto out; + } + + if (lock_set == TRUE) { + /* Lock the set on our side */ + if (clnt_lock_set(mynode(), sp, ep)) { + rval = -1; + goto out; + } + } + + /* + * A traditional diskset has no diskset stale information to send + * since there can only be one owner node at a time. + */ + if (snarf_set(sp, FALSE, ep)) + mdclrerror(ep); + + if (dd != NULL) { + /* + * Make sure that no drives are in use as parts of metadrives + * or hot spare pools, this is one of the few error conditions + * that will stop this routine, unless the environment has + * META_DESTROY_SET_OK set, in which case, the operation will + * proceed. + */ + if (getenv("META_DESTROY_SET_OK") == NULL) { + for (p = dd; p != NULL; p = p->dd_next) { + dnp = p->dd_dnp; + + i = meta_check_drive_inuse(sp, dnp, FALSE, ep); + if (i == -1) { + /* need xep - wire calls clear error */ + i = metaget_setownership(sp, &xep); + if (i == -1) { + rval = -1; + goto out; + } + + mysideno = getmyside(sp, &xep); + + if (mysideno == MD_SIDEWILD) { + rval = -1; + goto out; + } + + if (sd->sd_isown[mysideno] == FALSE) + if (halt_set(sp, &xep)) { + rval = -1; + goto out; + } + + rval = -1; + goto out; + } + } + } + + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip non local nodes */ + if (strcmp(mynode(), sd->sd_nodes[i]) != 0) + continue; + + if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep)) + mdclrerror(ep); + } + + /* + * Go thru each drive and individually delete the replicas. + * This way we can ignore individual errors. + */ + for (p = dd; p != NULL; p = p->dd_next) { + uint_t rep_slice; + + dnp = p->dd_dnp; + if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || + (((np = metaslicename(dnp, rep_slice, ep)) + == NULL) && + ((np = metaslicename(dnp, MD_SLICE0, ep)) + == NULL))) { + rval = -1; + goto out; + } + + if ((np = metaslicename(dnp, + rep_slice, ep)) == NULL) { + if ((np = metaslicename(dnp, + MD_SLICE0, ep)) == NULL) { + rval = -1; + goto out; + } + mdclrerror(ep); + } + + /* Yes this is UGLY!!! */ + p1 = p->dd_next; + p->dd_next = NULL; + if (rel_own_bydd(sp, p, FALSE, ep)) + mdclrerror(ep); + p->dd_next = p1; + + if (p->dd_dbcnt == 0) + continue; + + /* + * Skip the replica removal if we are not the last user + */ + if (num_users != 1) + continue; + + nlp = NULL; + (void) metanamelist_append(&nlp, np); + if (meta_db_detach(sp, nlp, + (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, ep)) + mdclrerror(ep); + metafreenamelist(nlp); + } + } + + if (halt_set(sp, ep)) { + rval = -1; + goto out; + } + + /* Setup the mediator record */ + (void) memset(&medr, '\0', sizeof (med_rec_t)); + medr.med_rec_mag = MED_REC_MAGIC; + medr.med_rec_rev = MED_REC_REV; + medr.med_rec_fl = 0; + medr.med_rec_sn = sp->setno; + (void) strcpy(medr.med_rec_snm, sp->setname); + medr.med_rec_meds = sd->sd_med; /* structure assigment */ + (void) memset(&medr.med_rec_data, '\0', sizeof (med_data_t)); + medr.med_rec_foff = 0; + + /* + * If we are the last remaining user, then remove the mediator hosts + */ + if (num_users == 1) { + for (i = 0; i < MED_MAX_HOSTS; i++) { + if (medr.med_rec_meds.n_lst[i].a_cnt != 0) + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, + SVM_TAG_MEDIATOR, sp->setno, i); + (void) memset(&medr.med_rec_meds.n_lst[i], '\0', + sizeof (md_h_t)); + } + medr.med_rec_meds.n_cnt = 0; + } else { /* Remove this host from the mediator node list. */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Copy non local node */ + if (strcmp(mynode(), sd->sd_nodes[i]) != 0) { + (void) strcpy(medr.med_rec_nodes[i], + sd->sd_nodes[i]); + continue; + } + + /* Clear local node */ + (void) memset(&medr.med_rec_nodes[i], '\0', + sizeof (md_node_nm_t)); + } + } + + crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* + * If the client is part of a cluster put the DCS service + * into a deleteing state. + */ + if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { + if (metad_isautotakebyname(sp->setname)) { + delete_end = 0; + } else { + mdclrerror(ep); + goto out; + } + } + + /* Inform the mediator hosts of the new information */ + for (i = 0; i < MED_MAX_HOSTS; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) + mdclrerror(ep); + } + + /* Delete the set locally */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip non local nodes */ + if (strcmp(mynode(), sd->sd_nodes[i]) != 0) + continue; + + if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) + mdclrerror(ep); + } + if (delete_end && + sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) + rval = -1; + +out: + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + + if (lock_set == TRUE) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + cl_set_setkey(NULL); + } + + metaflushsetname(sp); + return (rval); +} + +int +meta_set_purge( + mdsetname_t *sp, + int bypass_cluster, + int forceflg, + md_error_t *ep +) +{ + char *thishost = mynode(); + md_set_desc *sd; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + int rval = 0; + int i, num_hosts = 0; + int has_set = 0; + int max_node = 0; + int delete_end = 1; + md_mnnode_desc *nd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + /* unable to find set description */ + rval = 1; + return (rval); + } + + if (MD_MNSET_DESC(sd)) { + /* + * Get a count of the hosts in the set and also lock the set + * on those hosts that know about it. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + has_set = nodehasset(sp, nd->nd_nodename, + NHS_NST_EQ, ep); + + /* + * The host is not aware of this set (has_set < 0) or + * the set does not match (has_set == 0). This check + * prevents the code getting confused by an apparent + * inconsistancy in the set's state, this is in the + * purge code so something is broken in any case and + * this is just trying to fix the brokeness. + */ + if (has_set <= 0) { + mdclrerror(ep); + nd->nd_flags |= MD_MN_NODE_NOSET; + } else { + num_hosts++; + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + /* + * If the force flag is set then + * ignore any RPC failures because we + * are only really interested with + * the set on local node. + */ + if (forceflg && mdanyrpcerror(ep)) { + mdclrerror(ep); + } else { + /* + * set max_node so that in the + * unlock code nodes in the + * set that have not been + * locked are not unlocked. + */ + max_node = nd->nd_nodeid; + rval = 2; + goto out1; + } + } + + } + nd = nd->nd_next; + } + max_node = 0; + } else { + /* + * Get a count of the hosts in the set and also lock the set + * on those hosts that know about it. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + has_set = nodehasset(sp, sd->sd_nodes[i], + NHS_NST_EQ, ep); + + /* + * The host is not aware of this set (has_set < 0) or + * the set does not match (has_set == 0). This check + * prevents the code getting confused by an apparent + * inconsistancy in the set's state, this is in the + * purge code so something is broken in any case and + * this is just trying to fix the brokeness. + */ + if (has_set <= 0) { + mdclrerror(ep); + /* + * set the node to NULL to prevent further + * requests to this unresponsive node. + */ + sd->sd_nodes[i][0] = '\0'; + } else { + num_hosts++; + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + /* + * If the force flag is set then + * ignore any RPC failures because we + * are only really interested with + * the set on local node. + */ + if (forceflg && mdanyrpcerror(ep)) { + mdclrerror(ep); + } else { + rval = 2; + /* + * set max_node so that in the + * unlock code nodes in the + * set that have not been + * locked are not unlocked. + */ + max_node = i; + goto out1; + } + } + } + } + max_node = i; /* now MD_MAXSIDES */ + } + if (!bypass_cluster) { + /* + * If there is only one host associated with the + * set then remove the set from the cluster. + */ + if (num_hosts == 1) { + if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) { + if (metad_isautotakebyname(sp->setname)) { + delete_end = 0; + } else { + mdclrerror(ep); + rval = 3; + goto out1; + } + } + } + } + + if (MD_MNSET_DESC(sd)) { + /* + * Get a count of the hosts in the set and also lock the set + * on those hosts that know about it. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (nd->nd_nodeid != sd->sd_mn_mynode->nd_nodeid) { + /* + * Tell the remote node to remove this node + */ + if (clnt_delhosts(nd->nd_nodename, sp, 1, + &thishost, ep) == -1) { + /* + * If we fail to delete ourselves + * from the remote host it does not + * really matter because the set is + * being "purged" from this node. The + * set can be purged from the other + * node at a later time. + */ + mdclrerror(ep); + } + nd = nd->nd_next; + continue; + } + /* remove the set from this host */ + if (clnt_delset(nd->nd_nodename, sp, ep) == -1) { + md_perror(dgettext(TEXT_DOMAIN, "delset")); + if (!bypass_cluster && num_hosts == 1) + (void) sdssc_delete_end(sp->setname, + SDSSC_CLEANUP); + mdclrerror(ep); + goto out1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + if (strcmp(thishost, sd->sd_nodes[i]) != 0) { + /* + * Tell the remote node to remove this node + */ + if (clnt_delhosts(sd->sd_nodes[i], sp, 1, + &thishost, ep) == -1) { + /* + * If we fail to delete ourselves + * from the remote host it does not + * really matter because the set is + * being "purged" from this node. The + * set can be purged from the other + * node at a later time. + */ + mdclrerror(ep); + } + continue; + } + + /* remove the set from this host */ + if (clnt_delset(sd->sd_nodes[i], sp, ep) == -1) { + md_perror(dgettext(TEXT_DOMAIN, "delset")); + if (!bypass_cluster && num_hosts == 1) + (void) sdssc_delete_end(sp->setname, + SDSSC_CLEANUP); + mdclrerror(ep); + goto out1; + } + } + } + + if (!bypass_cluster && num_hosts == 1) { + if (delete_end && sdssc_delete_end(sp->setname, SDSSC_COMMIT) == + SDSSC_ERROR) { + rval = 4; + } + } + +out1: + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* + * Remove the set lock on those nodes that had the set locked + * max_node will either be MD_MAXSIDES or array index of the last + * node contacted (or rather failed to contact) for traditional + * diskset. For a MN diskset, max_node is the node_id of the node + * that failed the lock. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (nd->nd_nodeid == max_node) + break; + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (forceflg && mdanyrpcerror(&xep)) { + mdclrerror(&xep); + nd = nd->nd_next; + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = 5; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < max_node; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (forceflg && mdanyrpcerror(&xep)) { + mdclrerror(&xep); + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = 5; + } + } + } + + cl_set_setkey(NULL); + + return (rval); +} + +int +meta_set_query( + mdsetname_t *sp, + mddb_dtag_lst_t **dtlpp, + md_error_t *ep +) +{ + mddb_dtag_get_parm_t dtgp; + + (void) memset(&dtgp, '\0', sizeof (mddb_dtag_get_parm_t)); + dtgp.dtgp_setno = sp->setno; + + /*CONSTCOND*/ + while (1) { + if (metaioctl(MD_MED_GET_TAG, &dtgp, &dtgp.dtgp_mde, NULL) != 0) + if (! mdismddberror(&dtgp.dtgp_mde, MDE_DB_NOTAG) || + *dtlpp == NULL) + return (mdstealerror(ep, &dtgp.dtgp_mde)); + else + break; + + /* + * Run to the end of the list + */ + for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) + /* void */; + + *dtlpp = Zalloc(sizeof (mddb_dtag_lst_t)); + + (void) memmove(&(*dtlpp)->dtl_dt, &dtgp.dtgp_dt, + sizeof (mddb_dtag_t)); + + dtgp.dtgp_dt.dt_id++; + } + return (0); +} + +/* + * return drivename get by key + */ +mddrivename_t * +metadrivename_withdrkey( + mdsetname_t *sp, + side_t sideno, + mdkey_t key, + int flags, + md_error_t *ep +) +{ + char *nm; + mdname_t *np; + mddrivename_t *dnp; + ddi_devid_t devidp; + md_set_desc *sd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (NULL); + } + + /* get namespace info */ + if (MD_MNSET_DESC(sd)) { + if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno, + key, ep)) == NULL) + return (NULL); + } else { + if ((nm = meta_getnmbykey(MD_LOCAL_SET, sideno+SKEW, + key, ep)) == NULL) + return (NULL); + } + + /* get device name */ + if (flags & PRINT_FAST) { + if ((np = metaname_fast(&sp, nm, ep)) == NULL) { + Free(nm); + return (NULL); + } + } else { + if ((np = metaname(&sp, nm, ep)) == NULL) { + Free(nm); + return (NULL); + } + } + Free(nm); + + /* make sure it's OK */ + if ((! (flags & MD_BASICNAME_OK)) && (metachkcomp(np, ep) != 0)) + return (NULL); + + /* get drivename */ + dnp = np->drivenamep; + dnp->side_names_key = key; + + /* + * Skip the following devid check if dnp is did device + * The device id is disabled for did device due to the + * lack of minor name support in the did driver. The following + * devid code path can set and propagate the error and + * eventually prevent did disks from being added to the + * diskset under SunCluster systems + */ + if (strncmp(dnp->rname, "/dev/did/", strlen("/dev/did/")) == 0) { + goto out; + } + + /* Also, Skip the check if MN diskset, no devid's */ + if (MD_MNSET_DESC(sd)) { + goto out; + } + + /* + * Get the devid associated with the key. + * + * If a devid was returned, it MUST be valid even in + * the case where a device id has been "updated". The + * "update" of the device id may have occured due to + * a firmware upgrade. + */ + if ((devidp = meta_getdidbykey(MD_LOCAL_SET, sideno+SKEW, key, ep)) + != NULL) { + dnp->devid = devid_str_encode(devidp, NULL); + free(devidp); + } else { + /* + * It is okay if replica is not in devid mode + */ + if (mdissyserror(ep, MDDB_F_NODEVID)) { + mdclrerror(ep); + goto out; + } + + /* + * devid is missing so this means that we have + * just upgraded from a configuration where + * devid's were not used so try to add in + * the devid and requery. + */ + if (meta_setdid(MD_LOCAL_SET, sideno + SKEW, key, + ep) < 0) + return (NULL); + if ((devidp = (ddi_devid_t)meta_getdidbykey(MD_LOCAL_SET, + sideno+SKEW, key, ep)) == NULL) + return (NULL); + dnp->devid = devid_str_encode(devidp, NULL); + devid_free(devidp); + } + +out: + if (flags & MD_BYPASS_DAEMON) + return (dnp); + + if (get_sidenmlist(sp, dnp, ep)) + return (NULL); + + /* return success */ + return (dnp); +} + +void +metafreedrivedesc(md_drive_desc **dd) +{ + md_drive_desc *p, *next = NULL; + + for (p = *dd; p != NULL; p = next) { + next = p->dd_next; + Free(p); + } + *dd = NULL; +} + +md_drive_desc * +metaget_drivedesc( + mdsetname_t *sp, + int flags, + md_error_t *ep +) +{ + side_t sideno = MD_SIDEWILD; + + assert(! (flags & MD_BYPASS_DAEMON)); + + if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) + return (NULL); + + return (metaget_drivedesc_sideno(sp, sideno, flags, ep)); +} + +md_drive_desc * +metaget_drivedesc_fromnamelist( + mdsetname_t *sp, + mdnamelist_t *nlp, + md_error_t *ep +) +{ + md_set_desc *sd; + mdnamelist_t *p; + md_drive_desc *dd = NULL; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (NULL); + + for (p = nlp; p != NULL; p = p->next) + (void) metadrivedesc_append(&dd, p->namep->drivenamep, 0, 0, + sd->sd_ctime, sd->sd_genid, MD_DR_ADD); + + return (dd); +} + +md_drive_desc * +metaget_drivedesc_sideno( + mdsetname_t *sp, + side_t sideno, + int flags, + md_error_t *ep +) +{ + md_set_desc *sd = NULL; + + assert(! (flags & MD_BYPASS_DAEMON)); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (NULL); + + if (sd->sd_drvs) + return (sd->sd_drvs); + + if ((sd->sd_drvs = dr2drivedesc(sp, sideno, flags, ep)) == NULL) + return (NULL); + + return (sd->sd_drvs); +} + +int +metaget_setownership( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + int bool; + int i; + md_mnnode_desc *nd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* If node isn't alive, can't own diskset */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd->nd_flags &= ~MD_MN_NODE_OWN; + nd = nd->nd_next; + continue; + } + /* + * If can't communicate with rpc.metad, then mark + * this node as not an owner. That node may + * in fact, be an owner, but without rpc.metad running + * that node can't do much. + */ + if (clnt_ownset(nd->nd_nodename, sp, &bool, ep) == -1) { + nd->nd_flags &= ~MD_MN_NODE_OWN; + } else if (bool == TRUE) { + nd->nd_flags |= MD_MN_NODE_OWN; + } else { + nd->nd_flags &= ~MD_MN_NODE_OWN; + } + nd = nd->nd_next; + } + return (0); + } + + /* Rest of code handles traditional disksets */ + + for (i = 0; i < MD_MAXSIDES; i++) + sd->sd_isown[i] = 0; + + if (clnt_ownset(mynode(), sp, &bool, ep) == -1) + return (-1); + + if (bool == TRUE) + sd->sd_isown[getmyside(sp, ep)] = 1; + + return (0); +} + +char * +mynode(void) +{ + static struct utsname myuname; + static int done = 0; + + if (! done) { + if (uname(&myuname) == -1) { + md_perror(dgettext(TEXT_DOMAIN, "uname")); + assert(0); + } + done = 1; + } + return (myuname.nodename); +} + +int +strinlst(char *str, int cnt, char **lst) +{ + int i; + + for (i = 0; i < cnt; i++) + if (strcmp(lst[i], str) == 0) + return (TRUE); + + return (FALSE); +} + +/* + * meta_get_reserved_names + * returns an mdnamelist_t of reserved slices + * reserved slices are those that are used but don't necessarily + * show up as metadevices (ex. reserved slice for db in sets, logs) + */ + +/*ARGSUSED*/ +int +meta_get_reserved_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep) +{ + int count = 0; + mdname_t *np = NULL; + mdnamelist_t *transnlp = NULL; + mdnamelist_t **tailpp = nlpp; + mdnamelist_t *nlp; + md_drive_desc *dd, *di; + + if (metaislocalset(sp)) + goto out; + + if (!(dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep)) && !mdisok(ep)) { + count = -1; + goto out; + } + + /* db in for sets on reserved slice */ + for (di = dd; di && count >= 0; di = di->dd_next) { + uint_t rep_slice; + + /* + * Add the name struct to the end of the + * namelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + if (di->dd_dnp && + (meta_replicaslice(di->dd_dnp, &rep_slice, ep) == 0) && + (np = metaslicename(di->dd_dnp, rep_slice, ep)) && + (tailpp = meta_namelist_append_wrapper(tailpp, np))) + count++; + else + count = -1; + } + + /* now find logs */ + if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) { + count = -1; + goto out; + } + + for (nlp = transnlp; (nlp != NULL); nlp = nlp->next) { + mdname_t *transnp = nlp->namep; + md_trans_t *transp; + + if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) { + count = -1; + goto out; + } + if (transp->lognamep) { + /* + * Add the name struct to the end of the + * namelist but keep a pointer to the last + * element so that we don't incur the overhead + * of traversing the list each time + */ + tailpp = meta_namelist_append_wrapper( + tailpp, transp->lognamep); + } + } +out: + metafreenamelist(transnlp); + return (count); +} + +/* + * Entry point to join a node to MultiNode diskset. + * + * Validate host in diskset. + * - Should be in membership list from API + * - Should not already be joined into diskset. + * - Set must have drives + * Assume valid configuration is stored in the set/drive/node records + * in the local mddb since no node or drive can be added to the MNset + * unless all drives and nodes are available. Reconfig steps will + * resync all ALIVE nodes in case of panic in critical areas. + * + * Lock down the set. + * Verify host is a member of this diskset. + * If drives exist in the configuration, load the mddbs. + * Set this node to active by notifying master if one exists. + * If this is the first node active in the diskset, this node + * becomes the master. + * Unlock the set. + * + * Mirror Resync: + * If this node is the last node to join the set and clustering + * isn't running, then start the 'metasync -r' type resync + * on all mirrors in this diskset. + * If clustering is running, this resync operation will + * be handled by the reconfig steps and should NOT + * be handled during a join operation. + * + * There are multiple return values in order to assist + * the join operation of all sets in the metaset command. + * + * Return values: + * 0 - Node successfully joined to set. + * -1 - Join attempted but failed + * - any failure from libmeta calls + * - node not in the member list + * -2 - Join not attempted since + * - this set had no drives in set + * - this node already joined to set + * - set is not a multinode set + * -3 - Node joined to STALE set. + */ +extern int +meta_set_join( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd; + md_mnnode_desc *nd, *nd2, my_nd; + int rval = 0; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + md_error_t ep_snarf = mdnullerror; + int master_flag = 0; + md_mnset_record *mas_mnsr = NULL; + int clear_nr_flags = 0; + md_mnnode_record *nr; + int stale_set = 0; + int rb_flags = 0; + int stale_bool = FALSE; + int suspendall_flag = 0; + int suspend1_flag = 0; + sigset_t oldsigs; + int send_reinit = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + + /* Must be a multinode diskset */ + if (!MD_MNSET_DESC(sd)) { + (void) mderror(ep, MDE_NOT_MN, sp->setname); + return (-2); + } + + /* Verify that the node is ALIVE (i.e. is in the API membership list) */ + if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_ALIVE)) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, sp->setno, + sd->sd_mn_mynode->nd_nodename, NULL, + sp->setname); + return (-1); + } + + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * Lock the set on current set members. + * For MN diskset lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + + /* + * Verify that this host is a member (in the host list) of the set. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(mynode(), nd->nd_nodename) == 0) { + break; + } + nd = nd->nd_next; + } + if (!nd) { + (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, + sd->sd_mn_mynode->nd_nodename, NULL, + sp->setname); + rval = -1; + goto out; + } + + /* + * Need to return failure if host is already 'joined' + * into the set. This is done so that if later the user + * issues a command to join all sets and a failure is + * encountered - that the resulting cleanup effort + * (withdrawing from all sets that were joined + * during that command) won't withdraw from this set. + */ + if (nd->nd_flags & MD_MN_NODE_OWN) { + rval = -2; + goto out2; + } + + /* + * Call metaget_setownership that calls each node in diskset and + * marks in set descriptor if node is an owner of the set or not. + * metaget_setownership checks to see if a node is an owner by + * checking to see if that node's kernel has the mddb loaded. + * If a node had panic'd during a reconfig or an + * add/delete/join/withdraw operation, the other nodes' node + * records may not reflect the current state of the diskset, + * so calling metaget_setownership is the safest thing to do. + */ + if (metaget_setownership(sp, ep) == -1) { + rval = -1; + goto out; + } + + /* If first active member of diskset, become the master. */ + nd = sd->sd_nodelist; + while (nd) { + if (nd->nd_flags & MD_MN_NODE_OWN) + break; + nd = nd->nd_next; + } + if (nd == NULL) + master_flag = 1; + + /* + * If not first active member of diskset, then get the + * master information from a node that is already joined + * and set the master information for this node. Be sure + * that this node (the already joined node) has its own + * join flag set. If not, then this diskset isn't currently + * consistent and shouldn't allow a node to join. This diskset + * inconsistency should only occur when a node has panic'd in + * the set while doing a metaset operation and the sysadmin is + * attempting to join a node into the set. This inconsistency + * will be fixed during a reconfig cycle which should be occurring + * soon since a node panic'd. + * + * If unable to get this information from an owning node, then + * this diskset isn't currently consistent and shouldn't + * allow a node to join. + */ + if (!master_flag) { + /* get master information from an owner (joined) node */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + sp->setno, &mas_mnsr, ep) == -1) { + rval = -1; + goto out; + } + + /* Verify that owner (joined) node has its own JOIN flag set */ + nr = mas_mnsr->sr_nodechain; + while (nr) { + if ((nd->nd_nodeid == nr->nr_nodeid) && + ((nr->nr_flags & MD_MN_NODE_OWN) == NULL)) { + (void) mddserror(ep, MDE_DS_NODENOSET, + sp->setno, nd->nd_nodename, NULL, + nd->nd_nodename); + free_sr((md_set_record *)mas_mnsr); + rval = -1; + goto out; + } + nr = nr->nr_next; + } + + /* + * Does master have set marked as STALE? + * If so, need to pass this down to kernel when + * this node snarfs the set. + */ + if (clnt_mn_is_stale(nd->nd_nodename, sp, + &stale_bool, ep) == -1) { + rval = -1; + goto out; + } + + /* set master information in my rpc.metad's set record */ + if (clnt_mnsetmaster(mynode(), sp, mas_mnsr->sr_master_nodenm, + mas_mnsr->sr_master_nodeid, ep)) { + free_sr((md_set_record *)mas_mnsr); + rval = -1; + goto out; + } + + /* set master information in my cached set desc */ + (void) strcpy(sd->sd_mn_master_nodenm, + mas_mnsr->sr_master_nodenm); + sd->sd_mn_master_nodeid = mas_mnsr->sr_master_nodeid; + nd2 = sd->sd_nodelist; + while (nd2) { + if (nd2->nd_nodeid == mas_mnsr->sr_master_nodeid) { + sd->sd_mn_masternode = nd2; + break; + } + nd2 = nd2->nd_next; + } + free_sr((md_set_record *)mas_mnsr); + + /* + * Set the node flags in mynode's rpc.metad node records for + * the nodes that are in the diskset. Can use my sd + * since earlier call to metaget_setownership set the + * owner flags based on whether that node had snarfed + * the MN diskset mddb. Reconfig steps guarantee that + * return of metaget_setownership will match the owning + * node's owner list except in the case where a node + * has just panic'd and in this case, a reconfig will + * be starting immediately and the owner lists will + * be sync'd up by the reconfig. + * + * Flag of SET means to take no action except to + * set the node flags as given in the nodelist linked list. + */ + if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, + MD_NR_SET, NULL, ep)) { + rval = -1; + goto out; + } + } + + /* + * Read in the mddb if there are drives in the set. + */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + /* No drives in list */ + if (! mdisok(ep)) { + rval = -1; + goto out; + } + rval = -2; + goto out; + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of all messages), + * then change the nodelist followed by a reinit and resume. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, sp, + MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + + /* Set master in my set record in rpc.metad */ + if (master_flag) { + if (clnt_mnsetmaster(mynode(), sp, + sd->sd_mn_mynode->nd_nodename, + sd->sd_mn_mynode->nd_nodeid, ep)) { + rval = -1; + goto out; + } + } + /* Causes mddbs to be loaded in kernel */ + if (setup_db_bydd(sp, dd, 0, ep) == -1) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Host not able to start diskset.")); + rval = -1; + goto out; + } + + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + /* + * Set rollback flags to 1 so that halt_set is called if a failure + * is seen after this point. If snarf_set fails, still need to + * call halt_set to cleanup the diskset. + */ + rb_flags = 1; + + /* Starts the set */ + if (snarf_set(sp, stale_bool, ep) != 0) { + if (mdismddberror(ep, MDE_DB_STALE)) { + /* + * Don't fail join, STALE means that set has + * < 50% mddbs. + */ + (void) mdstealerror(&ep_snarf, ep); + stale_set = 1; + } else if (mdisok(ep)) { + /* If snarf failed, but no error was set - set it */ + (void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64, + sp->setno, 0, NULL); + rval = -1; + goto out; + } else if (!(mdismddberror(ep, MDE_DB_ACCOK))) { + /* + * Don't fail join if ACCOK; ACCOK means that mediator + * provided extra vote. + */ + rval = -1; + goto out; + } + } + + /* Did set really get snarfed? */ + if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_NO) { + if (mdisok(ep)) { + /* If snarf failed, but no error was set - set it */ + (void) mdmddberror(ep, MDE_DB_NOTNOW, NODEV64, + sp->setno, 0, NULL); + } + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Host not able to start diskset.")); + rval = -1; + goto out; + } + + /* Change to nodelist so need to send reinit to rpc.mdcommd */ + send_reinit = 1; + + /* If first node to enter set, setup master and clear change log */ + if (master_flag) { + /* Set master in my locally cached set descriptor */ + (void) strcpy(sd->sd_mn_master_nodenm, + sd->sd_mn_mynode->nd_nodename); + sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; + sd->sd_mn_am_i_master = 1; + + /* + * If first node to join set, then clear out change log + * entries. Change log entries are only needed when a + * change of master is occurring in a diskset that has + * multiple owners. Since this node is the first owner + * of the diskset, clear the entries. + * + * Only do this if we are in a single node non-SC3.x + * situation. + */ + if (meta_mn_singlenode() && + mdmn_reset_changelog(sp, ep, MDMN_CLF_RESETLOG) != 0) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reset changelog.")); + rval = -1; + goto out; + } + } + + /* Set my locally cached flag */ + sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; + + /* + * Set this node's own flag on all joined nodes in the set + * (including my node). + */ + clear_nr_flags = 1; + + my_nd = *(sd->sd_mn_mynode); + my_nd.nd_next = NULL; + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, + MD_NR_JOIN, NULL, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + +out: + if (rval != NULL) { + /* + * If rollback flag is 1, then node was joined to set. + * Since an error occurred, withdraw node from set in + * order to rollback to before command was run. + * Need to preserve ep so that calling function can + * get error information. + */ + if (rb_flags == 1) { + if (halt_set(sp, &xep)) { + mdclrerror(&xep); + } + } + + /* + * If error, reset master to INVALID. + * Ignore error since (next) first node to successfully join + * will set master on all nodes. + */ + (void) clnt_mnsetmaster(mynode(), sp, "", + MD_MN_INVALID_NID, &xep); + mdclrerror(&xep); + /* Reset master in my locally cached set descriptor */ + sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; + sd->sd_mn_am_i_master = 0; + + /* + * If nr flags set on other nodes, reset them. + */ + if (clear_nr_flags) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + (void) clnt_upd_nr_flags(nd->nd_nodename, sp, + &my_nd, MD_NR_WITHDRAW, NULL, &xep); + mdclrerror(&xep); + nd = nd->nd_next; + } + /* Reset my locally cached flag */ + sd->sd_mn_mynode->nd_flags &= ~MD_MN_NODE_OWN; + } + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (send_reinit) { + /* Send reinit */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + /* + * We are here because we failed to resume + * rpc.mdcommd. However we potentially have + * an error from the previous call + * If the previous call did fail, we capture + * that error and generate a perror with + * the string, "Unable to resume...". + * Setting rval to -1 ensures that in the + * next iteration of the loop, ep is not + * clobbered. + */ + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.")); + } + nd = nd->nd_next; + } + + } + +out2: + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + /* + * We are here because we failed to resume + * rpc.mdcommd. However we potentially have + * an error from the previous call + * If the previous call did fail, we capture + * that error and generate a perror with + * the string, "Unable to resume...". + * Setting rval to -1 ensures that in the + * next iteration of the loop, ep is not + * clobbered. + */ + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* + * Unlock set. This flushes the caches on the servers. + */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + } + nd = nd->nd_next; + } + + /* + * If this node is the last to join the diskset and clustering isn't + * running, then resync the mirrors in the diskset. We have to wait + * until all nodes are joined so that the status gets propagated to + * all of the members of the set. + * Ignore any error from the resync as the join function shouldn't fail + * because the mirror resync had a problem. + * + * Don't start resync if set is stale. + */ + if ((rval == 0) && (sdssc_bind_library() != SDSSC_OKAY) && + (stale_set != 1)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) + break; + nd = nd->nd_next; + } + /* + * nd set to NULL means that we have no nodes in the set that + * haven't joined. In this case we start the resync. + */ + if (nd == NULL) { + (void) meta_mirror_resync_all(sp, 0, &xep); + mdclrerror(&xep); + } + } + + /* Update ABR state for all soft partitions */ + (void) meta_sp_update_abr(sp, &xep); + mdclrerror(&xep); + + /* + * call metaflushsetnames to reset local cache for master and + * node information. + */ + metaflushsetname(sp); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * If no error and stale_set is set, then set ep back + * to ep from snarf_set call and return -3. If another error + * occurred and rval is not 0, then that error would have + * caused the node to be withdrawn from the set and would + * have set ep to that error information. + */ + if ((rval == 0) && (stale_set)) { + (void) mdstealerror(ep, &ep_snarf); + return (-3); + } + + return (rval); +} + +/* + * Entry point to withdraw a node from MultiNode diskset. + * + * Validate host in diskset. + * - Should be joined into diskset. + * Assume valid configuration is stored in the set/drive/node records + * in the local mddb since no node or drive can be added to the MNset + * unless all drives and nodes are available. Reconfig steps will + * resync all ALIVE nodes in case of panic in critical areas. + * + * Lock down the set. + * Verify that drives exist in configuration. + * Verify host is a member of this diskset. + * Verify host is an owner of the diskset (host is joined to diskset). + * Only allow withdrawal of master node if master node is the only joined + * in the diskset. + * Halt the diskset on this node. + * Reset Master on this node. + * Updated node flags that this node with withdrawn. + * Unlock the set. + * + * Return values: + * 0 - Node successfully withdrew from set. + * -1 - Withdrawal attempted but failed + * - any failure from libmeta calls + * - node not in the member list + * -2 - Withdrawal not attempted since + * - this set had no drives in set + * - this node not joined to set + * - set is not a multinode set + */ +extern int +meta_set_withdraw( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd = 0; + md_mnnode_desc *nd, my_nd; + int rval = 0; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + int set_halted = 0; + int suspendall_flag = 0; + int suspend1_flag = 0; + bool_t stale_bool = FALSE; + mddb_config_t c; + int node_id_list[1]; + sigset_t oldsigs; + int send_reinit = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + + /* Must be a multinode diskset */ + if (!MD_MNSET_DESC(sd)) { + (void) mderror(ep, MDE_NOT_MN, sp->setname); + return (-1); + } + + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * Lock the set on current set members. + * For MN diskset lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS1, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + + /* Get list of drives - needed in case of failure */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + /* Error getting drives in list */ + if (! mdisok(ep)) { + rval = -1; + goto out2; + } + /* no drives in list */ + rval = -2; + goto out2; + } + + /* + * Verify that this host is a member (in the host list) of the set. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(mynode(), nd->nd_nodename) == 0) { + break; + } + nd = nd->nd_next; + } + if (!nd) { + (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, + sd->sd_mn_mynode->nd_nodename, NULL, + sp->setname); + rval = -1; + goto out2; + } + + /* + * Call metaget_setownership that calls each node in diskset and + * marks in set descriptor if node is an owner of the set or not. + * metaget_setownership checks to see if a node is an owner by + * checking to see if that node's kernel has the mddb loaded. + * If a node had panic'd during a reconfig or an + * add/delete/join/withdraw operation, the other nodes' node + * records may not reflect the current state of the diskset, + * so calling metaget_setownership is the safest thing to do. + */ + if (metaget_setownership(sp, ep) == -1) { + rval = -1; + goto out2; + } + + /* + * Verify that this node is joined + * to diskset (i.e. is an owner of the diskset). + */ + if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + rval = -2; + goto out2; + } + + /* + * For a MN diskset, only withdraw master if it is + * the only joined node. + */ + if (sd->sd_mn_master_nodeid == sd->sd_mn_mynode->nd_nodeid) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip my node since checking for other owners */ + if (nd->nd_nodeid == sd->sd_mn_master_nodeid) { + nd = nd->nd_next; + continue; + } + /* If another owner node if found, error */ + if (nd->nd_flags & MD_MN_NODE_OWN) { + (void) mddserror(ep, MDE_DS_WITHDRAWMASTER, + sp->setno, + sd->sd_mn_mynode->nd_nodename, NULL, + sp->setname); + rval = -1; + goto out2; + } + nd = nd->nd_next; + } + } + + /* + * Is current set STALE? + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + rval = -1; + goto out; + } + if (c.c_flags & MDDB_C_STALE) { + stale_bool = TRUE; + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of all messages), + * then change the nodelist followed by a reinit and resume. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + + /* + * Withdraw the set - halt set. + * This will fail if any I/O is occuring to any metadevice which + * includes a resync to a mirror metadevice. + */ + set_halted = 1; + if (halt_set(sp, ep)) { + /* Was set actually halted? */ + if (own_set(sp, NULL, TRUE, ep) == MD_SETOWNER_YES) { + set_halted = 0; + } + rval = -1; + goto out; + } + + /* Change to nodelist so need to send reinit to rpc.mdcommd */ + send_reinit = 1; + + /* Reset master on withdrawn node */ + if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, "", + MD_MN_INVALID_NID, ep)) { + rval = -1; + goto out; + } + + /* Mark my node as withdrawn and send to other nodes */ + nd = sd->sd_nodelist; + my_nd = *(sd->sd_mn_mynode); /* structure copy */ + my_nd.nd_next = NULL; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, + MD_NR_WITHDRAW, NULL, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + + /* + * If withdrawn node is a mirror owner, reset mirror owner + * to NULL. If an error occurs, print a warning and continue. + * Don't fail metaset because of mirror owner reset problem since + * next node to grab mirror will resolve this issue. + * Before next node grabs mirrors, metaset will show the withdrawn + * node as owner which is why an attempt to reset the mirror owner + * is made. + */ + node_id_list[0] = sd->sd_mn_mynode->nd_nodeid; /* Setup my nodeid */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + if (clnt_reset_mirror_owner(nd->nd_nodename, sp, + 1, &node_id_list[0], &xep) == 01) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reset mirror owner on node %s"), + nd->nd_nodename); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + +out: + if (rval == -1) { + /* Rejoin node - Mark node as joined and send to other nodes */ + nd = sd->sd_nodelist; + my_nd = *(sd->sd_mn_mynode); /* structure copy */ + my_nd.nd_next = NULL; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, &my_nd, + MD_NR_JOIN, NULL, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + /* Set master on withdrawn node */ + if (clnt_mnsetmaster(sd->sd_mn_mynode->nd_nodename, sp, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, &xep)) { + mdclrerror(&xep); + } + + /* Join set if halt_set had succeeded */ + if (set_halted) { + if (setup_db_bydd(sp, dd, 0, &xep) == -1) { + mdclrerror(&xep); + } + /* If set previously stale - make it so at re-join */ + if (snarf_set(sp, stale_bool, &xep) != 0) { + mdclrerror(&xep); + (void) halt_set(sp, &xep); + mdclrerror(&xep); + } + } + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (send_reinit) { + /* Send reinit */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + /* + * We are here because we failed to resume + * rpc.mdcommd. However we potentially have + * an error from the previous call. + * If the previous call did fail, we + * capture that error and generate a perror + * withthe string, "Unable to resume...". + * Setting rval to -1 ensures that in the + * next iteration of the loop, ep is not + * clobbered. + */ + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.")); + } + nd = nd->nd_next; + } + } + +out2: + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + /* + * We are here because we failed to resume + * rpc.mdcommd. However we potentially have + * an error from the previous call + * If the previous call did fail, we capture + * that error and generate a perror with + * the string, "Unable to resume...". + * Setting rval to -1 ensures that in the + * next iteration of the loop, ep is not + * clobbered. + */ + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* + * Unlock set. This flushes the caches on the servers. + */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + else + mdclrerror(&xep); + rval = -1; + } + nd = nd->nd_next; + } + + /* + * call metaflushsetnames to reset local cache for master and + * node information. + */ + metaflushsetname(sp); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + return (rval); + +} + +/* + * Update nodelist with cluster member information. + * A node not in the member list will be marked + * as not ALIVE and not OWN. + * A node in the member list will be marked ALIVE, but + * the OWN bit will not be changed. + * + * If mynode isn't in the membership list, fail causing + * another reconfig cycle to be started since a non-member + * node shouldn't be taking part in the reconfig cycle. + * + * Return values: + * 0 - No problem. + * 1 - Any failure including RPC failure to my node. + */ +int +meta_reconfig_update_nodelist( + mdsetname_t *sp, + mndiskset_membershiplist_t *nl, + md_set_desc *sd, + md_error_t *ep +) +{ + mndiskset_membershiplist_t *nl2; + md_mnnode_desc *nd; + md_error_t xep = mdnullerror; + int rval = 0; + + /* + * Walk through nodelist, checking to see if each + * node is in the member list. + * If node is not a member, reset ALIVE and OWN node flag. + * If node is a member, set ALIVE. + * If mynode's OWN flag gets reset, then halt the diskset on this node. + */ + nd = sd->sd_nodelist; + while (nd) { + nl2 = nl; + while (nl2) { + /* If node is in member list, set ALIVE */ + if (nl2->msl_node_id == nd->nd_nodeid) { + nd->nd_flags |= MD_MN_NODE_ALIVE; + break; + } else { + nl2 = nl2->next; + } + /* node is not in member list, mark !ALIVE and !OWN */ + if (nl2 == NULL) { + /* If node is mynode, then halt set if needed */ + if (strcmp(mynode(), nd->nd_nodename) == 0) { + /* + * This shouldn't happen, but just + * in case... Any node not in the + * membership list should be dead and + * not running reconfig step1. + */ + if (nd->nd_flags & MD_MN_NODE_OWN) { + if (halt_set(sp, &xep)) { + mde_perror(&xep, ""); + mdclrerror(&xep); + } + } + /* + * Return failure since this node + * (mynode) is not in the membership + * list, but process the rest of the + * nodelist first so that rpc.metad + * can be updated with the latest + * membership information. + */ + (void) mddserror(ep, + MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, NULL, + sp->setname); + rval = 1; + } + nd->nd_flags &= ~MD_MN_NODE_ALIVE; + nd->nd_flags &= ~MD_MN_NODE_OWN; + } + } + nd = nd->nd_next; + } + + /* Send this information to rpc.metad */ + if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, + MD_NR_SET, MNSET_IN_RECONFIG, &xep)) { + /* Return failure if can't send node flags to rpc.metad */ + if (rval == 0) { + (void) mdstealerror(ep, &xep); + rval = 1; + } + } + return (rval); +} + +/* + * Choose master determines the master for a diskset. + * Each node determines the master on its own and + * adds this information to its local rpc.metad nodelist + * and also sends it to the kernel. + * + * Nodelist in set descriptor (sd) is sorted in + * monotonically increasing sequence of nodeid. + * + * Return values: + * 0 - No problem. + * 205 - There was an RPC problem to another node. + * -1 - There was an error. This could be an RPC error to my node. + * This is a catastrophic failure causing node to panic. + */ +int +meta_reconfig_choose_master_for_set( + mdsetname_t *sp, + md_set_desc *sd, + md_error_t *ep +) +{ + int is_owner; + md_mnset_record *mnsr = NULL; + int lowest_alive_nodeid = 0; + uint_t master_nodeid; + md_mnnode_desc *nd, *nd2; + md_mnnode_record *nr; + md_drive_desc *dd; + md_setkey_t *cl_sk; + int rval = 0; + md_error_t xep = mdnullerror; + mddb_setflags_config_t sf; + + /* + * Is current node joined to diskset? + * Don't trust flags, really check to see if mddb is snarfed. + */ + if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { + /* + * If a node is joined to the diskset, this node checks + * to see if the current master of the diskset is valid and + * is still in the membership list (ALIVE) and is + * still joined (OWN). Need to verify if master is + * really joined - don't trust the flags. (Can trust + * ALIVE since set during earlier part of reconfig cycle.) + * If the current master is valid, still in the membership + * list and joined, then master is not changed on this node. + * Just return. + * + * Verify that nodeid is valid before accessing masternode. + */ + if ((sd->sd_mn_master_nodeid != MD_MN_INVALID_NID) && + (sd->sd_mn_masternode->nd_flags & MD_MN_NODE_ALIVE)) { + if (clnt_ownset(sd->sd_mn_master_nodenm, sp, + &is_owner, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + sd->sd_mn_master_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } else { + if (is_owner == TRUE) { + + meta_mc_log(MC_LOG5, dgettext( + TEXT_DOMAIN, "Set %s previous " + "master chosen %s (%d): %s"), + sp->setname, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, + meta_print_hrtime(gethrtime() - + start_time)); + + /* Previous master is ok - done */ + return (0); + } + } + } + + /* + * If current master is no longer in the membership list or + * is no longer joined, then this node uses the following + * algorithm: + * - node calls RPC routine clnt_ownset to get latest + * information on which nodes are owners of diskset. + * clnt_ownset checks on each node to see if its kernel + * has that diskset snarfed. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_ownset(nd->nd_nodename, sp, + &is_owner, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + + /* + * Set owner flag for each node based on whether + * that node really has a diskset mddb snarfed in + * or not. + */ + if (is_owner == TRUE) + nd->nd_flags |= MD_MN_NODE_OWN; + else + nd->nd_flags &= ~MD_MN_NODE_OWN; + + nd = nd->nd_next; + } + + /* + * - node walks through nodelist looking for nodes that are + * owners of the diskset that are in the membership list. + * - for each owner, node calls RPC routine clnt_getset to + * see if that node has its node record set to OK. + * - If so, master is chosen to be this owner node. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Don't consider a node that isn't an owner */ + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + + /* Does node has its own node record set to OK? */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + nr = mnsr->sr_nodechain; + while (nr) { + if (nd->nd_nodeid == nr->nr_nodeid) { + if (nr->nr_flags & MD_MN_NODE_OK) { + /* Found a master */ + free_sr( + (md_set_record *)mnsr); + goto found_master; + } + } + nr = nr->nr_next; + } + free_sr((md_set_record *)mnsr); + nd = nd->nd_next; + } + + /* + * - If no owner node has its own node record on its own node + * set to OK, then this node checks all of the non-owner + * nodes that are in the membership list. + * - for each non-owner, node calls RPC routine clnt_getset to + * see if that node has its node record set to OK. + * - If set doesn't exist, don't choose node for master. + * - If so, master is chosen to be this non-owner node. + * + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Only checking non-owner nodes this time around */ + if (nd->nd_flags & MD_MN_NODE_OWN) { + nd = nd->nd_next; + continue; + } + + /* Does node has its own node record set to OK? */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + /* + * If set doesn't exist on non-owner node, + * don't consider this node for master. + */ + if (mdiserror(ep, MDE_NO_SET)) { + nd = nd->nd_next; + continue; + } else if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + /* RPC failure to another node */ + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + nr = mnsr->sr_nodechain; + while (nr) { + if (nd->nd_nodeid == nr->nr_nodeid) { + if (nr->nr_flags & MD_MN_NODE_OK) { + /* Found a master */ + free_sr( + (md_set_record *)mnsr); + goto found_master; + } + } + nr = nr->nr_next; + } + free_sr((md_set_record *)mnsr); + nd = nd->nd_next; + } + + /* + * - If no node can be found that has its own node record on + * its node to be set to OK, then all alive nodes + * were in the process of being added to or deleted + * from set. Each alive node will remove all + * information pertaining to this set from its node. + * + * If all nodes in set are ALIVE, then call sdssc end routines + * since set was truly being initially created or destroyed. + */ + goto delete_set; + } else { + + /* + * If node is not joined to diskset, then this + * node uses the following algorithm: + * - If unjoined node doesn't have a node record for itself, + * just delete the diskset since diskset was in the + * process of being created. + * - node needs to find master of diskset before + * reconfig cycle, if a master existed. + * - node calls RPC routine clnt_ownset to get latest + * information on which nodes are owners of diskset. + * clnt_ownset checks on each node to see if its + * kernel has that diskset snarfed. + */ + + /* + * Is my node in the set description? + * If not, delete the set from this node. + * sr2setdesc sets sd_mn_mynode pointer to the node + * descriptor for this node if there was a node + * record for this node. + * + */ + if (sd->sd_mn_mynode == NULL) { + goto delete_set; + } + + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_ownset(nd->nd_nodename, sp, + &is_owner, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + + /* + * Set owner flag for each node based on whether + * that node really has a diskset mddb snarfed in + * or not. + */ + if (is_owner == TRUE) + nd->nd_flags |= MD_MN_NODE_OWN; + else + nd->nd_flags &= ~MD_MN_NODE_OWN; + + nd = nd->nd_next; + } + + /* + * - node walks through nodelist looking for nodes that + * are owners of the diskset that are in + * the membership list. + * - for each owner, node calls RPC routine clnt_getset to + * see if that node has a master set and to get the + * diskset description. + * - If the owner node has a set description that doesn't + * include the non-joined node in the nodelist, this node + * removes its set description of that diskset + * (i.e. removes the set from its local mddbs). This is + * handling the case of when a node was removed from a + * diskset while it was not in the cluster membership + * list. + * - If that node has a master set and the master is in the + * membership list and is an owner, then either this was + * the master from before the reconfig cycle or this + * node has already chosen a new master - either way, + * the master value is valid as long as it is in the + * membership list and is an owner + * - master is chosen to be owner node's master + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Don't consider a node that isn't an owner */ + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + + /* Get owner node's set record */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + + /* Is this node in the owner node's set record */ + nr = mnsr->sr_nodechain; + while (nr) { + if (sd->sd_mn_mynode->nd_nodeid == + nr->nr_nodeid) { + break; + } + nr = nr->nr_next; + } + if (nr == NULL) { + /* my node not found - delete set */ + free_sr((md_set_record *)mnsr); + goto delete_set; + } + + /* Is owner's node's master valid? */ + master_nodeid = mnsr->sr_master_nodeid; + free_sr((md_set_record *)mnsr); + if (master_nodeid == MD_MN_INVALID_NID) { + nd = nd->nd_next; + continue; + } + + nd2 = sd->sd_nodelist; + while (nd2) { + if ((nd2->nd_nodeid == master_nodeid) && + (nd2->nd_flags & MD_MN_NODE_ALIVE) && + (nd2->nd_flags & MD_MN_NODE_OWN)) { + nd = nd2; + goto found_master; + } + nd2 = nd2->nd_next; + } + nd = nd->nd_next; + } + + /* + * - If no owner node has a valid master, then follow + * algorithm of when a node is joined to the diskset. + * - node walks through nodelist looking for nodes that are + * owners of the diskset that are in the membership list. + * - for each owner, node calls RPC routine clnt_getset to + * see if that node has its node record set to OK. + * - If so, master is chosen to be this owner node. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Don't consider a node that isn't an owner */ + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + + /* Does node has its own node record set to OK? */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + nr = mnsr->sr_nodechain; + while (nr) { + if (nd->nd_nodeid == nr->nr_nodeid) { + if (nr->nr_flags & MD_MN_NODE_OK) { + /* Found a master */ + free_sr( + (md_set_record *)mnsr); + goto found_master; + } + } + nr = nr->nr_next; + } + free_sr((md_set_record *)mnsr); + nd = nd->nd_next; + } + + /* + * - If no owner node has its own node record on its own node + * set to OK, then this node checks all of the non-owner + * nodes that are in the membership list. + * - for each non-owner, node calls RPC routine clnt_getset to + * see if that node has its node record set to OK. + * - If set doesn't exist, don't choose node for master. + * - If this node doesn't exist in the nodelist on any of the + * non-owner nodes, this node removes its set description + * of that diskset (i.e. removes the set from its local + * mddbs). This is handling the case of when a node was + * removed from a diskset while it was not in the + * cluster membership list. + * - If non-owner node has its node record set to OK and if + * this node hasn't removed this diskset (step directly + * before this one), then the master is chosen to be this + * non-owner node. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Don't consider node that isn't in member list */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd->nd_flags |= MD_MN_NODE_DEL; + nd = nd->nd_next; + continue; + } + + /* Don't consider owner nodes since none are OK */ + if (nd->nd_flags & MD_MN_NODE_OWN) { + nd->nd_flags |= MD_MN_NODE_DEL; + nd = nd->nd_next; + continue; + } + + /* + * Don't need to get nodelist from my node since + * this is where sd_nodelist was obtained. + */ + if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { + nd = nd->nd_next; + continue; + } + + /* + * If node has already been decided against for + * master, then skip it. + */ + if (nd->nd_flags & MD_MN_NODE_DEL) { + nd = nd->nd_next; + continue; + } + + /* + * Does node in my nodelist have its own node + * record marked OK on its node? And does node + * in my nodelist exist on all other nodes? + * Don't want to choose a node for master unless + * that node is marked OK on its own node and that + * node exists on all other alive nodes. + * + * This is guarding against the case when several + * nodes are down and one of the downed nodes is + * deleted from the diskset. When the down nodes + * are rebooted into the cluster, you don't want + * any node to pick the deleted node as the master. + */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + /* + * If set doesn't exist on non-owner node, + * don't consider this node for master. + */ + if (mdiserror(ep, MDE_NO_SET)) { + nd->nd_flags |= MD_MN_NODE_DEL; + nd = nd->nd_next; + continue; + } else if (mdanyrpcerror(ep)) { + /* RPC failure to another node */ + return (205); + } else { + /* Any other failure */ + return (-1); + } + } + /* + * Is my node in the nodelist gotten from the other + * node? If not, then remove the set from my node + * since set was deleted from my node while my node + * was out of the cluster. + */ + nr = mnsr->sr_nodechain; + while (nr) { + if (sd->sd_mn_mynode->nd_nodeid == + nr->nr_nodeid) { + break; + } + nr = nr->nr_next; + } + if (nr == NULL) { + /* my node not found - delete set */ + free_sr((md_set_record *)mnsr); + goto delete_set; + } + + /* Is node being checked marked OK on its own node? */ + nr = mnsr->sr_nodechain; + while (nr) { + if (nd->nd_nodeid == nr->nr_nodeid) { + if (!(nr->nr_flags & MD_MN_NODE_OK)) { + nd->nd_flags |= MD_MN_NODE_DEL; + } + break; + } + nr = nr->nr_next; + } + /* + * If node being checked doesn't exist on its + * own node - don't choose it as master. + */ + if (nr == NULL) { + nd->nd_flags |= MD_MN_NODE_DEL; + } + + /* + * Check every node in my node's nodelist against + * the nodelist gotten from the other node. + * If a node in my node's nodelist is not found in the + * other node's nodelist, then set the DEL flag. + */ + nd2 = sd->sd_nodelist; + while (nd2) { + nr = mnsr->sr_nodechain; + while (nr) { + if (nd2->nd_nodeid == nr->nr_nodeid) { + break; + } + nr = nr->nr_next; + } + /* nd2 not found in other node's nodelist */ + if (nr == NULL) { + nd2->nd_flags |= MD_MN_NODE_DEL; + } + nd2 = nd2->nd_next; + } + + free_sr((md_set_record *)mnsr); + nd = nd->nd_next; + } + + /* + * Rescan list look for node that has not been marked DEL. + * First node found is the master. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_DEL)) { + break; + } + nd = nd->nd_next; + continue; + } + if (nd) { + /* Found a master */ + goto found_master; + } + + /* + * - If no node can be found that has its own node record on + * its node to be set to OK, then all alive nodes + * were in the process of being added to or deleted + * from set. Each alive node will remove all + * information pertaining to this set from its node. + * + * If all nodes in set are ALIVE, then call sdssc end routines + * since set was truly being initially created or destroyed. + */ + goto delete_set; + } + +found_master: + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Set %s master chosen %s (%d): %s"), + sp->setname, nd->nd_nodename, nd->nd_nodeid, + meta_print_hrtime(gethrtime() - start_time)); + + if (clnt_lock_set(mynode(), sp, ep) == -1) { + return (-1); + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + + if (clnt_mnsetmaster(mynode(), sp, + nd->nd_nodename, nd->nd_nodeid, ep)) { + rval = -1; + } else if (sd->sd_mn_mynode->nd_nodeid == nd->nd_nodeid) { + /* If this node is new master, set flag in this node's kernel */ + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_setflags = MD_SET_MN_NEWMAS_RC; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + sf.sf_flags = MDDB_NM_SET; + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Setting new master flag for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* + * Fail reconfig cycle if ioctl fails since it is critical + * to set new master flag. + */ + if (metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, + NULL) != NULL) { + (void) mdstealerror(ep, &sf.sf_mde); + rval = -1; + } + } + + if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { + if (rval == 0) { + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + + cl_set_setkey(NULL); + + metaflushsetname(sp); + + return (rval); + +delete_set: + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Master not chosen, deleting set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* + * Remove all set information from this node: + * - node records for this set + * - drive records for this set + * - set record for this set + * (Only do this on this node since each node + * will do it for its own local mddb.) + * + * If all nodes in set are ALIVE, then + * the lowest numbered ALIVE nodeid in set + * (irregardless of whether an owner node or not) will + * call the DCS service to cleanup for create/delete of set. + * sdssc_create_end(cleanup) if set was being created or + * sdssc_delete_end(cleanup) if set was being deleted. + * A node record with flag ADD denotes a set being + * created. A node record with flag DEL denotes a + * set being deleted. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Found a node that isn't alive */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) + break; + + /* Is my node the lowest numbered ALIVE node? */ + if (nd->nd_nodeid < sd->sd_mn_mynode->nd_nodeid) { + break; + } + nd = nd->nd_next; + } + if (nd == NULL) { + /* All nodes ALIVE and this is the lowest nodeid */ + lowest_alive_nodeid = 1; + } + + if (clnt_lock_set(mynode(), sp, ep) == -1) { + return (-1); + } + + + /* + * If this node had been joined, withdraw and reset master. + * + * This could happen if a node was being added to or removed + * from a diskset and the node doing the add/delete operation and + * all other nodes in the diskset have left the cluster. + */ + if (sd->sd_mn_mynode) { + nd = sd->sd_mn_mynode; + if (nd->nd_flags & MD_MN_NODE_OWN) { + if (clnt_withdrawset(mynode(), sp, ep)) { + rval = -1; + goto out; + } + if (clnt_mnsetmaster(mynode(), sp, "", + MD_MN_INVALID_NID, ep)) { + rval = -1; + goto out; + } + } + } + + /* + * Remove side records for this node (side) from local mddb + * (clnt_deldrvs does this) if there are drives in the set. + * + * Don't need to mark this node as DEL since already marked as + * ADD or DEL (or this node would have been chosen as master). + * Don't need to mark other node records, drive records or + * set records as DEL. If a panic occurs during clnt_delset, + * these records will be deleted the next time this node + * becomes a member and goes through the reconfig cycle. + */ + /* Get the drive descriptors for this set */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) { + /* + * Ignore and clear out any failures from + * metaget_drivedesc since a panic could have + * occurred when a node was partially added to a set. + */ + mdclrerror(ep); + } + } else { + if (clnt_deldrvs(mynode(), sp, dd, ep)) { + rval = -1; + goto out; + } + } + + /* + * Now, delete the set - this removes the node, drive + * and set records from the local mddb. + */ + if (clnt_delset(mynode(), sp, ep)) { + rval = -1; + goto out; + } + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + + /* + * Ignore errors from unlock of set since set is no longer + * known (if clnt_delset worked). + */ + if (clnt_unlock_set(mynode(), cl_sk, &xep) == -1) { + mdclrerror(&xep); + } + + cl_set_setkey(NULL); + + metaflushsetname(sp); + + /* + * If this node is the lowest numbered nodeid then + * call sdssc_create/delete_end depending on whether + * this node is marked as ADD or DEL in the node record. + */ + if (lowest_alive_nodeid) { + if (nd->nd_flags & MD_MN_NODE_ADD) + sdssc_create_end(sp->setname, SDSSC_CLEANUP); + else if (nd->nd_flags & MD_MN_NODE_DEL) + sdssc_delete_end(sp->setname, SDSSC_CLEANUP); + } + + /* Finished with this set -- return */ + return (rval); +} + +/* + * Reconfig step to choose a new master for all MN disksets. + * Return values: + * 0 - Everything is great. + * 1 - This node failed to reconfig. + * 205 - Cause another reconfig due to a nodelist problem + * or RPC failure to another node + */ +int +meta_reconfig_choose_master( + md_error_t *ep +) +{ + set_t max_sets, setno; + int nodecnt; + mndiskset_membershiplist_t *nl; + md_set_desc *sd; + mdsetname_t *sp; + int rval = 0; + mddb_setflags_config_t sf; + int start_node_delayed = 0; + + if ((max_sets = get_max_sets(ep)) == 0) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to get number of sets")); + return (1); + } + + /* + * Get membershiplist from API routine. If there's + * an error, return a 205 to cause another reconfig. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + mde_perror(ep, ""); + return (205); + } + + for (setno = 1; setno < max_sets; setno++) { + if ((sp = metasetnosetname(setno, ep)) == NULL) { + if (mdiserror(ep, MDE_NO_SET)) { + /* No set for this setno - continue */ + mdclrerror(ep); + continue; + } else { + /* + * If encountered an RPC error from my node, + * then immediately fail. + */ + if (mdanyrpcerror(ep)) { + mde_perror(ep, ""); + return (1); + } + /* Can't get set information */ + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to get information for " + "set number %d"), setno); + mdclrerror(ep); + continue; + } + } + + /* If setname is there, set desc should exist. */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + /* + * If encountered an RPC error from my node, + * then immediately fail. + */ + if (mdanyrpcerror(ep)) { + mde_perror(ep, ""); + return (1); + } + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to get set %s desc information"), + sp->setname); + mdclrerror(ep); + continue; + } + + /* Only reconfig MN disksets */ + if (!MD_MNSET_DESC(sd)) { + continue; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Begin choose master for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* Update nodelist with member information. */ + if (meta_reconfig_update_nodelist(sp, nl, sd, ep)) { + /* + * If encountered an RPC error from my node, + * then immediately fail. + */ + if (mdanyrpcerror(ep)) { + mde_perror(ep, ""); + return (1); + } + mde_perror(ep, ""); + mdclrerror(ep); + continue; + } + + /* + * If all nodes in a cluster are starting, then + * all nodes will attempt to contact all other nodes + * to determine a master node. This can lead to a + * problem where node 1 is trying to contact the rpc.metad + * node 2 and node 2 is trying to contact the rpc.metad + * on node 1 -- and this causes the rpc call to fail + * on both nodes and causes a new reconfig cycle. + * + * In order to break this problem, a newly starting node + * will delay a small amount of time (nodeid mod 4 seconds) + * and will then run the code to choose a master for the + * first set. Delay will only be done once regardless of the + * number of sets. + */ + if (start_node_delayed == 0) { + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_flags = MDDB_NM_GET; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, + &sf.sf_mde, NULL) == 0) && + ((sf.sf_setflags & MD_SET_MN_START_RC) == + MD_SET_MN_START_RC)) { + (void) sleep(sd->sd_mn_mynode->nd_nodeid % 4); + } + start_node_delayed = 1; + } + + /* Choose master for this set */ + rval = meta_reconfig_choose_master_for_set(sp, sd, ep); + if (rval == -1) { + mde_perror(ep, ""); + return (1); + } else if (rval == 205) { + mde_perror(ep, ""); + return (205); + } + + /* Send new nodelist to rpc.mdcommd */ + (void) mdmn_reinit_set(sp->setno); + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Choose master for set %s completed: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + } + + /* + * Each node turns on I/Os for all MN disksets. + * This is to recover from the situation where the master died + * during a previous reconfig cycle when I/Os were suspended + * for a MN diskset. + * If a failure occurs return a 1 which will force this node to + * panic. Cannot leave node in the situation where I/Os are + * not resumed. + */ + setno = 0; /* 0 means all MN sets */ + if (metaioctl(MD_MN_RESUME_SET, &setno, ep, NULL)) { + mde_perror(ep, ""); + return (1); + } + + /* Free the nodelist */ + if (nodecnt) + meta_free_nodelist(nl); + + return (0); +} + +/* + * meta_mnsync_user_records will synchronize the diskset user records across + * all nodes in the diskset. The diskset user records are stored in + * each node's local set mddb. + * + * This needs to be done even if there is no master change during the + * reconfig cycle since this routine should clean up any mess left by + * the untimely termination of a metaset or metadb command (due to a + * node panic or to user intervention). + * + * Caller is the Master node. + * + * Returns 0 - Success + * 205 - Failure during RPC to another node + * -1 - Any other failure and ep is filled in. + */ +int +meta_mnsync_user_records( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + md_mnnode_desc *master_nodelist, *nd, *nd2, *ndtail; + md_mnset_record *mnsr; + md_mnsr_node_t *master_mnsr_node = NULL, *mnsr_node = NULL; + md_mnnode_record *nr; + md_drive_record *dr; + int dr_cnt, dd_cnt; + int found_my_nr; + md_drive_desc *dd, *dd_prev, *master_dd, *other_dd; + int all_drives_ok; + int rval = 0; + int max_genid = 0; + int num_alive_nodes, num_alive_nodes_del = 0; + int set_locked = 0; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + char *anode[1]; + mddb_setflags_config_t sf; + + /* + * Sync up node records first. + * Construct a master nodelist using the nodelist from this + * node's rpc.metad node records and then setting the state of each + * node following these rules: + * - If a node record is marked OK on its node, mark it OK + * in the master nodelist (and later OK on all nodes) + * If a node record is also marked OWN on its node, + * mark it OWN in the master nodelist. + * - If a node record is not marked OK on its node, then mark + * it as DEL in the master list (later deleting it) + * - If node record doesn't exist on that node, then mark it DEL + * (later deleting it) + * - If set record doesn't exist on that node, mark node as DEL + * - If a node record doesn't exist on all nodes, then mark it DEL + * - If a node is not ALIVE, then + * - If that node marked DEL on any node - mark it DEL + * in master list but leave in nodelist + * - If that node is marked as ADD on any node, mark it + * ADD in the master list but leave in nodelist + * - When that node returns to the living, the DEL + * node record will be removed and the ADD node + * record may be removed if marked ADD on that + * node. + * The key rule is to not remove a node from the nodelist until + * that node record is removed from its own node. Do not want to + * remove a node's record from all other nodes and then have + * that node have its own record marked OK so that a node will pick + * a different master than the other nodes. + * + * Next, + * If node is ALIVE and node record is marked DEL in master nodelist, + * remove node from set. + * If node is ALIVE and node record is marked OK in master nodelist, + * mark it OK on all other nodes. + * If node is not ALIVE and node record is marked DEL in master + * nodelist, mark it DEL on all other nodes. + * If node is not ALIVE and node record is marked ADD in master, + * nodelist, mark it ADD on all other nodes. + */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + master_nodelist = sd->sd_nodelist; + + /* + * Walk through nodelist creating a master nodelist. + */ + num_alive_nodes = 0; + nd = master_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + num_alive_nodes++; + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, ep) == -1) { + if (mdiserror(ep, MDE_NO_SET)) { + /* set doesn't exist, mark node as DEL */ + nd->nd_flags &= ~MD_MN_NODE_OK; + nd->nd_flags &= ~MD_MN_NODE_ADD; + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags |= MD_MN_NODE_NOSET; + nd = nd->nd_next; + continue; + } else { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + } + /* Find biggest genid in records for this diskset */ + if (mnsr->sr_genid > max_genid) + max_genid = mnsr->sr_genid; + + dr = mnsr->sr_drivechain; + while (dr) { + /* Find biggest genid in records for this diskset */ + if (dr->dr_genid > max_genid) { + max_genid = dr->dr_genid; + } + dr = dr->dr_next; + } + + found_my_nr = 0; + nr = mnsr->sr_nodechain; + /* nr is the list of node recs from nd_nodename node */ + while (nr) { + /* Find biggest genid in records for this diskset */ + if (nr->nr_genid > max_genid) + max_genid = nr->nr_genid; + nd2 = master_nodelist; + ndtail = NULL; + /* For each node record, is it in master list? */ + while (nd2) { + if (nd2->nd_nodeid == nr->nr_nodeid) + break; + if (nd2->nd_next == NULL) + ndtail = nd2; + nd2 = nd2->nd_next; + } + /* + * Found node record not in master list -- add it + * to list marking it as DEL since node record + * should exist on all nodes unless a panic occurred + * during addition or deletion of host to diskset. + */ + if (nd2 == NULL) { + nd2 = Zalloc(sizeof (*nd2)); + (void) strcpy(nd2->nd_nodename, + nr->nr_nodename); + nd2->nd_flags = nr->nr_flags; + nd2->nd_flags |= MD_MN_NODE_DEL; + nd2->nd_nodeid = nr->nr_nodeid; + nd2->nd_next = NULL; + ndtail->nd_next = nd2; + nd2 = NULL; + nr = nr->nr_next; + continue; + } + /* + * Is this the node record for the node that + * we requested the set desc from? + * If so, check if node has its own node record + * marked OK. If marked OK, check for the OWN bit. + */ + if (nr->nr_nodeid == nd->nd_nodeid) { + found_my_nr = 1; + if (nr->nr_flags & MD_MN_NODE_OK) { + /* + * If node record is marked OK + * on its own node, then mark it OK + * in the master list. Node record + * would have to exist on all nodes + * in the ADD state before it could + * be put into the OK state. + */ + nd->nd_flags |= MD_MN_NODE_OK; + nd->nd_flags &= + ~(MD_MN_NODE_ADD | MD_MN_NODE_DEL); + /* + * Mark own in master list as marked + * on own node. + */ + if (nr->nr_flags & MD_MN_NODE_OWN) + nd->nd_flags |= MD_MN_NODE_OWN; + else + nd->nd_flags &= ~MD_MN_NODE_OWN; + } else { + /* Otherwise, mark node as DEL */ + nd->nd_flags &= ~MD_MN_NODE_OK; + nd->nd_flags &= ~MD_MN_NODE_ADD; + nd->nd_flags |= MD_MN_NODE_DEL; + } + } + /* + * If node is not ALIVE and marked DEL + * on any node, make it DEL in master list. + * If node is not ALIVE and marked ADD + * on any node, make it ADD in master list + * unless node record has already been marked DEL. + */ + if (!(nr->nr_flags & MD_MN_NODE_ALIVE)) { + if (nr->nr_flags & MD_MN_NODE_ADD) { + if (!(nd->nd_flags & MD_MN_NODE_DEL)) { + /* If not DEL - mark it ADD */ + nd->nd_flags |= MD_MN_NODE_ADD; + nd->nd_flags &= ~MD_MN_NODE_OK; + } + } + if (nr->nr_flags & MD_MN_NODE_DEL) { + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags &= ~MD_MN_NODE_OK; + /* Could already be ADD - make it DEL */ + nd->nd_flags &= ~MD_MN_NODE_ADD; + } + } + nr = nr->nr_next; + } + /* + * If a node record doesn't exist on its own node, + * then mark node as DEL. + */ + if (found_my_nr == 0) { + nd->nd_flags &= ~MD_MN_NODE_OK; + nd->nd_flags |= MD_MN_NODE_DEL; + } + + /* + * If node is OK - put mnsr onto master_mnsr_node list for + * later use when syncing up the drive records in the set. + */ + if (nd->nd_flags & MD_MN_NODE_OK) { + mnsr_node = Zalloc(sizeof (*mnsr_node)); + mnsr_node->mmn_mnsr = mnsr; + (void) strncpy(mnsr_node->mmn_nodename, + nd->nd_nodename, MD_MAX_MNNODENAME_PLUS_1); + mnsr_node->mmn_next = master_mnsr_node; + master_mnsr_node = mnsr_node; + } else { + free_sr((struct md_set_record *)mnsr); + } + + nd = nd->nd_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Master nodelist created for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* + * Send master nodelist to the rpc.metad on all nodes (including + * myself) and each node will update itself. This will set the + * ADD and DEL flags on each node as setup in the master nodelist. + * Don't send nodelist to node where set doesn't exist. + */ + nd = master_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || + (nd->nd_flags & MD_MN_NODE_NOSET)) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + master_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + nd = nd->nd_next; + } + + /* + * Now, delete nodes that need to be deleted. + */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) { + rval = -1; + goto out; + } + } + + /* + * May be doing lots of RPC commands to the nodes, so lock the + * ALIVE members of the set since most of the rpc.metad routines + * require this for security reasons. + */ + nd = master_nodelist; + while (nd) { + /* Skip non-alive nodes and node without set */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || + (nd->nd_flags & MD_MN_NODE_NOSET)) { + nd = nd->nd_next; + continue; + } + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + set_locked = 1; + nd = nd->nd_next; + } + + nd = master_nodelist; + while (nd) { + /* Skip non-alive nodes */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (nd->nd_flags & MD_MN_NODE_DEL) { + num_alive_nodes_del++; + /* + * Delete this node rec from all ALIVE nodes in diskset. + */ + nd2 = master_nodelist; + while (nd2) { + /* Skip non-alive nodes and node without set */ + if (!(nd2->nd_flags & MD_MN_NODE_ALIVE) || + (nd2->nd_flags & MD_MN_NODE_NOSET)) { + nd2 = nd2->nd_next; + continue; + } + + /* This is a node being deleted from set */ + if (nd2->nd_nodeid == nd->nd_nodeid) { + /* Mark set record as DEL */ + if (clnt_upd_sr_flags(nd->nd_nodename, + sp, MD_SR_DEL, ep)) { + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode-> + nd_nodeid + != nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + if (clnt_deldrvs(nd->nd_nodename, sp, + dd, ep)) { + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode-> + nd_nodeid + != nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + if (clnt_delset(nd->nd_nodename, sp, + ep) == -1) { + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode-> + nd_nodeid + != nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + } else { + /* + * Delete host from sets on hosts + * not being deleted. + */ + anode[0] = Strdup(nd->nd_nodename); + if (clnt_delhosts(nd2->nd_nodename, sp, + 1, anode, ep) == -1) { + Free(anode[0]); + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode-> + nd_nodeid + != nd2->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + + meta_mc_log(MC_LOG5, + dgettext(TEXT_DOMAIN, + "Deleted node %s (%d) on node %s " + "from set %s: %s"), + nd->nd_nodename, nd->nd_nodeid, + nd2->nd_nodename, + sp->setname, + meta_print_hrtime( + gethrtime() - start_time)); + + Free(anode[0]); + } + nd2 = nd2->nd_next; + } + } + nd = nd->nd_next; + } + + nd = master_nodelist; + cl_sk = cl_get_setkey(sp->setno, sp->setname); + while (nd) { + /* Skip non-alive nodes and node without set */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE) || + (nd->nd_flags & MD_MN_NODE_NOSET)) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { + /* If RPC failure to another node return 205 */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + nd = nd->nd_next; + } + cl_set_setkey(NULL); + set_locked = 0; + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Nodelist syncronization complete for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + metaflushsetname(sp); + + /* + * If all alive nodes have been deleted from set, just + * return since nothing else can be done until non-alive + * nodes (if there are any) rejoin the cluster. + */ + if (num_alive_nodes == num_alive_nodes_del) { + rval = 0; + goto out; + } + + /* + * Sync up drive records. + * + * If a node panic'd (or metaset command was killed) during the + * addition or deletion of a drive to the diskset, the nodes + * may have a different view of the drive list. During cleanup + * of the drive list during reconfig, a drive will be deleted + * from the list if the master node sees that the drive has been + * marked in the ADD state on any node or is marked in the DEL state + * on all nodes. + * This cleanup must occur even if all nodes in the cluster are + * not part of the cluster so that all nodes have the same view + * of the drivelist. + * Then if the entire cluster goes down and comes back up, the + * new master node could be a node that wasn't in the cluster when + * the node was deleted. This could lead to a situation where the + * master node thinks that a drive is OK, but this drive isn't + * known to the other nodes. + * This situation can also occur during the addition of a drive + * where a node has the drive marked OK, but the node executing the + * metaset command enountered a failure before marking that drive OK + * on the rest of the nodes. If the node with the OK drive then + * panics, then rest of the nodes will remove that drive marked ADD + * and when the node with the OK drive rejoins the cluster, it will + * have a drive marked OK that is unknown by the other nodes. + * + * There are 2 situations to consider: + * A) Master knows about a drive that other nodes don't know about. + * B) At least one slave node knows about a drive that the master + * node doesn't know about. + * + * To handle these situations the following steps are followed: + * 1) Count number of drives known by this master node and the + * other slave nodes. + * If all nodes have the same number of drives and the master has + * all drives marked OK, then skip to step4. + * + * 2) If a node has less drives listed than the master, the master + * must get the drive descriptor list from that node so that + * master can determine which drive it needs to delete from that + * node. Master must get the drive descriptor list since the + * drive record list does not contain the name of the drive, but + * only a key and the key can only be interprested on that other + * node. + * + * 3) The master will then create the master drive list by doing: + * - Master starts with drive list known by master. + * - Any drive marked ADD will be removed from the list. + * - Any drive not known by another node (from step2) will be + * removed from the drive list. + * - If a drive is marked DEL on the master, the master must + * verify that the drive record is marked DEL on all nodes. + * If any node has the drive record marked OK, mark it OK + * on the master. (The reason why is described below). + * + * 4) The master sends out the master drive list and the slave + * nodes will force their drive lists to match the master + * drive list by deleting drives, if necessary and by changing + * the drive record states from ADD->OK if master has drive + * marked OK and slave has drive marked ADD. + * + * Interesting scenarios: + * + * 1) System has 4 nodes with node 1 as the master. Node 3 starts + * to delete a drive record (drive record on node 1 is marked DEL), + * but is stopped when node 3 panics. Node 1 also panics. + * During reconfig cycle, node 2 is picked as master and the drive + * record is left alone since all nodes in the cluster have it + * marked OK. User now sees drive as part of diskset. + * Now, entire cluster is rebooted and node 1 rejoins the cluster. + * Node 1 is picked as the master and node 1 has drive record + * marked DEL. Node 1 contacts all other nodes in the cluster + * and since at least one node has the drive record marked OK, + * the master marks the drive record OK. + * User continues to see the drive as part of the diskset. + */ + + /* Reget set descriptor since flushed above */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + rval = -1; + goto out; + } + + /* Has side effect of setting sd->sd_drvs to same as master_dd */ + if ((master_dd = metaget_drivedesc_sideno(sp, + sd->sd_mn_mynode->nd_nodeid, + (MD_BASICNAME_OK | PRINT_FAST), ep)) == NULL) { + /* No drives in list */ + if (!mdisok(ep)) { + /* + * Can't get drive list for this node, so + * return -1 causing this node to be removed + * cluster config and fixed. + */ + rval = -1; + goto out; + } + } + + /* Count the number of drives for all nodes */ + mnsr_node = master_mnsr_node; + while (mnsr_node) { + dr_cnt = 0; + dr = mnsr_node->mmn_mnsr->sr_drivechain; + while (dr) { + dr_cnt++; + dr = dr->dr_next; + } + mnsr_node->mmn_numdrives = dr_cnt; + mnsr_node = mnsr_node->mmn_next; + } + + /* Count the number of drives for the master; also check flags */ + all_drives_ok = 1; + dd_cnt = 0; + dd = master_dd; + while (dd) { + dd_cnt++; + if (!(dd->dd_flags & MD_DR_OK)) + all_drives_ok = 0; + dd = dd->dd_next; + } + + /* If all drives are ok, do quick check against number of drives */ + if (all_drives_ok) { + /* If all nodes have same number of drives, almost done */ + mnsr_node = master_mnsr_node; + while (mnsr_node) { + if (mnsr_node->mmn_numdrives != dd_cnt) + break; + mnsr_node = mnsr_node->mmn_next; + } + /* All nodes have same number of drives, just send flags */ + if (mnsr_node == NULL) { + goto send_drive_list; + } + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Begin detailed drive synchronization for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* Detailed check required */ + mnsr_node = master_mnsr_node; + while (mnsr_node) { + /* Does slave node have less drives than master? */ + if (mnsr_node->mmn_numdrives < dd_cnt) { + /* Yes - must determine which drive is missing */ + if (clnt_getdrivedesc(mnsr_node->mmn_nodename, sp, + &other_dd, ep)) { + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (strcmp(mynode(), mnsr_node->mmn_nodename) + != 0)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Master node %s unable to " + "retrieve drive list from node %s"), + mynode(), mnsr_node->mmn_nodename); + goto out; + } + mnsr_node->mmn_dd = other_dd; + dd = master_dd; + while (dd) { + if (!(dd->dd_flags & MD_DR_OK)) { + dd = dd->dd_next; + continue; + } + other_dd = mnsr_node->mmn_dd; + while (other_dd) { + /* Convert to devids, when available */ + if (strcmp(other_dd->dd_dnp->cname, + dd->dd_dnp->cname) == 0) { + break; + } + other_dd = other_dd->dd_next; + } + /* + * dd not found on slave so mark it + * ADD for later deletion (drives in ADD + * state are deleted later in this routine). + */ + if (other_dd == NULL) { + dd->dd_flags = MD_DR_ADD; + } + dd = dd->dd_next; + } + + } + mnsr_node = mnsr_node->mmn_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Drive check completed for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + dd = master_dd; + dd_prev = 0; + while (dd) { + /* Remove any ADD drives from list */ + if (dd->dd_flags & MD_DR_ADD) { + if (dd_prev) { + dd_prev->dd_next = dd->dd_next; + dd->dd_next = NULL; + metafreedrivedesc(&dd); + dd = dd_prev->dd_next; + } else { + /* + * If removing drive descriptor from head + * of linked list, also change sd->sd_drvs. + */ + master_dd = sd->sd_drvs = dd->dd_next; + dd->dd_next = NULL; + metafreedrivedesc(&dd); + dd = master_dd; + } + /* dd setup in if/else above */ + continue; + } + /* + * If drive is marked DEL, check all other nodes. + * If drive on another node is marked OK, mark drive OK + * in master list. If drive is marked DEL or doesn't exist + * on all nodes, remove drive from list. + */ + if (dd->dd_flags & MD_DR_DEL) { + mnsr_node = master_mnsr_node; + while (mnsr_node) { + if (mnsr_node->mmn_dd == NULL) { + if (clnt_getdrivedesc( + mnsr_node->mmn_nodename, sp, + &other_dd, ep)) { + /* RPC failure to !my node */ + if ((mdanyrpcerror(ep)) && + (strcmp(mynode(), + mnsr_node->mmn_nodename) + != 0)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Master node %s unable " + "to retrieve drive list from " + "node %s"), mynode(), + mnsr_node->mmn_nodename); + goto out; + } + mnsr_node->mmn_dd = other_dd; + } + other_dd = mnsr_node->mmn_dd; + while (other_dd) { + /* Found drive (OK) from other node */ + if (strcmp(dd->dd_dnp->cname, + other_dd->dd_dnp->cname) + == 0) { + /* Drive marked OK */ + if (other_dd->dd_flags & + MD_DR_OK) { + dd->dd_flags = MD_DR_OK; + } + break; + } + other_dd = other_dd->dd_next; + } + if (dd->dd_flags == MD_DR_OK) + break; + + mnsr_node = mnsr_node->mmn_next; + } + /* + * If no node had this drive marked OK, delete it. + */ + if (dd->dd_flags & MD_DR_DEL) { + if (dd_prev) { + dd_prev->dd_next = dd->dd_next; + dd->dd_next = NULL; + metafreedrivedesc(&dd); + dd = dd_prev->dd_next; + } else { + /* + * If removing drive descriptor from + * head of linked list, also change + * sd->sd_drvs. + */ + master_dd = sd->sd_drvs = dd->dd_next; + dd->dd_next = NULL; + metafreedrivedesc(&dd); + dd = master_dd; + } + /* dd setup in if/else above */ + continue; + } + } + dd_prev = dd; + dd = dd->dd_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Setting drive states completed for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + +send_drive_list: + /* + * Set genid on all drives to be the highest value seen. + */ + dd = master_dd; + while (dd) { + dd->dd_genid = max_genid; + dd = dd->dd_next; + } + /* + * Send updated drive list to all alive nodes. + * Will also set genid on set and node records to have same + * as the drive records. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-alive nodes */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_dr_reconfig(nd->nd_nodename, sp, master_dd, ep)) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + nd = nd->nd_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Sent drive list to all nodes for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* + * If no drive records left in set and nodes had been joined, + * withdraw the nodes. Always reset the master and mark + * all nodes as withdrawn on all nodes. + */ + if (master_dd == NULL) { + /* Reset new master flag since no longer master */ + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_setflags = MD_SET_MN_NEWMAS_RC; + sf.sf_flags = MDDB_NM_RESET; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + /* Ignore failure, failure to reset flag isn't catastrophic */ + (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, + &sf.sf_mde, NULL); + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Reset new master flag for " "set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-alive nodes */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + set_locked = 1; + + /* Withdraw node from set if owner */ + if ((nd->nd_flags & MD_MN_NODE_OWN) && + (clnt_withdrawset(nd->nd_nodename, sp, ep))) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + + /* Mark all nodes as withdrawn on this node */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_WITHDRAW, NULL, ep)) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + + /* Resets master to no-master on this node */ + if (clnt_mnsetmaster(nd->nd_nodename, sp, + "", MD_MN_INVALID_NID, ep)) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(nd->nd_nodename, cl_sk, ep)) { + /* RPC failure to another node */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + rval = 205; + } else { + /* Any other failure */ + rval = -1; + } + goto out; + } + set_locked = 0; + nd = nd->nd_next; + } + } + +out: + /* + * If got here and set is still locked, then an error has + * occurred and master_nodelist is still valid. + * If error is not an RPC error, then unlock. + * If error is an RPC error, skip unlocks since this could cause + * yet another RPC timeout if a node has failed. + * Ignore failures in unlock since unlock is just trying to + * clean things up. + */ + if ((set_locked) && !(mdanyrpcerror(ep))) { + nd = master_nodelist; + cl_sk = cl_get_setkey(sp->setno, sp->setname); + while (nd) { + /* Skip non-alive nodes */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + /* + * If clnt_unlock fails, just break out since next + * reconfig cycle will reset the locks anyway. + */ + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + break; + } + nd = nd->nd_next; + } + cl_set_setkey(NULL); + } + /* Free master_mnsr and drive descs */ + mnsr_node = master_mnsr_node; + while (mnsr_node) { + master_mnsr_node = mnsr_node->mmn_next; + free_sr((md_set_record *)mnsr_node->mmn_mnsr); + free_rem_dd(mnsr_node->mmn_dd); + Free(mnsr_node); + mnsr_node = master_mnsr_node; + } + + /* Frees sd->sd_drvs (which is also master_dd) */ + metaflushsetname(sp); + return (rval); +} + +/* + * meta_mnsync_diskset_mddbs + * Calling node is guaranteed to be an owner node. + * Calling node is the master node. + * + * Master node verifies that ondisk mddb format matches its incore format. + * If no nodes are joined to set, remove the change log entries. + * If a node is joined to set, play the change log. + * + * Returns 0 - Success + * 1 - Master unable to join to set. + * 205 - Failure during RPC to another node + * -1 - Any other failure and ep is filled in. + * -1 return will eventually cause node to panic + * in a SunCluster environment. + */ +int +meta_mnsync_diskset_mddbs( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + mddb_config_t c; + md_mn_msgclass_t class; + mddb_setflags_config_t sf; + md_mnnode_desc *nd, *nd2; + md_error_t xep = mdnullerror; + int stale_set = 0; + + /* If setname is there, set desc should exist. */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to get set %s desc information"), sp->setname); + return (-1); + } + + /* Are there drives in the set? */ + if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep) == NULL) { + if (! mdisok(ep)) { + return (-1); + } + /* No drives in set -- nothing to sync up */ + return (0); + } + + /* + * Is master node (which is this node) joined to set? + * If master node isn't joined (which means that no nodes + * are joined to diskset), remove the change log entries + * since no need to replay them - all nodes will have same + * view of mddbs since all nodes are reading in the mddbs + * from disk. + * There is also no need to sync up the master and ondisk mddbs + * since master has no incore knowledge. + * Need to join master to set in order to flush the change + * log entries. Don't need to block I/O during join of master + * to set since no other nodes are joined to set and so no I/O + * can be occurring. + */ + if (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + /* Join master to set */ + if (clnt_joinset(mynode(), sp, + MNSET_IN_RECONFIG, ep)) { + if (mdismddberror(ep, MDE_DB_STALE)) { + /* + * If STALE, print message and continue on. + * Don't do any writes or reads to mddbs + * so don't clear change log. + */ + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Join of master node to STALE set %s"), + sp->setname); + stale_set = 1; + mdclrerror(ep); + } else if (mdismddberror(ep, MDE_DB_ACCOK)) { + /* ACCOK means mediator provided extra vote */ + mdclrerror(ep); + } else { + /* + * If master is unable to join set, print an + * error message. Don't return failure or node + * will panic during cluster reconfig cycle. + * Also, withdraw node from set in order to + * cleanup from failed join attempt. + */ + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Join of master node in set %s failed"), + sp->setname); + if (clnt_withdrawset(mynode(), sp, &xep)) + mdclrerror(&xep); + return (1); + } + } + /* + * Master node successfully joined. + * Set local copy of flags to OWN and + * send owner flag to rpc.metad. If not stale, + * flush the change log. + */ + sd->sd_mn_mynode->nd_flags |= MD_MN_NODE_OWN; + if (clnt_upd_nr_flags(mynode(), sp, sd->sd_nodelist, MD_NR_SET, + MNSET_IN_RECONFIG, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Flag update of master node join in set %s failed"), + sp->setname); + return (-1); + } + + if (!stale_set) { + if (mdmn_reset_changelog(sp, ep, + MDMN_CLF_RESETLOG) != 0) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reset changelog.")); + return (-1); + } + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Removed changelog entries for set %s: %s"), + sp->setname, + meta_print_hrtime(gethrtime() - start_time)); + } + /* Reset new master flag before return */ + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_setflags = MD_SET_MN_NEWMAS_RC; + sf.sf_flags = MDDB_NM_RESET; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + /* Ignore failure, failure to reset flag isn't catastrophic */ + (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, + &sf.sf_mde, NULL); + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Reset new master flag for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + return (0); + } + + /* + * Is master already joined to STALE set (< 50% mddbs avail)? + * If so, can make no config changes to mddbs so don't check or play + * changelog and don't sync master node to ondisk mddbs. + * To get out of the stale state all nodes must be withdrawn + * from set. Then as nodes are re-joined, all nodes will + * have same view of mddbs since all nodes are reading the + * mddbs from disk. + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + return (-1); + } + if (c.c_flags & MDDB_C_STALE) { + return (0); + } + + /* + * If this node is NOT a newly chosen master, then there's + * nothing else to do since the change log should be empty and + * the ondisk and incore mddbs are already consistent. + * + * A newly chosen master is a node that was not the master + * at the beginning of the reconfig cycle. If a node is a new + * master, then the new master state is reset after the ondisk + * and incore mddbs are consistent and the change log has + * been replayed. + */ + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_flags = MDDB_NM_GET; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + if ((metaioctl(MD_MN_GET_SETFLAGS, &sf, &sf.sf_mde, NULL) == 0) && + ((sf.sf_setflags & MD_SET_MN_NEWMAS_RC) == 0)) { + return (0); + } + + /* + * Now, sync up incore master view to ondisk mddbs. + * This is needed in the case where a master node + * had made a change to the mddb, but this change + * may not have been relayed to the slaves yet. + * So, the new master needs to verify that the ondisk + * mddbs match what the new master has incore - + * if different, new master rewrites all of the mddbs. + * Then the new master will replay the changelog and the + * new master will then execute what the old master had + * done. + * + * Block all I/Os to disks in this diskset on all nodes in + * the diskset. This will allow the rewriting of the mddbs + * (if needed), to proceed in a timely manner. + * + * If block of I/Os fail, return a -1. + */ + + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-alive and non-owner nodes */ + if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + nd = nd->nd_next; + continue; + } + if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, + MN_SUSP_IO, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to suspend I/O on node %s in set %s"), + nd->nd_nodename, sp->setname); + + /* + * Resume all other nodes that had been suspended. + * (Reconfig return step also resumes I/Os + * for all sets.) + */ + nd2 = sd->sd_nodelist; + while (nd2) { + /* Stop when reaching failed node */ + if (nd2->nd_nodeid == nd->nd_nodeid) + break; + /* Skip non-alive and non-owner nodes */ + if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd2->nd_flags & MD_MN_NODE_OWN))) { + nd2 = nd2->nd_next; + continue; + } + (void) (clnt_mn_susp_res_io(nd2->nd_nodename, + sp->setno, MN_RES_IO, &xep)); + nd2 = nd2->nd_next; + } + + /* + * If an RPC failure on another node, return a 205. + * Otherwise, exit with failure. + */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + return (-1); + } + + } + nd = nd->nd_next; + } + + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + /* Master can't sync up to ondisk mddbs? Kick it out of cluster */ + if (metaioctl(MD_MN_CHK_WRT_MDDB, &c, &c.c_mde, NULL) != 0) + return (-1); + + /* + * Resume I/Os that were suspended above. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-alive and non-owner nodes */ + if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + nd = nd->nd_next; + continue; + } + if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, + MN_RES_IO, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume I/O on node %s in set %s"), + nd->nd_nodename, sp->setname); + + /* + * If an RPC failure then don't do any + * more RPC calls, since one timeout is enough + * to endure. If RPC failure to another node, return + * 205. If RPC failure to my node, return -1. + * If not an RPC failure, continue resuming the + * rest of the nodes and then return -1. + */ + if (mdanyrpcerror(ep)) { + if (sd->sd_mn_mynode->nd_nodeid == + nd->nd_nodeid) { + return (-1); + } else { + return (205); + } + } + + /* + * If not an RPC error, continue resuming rest of + * nodes, ignoring any failures except for an + * RPC failure which constitutes an immediate exit. + * Start in middle of list with failing node. + */ + nd2 = nd->nd_next; + while (nd2) { + /* Skip non-alive and non-owner nodes */ + if ((!(nd2->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd2->nd_flags & MD_MN_NODE_OWN))) { + nd2 = nd2->nd_next; + continue; + } + (void) (clnt_mn_susp_res_io(nd2->nd_nodename, + sp->setno, MN_RES_IO, &xep)); + if (mdanyrpcerror(&xep)) { + return (-1); + } + nd2 = nd2->nd_next; + } + } + nd = nd->nd_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, "Master node has completed " + "checking/writing the mddb for set %s: %s"), sp->setname, + meta_print_hrtime(gethrtime() - start_time)); + + /* + * Send (aka replay) all messages we find in the changelog. + * Flag the messages with + * MD_MSGF_REPLAY_MSG, so no new message ID is generated for them + * MD_MSGF_OVERRIDE_SUSPEND so they can pass the suspended commd. + */ + for (class = MD_MN_NCLASSES - 1; class > 0; class--) { + mdmn_changelog_record_t *lr; + md_error_t xep = mdnullerror; + md_mn_result_t *resultp = NULL; + int ret; + + lr = mdmn_get_changelogrec(sp->setno, class); + if ((lr->lr_flags & MD_MN_LR_INUSE) == 0) { + /* no entry for this class */ + continue; + } + + meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, + "replaying message ID=(%d, 0x%llx-%d)\n"), + MSGID_ELEMS(lr->lr_msg.msg_msgid)); + + ret = mdmn_send_message_with_msgid( + lr->lr_msg.msg_setno, + lr->lr_msg.msg_type, + lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG | + MD_MSGF_OVERRIDE_SUSPEND, + lr->lr_msg.msg_event_data, + lr->lr_msg.msg_event_size, + &resultp, + &lr->lr_msg.msg_msgid, + &xep); + + meta_mc_log(MC_LOG1, dgettext(TEXT_DOMAIN, + "mdmn_send_message returned %d\n"), ret); + + if (resultp) + free_result(resultp); + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Playing changelog completed for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + /* + * Now that new master has ondisk and incore mddbs in sync, reset + * this node's new master kernel flag (for this set). If this node + * re-enters another reconfig cycle before the completion of this + * reconfig cycle, this master node won't need to check if the ondisk + * and incore mddbs are in sync since this node won't be considered + * a new master (since this flag is being reset here in the middle of + * step2). This will save time during any subsequent reconfig + * cycles as long as this node continues to be master. + */ + (void) memset(&sf, 0, sizeof (sf)); + sf.sf_setno = sp->setno; + sf.sf_setflags = MD_SET_MN_NEWMAS_RC; + sf.sf_flags = MDDB_NM_RESET; + /* Use magic to help protect ioctl against attack. */ + sf.sf_magic = MDDB_SETFLAGS_MAGIC; + /* Ignore failure, since failure to reset flag isn't catastrophic */ + (void) metaioctl(MD_MN_SET_SETFLAGS, &sf, &sf.sf_mde, NULL); + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Reset new master flag for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + return (0); +} + +/* + * meta_mnjoin_all will join all starting nodes in the diskset. + * A starting node is considered to be any node that is not + * an owner of the set but is a member of the cluster. + * Master node is already joined to set (done in meta_mnsync_diskset_mddbs). + * + * Caller is the Master node. + * + * Returns 0 - Success + * 205 - Failure during RPC to another node + * -1 - Any other failure and ep is filled in. + */ +int +meta_mnjoin_all( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_set_desc *sd; + md_mnnode_desc *nd, *nd2; + int rval = 0; + int stale_flag = 0; + mddb_config_t c; + int susp_res_flag = 0; + md_error_t xep = mdnullerror; + + /* If setname is there, set desc should exist. */ + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to get set %s desc information"), sp->setname); + return (-1); + } + + /* Are there drives in the set? */ + if (metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep) == NULL) { + if (! mdisok(ep)) { + return (-1); + } + /* No drives in set -- nothing to join */ + return (0); + } + + /* + * Is set currently stale? + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + /* Ignore failure since master node may not be joined yet */ + (void) metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL); + if (c.c_flags & MDDB_C_STALE) { + stale_flag = MNSET_IS_STALE; + } + + /* + * If any nodes are going to be joined to diskset, then + * suspend I/O to all disks in diskset so that nodes can join + * (read in mddbs) in a reasonable amount of time even under + * high I/O load. Don't need to do this if set is STALE since + * no I/O can be occurring to a STALE set. + */ + if (stale_flag != MNSET_IS_STALE) { + nd = sd->sd_nodelist; + while (nd) { + /* Found a node that will be joined to diskset */ + if ((nd->nd_flags & MD_MN_NODE_ALIVE) && + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + /* Set flag that diskset should be suspended */ + susp_res_flag = 1; + break; + } + nd = nd->nd_next; + } + } + + if (susp_res_flag) { + /* + * Block all I/Os to disks in this diskset on all joined + * nodes in the diskset. + * If block of I/Os fails due to an RPC failure on another + * node, return 205; otherwise, return -1. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-alive and non-owner nodes */ + if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + nd = nd->nd_next; + continue; + } + if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, + MN_SUSP_IO, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to suspend I/O on node %s" + " in set %s"), nd->nd_nodename, + sp->setname); + /* + * Resume other nodes that had been suspended. + * (Reconfig return step also resumes I/Os + * for all sets.) + */ + nd2 = sd->sd_nodelist; + while (nd2) { + /* Stop when reaching failed node */ + if (nd2->nd_nodeid == nd->nd_nodeid) + break; + /* Skip non-alive/non-owner nodes */ + if ((!(nd2->nd_flags & + MD_MN_NODE_ALIVE)) || + (!(nd2->nd_flags & + MD_MN_NODE_OWN))) { + nd2 = nd2->nd_next; + continue; + } + (void) (clnt_mn_susp_res_io( + nd2->nd_nodename, sp->setno, + MN_RES_IO, &xep)); + nd2 = nd2->nd_next; + } + + /* + * If the suspend failed due to an + * RPC failure on another node, return + * a 205. + * Otherwise, exit with failure. + * The return reconfig step will resume + * I/Os for all disksets. + */ + if ((mdanyrpcerror(ep)) && + (sd->sd_mn_mynode->nd_nodeid != + nd->nd_nodeid)) { + return (205); + } else { + return (-1); + } + } + nd = nd->nd_next; + } + } + + nd = sd->sd_nodelist; + while (nd) { + /* + * If a node is in the membership list but isn't joined + * to the set, try to join the node. + */ + if ((nd->nd_flags & MD_MN_NODE_ALIVE) && + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + if (clnt_joinset(nd->nd_nodename, sp, + (MNSET_IN_RECONFIG | stale_flag), ep)) { + /* + * If RPC failure to another node + * then exit without attempting anything else. + * (Reconfig return step will resume I/Os + * for all sets.) + */ + if (mdanyrpcerror(ep)) { + mde_perror(ep, ""); + return (205); + } + /* + * STALE and ACCOK failures aren't true + * failures. STALE means that <50% mddbs + * are available. ACCOK means that the + * mediator provided the extra vote. + * If a true failure, then print messasge + * and withdraw node from set in order to + * cleanup from failed join attempt. + */ + if ((!mdismddberror(ep, MDE_DB_STALE)) && + (!mdismddberror(ep, MDE_DB_ACCOK))) { + mde_perror(ep, + "WARNING: Unable to join node %s " + "to set %s", nd->nd_nodename, + sp->setname); + mdclrerror(ep); + if (clnt_withdrawset(nd->nd_nodename, + sp, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + continue; + } + } + /* Set owner flag even if STALE or ACCOK */ + nd->nd_flags |= MD_MN_NODE_OWN; + } + nd = nd->nd_next; + } + /* + * Resume I/Os if suspended above. + */ + if (susp_res_flag) { + nd = sd->sd_nodelist; + while (nd) { + /* + * Skip non-alive and non-owner nodes + * (this list doesn't include any of + * the nodes that were joined). + */ + if ((!(nd->nd_flags & MD_MN_NODE_ALIVE)) || + (!(nd->nd_flags & MD_MN_NODE_OWN))) { + nd = nd->nd_next; + continue; + } + if (clnt_mn_susp_res_io(nd->nd_nodename, sp->setno, + MN_RES_IO, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume I/O on node %s" + " in set %s"), nd->nd_nodename, + sp->setname); + + /* + * If an RPC failure then don't do any + * more RPC calls, since one timeout is enough + * to endure. If RPC failure to another node, + * return 205. If RPC failure to my node, + * return -1. + * (Reconfig return step will resume I/Os + * for all sets.) + * If not an RPC failure, continue resuming the + * rest of the nodes and then return -1. + */ + if (mdanyrpcerror(ep)) { + if (sd->sd_mn_mynode->nd_nodeid == + nd->nd_nodeid) { + return (-1); + } else { + return (205); + } + } + + /* + * If not an RPC error, continue resuming rest + * of nodes, ignoring any failures except for + * an RPC failure which constitutes an + * immediate exit. + * Start in middle of list with failing node. + */ + nd2 = nd->nd_next; + while (nd2) { + /* Skip non-owner nodes */ + if ((!(nd2->nd_flags & + MD_MN_NODE_ALIVE)) || + (!(nd2->nd_flags & + MD_MN_NODE_OWN))) { + nd2 = nd2->nd_next; + continue; + } + (void) (clnt_mn_susp_res_io( + nd2->nd_nodename, sp->setno, + MN_RES_IO, &xep)); + if (mdanyrpcerror(&xep)) { + return (-1); + } + nd2 = nd2->nd_next; + } + } + nd = nd->nd_next; + } + } + + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + /* + * If 1 node fails - go ahead and update the rest except + * in the case of an RPC failure, fail immediately. + */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_SET, MNSET_IN_RECONFIG, ep)) { + /* RPC failure to another node */ + if (mdanyrpcerror(ep)) { + return (205); + } + nd = nd->nd_next; + rval = -1; + continue; + } + nd = nd->nd_next; + } + + meta_mc_log(MC_LOG5, dgettext(TEXT_DOMAIN, + "Join of all nodes completed for set %s: %s"), + sp->setname, meta_print_hrtime(gethrtime() - start_time)); + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_drv.c b/usr/src/lib/lvm/libmeta/common/meta_set_drv.c new file mode 100644 index 0000000000..5fad53ad7b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set_drv.c @@ -0,0 +1,1948 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Metadevice diskset interfaces + */ + +#include <meta.h> +#include <mdmn_changelog.h> +#include "meta_set_prv.h" +#include "meta_repartition.h" + +static int +check_setnodes_againstdrivelist( + mdsetname_t *sp, + mddrivenamelist_t *dnlp, + md_error_t *ep +) +{ + md_set_desc *sd; + mddrivenamelist_t *p; + int i; + md_mnnode_desc *nd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + for (p = dnlp; p != NULL; p = p->next) + if (checkdrive_onnode(sp, p->drivenamep, + nd->nd_nodename, ep)) + return (-1); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + for (p = dnlp; p != NULL; p = p->next) + if (checkdrive_onnode(sp, p->drivenamep, + sd->sd_nodes[i], ep)) + return (-1); + } + } + return (0); +} + +static int +drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep) +{ + mddrivenamelist_t *dl1, *dl2; + mddrivename_t *dn1, *dn2; + + for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) { + dn1 = dl1->drivenamep; + + for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) { + dn2 = dl2->drivenamep; + if (strcmp(dn1->cname, dn2->cname) != 0) + continue; + + return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno, + NULL, dn1->cname, sp->setname)); + } + } + return (0); +} + +static md_drive_desc * +metaget_drivedesc_fromdrivelist( + mdsetname_t *sp, + mddrivenamelist_t *dnlp, + uint_t flags, + md_error_t *ep +) +{ + mddrivenamelist_t *p; + md_drive_desc *dd = NULL; + md_set_desc *sd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (NULL); + + for (p = dnlp; p != NULL; p = p->next) { + (void) metadrivedesc_append(&dd, p->drivenamep, 0, 0, + sd->sd_ctime, sd->sd_genid, flags); + } + + return (dd); +} + +/* + * Exported Entry Points + */ + +int +meta_make_sidenmlist( + mdsetname_t *sp, + mddrivename_t *dnp, + md_error_t *ep +) +{ + mdsidenames_t *sn, **sn_next; + mdname_t *np; + int done; + side_t sideno = MD_SIDEWILD; + uint_t rep_slice; + + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) + return (-1); + + dnp->side_names_key = MD_KEYWILD; + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + + metaflushsidenames(dnp); + sn_next = &dnp->side_names; + /*CONSTCOND*/ + while (1) { + sn = Zalloc(sizeof (*sn)); + + if ((done = meta_getnextside_devinfo(sp, np->bname, + &sideno, &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) { + Free(sn); + return (-1); + } + + if (done == 0) { + Free(sn); + return (0); + } + + sn->sideno = sideno; + + /* Add to the end of the linked list */ + assert(*sn_next == NULL); + *sn_next = sn; + sn_next = &sn->next; + } + /*NOTREACHED*/ +} + +int +meta_set_adddrives( + mdsetname_t *sp, + mddrivenamelist_t *dnlp, + daddr_t dbsize, + int force_label, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd = NULL, *curdd = NULL, *ddp; + int i; + mddrivenamelist_t *p; + mhd_mhiargs_t mhiargs; + int rval = 0; + md_timeval32_t now; + sigset_t oldsigs; + ulong_t genid; + ulong_t max_genid = 0; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + int suspendall_flag = 0; + int suspend1_flag = 0; + int lock_flag = 0; + int flush_set_onerr = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* + * The drive and node records are stored in the local mddbs of each + * node in the diskset. Each node's rpc.metad daemon reads in the set, + * drive and node records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set, drive and node records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list). Otherwise, + * fail this operation since all nodes must be ALIVE in order to add + * the new drive record to their local mddb. If a panic of this node + * leaves the local mddbs set, node and drive records out-of-sync, the + * reconfig cycle will fix the local mddbs and force them back into + * synchronization. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, + nd->nd_nodename, NULL, sp->setname); + return (-1); + } + nd = nd->nd_next; + } + } + + if (drvsuniq(sp, dnlp, ep) == -1) + return (-1); + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for traditional + * diskset since lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + if (MD_MNSET_DESC(sd)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + } + + if (check_setnodes_againstdrivelist(sp, dnlp, ep)) { + rval = -1; + goto out; + } + + for (p = dnlp; p != NULL; p = p->next) { + mdsetname_t *tmp; + + if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE, + ep) == -1) { + rval = -1; + goto out; + } + + if (tmp != NULL) { + (void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno, + tmp->setname, p->drivenamep->cname, sp->setname); + rval = -1; + goto out; + } + } + + /* END CHECK CODE */ + + /* + * This is a separate loop (from above) so that we validate all the + * drives handed to us before we repartition any one drive. + */ + for (p = dnlp; p != NULL; p = p->next) { + if (meta_repartition_drive(sp, + p->drivenamep, + force_label == TRUE ? MD_REPART_FORCE : 0, + NULL, /* Don't return the VTOC. */ + ep) != 0) { + rval = -1; + goto out; + } + + /* + * Create the names for the drives we are adding per side. + */ + if (meta_make_sidenmlist(sp, p->drivenamep, ep) == -1) { + rval = -1; + goto out; + } + } + + /* + * Get the list of drives descriptors that we are adding. + */ + dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep); + + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + /* + * Slam a dummy master block on all the disks that we are adding + * Used by diskset import if the disksets are remotely replicated + */ + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + uint_t rep_slice; + int fd = -1; + mdname_t *np = NULL; + + if (meta_replicaslice(ddp->dd_dnp, &rep_slice, &xep) != 0) { + mdclrerror(&xep); + continue; + } + + if ((np = metaslicename(ddp->dd_dnp, rep_slice, &xep)) + == NULL) { + mdclrerror(&xep); + continue; + } + + if ((fd = open(np->rname, O_RDWR)) >= 0) { + meta_mkdummymaster(sp, fd, 16); + (void) close(fd); + } + } + + /* + * Get the set timeout information. + */ + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { + rval = -1; + goto out; + } + + /* + * Get timestamp and generation id for new records + */ + now = sd->sd_ctime; + genid = sd->sd_genid; + + + /* At this point, in case of error, set should be flushed. */ + flush_set_onerr = 1; + + /* Lock the set on current set members */ + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_on(); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + } + + /* + * Get drive descriptors for the drives that are currently in the set. + */ + curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep); + if (! mdisok(ep)) + goto rollback; + + /* + * If first drive being added to set, set the mastership + * of the multinode diskset to be this node. + * Only set it on this node. If all goes well + * and there are no errors, the mastership of this node will be set + * on all nodes in user space and in the kernel. + */ + if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { + if (clnt_mnsetmaster(mynode(), sp, + sd->sd_mn_mynode->nd_nodename, + sd->sd_mn_mynode->nd_nodeid, ep)) { + goto rollback; + } + /* + * Set this up in my local cache of the set desc so that + * the set descriptor won't have to be gotten again from + * rpc.metad. If it is flushed and gotten again, these + * values will be set in sr2setdesc. + */ + sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid; + (void) strcpy(sd->sd_mn_master_nodenm, + sd->sd_mn_mynode->nd_nodename); + sd->sd_mn_am_i_master = 1; + } + + RB_TEST(1, "adddrives", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "adddrives", ep) + + /* + * Add the drive records for the drives that we are adding to + * each host in the set. Marks the drive as MD_DR_ADD. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid, + ep) == -1) + goto rollback; + + RB_TEST(3, "adddrives", ep) + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid, + ep) == -1) + goto rollback; + + RB_TEST(3, "adddrives", ep) + } + } + + RB_TEST(4, "adddrives", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(5, "adddrives", ep) + + /* + * Take ownership of the added drives. + */ + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep)) + goto rollback; + } + + RB_TEST(6, "adddrives", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(7, "adddrives", ep) + + /* + * Balance the DB's according to the list of existing drives and the + * list of added drives. + */ + if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1) + goto rollback; + + if ((curdd == NULL) && (MD_MNSET_DESC(sd))) { + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of all + * messages), then change the nodelist followed by a reinit + * and resume. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + } + + /* + * If a MN diskset and this is the first disk(s) being added + * to set, then pre-allocate change log records here. + * When the other nodes are joined into the MN diskset, the + * USER records will just be snarfed in. + */ + if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { + if (mdmn_allocate_changelog(sp, ep) != 0) + goto rollback; + } + + /* + * Mark the drives MD_DR_OK. + * If first drive being added to MN diskset, then set + * master on all nodes to be this node and then join + * all alive nodes (nodes in membership list) to set. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* don't set master on this node - done earlier */ + if ((curdd == NULL) && (nd->nd_nodeid != + sd->sd_mn_mynode->nd_nodeid)) { + /* + * Set master on all alive nodes since + * all alive nodes will become joined nodes. + */ + if (clnt_mnsetmaster(nd->nd_nodename, sp, + sd->sd_mn_mynode->nd_nodename, + sd->sd_mn_mynode->nd_nodeid, ep)) { + goto rollback; + } + } + + if (curdd == NULL) { + /* + * No special flags for join set. Since + * all nodes are joining if 1st drive is being + * added to set then all nodes will be either + * STALE or non-STALE and each node can + * determine this on its own. + */ + if (clnt_joinset(nd->nd_nodename, sp, + NULL, ep)) { + goto rollback; + } + /* Sets join node flag on all nodes in list */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) { + goto rollback; + } + } + + /* + * Set MD_DR_OK as last thing before unlock. + * In case of panic on this node, recovery + * code can check for MD_DR_OK to determine + * status of diskset. + */ + if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, + MD_DR_OK, ep) == -1) + goto rollback; + + + RB_TEST(8, "adddrives", ep) + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK, + ep) == -1) + goto rollback; + + RB_TEST(8, "adddrives", ep) + } + } + + RB_TEST(9, "adddrives", ep) + +out: + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + } + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + if (lock_flag) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + cl_set_setkey(NULL); + } + + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + } + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + max_genid = sd->sd_genid; + + /* level 3 */ + if (rb_level > 2) { + /* + * Since the add drive operation is failing, need + * to reset config back to the way it was + * before the add drive opration. + * If a MN diskset and this is the first drive being added, + * then reset master on all ALIVE nodes (which is all nodes) + * since the master would have not been set previously. + * Don't reset master on this node, since this + * is done later. + * This is ok to fail since next node to add first + * disk to diskset will also set the master on all nodes. + * + * Also, if this is the first drive being added, + * need to have each node withdraw itself from the set. + */ + if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Be careful with ordering in case of + * panic between the steps and the + * effect on recovery during reconfig. + */ + if (clnt_withdrawset(nd->nd_nodename, sp, &xep)) + mdclrerror(&xep); + + /* Sets withdraw flag on all nodes in list */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_WITHDRAW, + NULL, &xep)) { + mdclrerror(&xep); + } + + /* Skip this node */ + if (nd->nd_nodeid == + sd->sd_mn_mynode->nd_nodeid) { + nd = nd->nd_next; + continue; + } + /* Reset master on all of the other nodes. */ + if (clnt_mnsetmaster(nd->nd_nodename, sp, + "", MD_MN_INVALID_NID, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } + } + + /* + * Send resume command to mdcommd. Don't send reinit command + * since nodelist should not have changed. + * If suspendall_flag is set, then user would have been adding + * first drives to set. Since this failed, there is certainly + * no reinit message to send to rpc.commd since no nodes will + * be joined to set at the end of this metaset command. + */ + if (suspendall_flag) { + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Resume all classes but class 1 so that lock is held + * against meta* commands. + * To later resume class1, must issue a class0 resume. + */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, + MD_MSCF_DONT_RESUME_CLASS1, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* level 3 */ + if (rb_level > 2) { + mdnamelist_t *nlp; + mdname_t *np; + + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + uint_t rep_slice; + + if ((meta_replicaslice(ddp->dd_dnp, + &rep_slice, &xep) != 0) || + ((np = metaslicename(ddp->dd_dnp, rep_slice, + &xep)) == NULL)) { + mdclrerror(&xep); + continue; + } + nlp = NULL; + (void) metanamelist_append(&nlp, np); + + if (meta_db_detach(sp, nlp, + (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep)) + mdclrerror(&xep); + + metafreenamelist(nlp); + } + + /* Re-balance */ + if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1) + mdclrerror(&xep); + + /* Only if we are adding the first drive */ + /* Handled MN diskset above. */ + if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) { + if (clnt_stimeout(mynode(), sp, &defmhiargs, + &xep) == -1) + mdclrerror(&xep); + + /* This is needed because of a corner case */ + if (halt_set(sp, &xep)) + mdclrerror(&xep); + } + max_genid++; + } + + /* level 2 */ + if (rb_level > 1) { + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, dd, TRUE, &xep)) + mdclrerror(&xep); + } + } + + /* level 1 */ + if (rb_level > 0) { + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_deldrvs(nd->nd_nodename, sp, dd, + &xep) == -1) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, + &xep) == -1) + mdclrerror(&xep); + } + } + max_genid += 2; + resync_genid(sp, sd, max_genid, 0, NULL); + } + + if ((suspend1_flag) || (suspendall_flag)) { + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Just resume all classes so that resume is the + * same whether just one class was locked or all + * classes were locked. + */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* Don't test lock flag since guaranteed to be set if in rollback */ + if (MD_MNSET_DESC(sd)) { + /* + * Since the add drive operation is failing, need + * to reset config back to the way it was + * before the add drive opration. + * If a MN diskset and this is the first drive being + * added, then reset master on this node since + * the master would have not been set previously. + * This is ok to fail since next node to add first + * disk to diskset will also set the master on all nodes. + */ + if (curdd == NULL) { + /* Reset master on mynode */ + if (clnt_mnsetmaster(mynode(), sp, "", + MD_MN_INVALID_NID, &xep)) + mdclrerror(&xep); + } + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + } + + return (rval); +} + +int +meta_set_deletedrives( + mdsetname_t *sp, + mddrivenamelist_t *dnlp, + int forceflg, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *ddp, *dd = NULL, *curdd = NULL; + md_replicalist_t *rlp = NULL, *rl; + mddrivenamelist_t *p; + int deldrvcnt = 0; + int rval = 0; + mhd_mhiargs_t mhiargs; + int i; + sigset_t oldsigs; + md_setkey_t *cl_sk; + ulong_t max_genid = 0; + int rb_level = 0; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + int has_set; + int current_drv_cnt = 0; + int suspendall_flag = 0, suspendall_flag_rb = 0; + int suspend1_flag = 0; + int lock_flag = 0; + bool_t stale_bool = FALSE; + int flush_set_onerr = 0; + mdnamelist_t *nlp; + mdname_t *np; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + if (drvsuniq(sp, dnlp, ep) == -1) + return (-1); + + /* + * Check and see if all the nodes have the set. + * + * The drive and node records are stored in the local mddbs of each + * node in the diskset. Each node's rpc.metad daemon reads in the set, + * drive and node records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set, drive and node records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list). Otherwise, + * fail this operation since all nodes must be ALIVE in order to delete + * a drive record from their local mddb. If a panic of this node + * leaves the local mddbs set, node and drive records out-of-sync, the + * reconfig cycle will fix the local mddbs and force them back into + * synchronization. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, + nd->nd_nodename, NULL, sp->setname); + return (-1); + } + nd = nd->nd_next; + } + + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for + * traditional diskset since lock_set and SUSPEND are used + * to protect against other meta* commands running on the + * other nodes. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (strcmp(nd->nd_nodename, mynode()) == 0) { + nd = nd->nd_next; + continue; + } + + has_set = nodehasset(sp, nd->nd_nodename, + NHS_NSTG_EQ, ep); + if (has_set < 0) { + rval = -1; + goto out; + } + + if (! has_set) { + (void) mddserror(ep, MDE_DS_NODENOSET, + sp->setno, nd->nd_nodename, + NULL, sp->setname); + rval = -1; + goto out; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (strcmp(sd->sd_nodes[i], mynode()) == 0) + continue; + + has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ, + ep); + if (has_set < 0) { + /* + * Can directly return since !MN diskset; + * nothing to unlock. + */ + return (-1); + } + + if (! has_set) { + /* + * Can directly return since !MN diskset; + * nothing to unlock. + */ + return (mddserror(ep, MDE_DS_NODENOSET, + sp->setno, sd->sd_nodes[i], NULL, + sp->setname)); + } + } + } + + for (p = dnlp; p != NULL; p = p->next) { + int is_it; + mddrivename_t *dnp; + + dnp = p->drivenamep; + + if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep)) + == -1) { + rval = -1; + goto out; + } + + if (! is_it) { + (void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno, + NULL, dnp->cname, sp->setname); + rval = -1; + goto out; + } + + if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) { + rval = -1; + goto out; + } + + deldrvcnt++; + } + current_drv_cnt = deldrvcnt; + + /* + * Get drive descriptors for the drives that are currently in the set. + */ + curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + /* + * Decrement the the delete drive count for each drive currently in the + * set. + */ + for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next) + deldrvcnt--; + + /* + * If the count of drives we are deleting is equal to the drives in the + * set, and we haven't specified forceflg, return an error + */ + if (deldrvcnt == 0 && forceflg == FALSE) { + (void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL); + rval = -1; + goto out; + } + + /* + * Get the list of drive descriptors that we are deleting. + */ + dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep); + if (! mdisok(ep)) { + rval = -1; + goto out; + } + + /* + * Get the set timeout information in case we have to roll back. + */ + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) { + rval = -1; + goto out; + } + + /* At this point, in case of error, set should be flushed. */ + flush_set_onerr = 1; + + /* END CHECK CODE */ + + /* Lock the set on current set members */ + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_on(); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + } + + if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) { + mddb_config_t c; + /* + * Is current set STALE? + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + rval = -1; + goto out; + } + if (c.c_flags & MDDB_C_STALE) { + stale_bool = TRUE; + } + } + + RB_TEST(1, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "deletedrives", ep) + + /* + * Mark the drives MD_DR_DEL + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, + MD_DR_DEL, ep) == -1) + goto rollback; + + RB_TEST(3, "deletedrives", ep) + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, + MD_DR_DEL, ep) == -1) + goto rollback; + + RB_TEST(3, "deletedrives", ep) + } + } + + RB_TEST(4, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(5, "deletedrives", ep) + + /* + * Balance the DB's according to the list of existing drives and the + * list of deleted drives. + */ + if (meta_db_balance(sp, dd, curdd, 0, ep) == -1) + goto rollback; + + /* + * If the drive(s) to be deleted cannot be accessed, + * they haven't really been deleted yet. Check and delete now + * if need be. + */ + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) { + nlp = NULL; + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + char *delete_name; + + delete_name = ddp->dd_dnp->cname; + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + char *cur_name; + + cur_name = + rl->rl_repp->r_namep->drivenamep->cname; + + if (strcmp(delete_name, cur_name) == 0) { + /* put it on the delete list */ + np = rl->rl_repp->r_namep; + (void) metanamelist_append(&nlp, np); + + } + } + } + + if (nlp != NULL) { + if (meta_db_detach(sp, nlp, + (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, + ep) == -1) { + metafreenamelist(nlp); + goto rollback; + } + metafreenamelist(nlp); + } + } + + RB_TEST(6, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(7, "deletedrives", ep) + + /* + * Cannot suspend set until after meta_db_balance since + * meta_db_balance uses META_DB_ATTACH/DETACH messages. + */ + if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) { + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of all + * messages), then change the nodelist followed by a reinit + * and resume. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + } + + /* + * Remove the drive records for the drives that were deleted from + * each host in the set. This removes the record and dr_flags. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1) + goto rollback; + + RB_TEST(8, "deletedrives", ep) + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1) + goto rollback; + + RB_TEST(8, "deletedrives", ep) + } + } + + RB_TEST(9, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(10, "deletedrives", ep) + + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, dd, TRUE, ep)) + goto rollback; + } + + /* If we deleted all the drives, then we need to halt the set. */ + if (deldrvcnt == 0) { + RB_TEST(11, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 5; /* level 5 */ + + RB_TEST(12, "deletedrives", ep) + + if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1) + goto rollback; + + RB_TEST(13, "deletedrives", ep) + + RB_PREEMPT; + rb_level = 6; /* level 6 */ + + RB_TEST(14, "deletedrives", ep) + + /* Halt MN diskset on all nodes by having node withdraw */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Only withdraw nodes that are joined */ + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + /* + * Going to set locally cached node flags to + * rollback join so in case of error, the + * rollback code knows which nodes to re-join. + */ + nd->nd_flags |= MD_MN_NODE_RB_JOIN; + + /* + * Be careful in ordering of following steps + * so that recovery from a panic between + * the steps is viable. + * Only reset master info in rpc.metad - + * don't reset local cached information + * which will be used to set master information + * back in case of failure (rollback). + */ + if (clnt_withdrawset(nd->nd_nodename, sp, ep)) + goto rollback; + /* Sets withdraw flag on all nodes in list */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_WITHDRAW, + NULL, ep)) { + goto rollback; + } + if (clnt_mnsetmaster(nd->nd_nodename, sp, + "", MD_MN_INVALID_NID, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + } else { + if (halt_set(sp, ep)) + goto rollback; + } + + RB_TEST(15, "deletedrives", ep) + } + + RB_TEST(16, "deletedrives", ep) + +out: + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + } + + /* + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + if (lock_flag) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + cl_set_setkey(NULL); + } + + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + } + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + max_genid = sd->sd_genid; + + /* Set the master on all nodes first thing */ + if (rb_level > 5) { + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { + continue; + } + /* + * Set master on all re-joining nodes to be + * my cached view of master. + */ + if (clnt_mnsetmaster(nd->nd_nodename, sp, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, &xep)) { + mdclrerror(&xep); + } + } + } + } + + /* level 3 */ + if (rb_level > 2) { + md_set_record *sr; + md_mnset_record *mnsr; + md_drive_record *dr; + int sr_drive_cnt; + + /* + * See if we have to re-add the drives specified. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Must get current set record from each + * node to see what else must be done + * to recover. + * Record should be for a multi-node diskset. + */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, &xep) == -1) { + mdclrerror(&xep); + nd = nd->nd_next; + continue; + } + + /* + * If all drives are already there, skip + * to next node. + */ + sr_drive_cnt = 0; + dr = mnsr->sr_drivechain; + while (dr) { + sr_drive_cnt++; + dr = dr->dr_next; + } + if (sr_drive_cnt == current_drv_cnt) { + free_sr((md_set_record *)mnsr); + nd = nd->nd_next; + continue; + } + + /* Readd all drives */ + if (clnt_adddrvs(nd->nd_nodename, sp, dd, + mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1) + mdclrerror(&xep); + + free_sr((struct md_set_record *)mnsr); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Record should be for a non-multi-node set */ + if (clnt_getset(sd->sd_nodes[i], sp->setname, + MD_SET_BAD, &sr, &xep) == -1) { + mdclrerror(&xep); + continue; + } + + /* + * Set record structure was allocated from RPC + * routine getset so this structure is only of + * size md_set_record even if the MN flag is + * set. So, clear the flag so that the free + * code doesn't attempt to free a structure + * the size of md_mnset_record. + */ + if (MD_MNSET_REC(sr)) { + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + continue; + } + + /* Drive already added, skip to next node */ + if (sr->sr_drivechain != NULL) { + free_sr(sr); + continue; + } + + if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, + sr->sr_ctime, sr->sr_genid, &xep) == -1) + mdclrerror(&xep); + + free_sr(sr); + } + } + max_genid += 2; + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * At this point in time, don't know which nodes are joined + * to the set. So, send a reinit command to mdcommd + * which forces it to get fresh set description. Then send resume. + * + * Later, this code will use rpc.mdcommd messages to reattach disks + * and then rpc.mdcommd may be suspended again, rest of the nodes + * joined, rpc.mdcommd reinited and then resumed. + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Resume all classes but class 1 so that lock is held + * against meta* commands. + * To later resume class1, must issue a class0 resume. + */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, + MD_MSCF_DONT_RESUME_CLASS1, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* level 2 */ + if (rb_level > 1) { + mdnamelist_t *nlp; + mdname_t *np; + + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + uint_t rep_slice; + + if ((meta_replicaslice(ddp->dd_dnp, + &rep_slice, &xep) != 0) || + ((np = metaslicename(ddp->dd_dnp, rep_slice, + &xep)) == NULL)) { + mdclrerror(&xep); + continue; + } + nlp = NULL; + (void) metanamelist_append(&nlp, np); + + if (meta_db_attach(sp, nlp, + (MDCHK_DRVINSET | MDCHK_SET_LOCKED), + &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize, + NULL, &xep) == -1) + mdclrerror(&xep); + + metafreenamelist(nlp); + } + /* Re-balance */ + if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1) + mdclrerror(&xep); + } + + /* level 4 */ + if (rb_level > 3) { + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) + mdclrerror(&xep); + } + } + + /* level 5 */ + if (rb_level > 4) { + if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1) + mdclrerror(&xep); + } + + /* + * If at least one node needs to be rejoined to MN diskset, + * then suspend commd again. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { + nd = nd->nd_next; + continue; + } + break; + } + if (nd) { + /* + * Found node that will be rejoined so + * notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of + * all messages), then change the nodelist followed by + * a reinit and resume. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0, + MD_MSCF_NO_FLAGS, &xep)) { + mdclrerror(&xep); + } + suspendall_flag_rb = 1; + nd = nd->nd_next; + } + } + } + + + + /* level 6 */ + if (rb_level > 5) { + if (MD_MNSET_DESC(sd)) { + int join_flags = 0; + + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Only rejoin nodes that were joined before */ + if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) { + nd = nd->nd_next; + continue; + } + /* + * Rejoin nodes to same state as before - + * either STALE or non-STALE. + */ + if (stale_bool == TRUE) + join_flags = MNSET_IS_STALE; + if (clnt_joinset(nd->nd_nodename, sp, + join_flags, &xep)) + mdclrerror(&xep); + /* Sets OWN flag on all nodes in list */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + } else { + if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) + mdclrerror(&xep); + + /* No special flag for traditional diskset */ + if (snarf_set(sp, NULL, &xep)) + mdclrerror(&xep); + } + } + + /* level 1 */ + if (rb_level > 0) { + /* + * Mark the drives as OK. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Must be last action before unlock. + * In case of panic, recovery code checks + * for MD_DR_OK to know that drive + * and possible master are fully added back. + */ + if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, + MD_DR_OK, &xep) == -1) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, + MD_DR_OK, &xep) == -1) + mdclrerror(&xep); + + } + } + max_genid += 2; + resync_genid(sp, sd, max_genid, 0, NULL); + } + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send a reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (suspendall_flag_rb) { + /* Send reinit */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + } + + /* + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) { + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* Don't test lock flag since guaranteed to be set if in rollback */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metafreedrivedesc(&dd); + + if (flush_set_onerr) { + metaflushsetname(sp); + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + } + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c new file mode 100644 index 0000000000..d5e5f43ed1 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c @@ -0,0 +1,5688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * Metadevice diskset interfaces + */ + +#include "meta_set_prv.h" +#include <meta.h> +#include <sys/lvm/md_crc.h> +#include <sys/time.h> +#include <sdssc.h> + +static int +add_db_sidenms( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + int rval = 0; + + if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0) + return (-1); + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + + /* + * This is not the first replica being added to the + * diskset so call with ADDSIDENMS_BCAST. If this + * is a traditional diskset, the bcast flag is ignored + * since traditional disksets don't use the rpc.mdcommd. + */ + if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, + DB_ADDSIDENMS_BCAST, ep)) { + rval = -1; + goto out; + } + } + +out: + metafreereplicalist(rlp); + return (rval); +} + +static int +add_drvs_to_hosts( + mdsetname_t *sp, + int node_c, + char **node_v, + md_error_t *ep +) +{ + int i; + md_set_desc *sd; + md_drive_desc *dd; + md_timeval32_t now; + ulong_t genid; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) { + if (! mdisok(ep)) + return (-1); + return (0); + } + + now = sd->sd_ctime; + genid = sd->sd_genid - 1; + + for (i = 0; i < node_c; i++) { + if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1) + return (-1); + } + + return (0); +} + +static int +add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep) +{ + mdnm_params_t nm; + char *cname, *dname; + side_t tmp_sideno; + minor_t mnum; + int done, i; + int rval = 0; + md_set_desc *sd; + + (void) memset(&nm, '\0', sizeof (nm)); + nm.key = MD_KEYWILD; + + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + } + /* Use rpc.mdcommd to add md side info from all nodes */ + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + md_mn_result_t *resultp = NULL; + md_mn_msg_meta_md_addside_t md_as; + int send_rval; + + md_as.msg_sideno = sideno; + md_as.msg_otherside = otherside; + /* + * If reconfig cycle has been started, this node is stuck in + * in the return step until this command has completed. If + * mdcommd is suspended, ask send_message to fail (instead of + * retrying) so that metaset can finish allowing the + * reconfig cycle to proceed. + */ + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_MD_ADDSIDE, + MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, + (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t), + &resultp, ep); + if (send_rval != 0) { + (void) mdstealerror(ep, &(resultp->mmr_ep)); + if (resultp) + free_result(resultp); + return (-1); + } + if (resultp) + free_result(resultp); + return (0); + } else { + /*CONSTCOND*/ + while (1) { + nm.mde = mdnullerror; + nm.setno = sp->setno; + nm.side = otherside; + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) + return (mdstealerror(ep, &nm.mde)); + + if (nm.key == MD_KEYWILD) + return (0); + + nm.devname = (uintptr_t)meta_getnmbykey(sp->setno, + otherside, nm.key, ep); + if (nm.devname == NULL) + return (-1); + + nm.side = sideno; + if (MD_MNSET_DESC(sd)) { + tmp_sideno = sideno; + } else { + tmp_sideno = sideno - 1; + } + + if ((done = meta_getnextside_devinfo(sp, + (char *)nm.devname, &tmp_sideno, + &cname, &dname, &mnum, ep)) == -1) { + Free((void *)nm.devname); + return (-1); + } + + assert(done == 1); + Free((void *)nm.devname); + + /* + * The device reference count can be greater than 1 if + * more than one softpart is configured on top of the + * same device. If this is the case then we want to + * increment the count to sync up with the other sides. + */ + for (i = 0; i < nm.ref_count; i++) { + if (add_name(sp, sideno, nm.key, dname, mnum, cname, + ep) == -1) + rval = -1; + } + + Free(cname); + Free(dname); + + if (rval != 0) + return (rval); + } + } + + /*NOTREACHED*/ +} + +static int +check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep) +{ + mddrivename_t *dp; + md_drive_desc *dd, *ddp; + + if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) + if (! mdisok(ep)) + return (-1); + + for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) { + dp = ddp->dd_dnp; + + if (checkdrive_onnode(sp, dp, node, ep)) + return (-1); + } + + return (0); +} + +static int +create_multinode_set_on_hosts( + mdsetname_t *sp, + int node_c, /* Number of new nodes */ + char **node_v, /* Nodes which are being added */ + int new_set, + md_error_t *ep +) +{ + int i; + md_set_desc *sd; + md_timeval32_t now; + ulong_t genid; + int rval = 0; + md_mnnode_desc *nd, *ndm = NULL; + md_mnnode_desc *nd_prev, *nd_curr; + int nodecnt; + mndiskset_membershiplist_t *nl, *nl2; + + if (!new_set) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + now = sd->sd_ctime; + genid = sd->sd_genid - 1; + if (sd->sd_drvs) + genid--; + } else { + sd = Zalloc(sizeof (*sd)); + + if (meta_gettimeofday(&now) == -1) { + (void) mdsyserror(ep, errno, + dgettext(TEXT_DOMAIN, "meta_gettimeofday()")); + rval = -1; + goto out; + } + + /* Put the new entries into the set */ + /* + * Get membershiplist from API routine. If there's + * an error, fail to create set and pass back error. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + rval = -1; + goto out; + } + + /* + * meta_set_addhosts has already verified that + * this node list is in the membership list + * so set ALIVE flag. + * Since this is a new set, all hosts being + * added are new to the set, so also set ADD flag. + */ + for (i = 0; i < node_c; i++) { + nd = Zalloc(sizeof (*nd)); + (void) strcpy(nd->nd_nodename, node_v[i]); + nd->nd_ctime = now; + nd->nd_flags = (MD_MN_NODE_ALIVE | + MD_MN_NODE_ADD); + nl2 = nl; + while (nl2) { + if (strcmp(nl2->msl_node_name, + node_v[i]) == 0) { + nd->nd_nodeid = nl2->msl_node_id; + (void) strcpy(nd->nd_priv_ic, + nl2->msl_node_addr); + break; + } + nl2 = nl2->next; + } + + /* + * Nodelist must be kept in ascending + * nodeid order. + */ + if (sd->sd_nodelist == NULL) { + /* Nothing in list, just add it */ + sd->sd_nodelist = nd; + } else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) { + /* Add to head of list */ + nd->nd_next = sd->sd_nodelist; + sd->sd_nodelist = nd; + } else { + nd_curr = sd->sd_nodelist->nd_next; + nd_prev = sd->sd_nodelist; + /* Search for place ot add it */ + while (nd_curr) { + if (nd->nd_nodeid < + nd_curr->nd_nodeid) { + /* Add before nd_curr */ + nd->nd_next = nd_curr; + nd_prev->nd_next = nd; + break; + } + nd_prev = nd_curr; + nd_curr = nd_curr->nd_next; + } + /* Add to end of list */ + if (nd_curr == NULL) { + nd_prev->nd_next = nd; + } + + } + /* Set master to be first node added */ + if (ndm == NULL) + ndm = nd; + } + + meta_free_nodelist(nl); + /* + * Creating mnset for first time. + * Set master to be invalid until first drive is + * in set. + */ + (void) strcpy(sd->sd_mn_master_nodenm, ""); + sd->sd_mn_master_nodeid = MD_MN_INVALID_NID; + sd->sd_mn_masternode = ndm; + sd->sd_ctime = now; + genid = sd->sd_genid = 0; + } + + /* Create the set where needed */ + for (i = 0; i < node_c; i++) { + /* + * Create the set on each new node. If the set already + * exists, then the node list being created on each new node + * is the current node list from before the new nodes + * were added. If the set doesn't exist, then the node + * list being created on each new node is the entire + * new node list. + */ + if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist, + now, genid, sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, ep) == -1) { + rval = -1; + break; + } + } + +out: + if (new_set) { + nd = sd->sd_nodelist; + while (nd) { + sd->sd_nodelist = nd->nd_next; + Free(nd); + nd = sd->sd_nodelist; + } + Free(sd); + } + + if (rval != 0 || new_set) + return (rval); + + /* + * Add the drive records to the new sets + * and names for the new sides. + */ + return (add_drvs_to_hosts(sp, node_c, node_v, ep)); +} + + +static int +create_traditional_set_on_hosts( + mdsetname_t *sp, + int node_c, /* Number of new nodes */ + char **node_v, /* Nodes which are being added */ + int new_set, + md_error_t *ep +) +{ + int i; + md_set_desc *sd; + md_timeval32_t now; + ulong_t genid; + int rval = 0; + + if (!new_set) { + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + now = sd->sd_ctime; + + genid = sd->sd_genid; + + if (sd->sd_drvs) + genid--; + } else { + if (node_c > MD_MAXSIDES) + return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, + sp->setno, NULL, NULL, sp->setname)); + + sd = Zalloc(sizeof (*sd)); + + /* Put the new entries into the set */ + for (i = 0; i < node_c; i++) { + (void) strcpy(sd->sd_nodes[i], node_v[i]); + } + + if (meta_gettimeofday(&now) == -1) { + (void) mdsyserror(ep, errno, "meta_gettimeofday()"); + rval = -1; + goto out; + } + + sd->sd_ctime = now; + genid = sd->sd_genid = 0; + } + + /* Create the set where needed */ + for (i = 0; i < node_c; i++) { + /* + * Create the set on each new host + */ + if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid, + ep) == -1) { + rval = -1; + break; + } + } + +out: + if (new_set) + Free(sd); + + if (rval != 0 || new_set) + return (rval); + + /* + * Add the drive records to the new sets + * and names for the new sides. + */ + return (add_drvs_to_hosts(sp, node_c, node_v, ep)); +} + +static int +create_set_on_hosts( + mdsetname_t *sp, + int multi_node, /* Multi_node diskset or not? */ + int node_c, /* Number of new nodes */ + char **node_v, /* Nodes which are being added */ + int new_set, + md_error_t *ep +) +{ + if (multi_node) + return (create_multinode_set_on_hosts(sp, node_c, node_v, + new_set, ep)); + else + return (create_traditional_set_on_hosts(sp, node_c, node_v, + new_set, ep)); +} + +static int +create_set( + mdsetname_t *sp, + int multi_node, /* Multi-node diskset or not? */ + int node_c, + char **node_v, + int auto_take, + md_error_t *ep +) +{ + int i; + int rval = 0; + set_t max_sets; + set_t setno; + int bool; + uint_t sr_flags; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + rval_e sdssc_rval; + int lock_flag = 0; + int sig_flag = 0; + + if ((max_sets = get_max_sets(ep)) == 0) + return (-1); + + /* We must be a member of the set we are creating */ + if (! strinlst(mynode(), node_c, node_v)) + return (mddserror(ep, MDE_DS_SELFNOTIN, + sp->setno, mynode(), NULL, sp->setname)); + + /* + * If auto_take then we must be the only member of the set + * that we are creating. + */ + if (auto_take && node_c > 1) + return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, + sp->setname)); + + /* + * If we're part of SC3.0 we'll already have allocated the + * set number so we can skip the allocation algorithm used. + * Set number is unique across traditional and MN disksets. + */ + if ((sdssc_rval = sdssc_get_index(sp->setname, &setno)) + == SDSSC_NOT_BOUND) { + + for (i = 0; i < node_c; i++) { + int has_set; + + /* Skip my node */ + if (strcmp(mynode(), node_v[i]) == 0) + continue; + + /* + * Make sure this set name is not used on the + * other hosts + */ + has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); + if (has_set < 0) { + if (! mdiserror(ep, MDE_NO_SET)) { + rval = -1; + goto out; + } + mdclrerror(ep); + continue; + } + + if (has_set) { + (void) mddserror(ep, MDE_DS_NODEHASSET, + sp->setno, node_v[i], NULL, sp->setname); + rval = -1; + goto out; + } + } + + for (setno = 1; setno < max_sets; setno++) { + for (i = 0; i < node_c; i++) { + if (clnt_setnumbusy(node_v[i], setno, + &bool, ep) == -1) { + rval = -1; + goto out; + } + + if (bool == TRUE) + break; + } + if (i == node_c) + break; + } + } else if (sdssc_rval != SDSSC_OKAY) { + (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL, + NULL, sp->setname); + rval = -1; + goto out; + } + + if (setno == max_sets) { + (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL, + NULL, sp->setname); + rval = -1; + goto out; + } + + sp->setno = setno; + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for traditional + * diskset since lock_set is used to protect against + * other meta* commands running on the other nodes. + * Don't issue mdcommd SUSPEND command since there is nothing + * to suspend since there currently is no set. + */ + if (multi_node) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + sig_flag = 1; + + /* Lock the set on new set members */ + for (i = 0; i < node_c; i++) { + if (clnt_lock_set(node_v[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + /* Now have the diskset locked, verify set number is still ok */ + for (i = 0; i < node_c; i++) { + if (clnt_setnumbusy(node_v[i], setno, + &bool, ep) == -1) { + rval = -1; + goto out; + } + } + } + + + if (meta_set_checkname(sp->setname, ep)) { + rval = -1; + goto out; + } + + for (i = 0; i < node_c; i++) { + if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) { + rval = -1; + goto out; + } + if (bool == FALSE) { + (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, + node_v[i], NULL, sp->setname); + rval = -1; + goto out; + } + } + + /* END CHECK CODE */ + + /* Lock the set on new set members */ + if (!multi_node) { + md_rb_sig_handling_on(); + sig_flag = 1; + for (i = 0; i < node_c; i++) { + if (clnt_lock_set(node_v[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + } + + RB_TEST(1, "create_set", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "create_set", ep) + + if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v, + 1, ep)) == -1) + goto rollback; + + RB_TEST(3, "create_set", ep) + + if (auto_take) + sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE; + else + sr_flags = MD_SR_OK; + + /* + * Mark the set record MD_SR_OK + */ + for (i = 0; i < node_c; i++) + if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep)) + goto rollback; + + rb_level = 2; /* level 2 */ + + /* + * For MN diskset: + * On each added node, set the node record for that node + * to OK. Then set all node records for the newly added + * nodes on all nodes to ok. + * + * By setting a node's own node record to ok first, even if + * the node adding the hosts panics, the rest of the nodes can + * determine the same node list during the choosing of the master + * during reconfig. So, only nodes considered for mastership + * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set + * on that node's rpc.metad. If all nodes have MD_SR_OK set, + * but no node has its own MD_MN_NODE_OK set, then the set will + * be removed during reconfig since a panic occurred during the + * creation of the initial diskset. + */ + + if (multi_node) { + md_mnnode_desc *nd, *saved_nd_next; + md_set_desc *sd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + goto rollback; + } + + for (i = 0; i < node_c; i++) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, will pick this up in next loop */ + if (nd == NULL) + continue; + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set node record for added host to ok on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_OK, NULL, ep)) { + nd->nd_next = saved_nd_next; + goto rollback; + } + nd->nd_next = saved_nd_next; + } + + /* Now set all node records on all nodes to be ok */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_OK, NULL, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + } + + RB_TEST(4, "create_set", ep) + +out: + if ((rval == 0) && multi_node) { + /* + * Set successfully created. + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. Then send resume. + * Resume on class 0 will resume all classes. + */ + for (i = 0; i < node_c; i++) { + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + } + for (i = 0; i < node_c; i++) { + if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + } + meta_ping_mnset(sp->setno); + } + if (lock_flag) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < node_c; i++) { + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + cl_set_setkey(NULL); + } + + if (sig_flag) { + if (multi_node) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!multi_node) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + /* + * For MN diskset: + * On each added node (which is now each node to be deleted), + * set the node record for that node to DEL. Then set all + * node records for the newly added (soon to be deleted) nodes + * on all nodes to ok. + * + * By setting a node's own node record to DEL first, even if + * the node doing the rollback panics, the rest of the nodes can + * determine the same node list during the choosing of the master + * during reconfig. + */ + + /* level 3 */ + if ((rb_level > 1) && (multi_node)) { + md_mnnode_desc *nd, *saved_nd_next; + md_set_desc *sd; + + if ((sd = metaget_setdesc(sp, &xep)) == NULL) { + mdclrerror(&xep); + } + + for (i = 0; i < node_c; i++) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, will pick this up in next loop */ + if (nd == NULL) + continue; + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set node record for added host to DEL on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_DEL, NULL, &xep)) { + nd->nd_next = saved_nd_next; + mdclrerror(&xep); + } + nd->nd_next = saved_nd_next; + } + + /* Now set all node records on all nodes to be DEL */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + /* Mark set record on all hosts to be DELETED */ + for (i = 0; i < node_c; i++) { + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) { + mdclrerror(&xep); + } + } + } + /* level 1 */ + if (rb_level > 0) { + for (i = 0; i < node_c; i++) { + if (clnt_delset(node_v[i], sp, &xep) == -1) + mdclrerror(&xep); + } + } + + /* level 0 */ + /* Don't test lock flag since guaranteed to be set if in rollback */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < node_c; i++) { + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) + mdclrerror(&xep); + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + if ((sig_flag) && (!multi_node)) + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); +} + +static int +del_db_sidenms( + mdsetname_t *sp, + side_t sideno, + md_error_t *ep +) +{ + md_replicalist_t *rlp = NULL; + md_replicalist_t *rl; + int rval = 0; + + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) + return (-1); + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + + if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) { + rval = -1; + goto out; + } + } + +out: + metafreereplicalist(rlp); + return (rval); +} + +static int +del_drvs_from_hosts( + mdsetname_t *sp, + md_set_desc *sd, + md_drive_desc *dd, + int node_c, + char **node_v, + int oha, + md_error_t *ep +) +{ + int i; + md_mnnode_desc *nd; + + for (i = 0; i < node_c; i++) { + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) { + return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, + NULL, sp->setname)); + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + continue; + } + if (clnt_deldrvs(node_v[i], sp, dd, ep)) { + return (-1); + } + } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { + /* + * All nodes should be alive in non-oha mode. + */ + if (clnt_deldrvs(node_v[i], sp, dd, ep)) { + return (-1); + } + } else { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_deldrvs(node_v[i], sp, dd, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + return (-1); + } + } + } + + return (0); +} + +static int +del_host_noset( + mdsetname_t *sp, + char **anode, + md_error_t *ep +) +{ + int rval = 0; + md_setkey_t *cl_sk; + md_drive_desc *dd; + md_error_t xep = mdnullerror; + md_set_desc *sd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* Lock the set on our side */ + if (clnt_lock_set(mynode(), sp, ep)) { + rval = -1; + goto out; + } + + if (clnt_delhosts(mynode(), sp, 1, anode, ep)) { + rval = -1; + goto out; + } + + if (!MD_MNSET_DESC(sd)) { + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) { + rval = -1; + goto out; + } + } + + /* If we have drives */ + if (dd != NULL) { + if (clnt_del_drv_sidenms(mynode(), sp, ep)) { + rval = -1; + goto out; + } + } + } + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + cl_set_setkey(NULL); + + metaflushsetname(sp); + + return (rval); +} + +static int +del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep) +{ + mdnm_params_t nm; + md_set_desc *sd; + int i; + + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + } + /* Use rpc.mdcommd to add md side info from all nodes */ + if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && + (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { + md_mn_result_t *resultp = NULL; + md_mn_msg_meta_md_delside_t md_ds; + int send_rval; + + md_ds.msg_sideno = sideno; + /* + * If reconfig cycle has been started, this node is stuck in + * in the return step until this command has completed. If + * mdcommd is suspended, ask send_message to fail (instead of + * retrying) so that metaset can finish allowing the + * reconfig cycle to proceed. + */ + send_rval = mdmn_send_message(sp->setno, + MD_MN_MSG_META_MD_DELSIDE, + MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT, + (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t), + &resultp, ep); + if (send_rval != 0) { + (void) mdstealerror(ep, &(resultp->mmr_ep)); + if (resultp) + free_result(resultp); + return (-1); + } + if (resultp) + free_result(resultp); + } else { + (void) memset(&nm, '\0', sizeof (nm)); + nm.key = MD_KEYWILD; + + /*CONSTCOND*/ + while (1) { + nm.mde = mdnullerror; + nm.setno = sp->setno; + nm.side = MD_SIDEWILD; + if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0) + return (mdstealerror(ep, &nm.mde)); + + if (nm.key == MD_KEYWILD) + return (0); + + /* + * The device reference count can be greater than 1 if + * more than one softpart is configured on top of the + * same device. If this is the case then we want to + * decrement the count to zero so the entry can be + * actually removed. + */ + for (i = 0; i < nm.ref_count; i++) { + if (del_name(sp, sideno, nm.key, ep) == -1) + return (-1); + } + } + } + return (0); +} + +static void +recreate_set( + mdsetname_t *sp, + md_set_desc *sd +) +{ + int i; + int has_set; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + has_set = nodehasset(sp, nd->nd_nodename, + NHS_NST_EQ, &xep); + + if (has_set >= 0) { + nd = nd->nd_next; + continue; + } + + mdclrerror(&xep); + + if (clnt_mncreateset(nd->nd_nodename, sp, + sd->sd_nodelist, + sd->sd_ctime, sd->sd_genid, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, &xep) == -1) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + has_set = nodehasset(sp, sd->sd_nodes[i], + NHS_NST_EQ, &xep); + + if (has_set >= 0) + continue; + + mdclrerror(&xep); + + if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes, + sd->sd_ctime, sd->sd_genid, &xep) == -1) + mdclrerror(&xep); + } + } +} + +/* + * If a MN diskset, set is already locked on all nodes via clnt_lock_set. + */ +static int +del_set_nodrives( + mdsetname_t *sp, + int node_c, + char **node_v, + int oha, + md_error_t *ep +) +{ + md_set_desc *sd; + int i; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + ulong_t max_genid = 0; + int rval = 0; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + int delete_end = 1; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_on(); + } + + /* + * Lock the set on current set members for traditional disksets. + */ + if (!(MD_MNSET_DESC(sd))) { + for (i = 0; i < node_c; i++) { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_lock_set(node_v[i], sp, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + rval = -1; + goto out; + } + } + } + + + RB_TEST(1, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "deletehosts", ep) + + /* + * Mark the set record MD_SR_DEL + */ + for (i = 0; i < node_c; i++) { + + RB_TEST(3, "deletehosts", ep) + + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, + NULL, sp->setname); + goto rollback; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + continue; + } + + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { + goto rollback; + } + } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { + /* + * All nodes should be alive in non-oha mode. + */ + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { + goto rollback; + } + } else { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + + RB_TEST(4, "deletehosts", ep) + } + + RB_TEST(5, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(6, "deletehosts", ep) + + if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR) + if (metad_isautotakebyname(sp->setname)) + delete_end = 0; + else + goto rollback; + + /* The set is OK to delete, make it so. */ + for (i = 0; i < node_c; i++) { + + RB_TEST(7, "deletehosts", ep) + + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, + NULL, sp->setname); + goto rollback; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + continue; + } + + if (clnt_delset(node_v[i], sp, ep) == -1) { + goto rollback; + } + } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { + /* + * All nodes should be alive in non-oha mode. + */ + if (clnt_delset(node_v[i], sp, ep) == -1) { + goto rollback; + } + } else { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_delset(node_v[i], sp, ep) == -1) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + + RB_TEST(8, "deletehosts", ep) + } + + RB_TEST(9, "deletehosts", ep) + +out: + /* + * Unlock the set on current set members + * for traditional disksets. + */ + if (!(MD_MNSET_DESC(sd))) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < node_c; i++) { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { + if (oha == TRUE && mdanyrpcerror(&xep)) { + mdclrerror(&xep); + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + cl_set_setkey(NULL); + } + + /* + * A MN diskset has the clnt_locks held by meta_set_deletehosts so + * don't flush that data until meta_set_deletehosts has finished + * with it. meta_set_deletehosts will handle the flush of the + * setname. + */ + if (!(MD_MNSET_DESC(sd))) { + metaflushsetname(sp); + } + + if (delete_end && + sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR) + rval = -1; + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + max_genid = sd->sd_genid; + + /* level 2 */ + if (rb_level > 1) { + recreate_set(sp, sd); + max_genid++; + + if (delete_end) + (void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP); + } + + /* level 1 */ + if (rb_level > 0) { + max_genid++; + resync_genid(sp, sd, max_genid, node_c, node_v); + } + + /* level 0 */ + /* + * Unlock the set on current set members + * for traditional disksets. + */ + if (!(MD_MNSET_DESC(sd))) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < node_c; i++) { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) + mdclrerror(&xep); + } + cl_set_setkey(NULL); + } + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + /* + * A MN diskset has the clnt_locks held by meta_set_deletehosts so + * don't flush that data until meta_set_deletehosts has finished + * with it. meta_set_deletehosts will handle the flush of the + * setname. + */ + if (!(MD_MNSET_DESC(sd))) { + metaflushsetname(sp); + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); +} + +/* + * On entry: + * procsigs already called for MN diskset. + * md_rb_sig_handling already called for traditional diskset. + */ +static int +del_set_on_hosts( + mdsetname_t *sp, + md_set_desc *sd, + md_drive_desc *dd, + int node_c, /* Number of nodes */ + char **node_v, /* Nodes being deleted */ + int oha, + md_error_t *ep +) +{ + int i; + int j; + side_t sideno; + md_replicalist_t *rlp = NULL; + sigset_t oldsigs; + md_setkey_t *cl_sk; + ulong_t max_genid = 0; + int rb_level = 1; /* This is a special case */ + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + + RB_PREEMPT; + + RB_TEST(7, "deletehosts", ep) + + if (dd != NULL) { + /* + * May need this to re-add sidenames on roll back. + */ + if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, + ep) < 0) + goto rollback; + + RB_TEST(8, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(9, "deletehosts", ep) + + if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep)) + goto rollback; + + RB_TEST(10, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(11, "deletehosts", ep) + + /* + * Delete the db replica sides + * This is done before the next loop, so that + * the db does not get unloaded before we are finished + * deleting the sides. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip hosts not being deleted */ + if (! strinlst(nd->nd_nodename, node_c, + node_v)) { + nd = nd->nd_next; + continue; + } + + if (del_db_sidenms(sp, nd->nd_nodeid, ep)) + goto rollback; + + RB_TEST(12, "deletehosts", ep) + nd = nd->nd_next; + } + } else { + for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { + /* Skip empty slots */ + if (sd->sd_nodes[sideno][0] == '\0') + continue; + + /* Skip hosts not being deleted */ + if (! strinlst(sd->sd_nodes[sideno], node_c, + node_v)) + continue; + + if (del_db_sidenms(sp, sideno, ep)) + goto rollback; + + RB_TEST(12, "deletehosts", ep) + } + } + + RB_TEST(13, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(14, "deletehosts", ep) + + /* Delete the names from the namespace */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip hosts not being deleted */ + if (! strinlst(nd->nd_nodename, node_c, + node_v)) { + nd = nd->nd_next; + continue; + } + + if (del_md_sidenms(sp, nd->nd_nodeid, ep)) + goto rollback; + + RB_TEST(15, "deletehosts", ep) + nd = nd->nd_next; + } + } else { + for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { + /* Skip empty slots */ + if (sd->sd_nodes[sideno][0] == '\0') + continue; + + /* Skip hosts not being deleted */ + if (! strinlst(sd->sd_nodes[sideno], node_c, + node_v)) + continue; + + if (del_md_sidenms(sp, sideno, ep)) + goto rollback; + + RB_TEST(15, "deletehosts", ep) + } + } + } + + RB_TEST(16, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 5; /* level 6 */ + + RB_TEST(17, "deletehosts", ep) + + for (i = 0; i < node_c; i++) { + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, + NULL, sp->setname); + goto rollback; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + continue; + } + + if (clnt_delset(node_v[i], sp, ep) == -1) { + goto rollback; + } + } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) { + /* + * All nodes should be alive in non-oha mode. + */ + if (clnt_delset(node_v[i], sp, ep) == -1) { + goto rollback; + } + } else { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_delset(node_v[i], sp, ep) == -1) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + + RB_TEST(18, "deletehosts", ep) + } + + metafreereplicalist(rlp); + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (0); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + max_genid = sd->sd_genid; + + /* level 5 */ + if (rb_level > 4) { + recreate_set(sp, sd); + max_genid++; + } + + /* level 2 */ + if (rb_level > 1 && dd != NULL) { + /* + * See if we have to re-add the drives specified. + */ + for (i = 0; i < node_c; i++) { + md_set_record *sr; + + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) + == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) + continue; + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) + continue; + } + + /* Don't care if set record is MN or not */ + if (clnt_getset(node_v[i], sp->setname, + MD_SET_BAD, &sr, &xep) == -1) { + mdclrerror(&xep); + continue; + } + + /* Drive already added, skip to next node */ + if (sr->sr_drivechain != NULL) { + /* + * Set record structure was allocated from RPC + * routine getset so this structure is only of + * size md_set_record even if the MN flag is + * set. So, clear the flag so that the free + * code doesn't attempt to free a structure + * the size of md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + continue; + } + + if (clnt_adddrvs(node_v[i], sp, dd, + sr->sr_ctime, sr->sr_genid, &xep) == -1) + mdclrerror(&xep); + + if (clnt_upd_dr_flags(node_v[i], sp, dd, + MD_DR_OK, &xep) == -1) + mdclrerror(&xep); + + /* + * Set record structure was allocated from RPC routine + * getset so this structure is only of size + * md_set_record even if the MN flag is set. So, + * clear the flag so that the free code doesn't + * attempt to free a structure the size of + * md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + } + max_genid += 3; + } + + /* level 3 */ + if (rb_level > 2 && dd != NULL) { + md_replicalist_t *rl; + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + + /* + * This is not the first replica being added to the + * diskset so call with ADDSIDENMS_BCAST. If this + * is a traditional diskset, the bcast flag is ignored + * since traditional disksets don't use the rpc.mdcommd. + */ + if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, + DB_ADDSIDENMS_BCAST, &xep)) + mdclrerror(&xep); + } + } + + /* level 4 */ + if (rb_level > 3 && dd != NULL) { + int nodeid_addsides = 0; + /* + * Add the device names for the new sides into the namespace, + * on all hosts not being deleted. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Find a node that is not being deleted */ + if (! strinlst(nd->nd_nodename, node_c, + node_v)) { + nodeid_addsides = nd->nd_nodeid; + break; + } + nd = nd->nd_next; + } + } else { + for (j = 0; j < MD_MAXSIDES; j++) { + /* Skip empty slots */ + if (sd->sd_nodes[j][0] == '\0') + continue; + + /* Find a node that is not being deleted */ + if (! strinlst(sd->sd_nodes[j], node_c, + node_v)) + break; + } + nodeid_addsides = j; + } + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being deleted */ + if (!strinlst(nd->nd_nodename, node_c, + node_v)) { + nd = nd->nd_next; + continue; + } + + /* this side was just created, add the names */ + if (add_md_sidenms(sp, nd->nd_nodeid, + nodeid_addsides, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being deleted */ + if (!strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + /* this side was just created, add the names */ + if (add_md_sidenms(sp, i, nodeid_addsides, + &xep)) + mdclrerror(&xep); + } + } + } + + /* level 1 */ + if (rb_level > 0) { + max_genid++; + resync_genid(sp, sd, max_genid, node_c, node_v); + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) + continue; + /* To balance lock/unlock; can send to dead node */ + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metafreereplicalist(rlp); + + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (-1); +} + +static int +make_sideno_sidenm( + mdsetname_t *sp, + mddrivename_t *dnp, + side_t sideno, + md_error_t *ep +) +{ + mdsidenames_t *sn, **sn_next; + md_set_desc *sd; + mdname_t *np; + uint_t rep_slice; + int err = 0; + + assert(dnp->side_names_key != MD_KEYWILD); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* find the end of the link list */ + for (sn = dnp->side_names; sn->next != NULL; sn = sn->next); + sn_next = &sn->next; + + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) + return (-1); + + if ((np = metaslicename(dnp, rep_slice, ep)) == NULL) + return (-1); + + sn = Zalloc(sizeof (*sn)); + sn->sideno = sideno; + + if (MD_MNSET_DESC(sd)) { + /* + * For MO diskset the sideno is not an index into + * the array of nodes. Hence getside_devinfo is + * used instead of meta_getnextside_devinfo. + */ + if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname, + &sn->dname, &sn->mnum, ep) == -1) + err = -1; + } else { + /* decrement sideno, to look like the previous sideno */ + sideno--; + if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname, + &sn->dname, &sn->mnum, ep) == -1) + err = -1; + } + + if (err) { + Free(sn); + return (err); + } + assert(sn->sideno == sideno); + + /* Add to the end of the linked list */ + *sn_next = sn; + return (0); +} + +static int +validate_nodes( + mdsetname_t *sp, + int node_c, + char **node_v, + md_error_t *ep +) +{ + char *hostname; + int i; + + + for (i = 0; i < node_c; i++) { + if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME) + return (mddserror(ep, MDE_DS_NODENAMETOOLONG, + sp->setno, node_v[i], NULL, sp->setname)); + if (clnt_hostname(node_v[i], &hostname, ep)) + return (-1); + if (strcmp(node_v[i], hostname) != 0) { + Free(hostname); + return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno, + node_v[i], NULL, sp->setname)); + } + Free(hostname); + } + return (0); +} + +/* + * Exported Entry Points + */ + +/* + * Check the given disk set name for syntactic correctness. + */ +int +meta_set_checkname(char *setname, md_error_t *ep) +{ + char *cp; + + if (strlen(setname) > (size_t)MD_MAX_SETNAME) + return (mddserror(ep, MDE_DS_SETNAMETOOLONG, + MD_SET_BAD, NULL, NULL, setname)); + + for (cp = setname; *cp; cp++) + if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL) + return (mddserror(ep, MDE_DS_INVALIDSETNAME, + MD_SET_BAD, NULL, NULL, setname)); + return (0); +} + +/* + * Add host(s) to the multi-node diskset provided in sp. + * - create set if non-existent. + */ +static int +meta_multinode_set_addhosts( + mdsetname_t *sp, + int multi_node, + int node_c, + char **node_v, + int auto_take, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd, *p; + int rval = 0; + int bool; + int nodeindex; + int i; + int has_set; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd, *nd_curr, *nd_prev; + md_timeval32_t now; + int nodecnt; + mndiskset_membershiplist_t *nl, *nl2; + int suspendall_flag = 0; + int suspend1_flag = 0; + int lock_flag = 0; + int stale_flag = 0; + md_mnnode_desc *saved_nd_next; + int remote_sets_created = 0; + + /* + * Check membershiplist first. If there's + * an error, fail to create set and pass back error. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + return (-1); + } + /* Verify that all nodes are in member list */ + for (i = 0; i < node_c; i++) { + /* + * If node in list isn't a member of the membership, + * just return error. + */ + if (meta_is_member(node_v[i], NULL, nl) == 0) { + meta_free_nodelist(nl); + return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, node_v[i], NULL, sp->setname)); + } + } + /* + * Node list is needed later, but there is a lot of error + * checking and possible failures between here and there, so + * just re-get the list later if there are no errors. + */ + meta_free_nodelist(nl); + nl = NULL; + + /* + * Verify that list of nodes being added contains no + * duplicates. + */ + if (nodesuniq(sp, node_c, node_v, ep)) + return (-1); + + /* + * Verify that each node being added thinks that its nodename + * is the same as the nodename given. + */ + if (validate_nodes(sp, node_c, node_v, ep)) + return (-1); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + if (! mdiserror(ep, MDE_NO_SET)) + return (-1); + mdclrerror(ep); + return (create_set(sp, multi_node, node_c, node_v, auto_take, + ep)); + } else { + /* + * If this node and another node were both attempting to + * create the same setname at the same time, and the other + * node has just created the set on this node then sd would + * be non-NULL, but sp->setno would be null (setno is filled + * in by the create_set). If this is true, then fail since + * the other node has already won this race. + */ + if (sp->setno == NULL) { + return (mddserror(ep, MDE_DS_NODEINSET, + NULL, mynode(), NULL, sp->setname)); + } + } + + /* The auto_take behavior is inconsistent with multiple hosts. */ + if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) { + (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, + sp->setname); + return (-1); + } + + /* + * We already have the set. + */ + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* + * The drive and node records are stored in the local mddbs of each + * node in the diskset. Each node's rpc.metad daemon reads in the set, + * drive and node records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set, drive and node records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list). Otherwise, + * fail this operation since all nodes must be ALIVE in order to add + * the new node record to their local mddb. If a panic of this node + * leaves the local mddbs set, node and drive records out-of-sync, the + * reconfig cycle will fix the local mddbs and force them back into + * synchronization. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, NULL, + sp->setname)); + } + nd = nd->nd_next; + } + + /* + * Check if node is already in set. + */ + for (i = 0; i < node_c; i++) { + /* Is node already in set? */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + if (nd) { + return (mddserror(ep, MDE_DS_NODEINSET, + sp->setno, node_v[i], NULL, + sp->setname)); + } + } + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for traditional + * diskset since lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + /* Send suspend to nodes in nodelist before addhosts call */ + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + + /* Lock the set on new set members */ + for (i = 0; i < node_c; i++) { + /* Already verified to be alive */ + if (clnt_lock_set(node_v[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + + /* + * Perform the required checks for new hosts + */ + for (i = 0; i < node_c; i++) { + /* Make sure this set name is not used on the other hosts */ + has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); + if (has_set < 0) { + if (! mdiserror(ep, MDE_NO_SET)) { + rval = -1; + goto out; + } + /* Keep on truck'n */ + mdclrerror(ep); + } else if (has_set) { + (void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno, + node_v[i], NULL, sp->setname); + rval = -1; + goto out; + } + + if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) { + rval = -1; + goto out; + } + + if (bool == TRUE) { + (void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno, + node_v[i], NULL, sp->setname); + rval = -1; + goto out; + } + + if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) { + rval = -1; + goto out; + } + + if (bool == FALSE) { + (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, + node_v[i], NULL, sp->setname); + rval = -1; + goto out; + } + + if (check_setdrvs_againstnode(sp, node_v[i], ep)) { + rval = -1; + goto out; + } + } + + /* Get drive descriptors for the set */ + if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) { + if (! mdisok(ep)) { + rval = -1; + goto out; + } + } + + /* END CHECK CODE */ + + RB_TEST(1, "addhosts", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "addhosts", ep) + + /* + * Create the set where needed + */ + if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) { + goto rollback; + } + + /* + * Send suspend to rpc.mdcommd on nodes where a set has been + * created since rpc.mdcommd must now be running on the remote nodes. + */ + remote_sets_created = 1; + for (i = 0; i < node_c; i++) { + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + if (clnt_mdcommdctl(node_v[i], + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto rollback; + } + } + + /* + * Merge the new entries into the set with the existing sides. + * Get membershiplist from API routine. If there's + * an error, fail to create set and pass back error. + */ + if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) { + goto rollback; + } + if (meta_gettimeofday(&now) == -1) { + meta_free_nodelist(nl); + (void) mdsyserror(ep, errno, + dgettext(TEXT_DOMAIN, "meta_gettimeofday()")); + goto rollback; + } + for (nodeindex = 0; nodeindex < node_c; nodeindex++) { + nd = Zalloc(sizeof (*nd)); + (void) strcpy(nd->nd_nodename, node_v[nodeindex]); + nd->nd_ctime = now; + nl2 = nl; + while (nl2) { + if (strcmp(nl2->msl_node_name, + node_v[nodeindex]) == 0) { + nd->nd_nodeid = nl2->msl_node_id; + (void) strcpy(nd->nd_priv_ic, + nl2->msl_node_addr); + break; + } + nl2 = nl2->next; + } + + /* + * Nodelist must be kept in ascending nodeid order. + */ + if (sd->sd_nodelist == NULL) { + /* Nothing in list, just add it */ + sd->sd_nodelist = nd; + } else if (nd->nd_nodeid < + sd->sd_nodelist->nd_nodeid) { + /* Add to head of list */ + nd->nd_next = sd->sd_nodelist; + sd->sd_nodelist = nd; + } else { + nd_curr = sd->sd_nodelist->nd_next; + nd_prev = sd->sd_nodelist; + /* Search for place to add it */ + while (nd_curr) { + if (nd->nd_nodeid < nd_curr->nd_nodeid) { + /* Add before nd_curr */ + nd->nd_next = nd_curr; + nd_prev->nd_next = nd; + break; + } + nd_prev = nd_curr; + nd_curr = nd_curr->nd_next; + } + /* Add to end of list */ + if (nd_curr == NULL) { + nd_prev->nd_next = nd; + } + + } + /* Node already verified to be in membership */ + nd->nd_flags |= MD_MN_NODE_ALIVE; + } + meta_free_nodelist(nl); + + /* If we have drives */ + if (dd != NULL) { + /* + * For all the hosts being added, create a sidename structure + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being added */ + if (!strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + for (p = dd; p != NULL; p = p->dd_next) { + if (make_sideno_sidenm(sp, p->dd_dnp, + nd->nd_nodeid, ep) != 0) + goto rollback; + } + nd = nd->nd_next; + } + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(4, "addhosts", ep) + + /* + * Add the new sidename for each drive to all the hosts + * + * If a multi-node diskset, each host only stores + * the side information for itself. So, only send + * side information to the new hosts where each host + * will add the appropriate side information to its + * local mddb. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being added */ + if (!strinlst(nd->nd_nodename, node_c, + node_v)) { + nd = nd->nd_next; + continue; + } + + /* Add side info to new hosts */ + if (clnt_add_drv_sidenms(nd->nd_nodename, + mynode(), sp, sd, node_c, node_v, ep)) + goto rollback; + + nd = nd->nd_next; + } + + RB_TEST(5, "addhosts", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(6, "addhosts", ep) + + /* + * Add the device names for the new sides into the namespace + * for all hosts being added. This is adding the side + * names to the diskset's mddb so add sidenames for all + * of the new hosts. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being added */ + if (!strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + /* this side was just created, add the names */ + if (add_md_sidenms(sp, nd->nd_nodeid, + MD_SIDEWILD, ep)) + goto rollback; + + nd = nd->nd_next; + } + + RB_TEST(7, "addhosts", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(8, "addhosts", ep) + + if (add_db_sidenms(sp, ep)) + goto rollback; + + } else { + RB_PREEMPT; + rb_level = 4; + } + + RB_TEST(9, "addhosts", ep) + + RB_PREEMPT; + rb_level = 5; /* level 5 */ + + RB_TEST(10, "addhosts", ep) + + if (dd != NULL) { + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of all + * messages), then change the nodelist followed by a reinit + * and resume. + */ + nd = sd->sd_nodelist; + /* Send suspend_all to nodes in nodelist (existing + new) */ + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto rollback; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + } + + /* Add the node(s) to the each host that is currently in the set */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + + RB_TEST(11, "addhosts", ep) + + if (dd != NULL) { + /* + * Mark the drives MD_DR_OK. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd, + MD_DR_OK, ep) == -1) + goto rollback; + nd = nd->nd_next; + } + } + + RB_TEST(12, "addhosts", ep) + + RB_PREEMPT; + rb_level = 6; /* level 6 */ + + RB_TEST(13, "addhosts", ep) + + + /* Add the mediator information to all hosts in the set. */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep)) + goto rollback; + nd = nd->nd_next; + } + + RB_TEST(14, "addhosts", ep) + + /* + * If a MN diskset and there are drives in the set, + * set the master on the new nodes and + * automatically join the new nodes into the set. + */ + if (dd != NULL) { + mddb_config_t c; + /* + * Is current set STALE? + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + rval = -1; + goto out; + } + if (c.c_flags & MDDB_C_STALE) { + stale_flag = MNSET_IS_STALE; + } + + /* Set master on newly added nodes */ + for (i = 0; i < node_c; i++) { + if (clnt_mnsetmaster(node_v[i], sp, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, ep)) { + goto rollback; + } + } + /* Join newly added nodes to diskset and set OWN flag */ + for (i = 0; i < node_c; i++) { + if (clnt_joinset(node_v[i], sp, stale_flag, ep)) + goto rollback; + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) { + nd->nd_flags |= MD_MN_NODE_OWN; + /* + * Also set ADD flag since this flag + * is already set in rpc.metad - it's + * just not in the local copy. + * Could flush local cache and call + * metaget_setdesc, but this just + * adds time. Since this node knows + * the state of the node flags in + * rpc.metad, just set the ADD + * flag and save time. + */ + nd->nd_flags |= MD_MN_NODE_ADD; + break; + } + nd = nd->nd_next; + } + } + + /* Send new node flag list to all Owner nodes */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_OWN)) { + nd = nd->nd_next; + continue; + } + /* + * Will effectively set OWN flag in records kept + * cached in rpc.metad. The ADD flag would have + * already been set by the call to clnt_addhosts. + */ + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_SET, NULL, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + } + + /* + * Mark the set record MD_SR_OK + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK, + ep)) { + goto rollback; + } + nd = nd->nd_next; + } + + /* + * For MN diskset: + * On each newly added node, set the node record for that node + * to OK. Then set all node records for the newly added + * nodes on all nodes to ok. + * + * By setting a node's own node record to ok first, even if + * the node adding the hosts panics, the rest of the nodes can + * determine the same node list during the choosing of the master + * during reconfig. So, only nodes considered for mastership + * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set + * on that node's rpc.metad. If all nodes have MD_SR_OK set, + * but no node has its own MD_MN_NODE_OK set, then the set will + * be removed during reconfig since a panic occurred during the + * creation of the initial diskset. + */ + + for (i = 0; i < node_c; i++) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, will pick this up in next loop */ + if (nd == NULL) + continue; + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set node record for added host to ok on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_OK, NULL, ep)) { + nd->nd_next = saved_nd_next; + goto rollback; + } + nd->nd_next = saved_nd_next; + } + + /* Now set all node records on all nodes to be ok */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_OK, NULL, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + + RB_TEST(15, "addhosts", ep) +out: + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. Then send resume. + * Resume on class 0 will resume all classes, so can skip + * doing an explicit resume of class1 (ignore suspend1_flag). + */ + if (suspendall_flag) { + /* + * Don't know if nodelist contains the nodes being added + * or not, so do reinit to nodes not being added (by skipping + * any nodes in the nodelist being added) and then do + * reinit to nodes being added if remote_sets_created is 1. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Skip nodes being added - handled later */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + /* + * Send reinit to added nodes that had a set created since + * rpc.mdcommd is running on the nodes with a set. + */ + if (remote_sets_created == 1) { + for (i = 0; i < node_c; i++) { + if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + } + } + } + } + if ((suspend1_flag) || (suspendall_flag)) { + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + * + * Don't know if nodelist contains the nodes being added + * or not, so do resume_all to nodes not being added (by + * skipping any nodes in the nodelist being added) and then do + * resume_all to nodes being added if remote_sets_created is 1. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Skip nodes being added - handled later */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + /* + * Send resume to added nodes that had a set created since + * rpc.mdcommd is be running on the nodes with a set. + */ + if (remote_sets_created == 1) { + for (i = 0; i < node_c; i++) { + /* Already verified to be alive */ + if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + } + } + meta_ping_mnset(sp->setno); + /* + * Start a resync thread on the newly added nodes + * if set is not stale. Also start a thread to update the + * abr state of all soft partitions + */ + if (stale_flag != MNSET_IS_STALE) { + for (i = 0; i < node_c; i++) { + if (clnt_mn_mirror_resync_all(node_v[i], + sp->setno, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to start resync " + "thread.\n")); + } + if (clnt_mn_sp_update_abr(node_v[i], + sp->setno, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to start sp update " + "thread.\n")); + } + } + } + } + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* + * Don't know if nodelist contains the nodes being added + * or not, so do clnt_unlock_set to nodes not being added (by + * skipping any nodes in the nodelist being added) and then do + * clnt_unlock_set to nodes being added. + */ + if (lock_flag) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Skip hosts we get in the next loop */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + for (i = 0; i < node_c; i++) { + /* Already verified to be alive */ + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + cl_set_setkey(NULL); + + metaflushsetname(sp); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + return (rval); + +rollback: + rval = -1; + + /* level 6 */ + if (rb_level > 5) { + /* + * For each node being deleted, set DEL flag and + * reset OK flag on that node first. + * Until a node has turned off its own + * rpc.metad's NODE_OK flag, that node could be + * considered for master during a reconfig. + */ + for (i = 0; i < node_c; i++) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, handle this in next loop */ + if (nd == NULL) + continue; + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set flags for del host to DEL on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_DEL, NULL, &xep)) { + mdclrerror(&xep); + } + nd->nd_next = saved_nd_next; + } + + for (i = 0; i < node_c; i++) { + if (dd != NULL) { + /* Reset master on newly added node */ + if (clnt_mnsetmaster(node_v[i], sp, "", + MD_MN_INVALID_NID, &xep)) + mdclrerror(&xep); + /* Withdraw set on newly added node */ + if (clnt_withdrawset(node_v[i], sp, &xep)) + mdclrerror(&xep); + } + /* + * Turn off owner flag in nodes to be deleted + * if there are drives in the set. + * Also, turn off NODE_OK and turn on NODE_DEL + * for nodes to be deleted. + * These flags are used to set the node + * record flags in all nodes in the set. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) { + if (dd != NULL) { + nd->nd_flags &= ~MD_MN_NODE_OWN; + } + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags &= ~MD_MN_NODE_OK; + break; + } + nd = nd->nd_next; + } + } + + /* + * Now, reset owner and set delete flags for the deleted + * nodes on all nodes. + */ + nd = sd->sd_nodelist; + while (nd) { + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_SET, NULL, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + /* + * On each node being deleted, set the set record + * to be in DEL state. + */ + for (i = 0; i < node_c; i++) { + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) { + mdclrerror(&xep); + } + } + } + + /* level 5 */ + if (rb_level > 4) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_delhosts(nd->nd_nodename, sp, node_c, + node_v, &xep) == -1) + mdclrerror(&xep); + nd = nd->nd_next; + } + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. Then send resume. + * Nodelist contains all nodes (existing + added). + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + /* Send reinit to nodes in nodelist before addhosts call */ + while (nd) { + /* + * Skip nodes being added if remote sets were not + * created since rpc.mdcommd may not be running + * on the remote nodes. + */ + if ((remote_sets_created == 0) && + (strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + /* Send resume */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* + * Skip nodes being added if remote sets were not + * created since rpc.mdcommd may not be running + * on the remote nodes. + */ + if ((remote_sets_created == 0) && + (strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + /* + * Resume all classes but class 1 so that lock is held + * against meta* commands. + * Send resume_all_but_1 to nodes in nodelist + * before addhosts call. + */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1, + &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* level 4 */ + /* Nodelist may or may not contain nodes being added. */ + if (rb_level > 3 && dd != NULL) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being added */ + if (!strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + if (del_db_sidenms(sp, nd->nd_nodeid, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } + + /* level 3 */ + /* Nodelist may or may not contain nodes being added. */ + if (rb_level > 2 && dd != NULL) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being added */ + if (!strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + if (del_md_sidenms(sp, nd->nd_nodeid, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } + + /* level 1 */ + if (rb_level > 0) { + if (dd != NULL) { + /* delete the drive records */ + for (i = 0; i < node_c; i++) { + if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1) + mdclrerror(&xep); + } + } + + /* delete the set record */ + for (i = 0; i < node_c; i++) { + if (clnt_delset(node_v[i], sp, &xep) == -1) + mdclrerror(&xep); + } + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* Don't test lock flag since guaranteed to be set if in rollback */ + /* Nodelist may or may not contain nodes being added. */ + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + /* All nodes are guaranteed to be ALIVE */ + nd = sd->sd_nodelist; + while (nd) { + /* + * Skip nodes being added since remote sets + * were either created and then deleted or + * were never created. Either way - rpc.mdcommd + * may not be running on the remote node. + */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_RESUME, sp, MD_MSG_CLASS0, + MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + /* Skip hosts we get in the next loop */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + + for (i = 0; i < node_c; i++) + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) + mdclrerror(&xep); + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metaflushsetname(sp); + + return (rval); +} + +/* + * Add host(s) to the traditional diskset provided in sp. + * - create set if non-existent. + */ +static int +meta_traditional_set_addhosts( + mdsetname_t *sp, + int multi_node, + int node_c, + char **node_v, + int auto_take, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd, *p; + med_rec_t medr; + med_rec_t rb_medr; + int rval = 0; + int bool; + int nodeindex; + int i; + int has_set; + int numsides; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + int max_meds; + + if (nodesuniq(sp, node_c, node_v, ep)) + return (-1); + + if (validate_nodes(sp, node_c, node_v, ep)) + return (-1); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + if (! mdiserror(ep, MDE_NO_SET)) + return (-1); + mdclrerror(ep); + return (create_set(sp, multi_node, node_c, node_v, auto_take, + ep)); + } + + /* The auto_take behavior is inconsistent with multiple hosts. */ + if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) { + (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL, + sp->setname); + return (-1); + } + + /* + * We already have the set. + */ + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* + * Perform the required checks for new hosts + */ + for (i = 0; i < node_c; i++) { + if (getnodeside(node_v[i], sd) != MD_SIDEWILD) + return (mddserror(ep, MDE_DS_NODEINSET, sp->setno, + node_v[i], NULL, sp->setname)); + + /* Make sure this set name is not used on the other hosts */ + has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep); + if (has_set < 0) { + if (! mdiserror(ep, MDE_NO_SET)) + return (-1); + /* Keep on truck'n */ + mdclrerror(ep); + } else if (has_set) + return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno, + node_v[i], NULL, sp->setname)); + + if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) + return (-1); + + if (bool == TRUE) + return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno, + node_v[i], NULL, sp->setname)); + + if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) + return (-1); + + if (bool == FALSE) + return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno, + node_v[i], NULL, sp->setname)); + + if (check_setdrvs_againstnode(sp, node_v[i], ep)) + return (-1); + } + + /* Count the number of occupied slots */ + numsides = 0; + for (i = 0; i < MD_MAXSIDES; i++) { + /* Count occupied slots */ + if (sd->sd_nodes[i][0] != '\0') + numsides++; + } + + /* Make sure the we have space to add the new sides */ + if ((numsides + node_c) > MD_MAXSIDES) { + (void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL, + NULL, sp->setname); + return (-1); + } + + /* Get drive descriptors for the set */ + if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) + if (! mdisok(ep)) + return (-1); + + /* Setup the mediator record roll-back structure */ + (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); + rb_medr.med_rec_mag = MED_REC_MAGIC; + rb_medr.med_rec_rev = MED_REC_REV; + rb_medr.med_rec_fl = 0; + rb_medr.med_rec_sn = sp->setno; + (void) strcpy(rb_medr.med_rec_snm, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]); + rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ + (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); + rb_medr.med_rec_foff = 0; + crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL); + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + /* END CHECK CODE */ + + md_rb_sig_handling_on(); + + /* Lock the set on current set members */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + } + + /* Lock the set on new set members */ + for (i = 0; i < node_c; i++) { + if (clnt_lock_set(node_v[i], sp, ep)) { + rval = -1; + goto out; + } + } + + RB_TEST(1, "addhosts", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "addhosts", ep) + + /* + * Add the new hosts to the existing set record on the existing hosts + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep)) + goto rollback; + } + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(3, "addhosts", ep); + + /* Merge the new entries into the set with the existing sides */ + nodeindex = 0; + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip full slots */ + if (sd->sd_nodes[i][0] != '\0') + continue; + + (void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]); + if (nodeindex == node_c) + break; + } + + /* If we have drives */ + if (dd != NULL) { + /* + * For all the hosts being added, create a sidename structure + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being added */ + if (! strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + for (p = dd; p != NULL; p = p->dd_next) { + if (make_sideno_sidenm(sp, p->dd_dnp, i, + ep) != 0) + goto rollback; + } + } + + /* + * Add the new sidename for each drive to the existing hosts + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes being added */ + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp, + sd, node_c, node_v, ep)) { + goto rollback; + } + } + + RB_TEST(4, "addhosts", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(5, "addhosts", ep) + + if (add_db_sidenms(sp, ep)) { + goto rollback; + } + + } else { + RB_PREEMPT; + rb_level = 3; + } + + RB_TEST(6, "addhosts", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(7, "addhosts", ep) + + + /* create the set on the new nodes, this adds the drives as well */ + if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) { + goto rollback; + } + + RB_TEST(8, "addhosts", ep) + + RB_PREEMPT; + rb_level = 5; /* level 5 */ + + RB_TEST(9, "addhosts", ep) + + if (dd != NULL) { + + /* + * Add the device entries for the new sides into the namespace. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being added */ + if (! strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (add_md_sidenms(sp, i, MD_SIDEWILD, ep)) + goto rollback; + } + } + + RB_TEST(10, "addhosts", ep) + + RB_PREEMPT; + rb_level = 6; /* level 6 */ + + RB_TEST(11, "addhosts", ep); + + if (dd != NULL) { + /* + * Mark the drives MD_DR_OK. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, + MD_DR_OK, ep) == -1) { + goto rollback; + } + } + } + + RB_TEST(12, "addhosts", ep) + + /* Bring the mediator record up to date with the set record */ + medr = rb_medr; /* structure assignment */ + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]); + crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* Inform the mediator hosts of the new node list */ + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) + goto rollback; + } + + /* Add the mediator information to all hosts in the set */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep)) + goto rollback; + } + + RB_TEST(13, "addhosts", ep) + + /* + * Mark the set record MD_SR_OK + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep)) + goto rollback; + } + + RB_TEST(14, "addhosts", ep) + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip hosts we get in the next loop */ + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + + if (rval == 0) { + for (i = 0; i < node_c; i++) + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + cl_set_setkey(NULL); + + metaflushsetname(sp); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); + +rollback: + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + rval = -1; + + /* level 6 */ + if (rb_level > 5) { + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, + &rb_medr, &xep)) + mdclrerror(&xep); + } + if (dd != NULL) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being added */ + if (! strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (del_md_sidenms(sp, i, &xep)) + mdclrerror(&xep); + } + } + } + + /* level 5 */ + if (rb_level > 4) { + if (dd != NULL) { + /* delete the drive records */ + for (i = 0; i < node_c; i++) { + if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1) + mdclrerror(&xep); + } + } + /* delete the set record on the 'new' hosts */ + for (i = 0; i < node_c; i++) { + if (clnt_delset(node_v[i], sp, &xep) == -1) + mdclrerror(&xep); + } + } + + /* level 4 */ + if (rb_level > 3 && dd != NULL) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being added */ + if (! strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (del_db_sidenms(sp, i, &xep)) + mdclrerror(&xep); + } + } + + /* level 3 */ + if (rb_level > 2 && dd != NULL) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being added */ + if (! strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp, + &xep) == -1) + mdclrerror(&xep); + } + } + + /* level 2 */ + if (rb_level > 1) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v, + &xep) == -1) + mdclrerror(&xep); + } + } + + /* level 1 */ + if (rb_level > 0) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip hosts we get in the next loop */ + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + + for (i = 0; i < node_c; i++) + if (clnt_unlock_set(node_v[i], cl_sk, &xep)) + mdclrerror(&xep); + cl_set_setkey(NULL); + } + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metaflushsetname(sp); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); +} + +/* + * Add host(s) to the diskset provided in sp. + * - create set if non-existent. + */ +int +meta_set_addhosts( + mdsetname_t *sp, + int multi_node, + int node_c, + char **node_v, + int auto_take, + md_error_t *ep +) +{ + if (multi_node) + return (meta_multinode_set_addhosts(sp, multi_node, node_c, + node_v, auto_take, ep)); + else + return (meta_traditional_set_addhosts(sp, multi_node, node_c, + node_v, auto_take, ep)); +} + +/* + * Delete host(s) from the diskset provided in sp. + * - destroy set if last host in set is removed. + */ +int +meta_set_deletehosts( + mdsetname_t *sp, + int node_c, + char **node_v, + int forceflg, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd; + med_rec_t medr; + med_rec_t rb_medr; + int i, j; + int has_set; + int numsides = 0; + int oha = FALSE; + sigset_t oldsigs; + mhd_mhiargs_t mhiargs; + md_replicalist_t *rlp = NULL; + md_setkey_t *cl_sk; + ulong_t max_genid = 0; + int rval = 0; + int rb_level = 0; + int max_meds = 0; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + md_mnnode_record *nr; + int delete_master = 0; + int suspendall_flag = 0, suspendall_flag_rb = 0; + int suspend1_flag = 0; + int lock_flag = 0; + int stale_flag = 0; + int *node_id_list = NULL; + int remote_sets_deleted = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* + * Verify that list of nodes being deleted contains no + * duplicates. + */ + if (nodesuniq(sp, node_c, node_v, ep)) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* + * The drive and node records are stored in the local mddbs of each + * node in the diskset. Each node's rpc.metad daemon reads in the set, + * drive and node records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set, drive and node records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list) if the + * forceflag is FALSE. (The case of forceflag being TRUE is handled + * in OHA check above.) + * + * If forceflag is FALSE and a node in the diskset is not in + * the membership list, then fail this operation since all nodes must + * be ALIVE in order to delete the node record from their local mddb. + * If a panic of this node leaves the local mddbs set, node and drive + * records out-of-sync, the reconfig cycle will fix the local mddbs + * and force them back into synchronization. + */ + if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + return (mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, nd->nd_nodename, + NULL, sp->setname)); + } + nd = nd->nd_next; + } + } + + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for traditional + * diskset since lock_set and SUSPEND are used to protect against + * other meta* commands running on the other nodes. + */ + if (MD_MNSET_DESC(sd)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out2; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out2; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + } + + for (i = 0; i < node_c; i++) + if (getnodeside(node_v[i], sd) == MD_SIDEWILD) { + (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno, + node_v[i], NULL, sp->setname); + rval = -1; + goto out2; + } + + /* + * Count the number of nodes currently in the set. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + numsides++; + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) + /* Count full slots */ + if (sd->sd_nodes[i][0] != '\0') + numsides++; + } + + /* + * OHA mode == -f -h <hostname> + * OHA is One Host Administration that occurs when the forceflag (-f) + * is set and at least one host in the diskset isn't responding + * to RPC requests. + * + * When in OHA mode, a node cannot delete itself from a diskset. + * When in OHA mode, a node can delete a list of nodes from a diskset + * even if some of the nodes in the diskset are unresponsive. + * + * For multinode diskset, only allow OHA mode when the nodes that + * aren't responding in the diskset are not in the membership list + * (i.e. nodes that aren't responding are not marked ALIVE). + * Nodes that aren't in the membership list will be rejoining + * the diskset through a reconfig cycle and the local mddb set + * and node records can be reconciled during the reconfig cycle. + * + * If a node isn't responding, but is still in the membership list, + * fail the request since the node may not be responding because + * rpc.metad died and is restarting. In this case, no reconfig + * cycle will be started, so there's no way to recover if + * the host delete operation was allowed. + * + * NOTE: if nodes that weren't in the membership when the OHA host + * delete occurred are now the only nodes in membership list, + * those nodes will see the old view of the diskset. As soon as + * a node re-enters the cluster that was present in the cluster + * during the host deletion, the diskset will reflect the host + * deletion on all nodes presently in the cluster. + */ + if (forceflg == TRUE) { + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* + * If a node isn't ALIVE (in member list), + * then allow a force-able delete in OHA mode. + */ + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + oha = TRUE; + break; + } + /* + * Don't test for clnt_nullproc since already + * tested the RPC connections by clnt_lock_set. + */ + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) { + /* + * If we timeout to at least one + * client, then we can allow OHA mode, + * otherwise, we are in normal mode. + */ + if (mdanyrpcerror(ep)) { + mdclrerror(ep); + if (strinlst(sd->sd_nodes[i], + node_c, node_v)) { + oha = TRUE; + break; + } + } + } + } + } + } + + /* + * Don't allow this for MN diskset since meta_set_destroy of 1 node + * does NOT remove this node's node record from the other node's set + * records in their local mddb. This leaves a MN diskset in a very + * messed up state. + */ + if (!(MD_MNSET_DESC(sd))) { + /* Destroy set */ + if (forceflg == TRUE && node_c == 1 && + strcmp(mynode(), node_v[0]) == 0) { + /* Can return since !MN diskset so nothing to unlock */ + return (meta_set_destroy(sp, TRUE, ep)); + } + } + + + /* + * In multinode diskset, can only delete self if this + * is the last node in the set or if all nodes in + * the set are being deleted. The traditional diskset code + * allows a node to delete itself (when there are other nodes + * in the diskset) when using the force flag, but that code + * path doesn't have the node remove itself from + * the set node list on the other nodes. Since this isn't + * satisfactory for the multinode diskset, just don't + * allow this operation. + */ + if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) && + strinlst(mynode(), node_c, node_v)) { + (void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno, + mynode(), NULL, sp->setname); + rval = -1; + goto out2; + } + + /* + * In multinode diskset, don't allow deletion of master node unless + * this is the only node left or unless all nodes are being + * deleted since there is no way to switch + * master ownership (unless via a cluster reconfig cycle). + */ + delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v); + if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) && + delete_master) { + (void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno, + sd->sd_mn_master_nodenm, NULL, sp->setname); + rval = -1; + goto out2; + } + + + /* Deleting self w/o forceflg */ + if (forceflg == FALSE && numsides > 1 && + strinlst(mynode(), node_c, node_v)) { + (void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno, + mynode(), NULL, sp->setname); + rval = -1; + goto out2; + } + + /* + * Setup the mediator record roll-back structure for a trad diskset. + * + * For a MN diskset, the deletion of a host in the diskset + * does not cause an update of the mediator record. If the + * host deletion will cause the diskset to be removed (this is + * the last host being removed or all hosts are being removed) + * then the mediator record must have already been removed by the + * user or this delete host operation will fail (a check for + * this is done later in this routine). + */ + if (!(MD_MNSET_DESC(sd))) { + (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); + rb_medr.med_rec_mag = MED_REC_MAGIC; + rb_medr.med_rec_rev = MED_REC_REV; + rb_medr.med_rec_fl = 0; + rb_medr.med_rec_sn = sp->setno; + (void) strcpy(rb_medr.med_rec_snm, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]); + rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ + (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); + rb_medr.med_rec_foff = 0; + crcgen(&rb_medr, &rb_medr.med_rec_cks, + sizeof (med_rec_t), NULL); + + /* Bring the mediator record up to date with the set record */ + medr = rb_medr; /* structure assignment */ + + if ((max_meds = get_max_meds(ep)) == 0) { + rval = -1; + goto out2; + } + } + + /* + * For traditional diskset: + * Check to see if all the hosts we are trying to delete the set from + * have a set "setname" that is the same as ours, i.e. - same name, + * same time stamp, same genid. We only do this if forceflg is not + * specified or we are in OHA mode. + */ + if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) { + int fix_node_v = FALSE; + int j; + + for (i = 0; i < node_c; i++) { + /* We skip this side */ + if (strcmp(mynode(), node_v[i]) == 0) + continue; + + has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep); + + if (has_set < 0) { + char *anode[1]; + + /* + * Can't talk to the host only allowed in OHA + * mode. + */ + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + + /* + * We got an error we do not, or are not, + * prepared to handle. + */ + if (! mdiserror(ep, MDE_NO_SET) && + ! mdismddberror(ep, MDE_DB_NODB)) { + rval = -1; + goto out2; + } + mdclrerror(ep); + + /* + * If we got here: both hosts are up; a host in + * our set record does not have the set. So we + * delete the host from our set and invalidate + * the node. + */ + anode[0] = Strdup(node_v[i]); + + rval = del_host_noset(sp, anode, ep); + + /* + * If we delete a host, make sure the mediator + * hosts are made aware of this. + */ + for (j = 0; j < MD_MAXSIDES; j++) { + if (strcmp(medr.med_rec_nodes[j], + node_v[i]) != 0) + continue; + (void) memset(&medr.med_rec_nodes[j], + '\0', sizeof (md_node_nm_t)); + } + crcgen(&medr, &medr.med_rec_cks, + sizeof (med_rec_t), NULL); + + rb_medr = medr; /* struct assignment */ + + Free(anode[0]); + + if (rval == -1) + goto out2; + + node_v[i][0] = '\0'; + fix_node_v = TRUE; + continue; + } + + /* + * If we can talk to the host, and they do not have the + * exact set, then we disallow the operation. + */ + if (has_set == FALSE) { + (void) mddserror(ep, MDE_DS_NODENOSET, + sp->setno, node_v[i], NULL, sp->setname); + rval = -1; + goto out2; + } + } + + /* + * Here we prune the node_v's that were invalidated above. + */ + if (fix_node_v == TRUE) { + i = 0; + while (i < node_c) { + if (node_v[i][0] == '\0') { + for (j = i; (j + 1) < node_c; j++) + node_v[j] = node_v[j + 1]; + node_c--; + } + i++; + } + /* + * If we are left with no nodes, then we have + * compeleted the operation. + */ + if (node_c == 0) { + /* + * Inform the mediator hosts of the new node + * list + */ + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec( + &sd->sd_med.n_lst[i], sp, &medr, + ep)) + mdclrerror(ep); + } + rval = 0; + goto out2; + } + } + } + + /* + * For multinode diskset: + * If forceflag is FALSE then check to see if all the hosts we + * are trying to delete the set from have a set "setname" that + * is the same as ours, i.e. - same name, same time stamp, same genid. + * If forceflag is TRUE, then we don't care if the hosts being + * deleted have the same set information or not since user is forcing + * those hosts to be deleted. + */ + if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) { + for (i = 0; i < node_c; i++) { + /* We skip this node since comparing against it */ + if (strcmp(mynode(), node_v[i]) == 0) + continue; + + has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep); + + if (has_set < 0) { + rval = -1; + goto out2; + } + + /* + * If we can talk to the host, and they do not have the + * exact set, then we disallow the operation. + */ + if (has_set == FALSE) { + (void) mddserror(ep, MDE_DS_NODENOSET, + sp->setno, node_v[i], NULL, sp->setname); + rval = -1; + goto out2; + } + } + } + + /* + * For traditional diskset: + * Can't allow user to delete their node (without deleting all nodes) + * out of a set in OHA mode, would leave a real mess. + * This action was already failed above for a MN diskset. + */ + if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) && + strinlst(mynode(), node_c, node_v)) { + /* Can directly return since !MN diskset; nothing to unlock */ + return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno, + mynode(), NULL, sp->setname)); + } + + + /* Get the drive descriptors for this set */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) { + rval = -1; + goto out2; + } + } + + /* + * We have been asked to delete all the hosts in the set, i.e. - delete + * the whole set. + */ + if (node_c == numsides) { + /* + * This is only a valid operation if all drives have been + * removed first. + */ + + if (dd != NULL) { + (void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno, + NULL, NULL, sp->setname); + rval = -1; + goto out2; + } + + /* + * If a mediator is currently associated with this set, + * fail the deletion of the last host(s). + */ + if (sd->sd_med.n_cnt != 0) { + (void) mddserror(ep, MDE_DS_HASMED, sp->setno, + NULL, NULL, sp->setname); + rval = -1; + goto out2; + } + + if (! mdisok(ep)) { + rval = -1; + goto out2; + } + + rval = del_set_nodrives(sp, node_c, node_v, oha, ep); + remote_sets_deleted = 1; + goto out2; + } + + /* + * Get timeout values in case we need to roll back + */ + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) { + rval = -1; + goto out2; + } + + if (dd != NULL) { + /* + * We need this around for re-adding DB side names later. + */ + if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { + rval = -1; + goto out2; + } + + /* + * Alloc nodeid list if drives are present in diskset. + * nodeid list is used to reset mirror owners if the + * owner is a deleted node. + */ + if (MD_MNSET_DESC(sd)) { + node_id_list = Zalloc(sizeof (int) * node_c); + } + } + + /* Lock the set on current set members */ + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_on(); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + rval = -1; + goto out2; + } + lock_flag = 1; + } + } + + RB_TEST(1, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "deletehosts", ep) + + if (MD_MNSET_DESC(sd)) { + md_mnnode_desc *saved_nd_next; + mddb_config_t c; + + if (dd != NULL) { + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Start by suspending rpc.mdcommd (which drains it of + * all messages), then change the nodelist followed + * by a reinit and resume. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, + MD_MSG_CLASS0, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out2; + } + suspendall_flag = 1; + nd = nd->nd_next; + } + /* + * Is current set STALE? + * Need to know this if delete host fails and node + * is re-joined to diskset. + */ + (void) memset(&c, 0, sizeof (c)); + c.c_id = 0; + c.c_setno = sp->setno; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { + (void) mdstealerror(ep, &c.c_mde); + rval = -1; + goto out2; + } + if (c.c_flags & MDDB_C_STALE) { + stale_flag = MNSET_IS_STALE; + } + } + + /* + * For each node being deleted, set DEL flag and + * reset OK flag on that node first. + * Until a node has turned off its own + * rpc.metad's NODE_OK flag, that node could be + * considered for master during a reconfig. + */ + for (i = 0; i < node_c; i++) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, handle this in next loop */ + if (nd == NULL) + continue; + + /* If node_id_list is alloc'd, fill in for later use */ + if (node_id_list) + node_id_list[i] = nd->nd_nodeid; + + /* All nodes are guaranteed to be ALIVE unless OHA */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + continue; + } + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set flags for del host to DEL on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_DEL, NULL, ep)) { + nd->nd_next = saved_nd_next; + goto rollback; + } + nd->nd_next = saved_nd_next; + } + for (i = 0; i < node_c; i++) { + /* + * Turn off owner flag in nodes to be deleted + * if this node has been joined. + * Also, turn off NODE_OK and turn on NODE_DEL + * for nodes to be deleted. + * These flags are used to set the node + * record flags in all nodes in the set. + * Only withdraw nodes that are joined. + */ + nd = sd->sd_nodelist; + while (nd) { + /* + * Don't communicate with non-ALIVE node if + * in OHA - but set flags in master list so + * alive nodes are updated correctly. + */ + if (strcmp(nd->nd_nodename, node_v[i]) == 0) { + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags &= ~MD_MN_NODE_OK; + nd = nd->nd_next; + continue; + } + if (nd->nd_flags & MD_MN_NODE_OWN) { + /* + * Going to set locally cached node + * flags to rollback join so in case + * of error, the rollback code knows + * which nodes to re-join. + * rpc.metad ignores the RB_JOIN flag. + */ + nd->nd_flags |= MD_MN_NODE_RB_JOIN; + nd->nd_flags &= ~MD_MN_NODE_OWN; + + /* + * Be careful in ordering of following + * steps so that recovery from a panic + * between the steps is viable. + * Only reset master info in rpc.metad + * - don't reset local cached info + * which will be used to set master + * info back if failure (rollback). + */ + if (clnt_withdrawset(nd->nd_nodename, + sp, ep)) + goto rollback; + + /* Reset master on deleted node */ + if (clnt_mnsetmaster(node_v[i], sp, "", + MD_MN_INVALID_NID, ep)) + goto rollback; + } + + nd->nd_flags |= MD_MN_NODE_DEL; + nd->nd_flags &= ~MD_MN_NODE_OK; + } + nd = nd->nd_next; + } + } + + /* + * Now, reset owner and set delete flags for the + * deleted nodes on all nodes. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip non-ALIVE node if in OHA */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_SET, NULL, ep)) { + goto rollback; + } + nd = nd->nd_next; + } + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + while (nd) { + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + goto rollback; + } + nd = nd->nd_next; + } + /* Send resume */ + nd = sd->sd_nodelist; + while (nd) { + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_RESUME, sp, MD_MSG_CLASS0, + MD_MSCF_DONT_RESUME_CLASS1, ep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + goto rollback; + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + } + + + /* + * Mark the set record MD_SR_DEL on the hosts we are deleting + * If a MN diskset and OHA mode, don't issue RPC to nodes that + * are not ALIVE. + * If a MN diskset and not in OHA mode, then all nodes must respond + * to RPC (be alive) or this routine will return failure. + * If a traditional diskset, all RPC failures if in OHA mode. + */ + for (i = 0; i < node_c; i++) { + + RB_TEST(3, "deletehosts", ep) + + if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) == 0) { + break; + } + nd = nd->nd_next; + } + if (nd == NULL) { + (void) mddserror(ep, MDE_DS_NODENOTINSET, + sp->setno, node_v[i], NULL, sp->setname); + goto rollback; + } else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + /* Skip non-ALIVE node if in OHA mode */ + continue; + } else { + if (clnt_upd_sr_flags(node_v[i], sp, + MD_SR_DEL, ep)) { + goto rollback; + } + } + } else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) { + /* + * All nodes should be alive in non-oha mode. + */ + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { + goto rollback; + } + } else { + /* + * For traditional diskset, issue the RPC and + * ignore RPC failure if in OHA mode. + */ + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + + RB_TEST(4, "deletehosts", ep) + } + + RB_TEST(5, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(6, "deletehosts", ep) + + /* Delete the set on the hosts we are deleting */ + if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) { + if (node_id_list) + Free(node_id_list); + /* + * Failure during del_set_on_hosts would have recreated + * the diskset on the remote hosts, but for multi-owner + * disksets need to set node flags properly and REINIT and + * RESUME rpc.mdcommd, so just let the rollback code + * do this. + */ + if (MD_MNSET_DESC(sd)) + goto rollback; + return (-1); + } + remote_sets_deleted = 1; + + RB_TEST(19, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(20, "deletehosts", ep) + + /* Delete the host from sets on hosts not being deleted */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE unless in oha mode */ + while (nd) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + + /* Skip nodes being deleted */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v, + ep) == -1) { + goto rollback; + } + + RB_TEST(21, "deletehosts", ep) + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes being deleted */ + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v, + ep) == -1) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + + RB_TEST(21, "deletehosts", ep) + } + } + + /* We have drives */ + if (dd != NULL) { + RB_TEST(22, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(23, "deletehosts", ep) + + /* + * Delete the old sidename for each drive on all the hosts. + * If a multi-node diskset, each host only stores + * the side information for itself. So, a multi-node + * diskset doesn't delete the old sidename for + * an old host. + * + * If a MN diskset, reset owners of mirrors that are + * owned by the deleted nodes. + */ + if (!(MD_MNSET_DESC(sd))) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes being deleted */ + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp, + ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + metaflushsetname(sp); + goto rollback; + } + + RB_TEST(24, "deletehosts", ep) + } + } else { + nd = sd->sd_nodelist; + /* All nodes guaranteed to be ALIVE unless in oha mode */ + while (nd) { + /* + * If mirror owner was set to a deleted node, then + * each existing node resets mirror owner to NULL. + * + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + + /* Skip nodes being deleted */ + if (strinlst(nd->nd_nodename, node_c, node_v)) { + nd = nd->nd_next; + continue; + } + + /* + * If mirror owner is a deleted node, reset mirror + * owners to NULL. If an error occurs, print a + * warning and continue. Don't fail metaset + * because of mirror owner reset problem since next + * node to grab mirror will resolve this issue. + * Before next node grabs mirrors, metaset will show + * the deleted node as owner which is why an attempt + * to reset the mirror owner is made. + */ + if (clnt_reset_mirror_owner(nd->nd_nodename, sp, + node_c, &node_id_list[0], &xep) == -1) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reset mirror owner on" + " node %s\n"), nd->nd_nodename); + mdclrerror(&xep); + } + + RB_TEST(21, "deletehosts", ep) + nd = nd->nd_next; + } + } + } + + RB_TEST(25, "deletehosts", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(26, "deletehosts", ep) + + /* + * Bring the mediator record up to date with the set record for + * traditional diskset. + */ + if (!(MD_MNSET_DESC(sd))) { + medr = rb_medr; /* structure assignment */ + for (i = 0; i < MD_MAXSIDES; i++) { + if (strinlst(sd->sd_nodes[i], node_c, node_v)) + (void) memset(&medr.med_rec_nodes[i], + '\0', sizeof (md_node_nm_t)); + else + (void) strcpy(medr.med_rec_nodes[i], + sd->sd_nodes[i]); + } + crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* Inform the mediator hosts of the new node list */ + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, + &medr, ep)) { + if (oha == TRUE && mdanyrpcerror(ep)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + } + + RB_TEST(27, "deletehosts", ep) + + /* + * For traditional diskset: + * We are deleting ourselves out of the set and we have drives to + * consider; so we need to halt the set, release the drives and + * reset the timeout. **** THIS IS A ONE WAY TICKET, NO ROLL BACK + * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE + * WITH ALL SIGNALS BLOCKED AND LAST **** + * + * This situation cannot occur in a MN diskset since a node can't + * delete itself unless all nodes are being deleted and a diskset + * cannot contain any drives if all nodes are being deleted. + * So, don't even test for this if a MN diskset. + */ + if (!(MD_MNSET_DESC(sd)) && (dd != NULL) && + strinlst(mynode(), node_c, node_v)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, ep) < 0) { + rval = -1; + goto out1; + } + + if (halt_set(sp, ep)) { + rval = -1; + goto out1; + } + + if (rel_own_bydd(sp, dd, FALSE, ep)) + rval = -1; + +out1: + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + +out2: + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag)) { + /* Send resume */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + /* + * Skip nodes being deleted if remote set + * was deleted since rpc.mdcommd may no longer + * be running on remote node. + */ + if ((remote_sets_deleted == 1) && + (strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (lock_flag) { + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + if (oha == TRUE && + mdanyrpcerror(&xep)) { + mdclrerror(&xep); + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + } + cl_set_setkey(NULL); + +out3: + metafreereplicalist(rlp); + if (node_id_list) + Free(node_id_list); + + metaflushsetname(sp); + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + max_genid = sd->sd_genid; + + + /* + * Send reinit command to rpc.mdcommd which forces it to get + * fresh set description and resume all classes but class 0. + * Don't send any commands to rpc.mdcommd if set on that node + * has been removed. + */ + if (suspendall_flag) { + /* Send reinit */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + /* + * If the remote set was deleted, rpc.mdcommd + * may no longer be running so send nothing to it. + */ + if ((remote_sets_deleted == 1) && + (strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + /* Send resume */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + /* + * If the remote set was deleted, rpc.mdcommd + * may no longer be running so send nothing to it. + */ + if ((remote_sets_deleted == 1) && + (strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1, + &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* level 2 */ + if (rb_level > 1) { + md_set_record *sr; + md_replicalist_t *rl; + + recreate_set(sp, sd); + + /* + * Lock out other meta* commands on nodes with the newly + * re-created sets by suspending class 1 messages + * across the diskset. + */ + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being deleted */ + if (!(strinlst(nd->nd_nodename, node_c, node_v))) { + nd = nd->nd_next; + continue; + } + /* Suspend commd on nodes with re-created sets */ + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to suspend rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + + max_genid++; + + /* + * See if we have to re-add the drives specified. + */ + for (i = 0; i < node_c; i++) { + if (MD_MNSET_DESC(sd) && (oha == TRUE)) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) + == 0) { + break; + } + nd = nd->nd_next; + } + if (nd == 0) + continue; + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) + continue; + } + + /* Don't care if set record is MN or not */ + if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr, + &xep) == -1) { + mdclrerror(&xep); + continue; + } + + /* Drive already added, skip to next node */ + if (sr->sr_drivechain != NULL) { + /* + * Set record structure was allocated from RPC + * routine getset so this structure is only of + * size md_set_record even if the MN flag is + * set. So, clear the flag so that the free + * code doesn't attempt to free a structure + * the size of md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + continue; + } + + if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime, + sr->sr_genid, &xep) == -1) + mdclrerror(&xep); + + if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK, + &xep) == -1) + mdclrerror(&xep); + + /* + * Set record structure was allocated from RPC routine + * getset so this structure is only of size + * md_set_record even if the MN flag is set. So, + * clear the flag so that the free code doesn't + * attempt to free a structure the size of + * md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + } + max_genid += 3; + + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + md_replica_t *r = rl->rl_repp; + /* + * This is not the first replica being added to the + * diskset so call with ADDSIDENMS_BCAST. If this + * is a traditional diskset, the bcast flag is ignored + * since traditional disksets don't use the rpc.mdcommd. + */ + if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno, + DB_ADDSIDENMS_BCAST, &xep)) + mdclrerror(&xep); + } + + /* + * Add the device names for the new sides into the namespace, + * on all hosts not being deleted. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Find a node that is not being deleted */ + if (!strinlst(nd->nd_nodename, node_c, + node_v)) { + j = nd->nd_nodeid; + break; + } + nd = nd->nd_next; + } + } else { + for (j = 0; j < MD_MAXSIDES; j++) { + /* Skip empty slots */ + if (sd->sd_nodes[j][0] == '\0') + continue; + + /* Find a node that is not being deleted */ + if (!strinlst(sd->sd_nodes[j], node_c, node_v)) + break; + } + } + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* Skip nodes not being deleted */ + if (!strinlst(nd->nd_nodename, node_c, + node_v)) { + nd = nd->nd_next; + continue; + } + + /* this side was just created, add the names */ + if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip nodes not being deleted */ + if (!strinlst(sd->sd_nodes[i], node_c, node_v)) + continue; + + /* this side was just created, add the names */ + if (add_md_sidenms(sp, i, j, &xep)) + mdclrerror(&xep); + } + } + } + + /* level 4 */ + if (rb_level > 3 && dd != NULL) { + /* + * Add the new sidename for each drive to all the hosts + * Multi-node disksets only store the sidename for + * that host, so there is nothing to re-add. + */ + if (!(MD_MNSET_DESC(sd))) { + for (j = 0; j < MD_MAXSIDES; j++) { + /* Skip empty slots */ + if (sd->sd_nodes[j][0] == '\0') + continue; + + /* Skip nodes not being deleted */ + if (!strinlst(sd->sd_nodes[j], node_c, node_v)) + break; + } + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_add_drv_sidenms(sd->sd_nodes[i], + sd->sd_nodes[j], sp, sd, node_c, node_v, + &xep)) + mdclrerror(&xep); + } + } + + } + + /* level 5 */ + if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) { + /* rollback the mediator record */ + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, + &rb_medr, &xep)) + mdclrerror(&xep); + } + } + + /* level 3 */ + if (rb_level > 2) { + md_set_record *sr; + md_mnset_record *mnsr; + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + while (nd) { + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + /* Record should be for a multi-node diskset */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, &xep) == -1) { + mdclrerror(&xep); + nd = nd->nd_next; + continue; + } + + has_set = 1; + + nr = mnsr->sr_nodechain; + while (nr) { + if (nd->nd_nodeid == nr->nr_nodeid) { + break; + } + nr = nr->nr_next; + } + if (nr == NULL) + has_set = 0; + + free_sr((struct md_set_record *)mnsr); + if (has_set) { + nd = nd->nd_next; + continue; + } + + if (clnt_addhosts(nd->nd_nodename, sp, node_c, + node_v, &xep) == -1) + mdclrerror(&xep); + + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Record should be for a non-multi-node set */ + if (clnt_getset(sd->sd_nodes[i], sp->setname, + MD_SET_BAD, &sr, &xep) == -1) { + mdclrerror(&xep); + continue; + } + + /* + * Set record structure was allocated from RPC + * routine getset so this structure is only of + * size md_set_record even if the MN flag is + * set. So, clear the flag so that the free + * code doesn't attempt to free a structure + * the size of md_mnset_record. + */ + if (MD_MNSET_REC(sr)) { + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + continue; + } + + has_set = 1; + for (j = 0; j < MD_MAXSIDES; j++) { + /* Skip empty slots */ + if (sd->sd_nodes[j][0] == '\0') + continue; + + if (sr->sr_nodes[j][0] == '\0') { + has_set = 0; + break; + } + } + + free_sr(sr); + if (has_set) + continue; + + if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, + node_v, &xep) == -1) + mdclrerror(&xep); + } + } + max_genid++; + } + + /* level 1 */ + if (rb_level > 0) { + max_genid++; + /* Sets MD_SR_OK on given nodes. */ + resync_genid(sp, sd, max_genid, node_c, node_v); + + /* + * For MN diskset: + * On each newly re-added node, set the node record for that + * node to OK. Then set all node records for the newly added + * nodes on all nodes to ok. + * + * By setting a node's own node record to ok first, even if + * the node re-adding the hosts panics, the rest of the nodes + * can determine the same node list during the choosing of the + * master during reconfig. So, only nodes considered for + * mastership are nodes that have both MD_MN_NODE_OK and + * MD_SR_OK set on that node's rpc.metad. If all nodes have + * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set, + * then the set will be removed during reconfig since a panic + * occurred during the re-creation of the deletion of + * the initial diskset. + */ + if (MD_MNSET_DESC(sd)) { + md_mnnode_desc *saved_nd_next; + if (dd != NULL) { + /* + * Notify rpc.mdcommd on all nodes of a + * nodelist change. Start by suspending + * rpc.mdcommd (which drains it of all + * messages), then change the nodelist + * followed by a reinit and resume. + */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & + MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, + MD_MSG_CLASS0, + MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, + dgettext(TEXT_DOMAIN, + "Unable to suspend " + "rpc.mdcommd.\n")); + mdclrerror(&xep); + } + suspendall_flag_rb = 1; + nd = nd->nd_next; + } + } + for (i = 0; i < node_c; i++) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) + == 0) + break; + nd = nd->nd_next; + } + /* Something wrong, finish this in next loop */ + if (nd == NULL) + continue; + + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + continue; + } + + if (dd != NULL) { + /* Set master on re-joining node. */ + if (clnt_mnsetmaster(node_v[i], sp, + sd->sd_mn_master_nodenm, + sd->sd_mn_master_nodeid, &xep)) { + mdclrerror(&xep); + } + + /* + * Re-join set to same state as + * before - stale or non-stale. + */ + if (clnt_joinset(node_v[i], sp, + stale_flag, &xep)) { + mdclrerror(&xep); + } + } + + /* Only changing my local cache of node list */ + saved_nd_next = nd->nd_next; + nd->nd_next = NULL; + + /* Set record for host to ok on that host */ + if (clnt_upd_nr_flags(node_v[i], sp, + nd, MD_NR_OK, NULL, &xep)) { + mdclrerror(&xep); + } + nd->nd_next = saved_nd_next; + } + + /* Now set all node records on all nodes to be ok */ + nd = sd->sd_nodelist; + while (nd) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_upd_nr_flags(nd->nd_nodename, sp, + sd->sd_nodelist, MD_NR_OK, NULL, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + } + } + + /* + * Notify rpc.mdcommd on all nodes of a nodelist change. + * Send reinit command to mdcommd which forces it to get + * fresh set description. + */ + if (suspendall_flag_rb) { + /* Send reinit */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + /* Class is ignored for REINIT */ + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT, + sp, NULL, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to reinit rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + } + + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) { + /* Send resume */ + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + /* + * Start a resync thread on the re-added nodes + * if set is not stale. Also start a thread to update the + * abr state of all soft partitions + */ + if (stale_flag != MNSET_IS_STALE) { + for (i = 0; i < node_c; i++) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node_v[i]) + == 0) + break; + nd = nd->nd_next; + } + if (nd == NULL) + continue; + + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + continue; + } + + if (dd != 0) { + if (clnt_mn_mirror_resync_all(node_v[i], + sp->setno, &xep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to start resync " + "thread.\n")); + } + if (clnt_mn_sp_update_abr(node_v[i], + sp->setno, &xep)) { + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to start sp update " + "thread.\n")); + } + } + } + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* Don't test lock flag since guaranteed to be set if in rollback */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* + * During OHA mode, don't issue RPCs to + * non-alive nodes since there is no reason to + * wait for RPC timeouts. + */ + if ((oha == TRUE) && + (!(nd->nd_flags & MD_MN_NODE_ALIVE))) { + nd = nd->nd_next; + continue; + } + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + metafreereplicalist(rlp); + if (node_id_list) + Free(node_id_list); + + metaflushsetname(sp); + + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); +} + +int +meta_set_auto_take( + mdsetname_t *sp, + int take_val, + md_error_t *ep +) +{ + int i; + md_set_desc *sd; + int rval = 0; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + char *hostname; + md_drive_desc *dd; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + hostname = mynode(); + + /* Lock the set on our side */ + if (clnt_lock_set(hostname, sp, ep)) { + rval = -1; + goto out; + } + + if (take_val) { + /* enable auto_take but only if it is not already set */ + if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) { + /* verify that we're the only host in the set */ + for (i = 0; i < MD_MAXSIDES; i++) { + if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0') + continue; + + if (strcmp(sd->sd_nodes[i], hostname) != 0) { + (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, + NULL, sp->setname); + rval = -1; + goto out; + } + } + + if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) + rval = -1; + + /* Disable SCSI reservations */ + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, + &xep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + + if (dd != NULL) { + if (rel_own_bydd(sp, dd, TRUE, &xep)) + mdclrerror(&xep); + } + } + + } else { + /* disable auto_take, if set, or error */ + if (sd->sd_flags & MD_SR_AUTO_TAKE) { + if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep)) + rval = -1; + + /* Enable SCSI reservations */ + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, + &xep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep); + if (! mdisok(&xep)) + mdclrerror(&xep); + + if (dd != NULL) { + mhd_mhiargs_t mhiargs = defmhiargs; + + if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep)) + mdclrerror(&xep); + } + + } else { + (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL, + sp->setname); + rval = -1; + } + } + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(hostname, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + cl_set_setkey(NULL); + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_med.c b/usr/src/lib/lvm/libmeta/common/meta_set_med.c new file mode 100644 index 0000000000..02b39d39ee --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set_med.c @@ -0,0 +1,1253 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Metadevice diskset interfaces + */ + +#include "meta_set_prv.h" +#include <sys/lvm/md_crc.h> +#include <sys/lvm/mdmed.h> + +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/svm.h> + +#define MALSIZ 32 + +static int +add_lst(char ***listp, char *item) +{ + int i, j; + + if (*listp) { + for (i = 0; (*listp)[i]; i++) + /* void */; + } else { + *listp = (char **)Zalloc(MALSIZ * sizeof (char *)); + i = 0; + } + + (*listp)[i] = Strdup(item); + + if ((++i % MALSIZ) == 0) { + *listp = (char **)Realloc((void *)*listp, + (i + MALSIZ) * sizeof (char *)); + for (j = i; j < (i + MALSIZ); j++) + (*listp)[j] = (char *)NULL; + } + return (i); +} + +static int +del_lst(char ***listp) +{ + int i; + + if (*listp) { + for (i = 0; (*listp)[i]; i++) + free((*listp)[i]); + free(*listp); + *listp = NULL; + return (1); + } else + return (0); +} + + +static int +validate_med_nodes( + mdsetname_t *sp, + md_h_arr_t *mhp, + md_error_t *ep +) +{ + char *hostname; + char *nodename; + char *nm; + char *cp; + int i, j; + + + for (i = 0; i < MED_MAX_HOSTS; i++) { + if (mhp->n_lst[i].a_cnt == 0) + continue; + + for (j = 0; j < mhp->n_lst[i].a_cnt; j++) { + nm = mhp->n_lst[i].a_nm[j]; + + for (cp = nm; *cp; cp++) + if (!isprint(*cp) || + strchr(INVALID_IN_NAMES, *cp) != NULL) + return (mddserror(ep, + MDE_DS_INVALIDMEDNAME, + sp->setno, nm, NULL, sp->setname)); + + if (clnt_med_hostname(nm, &hostname, ep)) + return (-1); + + if (j == 0) { + if (strcmp(nm, hostname) != 0) { + Free(hostname); + return (mddserror(ep, + MDE_DS_NOTNODENAME, sp->setno, nm, + NULL, sp->setname)); + } + nodename = nm; + } else { + if (strcmp(nodename, hostname) != 0) { + Free(hostname); + return (mddserror(ep, + MDE_DS_ALIASNOMATCH, sp->setno, nm, + nodename, sp->setname)); + } + } + Free(hostname); + } + } + return (0); +} + +/* + * Exported Entry Points + */ + +int +meta_set_addmeds( + mdsetname_t *sp, + int node_c, + char **node_v, + md_error_t *ep +) +{ + md_set_desc *sd = NULL; + md_drive_desc *dd = NULL; + mddb_med_parm_t mp; + mddb_med_upd_parm_t mup; + md_h_arr_t t; + md_h_arr_t rb_t; + med_rec_t medr; + med_rec_t rb_medr; + char *cp; + char **n_l = NULL; + int n_c = 0; + int i, j; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + int rval = 0; + int max_meds; + md_mnnode_desc *nd; + int suspend1_flag = 0; + int lock_flag = 0; + + /* Initialize */ + (void) memset(&t, '\0', sizeof (t)); + t.n_cnt = node_c; + mdclrerror(ep); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + /* + * The mediator information (which is part of the set record) is + * stored in the local mddbs of each node in the diskset. + * Each node's rpc.metad daemon reads in the set + * records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list). Otherwise, + * fail this operation since all nodes must be ALIVE in order to add + * the mediator information to the set record in their local mddb. + * If a panic of this node leaves the local mddbs set records + * out-of-sync, the reconfig cycle will fix the local mddbs and + * force them back into synchronization. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, + nd->nd_nodename, NULL, sp->setname); + return (-1); + } + nd = nd->nd_next; + } + } + + /* Parse the command line into a the md_h_arr_t structure */ + for (i = 0; i < t.n_cnt; i++) { + cp = strtok(node_v[i], ","); + j = 0; + while (cp) { + if (strlen(cp) > (size_t)MD_MAX_NODENAME) + return (mddserror(ep, MDE_DS_NODENAMETOOLONG, + sp->setno, cp, NULL, sp->setname)); + if (j >= MAX_HOST_ADDRS) + return (mddserror(ep, MDE_DS_TOOMANYALIAS, + sp->setno, cp, NULL, sp->setname)); + + (void) strcpy(t.n_lst[i].a_nm[j], cp); + + j++; + + cp = strtok(NULL, ","); + } + t.n_lst[i].a_cnt = j; + } + + /* Make a list of nodes to check */ + for (i = 0; i < t.n_cnt; i++) + for (j = 0; j < t.n_lst[i].a_cnt; j++) + n_c = add_lst(&n_l, t.n_lst[i].a_nm[j]); + + /* Make sure that there are no redundant nodes */ + rval = nodesuniq(sp, n_c, n_l, ep); + + (void) del_lst(&n_l); + + if (rval != 0) + return (rval); + + /* + * Lock the set on current set members. + * Set locking done much earlier for MN diskset than for traditional + * diskset since lock_set and SUSPEND are used to protect against + * other metaset commands running on the other nodes. + */ + if (MD_MNSET_DESC(sd)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + } + + if (validate_med_nodes(sp, &t, ep)) { + rval = -1; + goto out; + } + + /* Check existing mediators against new, if any */ + if (sd->sd_med.n_cnt > 0) { + for (i = 0; i < max_meds; i++) + if (sd->sd_med.n_lst[i].a_cnt > 0) + n_c = add_lst(&n_l, + sd->sd_med.n_lst[i].a_nm[0]); + + for (i = 0; i < t.n_cnt; i++) { + if (strinlst(t.n_lst[i].a_nm[0], n_c, n_l)) { + (void) del_lst(&n_l); + (void) mddserror(ep, MDE_DS_ISMED, sp->setno, + t.n_lst[i].a_nm[0], NULL, + sp->setname); + rval = -1; + goto out; + } + } + (void) del_lst(&n_l); + } + + if ((t.n_cnt + sd->sd_med.n_cnt) > max_meds) { + (void) mderror(ep, MDE_TOOMANYMED, NULL); + rval = -1; + goto out; + } + + /* Copy the current mediator list for rollback */ + rb_t = sd->sd_med; /* structure assignment */ + + /* Setup the mediator record roll-back structure */ + (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); + rb_medr.med_rec_mag = MED_REC_MAGIC; + rb_medr.med_rec_rev = MED_REC_REV; + rb_medr.med_rec_fl = 0; + rb_medr.med_rec_sn = sp->setno; + (void) strcpy(rb_medr.med_rec_snm, sp->setname); + if (MD_MNSET_DESC(sd)) { + /* + * For a MN diskset the mediator is not given a list of + * hosts in the set. Instead a generic name (multiowner) is + * given to the mediator which will allow any node to access + * the mediator data as long as it provides the correct + * setname and set number. In a MN diskset, the mediator + * data is only used when a first node joins the diskset + * and becomes the master of the MN diskset. + * + * The traditional diskset code keeps the host list in + * the mediator record up to date with respect to the host + * list in the traditional diskset. This keeps an unauthorized + * node in the traditional diskset from accessing the data + * in the mediator record and being able to 'take' the + * diskset. + * + * This additional check is needed in the traditional diskset + * since a panic during the metaset command can leave + * the diskset with some nodes thinking that an + * action has occurred and other nodes thinking the opposite. + * A node may have really been removed from a diskset, but + * that node doesn't realize this so this node must be + * blocked from using the mediator data when attempting + * to 'take' the diskset. + * (Traditional diskset code has each node's rpc.metad + * cleaning up from an inconsistent state without any + * knowledge from the other nodes in the diskset). + * + * In the MN diskset, the reconfig steps force a consistent + * state across all nodes in the diskset, so no node + * needs to be blocked from accessing the mediator data. + * This allow the MN diskset to use a common 'nodename' + * in the mediator record. This allows the mediator + * daemon to remain unchanged even though a large number of + * nodes are supported by the MN diskset. + */ + (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER, + MD_MAX_NODENAME_PLUS_1); + } else { + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(rb_medr.med_rec_nodes[i], + sd->sd_nodes[i]); + } + rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */ + (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); + rb_medr.med_rec_foff = 0; + crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* Merge new mediators into the set record */ + for (i = 0; i < t.n_cnt; i++) { + for (j = 0; j < max_meds; j++) { + if (sd->sd_med.n_lst[j].a_cnt > 0) + continue; + sd->sd_med.n_lst[j] = t.n_lst[i]; + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_ADD, SVM_TAG_MEDIATOR, + sp->setno, j); + sd->sd_med.n_cnt++; + break; + } + } + + /* + * Setup the kernel mediator list, which also validates that the + * hosts have valid IP addresses + */ + (void) memset(&mp, '\0', sizeof (mddb_med_parm_t)); + mp.med_setno = sp->setno; + + /* Copy the hostnames */ + if (meta_h2hi(&sd->sd_med, &mp.med, ep)) { + rval = -1; + goto out; + } + + /* Resolve the IP addresses for the host list */ + if (meta_med_hnm2ip(&mp.med, ep)) { + rval = -1; + goto out; + } + + /* Bring the mediator record up to date with the set record */ + medr = rb_medr; /* structure assignment */ + medr.med_rec_meds = sd->sd_med; /* structure assigment */ + crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* END CHECK CODE */ + + /* Lock the set on current set members */ + if (!(MD_MNSET_DESC(sd))) { + /* all signals already blocked for MN disket */ + md_rb_sig_handling_on(); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + lock_flag = 1; + } + } + + RB_TEST(1, "meta_set_addmeds", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "meta_set_addmeds", ep) + + /* + * Add the new mediator information to all hosts in the set. + * For MN diskset, each node sends mediator list to its kernel. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep)) + goto rollback; + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep)) + goto rollback; + } + } + + RB_TEST(3, "meta_set_addmeds", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(4, "meta_set_addmeds", ep) + + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) + goto rollback; + } + + RB_TEST(5, "meta_set_addmeds", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(6, "meta_set_addmeds", ep) + + /* Inform the mediator hosts of the new information */ + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + /* medr contains new mediator node list */ + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep)) + goto rollback; + } + + RB_TEST(7, "meta_set_addmeds", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(8, "meta_set_addmeds", ep) + + /* In MN diskset, mediator list updated in clnt_updmeds call */ + if (dd != NULL) { + if (!(MD_MNSET_DESC(sd))) { + if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, + NULL) != 0) { + (void) mdstealerror(ep, &mp.med_mde); + goto rollback; + } + } + + /* + * If only 50% mddbs available, mediator will be + * golden by this ioctl on a traditional diskset. + * + * On a MN disket, this only happens if the mediator + * add operation is executed on the master node. + * If a slave node is adding the mediator, the mediator + * won't be marked golden until the next mddb change. + */ + (void) memset(&mup, '\0', sizeof (mddb_med_upd_parm_t)); + mup.med_setno = sp->setno; + if (metaioctl(MD_MED_UPD_MED, &mup, &mup.med_mde, NULL) != 0) + mdclrerror(&mup.med_mde); + } + +out: + if (suspend1_flag) { + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + if (lock_flag) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + cl_set_setkey(NULL); + } + + metafreedrivedesc(&dd); + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + /* + * level 4 + * In MN diskset, mediator list updated in clnt_updmeds call + */ + if (rb_level > 3 && (dd != NULL) && (!(MD_MNSET_DESC(sd)))) { + (void) memset(&mp, '\0', sizeof (mddb_med_parm_t)); + mp.med_setno = sp->setno; + (void) meta_h2hi(&rb_t, &mp.med, &xep); + mdclrerror(&xep); + (void) meta_med_hnm2ip(&mp.med, &xep); + mdclrerror(&xep); + (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL); + } + + /* level 3 */ + if (rb_level > 2) { + for (i = 0; i < max_meds; i++) { + if (sd->sd_med.n_lst[i].a_cnt == 0) + continue; + + /* + * rb_medr contains the rollback mediator node list. + * Send the rollback mediator information to the + * new mediator node list. If a node had this RPC + * called, but its node is not in the mediator node + * list, rpc.metamedd will delete the mediator + * record on that node. + */ + if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, + &rb_medr, &xep)) + mdclrerror(&xep); + } + } + + /* level 2 */ + if (rb_level > 1) { + metafreedrivedesc(&dd); + } + + /* level 1 */ + if (rb_level > 0) { + /* Delete mediator information from all hosts in the set */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_updmeds(nd->nd_nodename, sp, &rb_t, + &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t, + &xep)) + mdclrerror(&xep); + } + } + } + + /* level 0 */ + if (suspend1_flag) { + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + mdclrerror(&xep); + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + if (lock_flag) { + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + mdclrerror(&xep); + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + mdclrerror(&xep); + } + } + } + cl_set_setkey(NULL); + } + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); +} + +int +meta_set_deletemeds( + mdsetname_t *sp, + int node_c, + char **node_v, + int forceflg, + md_error_t *ep +) +{ + md_set_desc *sd = NULL; + md_drive_desc *dd = NULL; + mddb_med_parm_t mp; + md_h_arr_t rb_t; + med_rec_t medr; + med_rec_t rb_medr; + int i, j; + char **n_l = NULL; + int n_c = 0; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + int rval = 0; + int max_meds; + md_mnnode_desc *nd; + int suspend1_flag = 0; + int lock_flag = 0; + + mdclrerror(ep); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + for (i = 0; i < node_c; i++) + if (strchr(node_v[i], ',') != NULL) + return (mderror(ep, MDE_ONLYNODENAME, node_v[i])); + + if (nodesuniq(sp, node_c, node_v, ep)) + return (-1); + + if ((max_meds = get_max_meds(ep)) == 0) + return (-1); + + /* + * The mediator information (which is part of the set record) is + * stored in the local mddbs of each node in the diskset. + * Each node's rpc.metad daemon reads in the set + * records from that node's local mddb and caches them + * internally. Any process needing diskset information contacts its + * local rpc.metad to get this information. Since each node in the + * diskset is independently reading the set information from its local + * mddb, the set records in the local mddbs must stay + * in-sync, so that all nodes have a consistent view of the diskset. + * + * For a multinode diskset, explicitly verify that all nodes in the + * diskset are ALIVE (i.e. are in the API membership list). Otherwise, + * fail this operation since all nodes must be ALIVE in order to delete + * the mediator information from the set record in their local mddb. + * If a panic of this node leaves the local mddbs set records + * out-of-sync, the reconfig cycle will fix the local mddbs and + * force them back into synchronization. + */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST, + sp->setno, + nd->nd_nodename, NULL, sp->setname); + return (-1); + } + nd = nd->nd_next; + } + } + + if (sd->sd_med.n_cnt == 0) + return (mderror(ep, MDE_NOMED, NULL)); + + /* Make a list of nodes to check */ + for (i = 0; i < max_meds; i++) + if (sd->sd_med.n_lst[i].a_cnt > 0) + n_c = add_lst(&n_l, sd->sd_med.n_lst[i].a_nm[0]); + + for (i = 0; i < node_c; i++) { + if (! strinlst(node_v[i], n_c, n_l)) { + (void) del_lst(&n_l); + return (mddserror(ep, MDE_DS_ISNOTMED, sp->setno, + node_v[i], NULL, sp->setname)); + } + } + + (void) del_lst(&n_l); + + /* Save a copy of the current mediator information */ + rb_t = sd->sd_med; /* structure assignment */ + + /* Setup the mediator record for rollback */ + (void) memset(&rb_medr, '\0', sizeof (med_rec_t)); + rb_medr.med_rec_mag = MED_REC_MAGIC; + rb_medr.med_rec_rev = MED_REC_REV; + rb_medr.med_rec_fl = 0; + rb_medr.med_rec_sn = sp->setno; + (void) strcpy(rb_medr.med_rec_snm, sp->setname); + if (MD_MNSET_DESC(sd)) { + /* + * In MN diskset, use a generic nodename, multiowner, in the + * mediator record which allows any node to access mediator + * information. MN diskset reconfig cycle forces consistent + * view of set/node/drive/mediator information across all nodes + * in the MN diskset. This allows the relaxation of + * node name checking in rpc.metamedd for MN disksets. + * + * In the traditional diskset, only a node that is in the + * mediator record's diskset nodelist can access mediator + * data. + */ + (void) strlcpy(rb_medr.med_rec_nodes[0], MED_MN_CALLER, + MD_MAX_NODENAME_PLUS_1); + } else { + for (i = 0; i < MD_MAXSIDES; i++) + (void) strcpy(rb_medr.med_rec_nodes[i], + sd->sd_nodes[i]); + } + rb_medr.med_rec_meds = sd->sd_med; /* structure assignment */ + (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t)); + rb_medr.med_rec_foff = 0; + crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* Delete the mediators requested from the set */ + for (i = 0; i < node_c; i++) { + for (j = 0; j < max_meds; j++) { + if (sd->sd_med.n_lst[j].a_cnt == 0) + continue; + if (strcmp(node_v[i], + sd->sd_med.n_lst[j].a_nm[0]) != 0) + continue; + SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REMOVE, + SVM_TAG_MEDIATOR, sp->setno, j); + (void) memset(&sd->sd_med.n_lst[j], '\0', + sizeof (md_h_t)); + sd->sd_med.n_cnt--; + break; + } + } + + medr = rb_medr; /* structure assignment */ + medr.med_rec_meds = sd->sd_med; /* structure assignment */ + crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL); + + /* END CHECK CODE */ + + /* Lock the set on current set members */ + if (MD_MNSET_DESC(sd)) { + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + /* + * Lock the set on current set members. + * lock_set and SUSPEND are used to protect against + * other metaset commands running on the other nodes. + */ + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + if (forceflg && strcmp(mynode(), + nd->nd_nodename) != 0) { + mdclrerror(ep); + nd = nd->nd_next; + continue; + } + rval = -1; + goto out; + } + lock_flag = 1; + nd = nd->nd_next; + } + /* + * Lock out other meta* commands by suspending + * class 1 messages across the diskset. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1, + MD_MSCF_NO_FLAGS, ep)) { + rval = -1; + goto out; + } + suspend1_flag = 1; + nd = nd->nd_next; + } + } else { + md_rb_sig_handling_on(); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + if (forceflg && + strcmp(mynode(), sd->sd_nodes[i]) != 0) { + mdclrerror(ep); + continue; + } + rval = -1; + goto out; + } + lock_flag = 1; + } + } + + RB_TEST(1, "meta_set_deletemeds", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "meta_set_deletemeds", ep) + + /* Update the mediator information on all hosts in the set */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, + ep)) { + if (forceflg && strcmp(mynode(), + nd->nd_nodename) != 0) { + mdclrerror(ep); + continue; + } + goto rollback; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, + ep)) { + if (forceflg && strcmp(mynode(), + sd->sd_nodes[i]) != 0) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + } + + RB_TEST(3, "meta_set_deletemeds", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(5, "meta_set_deletemeds", ep) + + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) { + if (! mdisok(ep)) + goto rollback; + } + + RB_TEST(5, "meta_set_deletemeds", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(6, "meta_set_deletemeds", ep) + + if (dd != NULL) { + /* + * Set up the parameters to the call to update the + * kernel mediator list + */ + (void) memset(&mp, '\0', sizeof (mddb_med_parm_t)); + mp.med_setno = sp->setno; + if (meta_h2hi(&sd->sd_med, &mp.med, ep)) + goto rollback; + + /* Resolve the IP addresses for the host list */ + if (meta_med_hnm2ip(&mp.med, ep)) + goto rollback; + + if (metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL) != 0) { + (void) mdstealerror(ep, &mp.med_mde); + goto rollback; + } + } + + RB_TEST(7, "meta_set_deletemeds", ep) + + RB_PREEMPT; + rb_level = 4; /* level 4 */ + + RB_TEST(8, "meta_set_deletemeds", ep) + + /* Inform the mediator hosts of the new status */ + for (i = 0; i < max_meds; i++) { + if (rb_t.n_lst[i].a_cnt == 0) + continue; + + /* + * medr contains the new mediator node list. + * Send the new mediator information to the + * new mediator node list. If a node had this RPC + * called, but its node is no longer in the new mediator + * node list, rpc.metamedd will delete the mediator + * record on that node. + */ + if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &medr, ep)) { + if ((forceflg && mdanyrpcerror(ep)) || + mdisrpcerror(ep, RPC_PROGNOTREGISTERED)) { + mdclrerror(ep); + continue; + } + goto rollback; + } + } + +out: + if (dd) + metafreedrivedesc(&dd); + + if (suspend1_flag) { + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME, + sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + mde_perror(ep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (lock_flag) { + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_unlock_set(nd->nd_nodename, + cl_sk, &xep)) { + if (forceflg && + strcmp(mynode(), + nd->nd_nodename) != 0) { + mdclrerror(ep); + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], + cl_sk, &xep)) { + if (forceflg && + strcmp(mynode(), + sd->sd_nodes[i]) != 0) { + mdclrerror(ep); + continue; + } + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + } + cl_set_setkey(NULL); + + if (MD_MNSET_DESC(sd)) { + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } else { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); + +rollback: + /* all signals already blocked for MN disket */ + if (!(MD_MNSET_DESC(sd))) { + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + } + + rval = -1; + + (void) del_lst(&n_l); + + /* level 4 */ + if (rb_level > 4) { + for (i = 0; i < max_meds; i++) { + if (rb_t.n_lst[i].a_cnt == 0) + continue; + + /* + * rb_medr contains the rollback mediator node list. + * Send the rollback mediator information to the + * new mediator node list. This will recreate the + * mediator record on all nodes where the mediator + * record had been removed. + */ + if (clnt_med_upd_rec(&rb_t.n_lst[i], sp, &rb_medr, + &xep)) + mdclrerror(&xep); + } + } + + /* level 3 */ + if (rb_level > 2 && dd != NULL) { + (void) memset(&mp, '\0', sizeof (mddb_med_parm_t)); + mp.med_setno = sp->setno; + (void) meta_h2hi(&rb_t, &mp.med, &xep); + mdclrerror(&xep); + (void) meta_med_hnm2ip(&mp.med, &xep); + mdclrerror(&xep); + (void) metaioctl(MD_MED_SET_LST, &mp, &mp.med_mde, NULL); + } + + /* level 2 */ + if (rb_level > 1) { + metafreedrivedesc(&dd); + } + + /* level 1 */ + if (rb_level > 0) { + /* Delete mediator information from all hosts in the set */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* All nodes are guaranteed to be ALIVE */ + if (clnt_updmeds(nd->nd_nodename, sp, &rb_t, + &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_updmeds(sd->sd_nodes[i], sp, &rb_t, + &xep)) + mdclrerror(&xep); + } + } + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + /* Unlock the set */ + /* Don't test lock flag since guaranteed to be set if in rollback */ + if (MD_MNSET_DESC(sd)) { + /* + * Unlock diskset by resuming messages across the diskset. + * Just resume all classes so that resume is the same whether + * just one class was locked or all classes were locked. + */ + if (suspend1_flag) { + /* All nodes are guaranteed to be ALIVE */ + nd = sd->sd_nodelist; + while (nd) { + if (clnt_mdcommdctl(nd->nd_nodename, + COMMDCTL_RESUME, sp, MD_MSG_CLASS0, + MD_MSCF_NO_FLAGS, &xep)) { + mde_perror(&xep, dgettext(TEXT_DOMAIN, + "Unable to resume rpc.mdcommd.\n")); + mdclrerror(&xep); + } + nd = nd->nd_next; + } + meta_ping_mnset(sp->setno); + } + nd = sd->sd_nodelist; + /* All nodes are guaranteed to be ALIVE */ + while (nd) { + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) + mdclrerror(&xep); + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + if (!(MD_MNSET_DESC(sd))) { + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + } + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_prv.c b/usr/src/lib/lvm/libmeta/common/meta_set_prv.c new file mode 100644 index 0000000000..8b615d9af0 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set_prv.c @@ -0,0 +1,818 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Metadevice diskset interfaces + */ + +#include "meta_set_prv.h" +#include <meta.h> +#include <sys/lvm/md_mddb.h> +#include <sys/cladm.h> +#include <devid.h> +#include <sys/lvm/md_convert.h> + +/* + * Exported Entry Points + */ + +int +checkdrive_onnode( + mdsetname_t *sp, + mddrivename_t *dnp, + char *node, + md_error_t *ep) +{ + time_t mystamp, otherstamp; + md_dev64_t otherdev; + mdname_t *np, *remote_np; + mddrivename_t *remote_dnp; + int release = 0; + md_drive_desc dd; + int rval = 0; + int ret = -1; + mhd_mhiargs_t mhiargs; + md_set_desc *sd; + int is_efi = 0; + int do_fallback = 0; + + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (meta_is_drive_in_thisset(sp, dnp, FALSE, ep)) { + release = 1; + dd.dd_next = NULL; + dd.dd_dbcnt = 0; + dd.dd_dbsize = 0; + dd.dd_dnp = dnp; + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) + return (-1); + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, &dd, TRUE, ep)) + return (-1); + } + } + if ((np = metaslicename(dnp, MD_SLICE0, ep)) == NULL) { + rval = -1; + goto out; + } + + /* + * First try and operate assuming the other side + * is running a SVM version that supports device id + * in disksets i.e. is running SVM RPC version 2. + * + * If this call fails due to the other side running + * a SVM version that does not support device id + * in disksets i.e. is running SVM RPC version 1, we + * fallback to the old behaviour. + */ + if ((dnp->devid != NULL) && (!(MD_MNSET_DESC(sd)))) { + char *rname = NULL; + md_dev64_t dev = NODEV64; + + /* + * If the disk is connected to the remote node then the + * only thing we can be certain of is that the disk will + * have the same devid on that node, it may not have the + * same minor number nor the same ctd name. But if it + * does have the same ctd name then use it. In most cases + * there will only be a single entry returned but if the + * system has multi-path disks with MPXIO turned off there + * will be multiple entries. Attempting to choose the same + * name will give the user as consistent a view across the + * nodes as possible. + */ + ret = clnt_devinfo_by_devid(node, sp, dnp->devid, &dev, + np->rname, &rname, NULL, ep); + + /* + * If the return value was ENOTSUP, we know the + * other side is not running a SVM version that + * supports device id in disksets. We fallback + * to the previous behaviour in that case. + */ + if (ret == ENOTSUP) { + do_fallback++; + goto fallback; + } else if (ret == -1) { + rval = -1; + goto out; + } + + /* + * If the device does not exist on the remote node then + * the returned dev should indicate this (NODEV64) but + * we also check to make sure the returned name is not + * empty to make sure that the namespace does not get + * created with a NULL/empty entry (should not be possbile + * but being paranoid). + */ + if (dev == NODEV64 || rname == (char *)NULL || + strcmp(rname, "") == 0) { + rval = mddserror(ep, MDE_DS_DRIVENOTCOMMON, sp->setno, + node, dnp->cname, sp->setname); + goto out; + } + + /* + * The rname returned from the remote node maybe different + * to the rname on this node, therefore we need to build up + * a dnp for this new rname. + */ + if (strcmp(np->rname, rname) != 0) { + /* different rname */ + remote_np = metaname_fast(&sp, rname, ep); + if (remote_np != NULL) { + remote_dnp = remote_np->drivenamep; + } + } else { + remote_dnp = dnp; + } + } else { + do_fallback++; + } + +fallback: + if (do_fallback) { + ret = setdevstamp(dnp, &mystamp, ep); + /* + * Check if the disk in question is an EFI disk. + */ + if (ret == ENOTSUP) + is_efi++; + else if (ret == -1) + return (-1); + + if ((np = metaslicename(dnp, MD_SLICE0, ep)) == NULL) { + rval = -1; + goto out; + } + + if (is_efi) { + /* + * For EFI disks, we compare the device + * id for the disks in question. + */ + ddi_devid_t thisdevid, otherdevid; + char *encoded_otherdevid = NULL; + char *encoded_thisdevid = NULL; + + if (clnt_devinfo(node, sp, dnp, &otherdev, NULL, ep) + == -1) { + rval = -1; + goto out; + } + if (np->dev != otherdev) { + rval = mddserror(ep, MDE_DS_DRIVENOTCOMMON, + sp->setno, node, dnp->cname, sp->setname); + goto out; + } + + if (clnt_devid(node, sp, dnp, &encoded_otherdevid, + ep) == -1) { + rval = -1; + goto out; + } + if (encoded_otherdevid == NULL) { + rval = -1; + goto out; + } + if (devid_str_decode(encoded_otherdevid, &otherdevid, + NULL) == 0) { + /* + * If we are here, it means that dnp->devid + * is NULL. This will typically happen if + * we are dealing with SunCluster DID devices. + * + * We want to explicitly get the device id + * for such a disk + */ + encoded_thisdevid = meta_get_devid(dnp->rname); + ret = devid_str_decode(encoded_thisdevid, + &thisdevid, NULL); + if (ret == 0) { + ret = devid_compare(thisdevid, + otherdevid); + devid_free(thisdevid); + } + devid_free(otherdevid); + if (encoded_thisdevid) + Free(encoded_thisdevid); + } + + Free(encoded_otherdevid); + if (ret != 0) { + rval = mddserror(ep, MDE_DS_DRIVENOTCOMMON, + sp->setno, node, dnp->cname, sp->setname); + goto out; + } + } else { + /* + * For VTOC disks, we compare the dev_t and + * timestamp for the disks in question. + */ + if (clnt_devinfo(node, sp, dnp, &otherdev, + &otherstamp, ep) == -1) { + rval = -1; + goto out; + } + if ((mystamp != otherstamp) || (np->dev != otherdev)) { + rval = mddserror(ep, MDE_DS_DRIVENOTCOMMON, + sp->setno, node, dnp->cname, sp->setname); + goto out; + } + } + remote_dnp = dnp; + } + + if (clnt_drvused(node, sp, remote_dnp, ep) == -1) + rval = -1; + +out: + if (release) + if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) { + if (tk_own_bydd(sp, &dd, &mhiargs, TRUE, ep)) + rval = -1; + } + + return (rval); +} + +side_t +getnodeside(char *node, md_set_desc *sd) +{ + side_t sideno; + int nid; + md_mnnode_desc *nd; + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (strcmp(nd->nd_nodename, node) == 0) { + return (nd->nd_nodeid); + } + nd = nd->nd_next; + } + return (MD_SIDEWILD); + } + + + /* If regular diskset */ + for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { + if (sd->sd_nodes[sideno] == NULL || + sd->sd_nodes[sideno][0] == '\0') + continue; + + if (strcmp(sd->sd_nodes[sideno], node) == 0) { + return (sideno); + } + } + + /* + * If the first loop fails we may be in a situation where this host + * is configured as part of a cluster yet not running in the cluster + * mode. If so, the names stored in sd->sd_nodes[] are going to be + * nodeid's instead of hostnames. See if we can find a match that way. + */ + if (_cladm(CL_CONFIG, CL_NODEID, &nid) == 0) { + for (sideno = 0; sideno < MD_MAXSIDES; sideno++) { + if (sd->sd_nodes[sideno] == NULL || + sd->sd_nodes[sideno][0] == '\0') + continue; + if (atoi(sd->sd_nodes[sideno]) == nid) + return (sideno); + } + } + + return (MD_SIDEWILD); +} + +int +halt_set(mdsetname_t *sp, md_error_t *ep) +{ + mddb_config_t c; + + (void) memset(&c, 0, sizeof (c)); + c.c_setno = sp->setno; + if ((c.c_sideno = getmyside(sp, ep)) == MD_SIDEWILD) + return (-1); + + if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) { + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + /* Kill any resyncs that are running on mirrors in this set */ + meta_mirror_resync_kill(sp); + if (metaioctl(MD_RELEASE_SET, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + } + + return (0); +} + +md_drive_desc * +metadrivedesc_append( + md_drive_desc **dd, + mddrivename_t *dnp, + int dbcnt, + int dbsize, + md_timeval32_t timestamp, + ulong_t genid, + uint_t flags +) +{ + md_drive_desc *p; + + /* run to end of list */ + for (/* void */; (*dd != NULL); dd = &(*dd)->dd_next) + /* void */; + + /* allocate new list element */ + p = *dd = Zalloc(sizeof (*p)); + + p->dd_dnp = dnp; + p->dd_dbcnt = dbcnt; + p->dd_dbsize = dbsize; + p->dd_ctime = timestamp; + p->dd_genid = genid; + p->dd_flags = flags; + return (p); +} + +int +nodehasset( + mdsetname_t *sp, + char *node, + uint_t match_flag, + md_error_t *ep +) +{ + md_set_desc *sd; + md_set_record *sr; + int rval = 0; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + /* Don't care if set record is MN or not */ + if (clnt_getset(node, sp->setname, MD_SET_BAD, &sr, ep)) + return (-1); + + if (sr == NULL) { + if (! mdisok(ep)) + return (-1); + return (0); + } + + /* Looking for name only match */ + if ((match_flag & NHS_N_EQ) == NHS_N_EQ) { + rval = 1; + goto out; + } + + if (sd->sd_setno != sr->sr_setno) + goto out; + + /* Looking for name and setno match */ + if ((match_flag & NHS_NS_EQ) == NHS_NS_EQ) { + rval = 1; + goto out; + } + + if (sd->sd_ctime.tv_sec != sr->sr_ctime.tv_sec || + sd->sd_ctime.tv_usec != sr->sr_ctime.tv_usec) + goto out; + + /* Looking for name, setno, and timestamp match */ + if ((match_flag & NHS_NST_EQ) == NHS_NST_EQ) { + rval = 1; + goto out; + } + + if (sd->sd_genid != sr->sr_genid) { + if (sd->sd_genid < sr->sr_genid) { + /* + * Looking for name, setno, timestamp, and genid on + * other host is GT than other host. + */ + if ((match_flag & NHS_NST_EQ_G_GT) == NHS_NST_EQ_G_GT) { + rval = 1; + goto out; + } + } + goto out; + } + + /* Looking for name, setno, timestamp, and genid match */ + if ((match_flag & NHS_NSTG_EQ) == NHS_NSTG_EQ) + rval = 1; + +out: + /* + * Set record structure was allocated from RPC routine getset + * so this structure is only of size md_set_record even if + * the MN flag is set. So, clear the flag so that the free + * code doesn't attempt to free a structure the size of + * md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + + return (rval); +} + +int +nodesuniq(mdsetname_t *sp, int cnt, char **strings, md_error_t *ep) +{ + int i, j; + for (i = 0; i < cnt; i++) + for (j = i + 1; j < cnt; j++) + if (strcmp(strings[i], strings[j]) == 0) + return (mddserror(ep, MDE_DS_DUPHOST, + sp->setno, strings[i], NULL, sp->setname)); + return (0); +} + +int +own_set(mdsetname_t *sp, char **owner_of_set, int forceflg, md_error_t *ep) +{ + md_set_desc *sd; + int am_i_owner; + int i; + + if (metaislocalset(sp)) { + if (owner_of_set != NULL) + *owner_of_set = Strdup(mynode()); + return (MD_SETOWNER_YES); + } + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (clnt_ownset(mynode(), sp, &am_i_owner, ep) == -1) + return (-1); + + if (MD_MNSET_DESC(sd)) { + if (am_i_owner == TRUE) + return (MD_SETOWNER_YES); + else + return (MD_SETOWNER_NO); + } + + if (forceflg == TRUE) { + if (am_i_owner == TRUE) { + if (owner_of_set != NULL) + *owner_of_set = Strdup(mynode()); + return (MD_SETOWNER_YES); + } + + if (owner_of_set != NULL) + *owner_of_set = NULL; + return (MD_SETOWNER_NONE); + } + + if (am_i_owner == TRUE) { + if (owner_of_set != NULL) + *owner_of_set = Strdup(mynode()); + return (MD_SETOWNER_YES); + } + + + for (i = 0; i < MD_MAXSIDES; i++) { + /* + * Skip empty slots, and my own slot. + */ + if (sd->sd_nodes[i][0] == '\0' || + strcmp(sd->sd_nodes[i], mynode()) == 0) + continue; + + if (clnt_ownset(sd->sd_nodes[i], sp, &am_i_owner, ep) == -1) + return (-1); + + if (am_i_owner == TRUE) { + if (owner_of_set != NULL) + *owner_of_set = Strdup(sd->sd_nodes[i]); + return (MD_SETOWNER_NO); + } + } + + /* We get here, we currently have no owner. */ + if (owner_of_set != NULL) + *owner_of_set = NULL; + return (MD_SETOWNER_NONE); +} + +void +resync_genid( + mdsetname_t *sp, + md_set_desc *sd, + ulong_t max_genid, + int node_c, + char **node_v +) +{ + int i, j; + ulong_t cur_genid[MD_MAXSIDES]; + md_set_record *sr; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + md_mnset_record *mnsr; + + if (node_c > 0 && node_v && *node_v) { + /* + * Mark the set record MD_SR_OK. + */ + for (i = 0; i < node_c; i++) + if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_OK, &xep)) + mdclrerror(&xep); + max_genid++; + } + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + /* Will only return a multi-node diskset record */ + if (clnt_mngetset(nd->nd_nodename, sp->setname, + MD_SET_BAD, &mnsr, &xep) == -1) { + mdclrerror(&xep); + nd = nd->nd_next; + continue; + } + for (j = mnsr->sr_genid; j < max_genid; j++) { + if (clnt_upd_sr_flags(nd->nd_nodename, sp, + MD_SR_OK, &xep)) + mdclrerror(&xep); + } + free_sr((struct md_set_record *)mnsr); + nd = nd->nd_next; + } + return; + } + + /* + * Get current genid for each node. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + cur_genid[i] = 0; + + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Should be a non-multinode diskset */ + if (clnt_getset(sd->sd_nodes[i], sp->setname, + MD_SET_BAD, &sr, &xep) == -1) { + mdclrerror(&xep); + continue; + } + + if (MD_MNSET_REC(sr)) { + /* + * Set record structure was allocated from RPC routine + * getset so this structure is only of size + * md_set_record even if the MN flag is set. So, + * clear the flag so that the free code doesn't + * attempt to free a structure the size of + * md_mnset_record. + */ + sr->sr_flags &= ~MD_SR_MN; + free_sr(sr); + continue; + } + + cur_genid[i] = sr->sr_genid; + + free_sr(sr); + } + + /* + * Mark the set record MD_SR_OK + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + for (j = cur_genid[i]; j < max_genid; j++) + if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, + &xep)) + mdclrerror(&xep); + + } +} + +int +setup_db_bydd(mdsetname_t *sp, md_drive_desc *dd, int force, md_error_t *ep) +{ + md_drive_desc *p; + struct mddb_config c; + int i; + md_set_desc *sd; + int use_devid = 1; + ddi_devid_t devidp; + char *minor_name = NULL; + size_t sz; + char *devid_str = NULL; + + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + (void) memset(&c, 0, sizeof (c)); + + c.c_setno = sp->setno; + (void) strcpy(c.c_setname, sp->setname); + if ((c.c_sideno = getmyside(sp, ep)) == MD_SIDEWILD) + return (-1); + + c.c_timestamp = sd->sd_ctime; + + if (setup_med_cfg(sp, &c, force, ep)) + return (-1); + + for (p = dd; p != NULL; p = p->dd_next) { + mddrivename_t *dnp; + mdname_t *np; + mdcinfo_t *cinfo; + mdsidenames_t *sn = NULL; + + if (p->dd_dbcnt == 0) + continue; + + dnp = p->dd_dnp; + + assert(dnp != NULL); + + for (sn = dnp->side_names; sn != NULL; sn = sn->next) { + if (sn->sideno == c.c_sideno) + break; + } + + /* + * The disk has no side name information + */ + if (sn == NULL) { + uint_t rep_slice; + + if ((meta_replicaslice(dnp, &rep_slice, ep) != 0) || + ((np = metaslicename(dnp, rep_slice, ep)) + == NULL)) { + mdclrerror(ep); + continue; + } + + if (np->dev == NODEV64) + continue; + + c.c_locator.l_dev = meta_cmpldev(np->dev); + c.c_locator.l_mnum = meta_getminor(np->dev); + + if (!MD_MNSET_DESC(sd)) { + /* + * minor_name will be NULL if dnp->devid == NULL + * - see metagetvtoc() + */ + if (np->minor_name != NULL) { + minor_name = Strdup(np->minor_name); + } + } + + if ((cinfo = metagetcinfo(np, ep)) == NULL) { + mdclrerror(ep); + continue; + } + + (void) strncpy(c.c_locator.l_driver, cinfo->dname, + sizeof (c.c_locator.l_driver)); + } else { + c.c_locator.l_dev = NODEV32; + c.c_locator.l_mnum = sn->mnum; + (void) strncpy(c.c_locator.l_driver, sn->dname, + sizeof (c.c_locator.l_driver)); + + if (!MD_MNSET_DESC(sd)) { + if (dnp->devid != NULL) { + minor_name = meta_getdidminorbykey( + MD_LOCAL_SET, sn->sideno + SKEW, + dnp->side_names_key, ep); + } + } + } + + if ((dnp->devid == NULL) || MD_MNSET_DESC(sd)) { + use_devid = 0; + } + + if (use_devid) { + /* + * The devid associated with the dnp does not have + * a minor name and so we must add it in. + */ + size_t len = strlen(dnp->devid) + + strlen(minor_name) + 2; + devid_str = (char *)Malloc(len); + (void) snprintf(devid_str, len, "%s/%s", dnp->devid, + minor_name); + (void) devid_str_decode(devid_str, &devidp, NULL); + + sz = devid_sizeof(devidp); + c.c_locator.l_devid = (uintptr_t)malloc(sz); + c.c_locator.l_devid_sz = sz; + (void) memcpy((void *)c.c_locator.l_devid, devidp, sz); + if (minor_name == NULL) { + /* ERROR fix up */ + Free(devid_str); + return (-1); + } + (void) strcpy(c.c_locator.l_minor_name, minor_name); + c.c_locator.l_devid_flags = MDDB_DEVID_VALID | + MDDB_DEVID_SPACE | MDDB_DEVID_SZ; + } else { + /* + * Don't need device id information from + * this ioctl + */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + } + + + for (i = 0; i < p->dd_dbcnt; i++) { + c.c_locator.l_flags = 0; + c.c_locator.l_blkno = 16 + i * p->dd_dbsize; + + if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { + if (use_devid) { + Free(devid_str); + } + Free(minor_name); + return (mdstealerror(ep, &c.c_mde)); + } + } + if (use_devid) { + Free(devid_str); + } + Free(minor_name); + } + + /* return success */ + return (0); +} + +int +snarf_set(mdsetname_t *sp, bool_t stale_bool, md_error_t *ep) +{ + mddb_config_t c; + + (void) memset(&c, '\0', sizeof (c)); + + c.c_setno = sp->setno; + if ((c.c_sideno = getmyside(sp, ep)) == MD_SIDEWILD) + return (-1); + + /* Don't need device id information from this ioctl */ + c.c_locator.l_devid = (uint64_t)0; + c.c_locator.l_devid_flags = 0; + if (stale_bool == TRUE) { + c.c_flags = MDDB_C_STALE; + } + if (metaioctl(MD_GRAB_SET, &c, &c.c_mde, NULL) != 0) + return (mdstealerror(ep, &c.c_mde)); + + if (c.c_flags & MDDB_C_STALE) + return (mdmddberror(ep, MDE_DB_STALE, NODEV64, sp->setno, + 0, NULL)); + + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c b/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c new file mode 100644 index 0000000000..b13c483af0 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_set_tkr.c @@ -0,0 +1,1079 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Metadevice diskset interfaces + */ + +#include "meta_set_prv.h" +#include <sys/lvm/md_crc.h> + + +static int +upd_dr_dbinfo( + mdsetname_t *sp, + md_set_desc *sd, + md_drive_desc *dd, + md_replicalist_t *rlp, + int forceflg, + md_error_t *ep +) +{ + md_drive_desc *p; + md_replica_t *r; + md_replicalist_t *rl; + int i; + int dbcnt; + int rval = 0; + daddr_t nblks = 0; + md_setkey_t *cl_sk; + md_error_t xep = mdnullerror; + md_mnnode_desc *nd; + ddi_devid_t devid; + + /* find the smallest existing replica */ + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + r = rl->rl_repp; + nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); + } + + if (nblks <= 0) + nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE; + + for (p = dd; p != NULL; p = p->dd_next) { + dbcnt = 0; + for (rl = rlp; rl != NULL; rl = rl->rl_next) { + r = rl->rl_repp; + + /* + * Before we bump up the dbcnt, if we're + * running with device ids in disksets, let's + * compare the device ids otherwise we compare + * the ctd names. + * + * There is a possibility the device ids might + * have changed. To account for that case, we + * fallback to comparing the ctd names if the + * device id comparison fails. If we aren't running + * in device id mode and a disk has moved, the ctd's + * won't match. + */ + if ((p->dd_dnp->devid != NULL) && + (r->r_devid != NULL) && (!MD_MNSET_DESC(sd))) { + (void) devid_str_decode(p->dd_dnp->devid, + &devid, NULL); + if ((devid_compare(devid, r->r_devid) == 0) || + (strcmp(r->r_namep->drivenamep->cname, + p->dd_dnp->cname) == 0)) + dbcnt++; + devid_free(devid); + } else { + if (strcmp(r->r_namep->drivenamep->cname, + p->dd_dnp->cname) == 0) + dbcnt++; + } + } + p->dd_dbcnt = dbcnt; + p->dd_dbsize = dbcnt > 0 ? nblks : 0; + } + + /* Lock the set on current set members */ + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* If this is forced, don't lock other sides */ + if (forceflg && strcmp(mynode(), nd->nd_nodename) + != 0) { + nd = nd->nd_next; + continue; + } + + /* We already locked this side in the caller */ + if (strcmp(mynode(), nd->nd_nodename) == 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_lock_set(nd->nd_nodename, sp, ep)) { + rval = -1; + goto out; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* If this is forced, don't lock other sides */ + if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0) + continue; + + /* We already locked this side in the caller */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + } + } + + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* If this is forced, then only care about this node */ + if (forceflg && strcmp(mynode(), nd->nd_nodename) + != 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, + ep) == -1) { + if (! mdiserror(ep, MDE_NO_SET) && + ! mdismddberror(ep, MDE_DB_NODB)) { + rval = -1; + break; + } + mdclrerror(ep); + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* If this is forced, then only care about this node */ + if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0) + continue; + + if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, + ep) == -1) { + if (! mdiserror(ep, MDE_NO_SET) && + ! mdismddberror(ep, MDE_DB_NODB)) { + rval = -1; + break; + } + mdclrerror(ep); + } + } + } + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (MD_MNSET_DESC(sd)) { + nd = sd->sd_nodelist; + while (nd) { + /* If this is forced, don't unlock other sides */ + if (forceflg && strcmp(mynode(), nd->nd_nodename) + != 0) { + nd = nd->nd_next; + continue; + } + + /* We will unlocked this side in the caller */ + if (strcmp(mynode(), nd->nd_nodename) == 0) { + nd = nd->nd_next; + continue; + } + + if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { + nd = nd->nd_next; + continue; + } + + if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + nd = nd->nd_next; + } + } else { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* If this is forced, don't unlock other sides */ + if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0) + continue; + + /* We will unlocked this side in the caller */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + /* Do not clear the key, via cl_set_setkey(NULL) this is nested */ + + return (rval); +} + +static int +usetag_take(set_t setno, int usetag, md_error_t *ep) +{ + mddb_dtag_use_parm_t dtup; + + (void) memset(&dtup, '\0', sizeof (mddb_dtag_use_parm_t)); + dtup.dtup_id = usetag; + dtup.dtup_setno = setno; + + if (metaioctl(MD_MED_USE_TAG, &dtup, &dtup.dtup_mde, NULL) != 0) + return (mdstealerror(ep, &dtup.dtup_mde)); + + return (0); +} + +static int +useit_take(set_t setno, md_error_t *ep) +{ + mddb_accept_parm_t accp; + + (void) memset(&accp, '\0', sizeof (mddb_accept_parm_t)); + accp.accp_setno = setno; + + if (metaioctl(MD_MED_ACCEPT, &accp, &accp.accp_mde, NULL) != 0) + return (mdstealerror(ep, &accp.accp_mde)); + + return (0); +} + +/* + * Update the master block with the device id information for the disks + * in the diskset. The device id information will be consumed by the + * diskset import code in case of remotely replicated disksets. + * + * For the drives that have a valid diskset mddb on them, we add the + * device id for the drive to the unused portion of the mddb. + * + * For the drives that don't have a diskset mddb on them, we add a dummy + * master block that contains the device id for the drive. A dummy master + * block is signified by changing the master block magic number, mb_magic, + * to MDDB_MAGIC_DU. + * + * This code is responsible primarily for adding the appropriate device id + * information to diskset disks that didn't have the information. This would + * typically occur when the OS has been upgraded from an OS release prior to + * Solaris 10 + * + * The error path in this routine is defined as - if an error occurs while + * updating the mddb for one disk in the diskset, don't bother updating *any* + * of the mddbs because it's game over anyways as far as disaster recovery for + * that diskset is concerned. + * + * This code will need to be revisited if and when support for importing + * partial disksets is added. + * + * NOTE: This code relies heavily on the meta_repartition() working correctly + * and reformatting a drive, so that there's enough room for a dummy master + * block, every time a drive is added to a diskset. Should + * the meta_repartition() code change in future, this code will have to be + * revisited. + * + * Returns 0 on success and -1 on failure + */ +int +meta_update_mb(mdsetname_t *sp, md_drive_desc *drivedesc, md_error_t *ep) +{ + uint_t sliceno, offset; + void *mb; + mddb_mb_t *mbp; + int fd = -1; + ddi_devid_t devid = NULL; + md_drive_desc *dd; + mddrivename_t *dnp; + mdname_t *rsp; + int dbcnt; + int dbsize; + size_t len; + md_set_desc *sd; + + /* + * Don't do anything for MN diskset for now. + */ + if (! metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + + if (MD_MNSET_DESC(sd)) + return (0); + } + + mb = Malloc(DEV_BSIZE); + mbp = (mddb_mb_t *)mb; + + /* + * For every drive in the drive descriptor, iterate through all + * the mddbs present on it and check to see if mb_devid_magic is + * set. If it isn't, then update the master block with the correct + * device id information + */ + for (dd = drivedesc; dd != NULL; dd = dd->dd_next) { + int i = 0; + + dnp = dd->dd_dnp; + dbcnt = dd->dd_dbcnt; + dbsize = dd->dd_dbsize; + + /* + * When the import support for remotely replicated + * disksets gets implemented, we probably want to + * inform the user that the disks won't be self + * identifying if any of these calls fails + */ + if (meta_replicaslice(dnp, &sliceno, ep) != 0) + return (-1); + + if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL) + return (-1); + + if ((fd = open(rsp->rname, O_RDWR)) < 0) + goto cleanup; + + /* if devid_str_decode fails, make sure devid is null */ + if (devid_str_decode(dnp->devid, &devid, NULL) != 0) { + devid = NULL; + } + + do { + int push = 0; + + offset = (i * dbsize + 16); + ++i; + + if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) < 0) + goto cleanup; + + if (read(fd, mbp, DEV_BSIZE) != DEV_BSIZE) + goto cleanup; + + if (crcchk((uchar_t *)mbp, (uint_t *)&mbp->mb_checksum, + (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) + goto cleanup; + + /* + * If the disk is one of the ones that doesn't + * have a shared mddb on it, we put a dummy + * master block on it. + */ + if (mbp->mb_devid_magic != MDDB_MAGIC_DE) { + if (dbcnt == 0) { + meta_mkdummymaster(sp, fd, 16); + break; + } + } + + /* + * if mb_setcreatetime is 0, this field was never + * filled in so do it now. + */ + if ((mbp->mb_setcreatetime.tv_sec == 0) && + (mbp->mb_setcreatetime.tv_usec == 0)) { + mbp->mb_setcreatetime = + meta_get_lb_inittime(sp, ep); + push = 1; + } + + /* + * If MDDB_MAGIC_DE is set in the + * mb_devid_magic field then we know we + * have a valid device id and we don't + * need to add it to the master block. + * + * This would have to be revisited if device + * ids change as a result of device id + * algorithms changing or somesuch. + */ + if (mbp->mb_devid_magic != MDDB_MAGIC_DE) { + if (devid != NULL) { + len = devid_sizeof(devid); + if (len <= (DEV_BSIZE - + sizeof (mddb_mb_t))) { + /* + * there's enough space to + * store the devid + */ + mbp->mb_devid_magic = + MDDB_MAGIC_DE; + mbp->mb_devid_len = len; + (void) memcpy(mbp->mb_devid, + (char *)devid, len); + push = 1; + } + } + } + + /* + * write out (push) any changes we have to the mb + */ + if (push) { + crcgen((uchar_t *)mbp, + (uint_t *)&mbp->mb_checksum, + (uint_t)DEV_BSIZE, (crc_skip_t *)NULL); + + if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) + < 0) + goto cleanup; + + if (write(fd, mbp, DEV_BSIZE) != DEV_BSIZE) + goto cleanup; + } + if (devid) + devid_free(devid); + } while (i < dbcnt); + (void) close(fd); + } + /* success */ + return (0); + +cleanup: + if (fd != -1) + (void) close(fd); + if (devid) + devid_free(devid); + return (-1); +} + +/* + * Exported Entry Points + */ +int +meta_set_take( + mdsetname_t *sp, + mhd_mhiargs_t *mhiargsp, + int flags, + int usetag, + md_error_t *ep +) +{ + md_set_desc *sd; + md_drive_desc *dd; + md_drive_desc *d = NULL; + char *owner = NULL; + int rval = 0; + int i; + int has_set; + int matches = 0; + int numsides = 0; + md_replicalist_t *rlp = NULL; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + mdsetname_t *local_sp = NULL; + side_t side = MD_KEYWILD; + int ret = 0; + char *newname = NULL; + mdkey_t side_names_key; + + if ((flags & TAKE_USETAG) || (flags & TAKE_USEIT)) { + if (flags & TAKE_USETAG) { + if (usetag_take(sp->setno, usetag, ep)) + return (-1); + } else { + if (useit_take(sp->setno, ep)) + return (-1); + } + + if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, ep) != 0) + mdclrerror(ep); + } + + /* Do we own the set? */ + i = own_set(sp, &owner, (flags & TAKE_FORCE), ep); + if (! mdisok(ep)) { + if (owner != NULL) + Free(owner); + return (-1); + } + + if (i == MD_SETOWNER_NO) { + (void) mddserror(ep, MDE_DS_NOTOWNER, sp->setno, owner, NULL, + sp->setname); + if (owner != NULL) + Free(owner); + return (-1); + } + + if (owner != NULL) { + Free(owner); + owner = NULL; + } + + /* We already own it, we are done. */ + if (i == MD_SETOWNER_YES) + return (0); + + if ((sd = metaget_setdesc(sp, &xep)) == NULL) + return (-1); + + /* You can not take ownership of a set that has no drives */ + if (sd->sd_flags & MD_SR_MB_DEVID) + dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, ep); + else + dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep); + + if (dd == NULL) { + if (! mdisok(ep)) + return (-1); + return (0); + } + + /* END CHECK CODE */ + + md_rb_sig_handling_on(); + + /* Lock the set on our side */ + if (clnt_lock_set(mynode(), sp, ep)) { + rval = -1; + goto out; + } + /* + * Get the current side number - do not use getmyside() + * as this code is essentially getnodeside() and this saves + * some instructions. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + if (sd->sd_nodes[i][0] == '\0') + continue; + if (strcmp(sd->sd_nodes[i], mynode()) == 0) { + /* + * SKEW is required for the local set + * as side 0 in this set is the node + * associated with it (this node). + */ + side = i + SKEW; + break; + } + } + if (side == MD_KEYWILD) + return (mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, mynode(), + NULL, mynode())); + + /* + * Check the local devid namespace to see if the disks + * have been moved. Use the local set first of all as this contains + * entries for the disks in the set. + * + * This is being done before the tk_own_bydd because the disks + * in the dd list could be wrong! But it should be done with the lock + * held for the set. + */ + local_sp = metasetname(MD_LOCAL_NAME, ep); + for (d = dd; d != NULL; d = d->dd_next) { + /* + * Actually do the check of the disks. + */ + ret = meta_upd_ctdnames(&local_sp, 0, side, d->dd_dnp, &newname, + ep); + + if ((ret == METADEVADM_ERR) || + (ret == METADEVADM_DSKNAME_ERR)) { + /* check failed in some unknown manner */ + rval = -1; + goto out; + } else if (ret == METADEVADM_DISKMOVE) { + + /* + * Update the dd namelist so that the rpc.metamhd + * gets the correct disks to reserve - it is the rname + * we are interested in. + */ + if (newname != NULL) { + /* + * Need to save the side names key as this + * points to the namespace entry that will + * need to be updated. In addition the call + * to meta_make_sidenmlist does not actually + * set the namespace key. + */ + side_names_key = d->dd_dnp->side_names_key; + metafreedrivename(d->dd_dnp); + d->dd_dnp = metadrivename(&sp, + metadiskname(newname), ep); + Free(newname); + /* + * null newname so we are reset for next time + * through + */ + newname = NULL; + ret = meta_make_sidenmlist(sp, d->dd_dnp, ep); + d->dd_dnp->side_names_key = side_names_key; + if (ret == -1) { + rval = -1; + goto out; + } + } + } + } + + + RB_TEST(1, "take", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "take", ep) + + if (!MD_ATSET_DESC(sd)) { + if (tk_own_bydd(sp, dd, mhiargsp, FALSE, ep)) + goto rollback; + } + + RB_TEST(3, "take", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(4, "take", ep) + + if (clnt_stimeout(mynode(), sp, mhiargsp, ep) == -1) + goto rollback; + + if (setup_db_bydd(sp, dd, (flags & TAKE_FORCE), ep) == -1) { + if (! mdismddberror(ep, MDE_DB_ACCOK) && + ! mdismddberror(ep, MDE_DB_TAGDATA)) + goto rollback; + mdclrerror(ep); + } + + RB_TEST(5, "take", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(6, "take", ep) + + /* Snarf set of traditional diskset doesn't use stale information */ + if (snarf_set(sp, FALSE, ep)) { + if (mdismddberror(ep, MDE_DB_STALE) || + mdismddberror(ep, MDE_DB_ACCOK) || + mdismddberror(ep, MDE_DB_TAGDATA)) { + rval = -1; + goto out; + } + + if (! mdismddberror(ep, MDE_DB_NODB) && + ! mdismddberror(ep, MDE_DB_NOTOWNER)) + goto rollback; + + /* + * Look at the set on all other hosts, if every other host + * has the same set with a larger genid, then we destroy this + * copy. + */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Skip this node */ + if (strcmp(sd->sd_nodes[i], mynode()) == 0) + continue; + + numsides++; + + has_set = nodehasset(sp, sd->sd_nodes[i], + NHS_NST_EQ_G_GT, &xep); + + if (has_set < 0) { + if (! mdiserror(&xep, MDE_NO_SET) && + ! mdismddberror(&xep, MDE_DB_NODB)) + goto rollback; + matches++; + mdclrerror(&xep); + continue; + } + + if (has_set) + matches++; + } + + /* Destroy the set */ + if (numsides > 0 && (numsides - matches) == 0) { + if (meta_set_destroy(sp, FALSE, &xep)) + mdclrerror(&xep); + (void) mddserror(ep, MDE_DS_SETCLEANUP, sp->setno, + sp->setname, NULL, mynode()); + rval = -1; + goto out; + } + goto rollback; + } + + rval = pathname_reload(&sp, sp->setno, ep); + if ((rval == METADEVADM_ERR) || (rval == METADEVADM_DSKNAME_ERR)) { + goto rollback; + } + + + if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) + goto rollback; + + if (upd_dr_dbinfo(sp, sd, dd, rlp, (flags & TAKE_FORCE), ep) < 0) { + metafreereplicalist(rlp); + goto rollback; + } + + metafreereplicalist(rlp); + + /* + * If the set doesn't have the MD_SR_MB_DEVID bit set, i.e + * the drives in the set don't have the device id information, + * then stick it in if possible. + * + * If updating the master block fails for whatever reason, it's + * okay. It just means the disk(s) in the diskset won't be self + * identifying. + */ + if (!(sd->sd_flags & MD_SR_MB_DEVID)) { + /* Lock the set on current set members */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* We already locked this side */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) { + rval = -1; + goto out; + } + } + rb_level = 4; /* level 4 */ + + if (meta_update_mb(sp, dd, ep) == 0) + /* update the sr_flags on all hosts */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_sr_flags(sd->sd_nodes[i], + sp, (sd->sd_flags | MD_SR_MB_DEVID), ep)) + goto rollback; + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* Unlocked of this side is done later */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + + /* + * If we get here, we need to unlock the set before the resync + * gets called, otherwise the "daemon" will hold the set lock + * until the resync is done! + */ + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + cl_set_setkey(NULL); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + /* We try to get things resync'ed, but this can fail */ + mdclrerror(&xep); + if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, &xep) != 0) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + + RB_TEST(7, "take", ep) + + return (rval); + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + if (!(sd->sd_flags & MD_SR_MB_DEVID) && (rb_level > 2)) { + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* We already unlocked this side */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + } + } + cl_set_setkey(NULL); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); + +rollback: + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + rval = -1; + + /* level 4 */ + if (rb_level > 3) { + if (sd->sd_flags & MD_SR_MB_DEVID) { + /* update the sr_flags on all hosts */ + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, + (sd->sd_flags & ~MD_SR_MB_DEVID), &xep)) + mdclrerror(&xep); + } + } + + cl_sk = cl_get_setkey(sp->setno, sp->setname); + for (i = 0; i < MD_MAXSIDES; i++) { + /* Skip empty slots */ + if (sd->sd_nodes[i][0] == '\0') + continue; + + /* We will unlocked this side below */ + if (strcmp(mynode(), sd->sd_nodes[i]) == 0) + continue; + + if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) + mdclrerror(&xep); + } + } + + /* level 3 */ + if (rb_level > 2) { + if (halt_set(sp, &xep)) + mdclrerror(&xep); + } + + /* level 2 */ + if (rb_level > 1) { + if (clnt_stimeout(mynode(), sp, &defmhiargs, &xep) == -1) + mdclrerror(&xep); + } + + /* level 1 */ + if (rb_level > 0) { + if (!MD_ATSET_DESC(sd)) { + if (rel_own_bydd(sp, dd, FALSE, &xep)) + mdclrerror(&xep); + } + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) + mdclrerror(&xep); + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); +} + +int +meta_set_release( + mdsetname_t *sp, + md_error_t *ep +) +{ + int rval = 0; + md_drive_desc *dd; + mhd_mhiargs_t mhiargs; + sigset_t oldsigs; + md_setkey_t *cl_sk; + int rb_level = 0; + md_error_t xep = mdnullerror; + + /* Make sure we own the set */ + if (meta_check_ownership(sp, ep) != 0) + return (-1); + + /* Get the drive descriptors */ + if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST), + ep)) == NULL) + if (! mdisok(ep)) + return (-1); + + /* Get timeout values in case we need to roll back this release */ + (void) memset(&mhiargs, '\0', sizeof (mhiargs)); + if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) + return (-1); + + /* END CHECK CODE */ + + md_rb_sig_handling_on(); + + /* Lock the set on our side */ + if (clnt_lock_set(mynode(), sp, ep)) { + rval = -1; + goto out; + } + + RB_TEST(1, "release", ep) + + RB_PREEMPT; + rb_level = 1; /* level 1 */ + + RB_TEST(2, "release", ep) + + if (halt_set(sp, ep)) + goto rollback; + + RB_TEST(3, "release", ep) + + RB_PREEMPT; + rb_level = 2; /* level 2 */ + + RB_TEST(4, "release", ep) + + if (rel_own_bydd(sp, dd, FALSE, ep)) + goto rollback; + + RB_TEST(5, "release", ep) + + RB_PREEMPT; + rb_level = 3; /* level 3 */ + + RB_TEST(6, "release", ep) + + if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1) + goto rollback; + + RB_TEST(7, "release", ep) + +out: + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) { + if (rval == 0) + (void) mdstealerror(ep, &xep); + rval = -1; + } + cl_set_setkey(NULL); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); + +rollback: + /* Make sure we are blocking all signals */ + if (procsigs(TRUE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + rval = -1; + + /* level 3 */ + if (rb_level > 2) { + if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1) + mdclrerror(&xep); + } + + /* level 2 */ + if (rb_level > 1) { + if (tk_own_bydd(sp, dd, &mhiargs, FALSE, &xep)) + mdclrerror(&xep); + } + + /* level 1 */ + if (rb_level > 0) { + if (setup_db_bydd(sp, dd, TRUE, &xep) == -1) + mdclrerror(&xep); + + /* Snarf set of trad diskset doesn't use stale information */ + if (snarf_set(sp, FALSE, &xep)) + mdclrerror(&xep); + } + + /* level 0 */ + cl_sk = cl_get_setkey(sp->setno, sp->setname); + if (clnt_unlock_set(mynode(), cl_sk, &xep)) + mdclrerror(&xep); + cl_set_setkey(NULL); + + /* release signals back to what they were on entry */ + if (procsigs(FALSE, &oldsigs, &xep) < 0) + mdclrerror(&xep); + + md_rb_sig_handling_off(md_got_sig(), md_which_sig()); + + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_setup.c b/usr/src/lib/lvm/libmeta/common/meta_setup.c new file mode 100644 index 0000000000..64bdc73c3c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_setup.c @@ -0,0 +1,897 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * setup utility + */ + +#include "meta_set_prv.h" +#include <sys/resource.h> +#include <syslog.h> + + +/* globals */ +char *myname = ""; +FILE *metalogfp = NULL; +int metasyslog = 0; +uint_t verbosity = 0; +hrtime_t start_time = 0; +sigset_t allsigs; + +/* locals */ +static int rb_signal_handling = FALSE; +static int rb_signal_caught = FALSE; +static int rb_signal_which = 0; +static size_t metansig = 0; +static struct sigaction *metahandlers = NULL; +#ifdef _DEBUG_MALLOC_INC +static ulong_t malloc_histid_begin; +static ulong_t malloc_histid_end; +static ulong_t malloc_inuse_begin; +static ulong_t malloc_inuse_end; +#endif /* _DEBUG_MALLOC_INC */ + +/* forwards */ +static void md_catcher(int sig); + +/* + * push/pop signal handlers + */ +static int +md_pushsig( + unsigned sig, + void (*handler)(int sig), + md_error_t *ep +) +{ + struct sigaction newhandler; + + /* expand vector as neccessary */ + if (sig >= metansig) { + if (metahandlers == NULL) { + metahandlers = Zalloc( + (sig + 1) * sizeof (metahandlers[0])); + } else { + metahandlers = Realloc(metahandlers, + ((sig + 1) * sizeof (metahandlers[0]))); + (void) memset(&metahandlers[metansig], 0, + ((sig - metansig) * sizeof (metahandlers[0]))); + } + metansig = sig; + } + + /* We need to have a seperate stack to handle rollback properly */ + newhandler.sa_flags = 0; + if (sigfillset(&newhandler.sa_mask) < 0) + return (mdsyserror(ep, errno, + "sigfillset(&newhandler.sa_mask)")); + newhandler.sa_handler = handler; + + /* push handler */ + if (sigaction(sig, &newhandler, &metahandlers[sig]) < 0) + return (mdsyserror(ep, errno, "sigaction(&newhandler)")); + + /* return success */ + return (0); +} + +static int +md_popsig( + unsigned sig, + md_error_t *ep +) +{ + /* can't pop what isn't pushed */ + assert(sig <= metansig); + assert(metahandlers[sig].sa_handler != md_catcher); + + /* pop handler */ + if (sigaction(sig, &metahandlers[sig], NULL) < 0) + return (mdsyserror(ep, errno, "sigaction(&metahandlers)")); + + /* return success */ + return (0); +} + +char * +meta_lock_name( + set_t setno +) +{ + char lockname[30]; + + if (setno == MD_LOCAL_SET) + return (strdup(METALOCK)); + + (void) snprintf(lockname, sizeof (lockname), "%s.%ld", METALOCK, setno); + return (strdup(lockname)); +} + +#define META_LOCK_FD(sp) ((sp)->lockfd) +#define META_LOCK_NAME(sp) (meta_lock_name((sp)->setno)) + +/* + * open lock + */ +static int +meta_lock_open( + mdsetname_t *sp, + md_error_t *ep +) +{ + int lockfd = META_LOCK_FD(sp); + char *lockname = META_LOCK_NAME(sp); + + /* check for already open */ + if (lockfd >= 0) + goto success; + assert(lockfd == MD_NO_LOCK); + + /* open and/or create lock file */ + if ((lockfd = open(lockname, O_WRONLY, 0)) < 0) { + if (errno == EROFS) { + lockfd = MD_NO_LOCK; + goto success; + } + if (errno != ENOENT) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + if ((lockfd = open(lockname, (O_WRONLY|O_CREAT), + 0644)) < 0) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + if (fchmod(lockfd, 0644) != 0) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + } + + /* return success */ +success: + if (lockname != NULL) + free(lockname); + META_LOCK_FD(sp) = lockfd; + return (0); + + /* flag failure */ +failure: + if (lockname != NULL) + free(lockname); + if (lockfd >= 0) + (void) close(lockfd); + return (-1); +} + +static int +meta_lock_close( + mdsetname_t *sp, + md_error_t *ep +) +{ + int retval = 0; + + if (close(META_LOCK_FD(sp)) != 0) { + if (ep != NULL) { + char *lockname = META_LOCK_NAME(sp); + (void) mdsyserror(ep, errno, lockname); + if (lockname != NULL) + free(lockname); + } + + retval = -1; + } + META_LOCK_FD(sp) = MD_NO_LOCK; + return (retval); +} + +/* + * unlock + */ +int +meta_unlock( + mdsetname_t *sp, + md_error_t *ep +) +{ + int lockfd = META_LOCK_FD(sp); + + /* ignore read-only filesystem */ + if (lockfd == MD_NO_LOCK) + return (0); + + assert(lockfd >= 0); + + /* unlock and discard */ + if (lockf(lockfd, F_ULOCK, 0) != 0) { + (void) mdsyserror(ep, errno, METALOCK); + (void) meta_lock_close(sp, NULL); + return (-1); + } + return (meta_lock_close(sp, ep)); +} + +/* + * lock + */ +int +meta_lock( + mdsetname_t *sp, + int print_status, + md_error_t *ep +) +{ + int lockfd; + char *lockname = NULL; + + /* open lock file */ + if (meta_lock_open(sp, ep) != 0) { + assert(META_LOCK_FD(sp) == MD_NO_LOCK); + goto failure; + } + + /* ignore read-only filesystem */ + if ((lockfd = META_LOCK_FD(sp)) == MD_NO_LOCK) + goto success; + assert(lockfd >= 0); + + lockname = META_LOCK_NAME(sp); + + /* grab lock */ + if (lockf(lockfd, F_TLOCK, 0) != 0) { + if ((errno != EACCES) && (errno != EAGAIN)) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + if (print_status) + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: waiting on %s\n"), + myname, lockname); + if (lockf(lockfd, F_LOCK, 0) != 0) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + } + + /* return success */ +success: + if (lockname != NULL) + free(lockname); + return (0); + + /* flag failure */ +failure: + if (lockname != NULL) + free(lockname); + if (lockfd >= 0) + (void) meta_lock_close(sp, ep); + return (-1); +} + +int +meta_lock_nowait( + mdsetname_t *sp, + md_error_t *ep +) +{ + int lockfd; + char *lockname = NULL; + + /* open lock file */ + if (meta_lock_open(sp, ep) != 0) { + assert(META_LOCK_FD(sp) == MD_NO_LOCK); + goto failure; + } + + /* ignore read-only filesystem */ + if ((lockfd = META_LOCK_FD(sp)) == MD_NO_LOCK) + goto success; + assert(lockfd >= 0); + + lockname = META_LOCK_NAME(sp); + + /* grab lock */ + if (lockf(lockfd, F_TLOCK, 0) != 0) { + if ((errno != EACCES) && (errno != EAGAIN)) { + (void) mdsyserror(ep, errno, lockname); + goto failure; + } + (void) mdsyserror(ep, EAGAIN, lockname); + goto failure; + } + + /* return success */ +success: + if (lockname != NULL) + free(lockname); + return (0); + + /* flag failure */ +failure: + if (lockname != NULL) + free(lockname); + if (lockfd >= 0) + (void) meta_lock_close(sp, ep); + return (-1); +} + +/* + * lock status + */ +int +meta_lock_status( + mdsetname_t *sp, + md_error_t *ep +) +{ + int lockfd; + + /* open lock file */ + if (meta_lock_open(sp, ep) != 0) { + assert(META_LOCK_FD(sp) == MD_NO_LOCK); + return (-1); + } + + lockfd = META_LOCK_FD(sp); + /* ignore read-only filesystem */ + if (lockfd == MD_NO_LOCK) + return (0); + assert(lockfd >= 0); + + /* test lock */ + if (lockf(lockfd, F_TEST, 0) != 0) { + char *lockname = META_LOCK_NAME(sp); + (void) mdsyserror(ep, errno, lockname); + if (lockname != NULL) + free(lockname); + return (-1); + } + + return (0); +} + +/* + * setup for syslog daemon output + */ +static void +md_syslog( + char *name /* name of program */ +) +{ + if ((name == NULL) || (*name == '\0')) + name = "md"; + openlog(name, LOG_CONS, LOG_DAEMON); + metasyslog = 1; +} + +/* + * daemonize: put in background + */ +int +md_daemonize( + mdsetname_t *sp, + md_error_t *ep +) +{ + char *p; + struct rlimit rlim; + pid_t pid; + int i; + + /* debug */ + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "NODAEMON") != NULL)) { + return (0); /* do nothing */ + } + + /* get number of file descriptors */ + if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) { + return (mdsyserror(ep, errno, "getrlimit(RLIMIT_NOFILE)")); + } + + /* fork and kill parent */ + if ((pid = fork()) == -1) + return (mdsyserror(ep, errno, "fork")); + else if (pid != 0) + return (pid); + + /* + * We need to close the admin device and reset the specialfd to force + * the child process to reopen it, since we are going to close all + * descriptors from 3 up to RLIMIT_NOFILE in the child. + */ + if (close_admin(ep) != 0) + return (-1); + + /* close RPC connections */ + metarpccloseall(); + + /* drop lock */ + if (meta_unlock(sp, ep) != 0) + return (-1); + + if (rlim.rlim_cur != RLIM_INFINITY) { + /* + * close all but stdout, stderr, and metalogfp + */ + + for (i = 0; (i < rlim.rlim_cur); ++i) { + if ((i == fileno(stdout)) || + (i == fileno(stderr)) || + ((metalogfp != NULL) && + (i == fileno(metalogfp)))) { + continue; + } + (void) close(i); + } + } + + /* put in own process group */ + if (setsid() == -1) + return (mdsyserror(ep, errno, "setsid")); + + /* setup syslog */ + md_syslog(myname); + + /* return success */ + return (0); +} + +/* + * flush and sync fp + */ +static void +flushfp( + FILE *fp +) +{ + (void) fflush(fp); + (void) fsync(fileno(fp)); +} + +/* + * reset and exit utility + */ +void +md_exit( + mdsetname_t *sp, + int eval +) +{ + md_error_t status = mdnullerror; + md_error_t *ep = &status; + + + /* close RPC connections */ + metarpccloseall(); + + if (sp != NULL) { + if (meta_unlock(sp, ep) != 0) { + mde_perror(ep, ""); + mdclrerror(ep); + if (eval == 0) + eval = 1; + } + } + + /* flush name caches */ +#ifdef DEBUG + metaflushnames(1); +#endif /* DEBUG */ + + /* log exit */ + if (metalogfp != NULL) { + md_logpfx(metalogfp); + (void) fprintf(metalogfp, dgettext(TEXT_DOMAIN, + "exiting with %d\n"), eval); + flushfp(metalogfp); + (void) fclose(metalogfp); + metalogfp = NULL; + } + if ((metasyslog) && (eval != 0)) { + syslog(LOG_ERR, dgettext(TEXT_DOMAIN, + "exiting with %d\n"), eval); + closelog(); + metasyslog = 0; + } + + /* check arena, print malloc usage */ +#ifdef _DEBUG_MALLOC_INC + (void) malloc_chain_check(1); + { + char *p; + + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "MALLOC") != NULL)) { + malloc_inuse_end = malloc_inuse(&malloc_histid_end); + (void) fprintf(stderr, "%s: end malloc_inuse %lu\n", + myname, malloc_inuse_end); + if (malloc_inuse_end != malloc_inuse_begin) { + malloc_list(fileno(stderr), + malloc_histid_begin, malloc_histid_end); + } + } + } +#endif /* _DEBUG_MALLOC_INC */ + + /* exit with value */ + exit(eval); +} + +/* + * signal catcher + */ +static void +md_catcher( + int sig +) +{ + char buf[128]; + char *msg; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + struct sigaction defhandler; + + /* log signal */ + if ((msg = strsignal(sig)) == NULL) { + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, "unknown signal %d"), sig); + msg = buf; + } + md_eprintf("%s\n", msg); + + /* + * In roll_back crtical section handling, the first instance of a user + * generated signal is caught, a flag is set to allow preemption at a + * "convenient" point and md_catcher returns. If the user continues + * generate the signal, the second instance will invoke the default + * handler and exit. + */ + if (rb_signal_handling == TRUE) { + if (sig != SIGABRT && sig != SIGBUS && sig != SIGSEGV) { + if (rb_signal_caught == FALSE) { + rb_signal_caught = TRUE; + rb_signal_which = sig; + return; + } + } + } + + /* let default handler do it's thing */ + if (md_popsig(sig, ep) != 0) { + mde_perror(ep, ""); + mdclrerror(ep); + defhandler.sa_flags = 0; + if (sigfillset(&defhandler.sa_mask) < 0) { + (void) mdsyserror(ep, errno, + "sigfillset(&defhandler.sa_mask)"); + mde_perror(ep, ""); + md_exit(NULL, 1); + } + defhandler.sa_handler = SIG_DFL; + if (sigaction(sig, &defhandler, NULL) < 0) { + (void) mdsyserror(ep, errno, "sigaction(&defhandler)"); + mde_perror(ep, ""); + md_exit(NULL, 1); + } + } + + md_post_sig(sig); +} + +void +md_post_sig(int sig) +{ + if (kill(getpid(), sig) != 0) { + md_perror("kill(getpid())"); + md_exit(NULL, -sig); + } +} + +int +md_got_sig(void) +{ + return (rb_signal_caught); +} + +int +md_which_sig(void) +{ + return (rb_signal_which); +} + +void +md_rb_sig_handling_on(void) +{ + rb_signal_handling = TRUE; +} + +void +md_rb_sig_handling_off(int sig_seen, int sig) +{ + rb_signal_handling = FALSE; + rb_signal_caught = FALSE; + rb_signal_which = 0; + if (sig_seen) + md_post_sig(sig); +} + +/* + * setup metaclust variables + */ +void +setup_mc_log( + uint_t level +) +{ + /* initialise externals */ + verbosity = level; + start_time = gethrtime(); +} + +/* + * initilize utility + */ +int +md_init( + int argc, + char *argv[], + int dosyslog, + int doadmin, + md_error_t *ep +) +{ + int ret = 0; + + /* initialize everything but the signals */ + if ((ret = md_init_nosig(argc, argv, dosyslog, + doadmin, ep)) != 0) + return (ret); + + + if (sigfillset(&allsigs) < 0) + return (mdsyserror(ep, errno, "sigfillset(&allsigs)")); + + /* catch common signals */ + if ((md_pushsig(SIGHUP, md_catcher, ep) != 0) || + (md_pushsig(SIGINT, md_catcher, ep) != 0) || + (md_pushsig(SIGQUIT, md_catcher, ep) != 0) || + (md_pushsig(SIGABRT, md_catcher, ep) != 0) || + (md_pushsig(SIGBUS, md_catcher, ep) != 0) || + (md_pushsig(SIGSEGV, md_catcher, ep) != 0) || + (md_pushsig(SIGPIPE, md_catcher, ep) != 0) || + (md_pushsig(SIGTERM, md_catcher, ep) != 0)) { + return (-1); + } + + /* return success */ + return (0); +} + + +/* + * initilize utility without setting up sighandlers + * setting up signal handlers in libmeta can affect others + * programs that link with libmeta but have their own handlers + */ +int +md_init_nosig( + int argc, + char *argv[], + int dosyslog, + int doadmin, + md_error_t *ep +) +{ + /* setup myname */ + if ((myname = strrchr(argv[0], '/')) != NULL) + ++myname; + else + myname = argv[0]; + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + + /* print malloc usage */ +#ifdef _DEBUG_MALLOC_INC + { + char *p; + + if (((p = getenv("MD_DEBUG")) != NULL) && + (strstr(p, "MALLOC") != NULL)) { + malloc_inuse_begin = + malloc_inuse(&malloc_histid_begin); + (void) fprintf(stderr, "%s: begin malloc_inuse %lu\n", + myname, malloc_inuse_begin); + } + } +#endif /* _DEBUG_MALLOC_INC */ + + /* open syslog */ + if (dosyslog) + md_syslog(myname); + + /* log command */ + if (getenv(METALOGENV) != NULL) { + if ((metalogfp = fopen(METALOG, "a")) != NULL) { + int i; + + (void) fchmod(fileno(metalogfp), 0664); + md_logpfx(metalogfp); + for (i = 1; (i < argc); ++i) + (void) fprintf(metalogfp, " %s", argv[i]); + (void) fprintf(metalogfp, "\n"); + flushfp(metalogfp); + } + } + + /* make sure we can open the admin device before we do anything else */ + if (doadmin) + if (open_admin(ep) < 0) + return (-1); + + /* flush name caches */ + metaflushnames(1); + + /* return success */ + return (0); +} + +/* + * (re)initilize daemon + */ +int +md_init_daemon( + char *name, + md_error_t *ep +) +{ + static int already = 0; + int dosyslog = 1; + int doadmin = 1; + + /* setup */ + if (! already) { + if (md_init(1, &name, dosyslog, doadmin, ep) != 0) + return (-1); + already = 1; + } + + /* return success */ + return (0); +} + +/* + * Roll back functions for handling sync and async cleanup. + */ + +int +procsigs(int block, sigset_t *oldsigs, md_error_t *ep) +{ + if (block == TRUE) { + if (sigprocmask(SIG_BLOCK, &allsigs, oldsigs) < 0) { + (void) mdsyserror(ep, errno, "sigprocmask(SIG_BLOCK)"); + return (-1); + } + } else { + if (sigprocmask(SIG_SETMASK, oldsigs, NULL) < 0) { + (void) mdsyserror(ep, errno, + "sigprocmask(SIG_SETMASK)"); + return (-1); + } + } + return (0); +} + +#ifdef DEBUG +int +rb_test( + int rbt_sel_tpt, + char *rbt_sel_tag, + md_error_t *ep +) +{ + char *rbt_env_tpt = getenv("META_RBT_TPT"); + char *rbt_env_tag = getenv("META_RBT_TAG"); + int sig = 0; + int rbt_int_tpt; + int rbt_tag_match = 1; + sigset_t curmask; + md_error_t xep = mdnullerror; + + if (rbt_env_tpt) { + rbt_int_tpt = atoi(rbt_env_tpt); + if (rbt_int_tpt < 0) { + sig = 1; + rbt_int_tpt = -1 * rbt_int_tpt; + } + + assert(rbt_sel_tpt != 0); + + if (rbt_int_tpt == 0) + return (0); + + if (rbt_env_tag && rbt_sel_tag) + if (strcmp(rbt_env_tag, rbt_sel_tag) != 0) + rbt_tag_match = 0; + + if (rbt_int_tpt == rbt_sel_tpt && rbt_tag_match) { + md_eprintf( + "******************** RB_TEST(%s, %d, sig=%s)\n", + rbt_sel_tag, rbt_sel_tpt, + (sig != 0) ? "True" : "False"); + if (sig) { + md_eprintf("********** sigsuspend()\n"); + if (sigprocmask(NULL, NULL, &curmask) < 0) { + (void) mdsyserror(&xep, errno, NULL); + mde_perror(&xep, "sigprocmask(GET)"); + md_exit(NULL, 1); + } + + if (sigsuspend(&curmask) < 0) { + (void) mdsyserror(&xep, errno, NULL); + mde_perror(&xep, + "sigsuspend(&curmask)"); + md_exit(NULL, 1); + } + + if (md_got_sig()) + return (-1); + } + (void) mderror(ep, MDE_TESTERROR, + "********** rb_test()"); + md_eprintf("******************** rollback\n"); + return (-1); + } + } + return (0); +} +#else +/* ARGSUSED */ +int +rb_test( + int rbt_sel_tpt, + char *rbt_sel_tag, + md_error_t *ep +) +{ + (void) mderror(ep, MDE_TESTERROR, "******** rb_test:Not supported\n"); + return (-1); + +} +#endif /* DEBUG */ diff --git a/usr/src/lib/lvm/libmeta/common/meta_smf.c b/usr/src/lib/lvm/libmeta/common/meta_smf.c new file mode 100644 index 0000000000..204691a1a3 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_smf.c @@ -0,0 +1,351 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Service Management Facility (SMF) interfaces. + */ + +#include <stdio.h> +#include <libscf.h> +#include <meta.h> + +static void enable(char *svc_names[], md_error_t *ep); +static void disable(char *svc_names[], md_error_t *ep); +static int enabled(char *svc_name); +static int online(char *svc_names[], char **names); +static void wait_online(char *svc_names[]); +static int is_online(char *svc_name); + +static char +*svm_core_svcs[] = { + "system/metainit:default", + "system/mdmonitor:default", + "network/rpc/meta:default", + NULL +}; + +static char +*svm_diskset_svcs[] = { + "network/rpc/metamed:default", + "network/rpc/metamh:default", + NULL +}; + +static char +*svm_mn_diskset_svcs[] = { + "network/rpc/mdcomm:default", + NULL +}; + +/* + * Enable the specified SVM services through the SMF. + */ +int +meta_smf_enable(uint_t flags, md_error_t *ep) +{ + if (flags & META_SMF_CORE) { + enable(svm_core_svcs, ep); + wait_online(svm_core_svcs); + } + + if (flags & META_SMF_DISKSET) { + enable(svm_diskset_svcs, ep); + wait_online(svm_diskset_svcs); + } + + if (flags & META_SMF_MN_DISKSET) { + enable(svm_mn_diskset_svcs, ep); + wait_online(svm_mn_diskset_svcs); + } + + if (ep != NULL) + return ((mdisok(ep)) ? 0 : -1); + else + return (0); +} + +/* + * Disable the specified SVM services through the SMF. + */ +int +meta_smf_disable(uint_t flags, md_error_t *ep) +{ + if (flags & META_SMF_CORE) { + disable(svm_core_svcs, ep); + } + + if (flags & META_SMF_DISKSET) { + disable(svm_diskset_svcs, ep); + } + + if (flags & META_SMF_MN_DISKSET) { + disable(svm_mn_diskset_svcs, ep); + } + + if (ep != NULL) + return ((mdisok(ep)) ? 0 : -1); + else + return (0); +} + +/* + * Determine if desired services are online. If all services in the + * classes specified by flags are online, 1 is returned. Otherwise + * 0 is returned. + */ + +int +meta_smf_isonline(uint_t flags, md_error_t *ep) +{ + int ret = 1; + char *names = NULL; + + if (flags & META_SMF_CORE) { + if (online(svm_core_svcs, &names) == 0) + ret = 0; + } + if (flags & META_SMF_DISKSET) { + if (online(svm_diskset_svcs, &names) == 0) + ret = 0; + } + if (flags & META_SMF_MN_DISKSET) { + if (online(svm_mn_diskset_svcs, &names) == 0) + ret = 0; + } + + if (ret == 0) { + (void) mderror(ep, MDE_SMF_NO_SERVICE, names); + Free(names); + } + + return (ret); +} + +/* + * Return a bitmask of the META_SMF_* flags indicating which services should be + * online given the current SVM configuration. + */ +int +meta_smf_getmask() +{ + int mask = 0; + mdsetname_t *sp = NULL; + mddb_config_t c; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int max_sets; + + /* + * If there are any local metadbs configured then the core services + * are needed. + */ + (void) memset(&c, 0, sizeof (c)); + c.c_setno = MD_LOCAL_SET; + if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0 || c.c_dbcnt == 0) + return (mask); + + mask |= META_SMF_CORE; + + /* + * If any disksets configured then the diskset services are needed. + * Also check for multi-node sets. + */ + if ((max_sets = get_max_sets(ep)) > 0) { + int i; + + mdclrerror(ep); + for (i = 1; i < max_sets; i++) { + md_set_desc *sd; + + if ((sp = metasetnosetname(i, ep)) == NULL) { + if (!mdisok(ep) && !mdiserror(ep, MDE_NO_SET) && + !mdismddberror(ep, MDE_NOTENOUGH_DB) && + !mdiserror(ep, MDE_SMF_NO_SERVICE) && + ep->info.errclass != MDEC_RPC) { + /* + * metad rpc program not registered + * can't get diskset info + */ + break; + } + + } else { + mask |= META_SMF_DISKSET; + + if ((sd = metaget_setdesc(sp, ep)) != NULL) { + if (MD_MNSET_DESC(sd)) { + mask |= META_SMF_MN_DISKSET; + + /* + * we don't have to check the + * rest of the disksets at this + * point + */ + break; + } + } + } + + mdclrerror(ep); + } + } + + return (mask); +} + +static void +enable(char *svc_names[], md_error_t *ep) +{ + int i; + + for (i = 0; svc_names[i]; i++) { + if (!enabled(svc_names[i])) + if (smf_enable_instance(svc_names[i], 0) != 0) { + if (ep != NULL) { + (void) mderror(ep, MDE_SMF_FAIL, + svc_names[i]); + } + } + } +} + +static void +disable(char *svc_names[], md_error_t *ep) +{ + int i; + + for (i = 0; svc_names[i]; i++) { + if (enabled(svc_names[i])) + if (smf_disable_instance(svc_names[i], 0) != 0) { + if (ep != NULL) { + (void) mderror(ep, MDE_SMF_FAIL, + svc_names[i]); + } + } + } +} + +static int +enabled(char *svc_name) +{ + scf_simple_prop_t *prop; + int rval = 0; + + prop = scf_simple_prop_get(NULL, svc_name, SCF_PG_GENERAL, + SCF_PROPERTY_ENABLED); + + if (scf_simple_prop_numvalues(prop) == 1) { + if (*scf_simple_prop_next_boolean(prop) != 0) + rval = 1; + } + + scf_simple_prop_free(prop); + + return (rval); +} + +/* + * There can be a delay while the RPC services get going. Try to + * make sure the RPC daemons are ready to run before we return. + * Check 15 times (15 seconds total wait time) and then just + * return. + */ +static void +wait_online(char *svc_names[]) +{ + int i; + char *names = NULL; + + for (i = 0; i < 15; i++) { + if (online(svc_names, &names)) + break; + (void) sleep(1); + } + + if (names != NULL) + Free(names); +} + +/* + * Check to see if all services in the svc_names are online. If they are + * all online 1 is returned, otherwise 0 is returned. + */ + +static int +online(char *svc_names[], char **names) +{ + int i; + int rv = 1; + + for (i = 0; svc_names[i]; i++) { + if (is_online(svc_names[i]) == 0) { + int sz; + char *p; + + /* + * Need space for the name, the new line, the + * tab and the null terminator. + */ + sz = strlen(svc_names[i]) + 3; + + if (*names == NULL) { + p = Malloc(sz); + (void) snprintf(p, sz, "\n\t%s", svc_names[i]); + + } else { + /* Add space for existing names */ + sz += strlen(*names); + p = Malloc(sz); + (void) snprintf(p, sz, "%s\n\t%s", *names, + svc_names[i]); + Free(names); + } + + *names = p; + rv = 0; + } + } + return (rv); +} + +/* + * Return 1 if the specified service is online. Otherwise, return 0. + */ +static int +is_online(char *svc_name) +{ + int rval = 0; + char *s; + + if ((s = smf_get_state(svc_name)) != NULL) { + if (strcmp(s, "online") == 0) + rval = 1; + free(s); + } + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_sp.c b/usr/src/lib/lvm/libmeta/common/meta_sp.c new file mode 100644 index 0000000000..ce3965489f --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_sp.c @@ -0,0 +1,6652 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * soft partition operations + * + * Soft Partitions provide a virtual disk mechanism which is used to + * divide a large volume into many small pieces, each appearing as a + * separate device. A soft partition consists of a series of extents, + * each having an offset and a length. The extents are logically + * contiguous, so where the first extent leaves off the second extent + * picks up. Which extent a given "virtual offset" belongs to is + * dependent on the size of all the previous extents in the soft + * partition. + * + * Soft partitions are represented in memory by an extent node + * (sp_ext_node_t) which contains all of the information necessary to + * create a unit structure and update the on-disk format, called + * "watermarks". These extent nodes are typically kept in a doubly + * linked list and are manipulated by list manipulation routines. A + * list of extents may represent all of the soft partitions on a volume, + * a single soft partition, or perhaps just a set of extents that need + * to be updated. Extent lists may be sorted by extent or by name/seq#, + * depending on which compare function is used. Most of the routines + * require the list be sorted by offset to work, and that's the typical + * configuration. + * + * In order to do an allocation, knowledge of all soft partitions on the + * volume is required. Then free space is determined from the space + * that is not allocated, and new allocations can be made from the free + * space. Once the new allocations are made, a unit structure is created + * and the watermarks are updated. The status is then changed to "okay" + * on the unit structure to commit the transaction. If updating the + * watermarks fails, the unit structure is in an intermediate state and + * the driver will not allow access to the device. + * + * A typical sequence of events is: + * 1. Fetch the list of names for all soft partitions on a volume + * meta_sp_get_by_component() + * 2. Construct an extent list from the name list + * meta_sp_extlist_from_namelist() + * 3. Fill the gaps in the extent list with free extents + * meta_sp_list_freefill() + * 4. Allocate from the free extents + * meta_sp_alloc_by_len() + * meta_sp_alloc_by_list() + * 5. Create the unit structure from the extent list + * meta_sp_createunit() + * meta_sp_updateunit() + * 6. Write out the watermarks + * meta_sp_update_wm() + * 7. Set the status to "Okay" + * meta_sp_setstatus() + * + */ + +#include <stdio.h> +#include <meta.h> +#include "meta_repartition.h" +#include <sys/lvm/md_sp.h> +#include <sys/lvm/md_crc.h> +#include <strings.h> +#include <sys/lvm/md_mirror.h> +#include <sys/bitmap.h> + +extern int md_in_daemon; + +typedef struct sp_ext_node { + struct sp_ext_node *ext_next; /* next element */ + struct sp_ext_node *ext_prev; /* previous element */ + sp_ext_type_t ext_type; /* type of extent */ + sp_ext_offset_t ext_offset; /* starting offset */ + sp_ext_length_t ext_length; /* length of this node */ + uint_t ext_flags; /* extent flags */ + uint32_t ext_seq; /* watermark seq no */ + mdname_t *ext_namep; /* name pointer */ + mdsetname_t *ext_setp; /* set pointer */ +} sp_ext_node_t; + +/* extent flags */ +#define EXTFLG_UPDATE (1) + +/* Extent node compare function for list sorting */ +typedef int (*ext_cmpfunc_t)(sp_ext_node_t *, sp_ext_node_t *); + + +/* Function Prototypes */ + +/* Debugging Functions */ +static void meta_sp_debug(char *format, ...); +static void meta_sp_printunit(mp_unit_t *mp); + +/* Misc Support Functions */ +int meta_sp_parsesize(char *s, sp_ext_length_t *szp); +static int meta_sp_parsesizestring(char *s, sp_ext_length_t *szp); +static int meta_sp_setgeom(mdname_t *np, mdname_t *compnp, mp_unit_t *mp, + md_error_t *ep); +static int meta_sp_get_by_component(mdsetname_t *sp, mdname_t *compnp, + mdnamelist_t **nlpp, int force, md_error_t *ep); +static sp_ext_length_t meta_sp_get_default_alignment(mdsetname_t *sp, + mdname_t *compnp, md_error_t *ep); + +/* Extent List Manipulation Functions */ +static int meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2); +static int meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2); +static void meta_sp_list_insert(mdsetname_t *sp, mdname_t *np, + sp_ext_node_t **head, sp_ext_offset_t offset, sp_ext_length_t length, + sp_ext_type_t type, uint_t seq, uint_t flags, ext_cmpfunc_t compare); +static void meta_sp_list_free(sp_ext_node_t **head); +static void meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext); +static sp_ext_length_t meta_sp_list_size(sp_ext_node_t *head, + sp_ext_type_t exttype, int exclude_wm); +static sp_ext_node_t *meta_sp_list_find(sp_ext_node_t *head, + sp_ext_offset_t offset); +static void meta_sp_list_freefill(sp_ext_node_t **extlist, + sp_ext_length_t size); +static void meta_sp_list_dump(sp_ext_node_t *head); +static int meta_sp_list_overlaps(sp_ext_node_t *head); + +/* Extent List Query Functions */ +static boolean_t meta_sp_enough_space(int desired_number_of_sps, + blkcnt_t desired_sp_size, sp_ext_node_t **extent_listpp, + sp_ext_length_t alignment); +static boolean_t meta_sp_get_extent_list(mdsetname_t *mdsetnamep, + mdname_t *device_mdnamep, sp_ext_node_t **extent_listpp, + md_error_t *ep); +static boolean_t meta_sp_get_extent_list_for_drive(mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep, sp_ext_node_t **extent_listpp); + + +/* Extent Allocation Functions */ +static void meta_sp_alloc_by_ext(mdsetname_t *sp, mdname_t *np, + sp_ext_node_t **extlist, sp_ext_node_t *free_ext, + sp_ext_offset_t alloc_offset, sp_ext_length_t alloc_length, uint_t seq); +static int meta_sp_alloc_by_len(mdsetname_t *sp, mdname_t *np, + sp_ext_node_t **extlist, sp_ext_length_t *lp, + sp_ext_offset_t last_off, sp_ext_length_t alignment); +static int meta_sp_alloc_by_list(mdsetname_t *sp, mdname_t *np, + sp_ext_node_t **extlist, sp_ext_node_t *oblist); + +/* Extent List Population Functions */ +static int meta_sp_extlist_from_namelist(mdsetname_t *sp, mdnamelist_t *spnlp, + sp_ext_node_t **extlist, md_error_t *ep); +static int meta_sp_extlist_from_wm(mdsetname_t *sp, mdname_t *compnp, + sp_ext_node_t **extlist, ext_cmpfunc_t compare, md_error_t *ep); + +/* Print (metastat) Functions */ +static int meta_sp_short_print(md_sp_t *msp, char *fname, FILE *fp, + mdprtopts_t options, md_error_t *ep); +static char *meta_sp_status_to_name(xsp_status_t xsp_status, uint_t tstate); +static int meta_sp_report(mdsetname_t *sp, md_sp_t *msp, mdnamelist_t **nlpp, + char *fname, FILE *fp, mdprtopts_t options, md_error_t *ep); + +/* Watermark Manipulation Functions */ +static int meta_sp_update_wm(mdsetname_t *sp, md_sp_t *msp, + sp_ext_node_t *extlist, md_error_t *ep); +static int meta_sp_clear_wm(mdsetname_t *sp, md_sp_t *msp, md_error_t *ep); +static int meta_sp_read_wm(mdsetname_t *sp, mdname_t *compnp, + mp_watermark_t *wm, sp_ext_offset_t offset, md_error_t *ep); +static diskaddr_t meta_sp_get_start(mdsetname_t *sp, mdname_t *compnp, + md_error_t *ep); + +/* Unit Structure Manipulation Functions */ +static void meta_sp_fillextarray(mp_unit_t *mp, sp_ext_node_t *extlist); +static mp_unit_t *meta_sp_createunit(mdname_t *np, mdname_t *compnp, + sp_ext_node_t *extlist, int numexts, sp_ext_length_t len, + sp_status_t status, md_error_t *ep); +static mp_unit_t *meta_sp_updateunit(mdname_t *np, mp_unit_t *old_un, + sp_ext_node_t *extlist, sp_ext_length_t grow_len, int numexts, + md_error_t *ep); +static int meta_create_sp(mdsetname_t *sp, md_sp_t *msp, sp_ext_node_t *oblist, + mdcmdopts_t options, sp_ext_length_t alignment, md_error_t *ep); +static int meta_check_sp(mdsetname_t *sp, md_sp_t *msp, mdcmdopts_t options, + int *repart_options, md_error_t *ep); + +/* Reset (metaclear) Functions */ +static int meta_sp_reset_common(mdsetname_t *sp, mdname_t *np, md_sp_t *msp, + md_sp_reset_t reset_params, mdcmdopts_t options, md_error_t *ep); + +/* Recovery (metarecover) Functions */ +static void meta_sp_display_exthdr(void); +static void meta_sp_display_ext(sp_ext_node_t *ext); +static int meta_sp_checkseq(sp_ext_node_t *extlist); +static int meta_sp_resolve_name_conflict(mdsetname_t *, mdname_t *, + mdname_t **, md_error_t *); +static int meta_sp_validate_wm(mdsetname_t *sp, mdname_t *np, + mdcmdopts_t options, md_error_t *ep); +static int meta_sp_validate_unit(mdsetname_t *sp, mdname_t *compnp, + mdcmdopts_t options, md_error_t *ep); +static int meta_sp_validate_wm_and_unit(mdsetname_t *sp, mdname_t *np, + mdcmdopts_t options, md_error_t *ep); +static int meta_sp_validate_exts(mdname_t *np, sp_ext_node_t *wmext, + sp_ext_node_t *unitext, md_error_t *ep); +static int meta_sp_recover_from_wm(mdsetname_t *sp, mdname_t *compnp, + mdcmdopts_t options, md_error_t *ep); +static int meta_sp_recover_from_unit(mdsetname_t *sp, mdname_t *np, + mdcmdopts_t options, md_error_t *ep); + +/* + * Private Constants + */ + +static const int FORCE_RELOAD_CACHE = 1; +static const uint_t NO_FLAGS = 0; +static const sp_ext_offset_t NO_OFFSET = 0ULL; +static const uint_t NO_SEQUENCE_NUMBER = 0; +static const int ONE_SOFT_PARTITION = 1; + +static unsigned long sp_parent_printed[BT_BITOUL(MD_MAXUNITS)]; + +#define TEST_SOFT_PARTITION_NAMEP NULL +#define TEST_SETNAMEP NULL + +#define EXCLUDE_WM (1) +#define INCLUDE_WM (0) + +#define SP_UNALIGNED (0LL) + +/* + * ************************************************************************** + * Debugging Functions * + * ************************************************************************** + */ + +/*PRINTFLIKE1*/ +static void +meta_sp_debug(char *format, ...) +{ + static int debug; + static int debug_set = 0; + va_list ap; + + if (!debug_set) { + debug = getenv(META_SP_DEBUG) ? 1 : 0; + debug_set = 1; + } + + if (debug) { + va_start(ap, format); + (void) vfprintf(stderr, format, ap); + va_end(ap); + } +} + +static void +meta_sp_printunit(mp_unit_t *mp) +{ + int i; + + if (mp == NULL) + return; + + /* print the common fields we know about */ + (void) fprintf(stderr, "\tmp->c.un_type: %d\n", mp->c.un_type); + (void) fprintf(stderr, "\tmp->c.un_size: %u\n", mp->c.un_size); + (void) fprintf(stderr, "\tmp->c.un_self_id: %lu\n", MD_SID(mp)); + + /* sp-specific fields */ + (void) fprintf(stderr, "\tmp->un_status: %u\n", mp->un_status); + (void) fprintf(stderr, "\tmp->un_numexts: %u\n", mp->un_numexts); + (void) fprintf(stderr, "\tmp->un_length: %llu\n", mp->un_length); + (void) fprintf(stderr, "\tmp->un_dev(32): 0x%llx\n", mp->un_dev); + (void) fprintf(stderr, "\tmp->un_dev(64): 0x%llx\n", mp->un_dev); + (void) fprintf(stderr, "\tmp->un_key: %d\n", mp->un_key); + + /* print extent information */ + (void) fprintf(stderr, "\tExt#\tvoff\t\tpoff\t\tLen\n"); + for (i = 0; i < mp->un_numexts; i++) { + (void) fprintf(stderr, "\t%d\t%llu\t\t%llu\t\t%llu\n", i, + mp->un_ext[i].un_voff, mp->un_ext[i].un_poff, + mp->un_ext[i].un_len); + } +} + +/* + * FUNCTION: meta_sp_parsesize() + * INPUT: s - the string to parse + * OUTPUT: *szp - disk block count (0 for "all") + * RETURNS: -1 for error, 0 for success + * PURPOSE: parses the command line parameter that specifies the + * requested size of a soft partition. The input string + * is either the literal "all" or a numeric value + * followed by a single character, b for disk blocks, k + * for kilobytes, m for megabytes, g for gigabytes, or t + * for terabytes. p for petabytes and e for exabytes + * have been added as undocumented features for future + * expansion. For example, 100m is 100 megabytes, while + * 50g is 50 gigabytes. All values are rounded up to the + * nearest block size. + */ +int +meta_sp_parsesize(char *s, sp_ext_length_t *szp) +{ + if (s == NULL || szp == NULL) { + return (-1); + } + + /* Check for literal "all" */ + if (strcasecmp(s, "all") == 0) { + *szp = 0; + return (0); + } + + return (meta_sp_parsesizestring(s, szp)); +} + +/* + * FUNCTION: meta_sp_parsesizestring() + * INPUT: s - the string to parse + * OUTPUT: *szp - disk block count + * RETURNS: -1 for error, 0 for success + * PURPOSE: parses a string that specifies size. The input string is a + * numeric value followed by a single character, b for disk blocks, + * k for kilobytes, m for megabytes, g for gigabytes, or t for + * terabytes. p for petabytes and e for exabytes have been added + * as undocumented features for future expansion. For example, + * 100m is 100 megabytes, while 50g is 50 gigabytes. All values + * are rounded up to the nearest block size. + */ +static int +meta_sp_parsesizestring(char *s, sp_ext_length_t *szp) +{ + sp_ext_length_t len = 0; + char len_type[2]; + + if (s == NULL || szp == NULL) { + return (-1); + } + + /* + * make sure block offset does not overflow 2^64 bytes. + */ + if ((sscanf(s, "%llu%1[BbKkMmGgTt]", &len, len_type) != 2) || + (len == 0LL) || + (len > (1LL << (64 - DEV_BSHIFT)))) + return (-1); + + switch (len_type[0]) { + case 'B': + case 'b': + len = lbtodb(roundup(len * DEV_BSIZE, DEV_BSIZE)); + break; + case 'K': + case 'k': + len = lbtodb(roundup(len * 1024ULL, DEV_BSIZE)); + break; + case 'M': + case 'm': + len = lbtodb(roundup(len * 1024ULL*1024ULL, DEV_BSIZE)); + break; + case 'g': + case 'G': + len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL, DEV_BSIZE)); + break; + case 't': + case 'T': + len = lbtodb(roundup(len * 1024ULL*1024ULL*1024ULL*1024ULL, + DEV_BSIZE)); + break; + case 'p': + case 'P': + len = lbtodb(roundup( + len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL, + DEV_BSIZE)); + break; + case 'e': + case 'E': + len = lbtodb(roundup( + len * 1024ULL*1024ULL*1024ULL*1024ULL*1024ULL*1024ULL, + DEV_BSIZE)); + break; + default: + /* error */ + return (-1); + } + + *szp = len; + return (0); +} + +/* + * FUNCTION: meta_sp_setgeom() + * INPUT: np - the underlying device to setup geometry for + * compnp - the underlying device to setup geometry for + * mp - the unit structure to set the geometry for + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 otherwise + * PURPOSE: establishes geometry information for a device + */ +static int +meta_sp_setgeom( + mdname_t *np, + mdname_t *compnp, + mp_unit_t *mp, + md_error_t *ep +) +{ + mdgeom_t *geomp; + uint_t round_cyl = 0; + + if ((geomp = metagetgeom(compnp, ep)) == NULL) + return (-1); + if (meta_setup_geom((md_unit_t *)mp, np, geomp, geomp->write_reinstruct, + geomp->read_reinstruct, round_cyl, ep) != 0) + return (-1); + + return (0); +} + +/* + * FUNCTION: meta_sp_setstatus() + * INPUT: sp - the set name for the devices to set the status on + * minors - an array of minor numbers of devices to set status on + * num_units - number of entries in the array + * status - status value to set all units to + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: sets the status of one or more soft partitions to the + * requested value + */ +int +meta_sp_setstatus( + mdsetname_t *sp, + minor_t *minors, + int num_units, + sp_status_t status, + md_error_t *ep +) +{ + md_sp_statusset_t status_params; + + assert(minors != NULL); + + /* update status of all soft partitions to the status passed in */ + (void) memset(&status_params, 0, sizeof (status_params)); + status_params.num_units = num_units; + status_params.new_status = status; + status_params.size = num_units * sizeof (minor_t); + status_params.minors = (uintptr_t)minors; + MD_SETDRIVERNAME(&status_params, MD_SP, sp->setno); + if (metaioctl(MD_IOC_SPSTATUS, &status_params, &status_params.mde, + NULL) != 0) { + (void) mdstealerror(ep, &status_params.mde); + return (-1); + } + return (0); +} + +/* + * FUNCTION: meta_get_sp_names() + * INPUT: sp - the set name to get soft partitions from + * options - options from the command line + * OUTPUT: nlpp - list of all soft partition names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all soft partitions in the metadb + * for all devices in the specified set + */ +int +meta_get_sp_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + return (meta_get_names(MD_SP, sp, nlpp, options, ep)); +} + +/* + * FUNCTION: meta_get_by_component() + * INPUT: sp - the set name to get soft partitions from + * compnp - the name of the device containing the soft + * partitions that will be returned + * force - 0 - reads cached namelist if available, + * 1 - reloads cached namelist, frees old namelist + * OUTPUT: nlpp - list of all soft partition names + * ep - return error pointer + * RETURNS: int - -1 error, otherwise the number of soft partitions + * found on the component (0 = none found). + * PURPOSE: returns a list of all soft partitions on a given device + * from the metadb information + */ +static int +meta_sp_get_by_component( + mdsetname_t *sp, + mdname_t *compnp, + mdnamelist_t **nlpp, + int force, + md_error_t *ep +) +{ + static mdnamelist_t *cached_list = NULL; /* cached namelist */ + static int cached_count = 0; /* cached count */ + mdnamelist_t *spnlp = NULL; /* all sp names */ + mdnamelist_t *namep; /* list iterator */ + mdnamelist_t **tailpp = nlpp; /* namelist tail */ + mdnamelist_t **cachetailpp; /* cache tail */ + md_sp_t *msp; /* unit structure */ + int count = 0; /* count of sp's */ + int err; + mdname_t *curnp; + + if ((cached_list != NULL) && (!force)) { + /* return a copy of the cached list */ + for (namep = cached_list; namep != NULL; namep = namep->next) + tailpp = meta_namelist_append_wrapper(tailpp, + namep->namep); + return (cached_count); + } + + /* free the cache and reset values to zeros to prepare for a new list */ + metafreenamelist(cached_list); + cached_count = 0; + cached_list = NULL; + cachetailpp = &cached_list; + *nlpp = NULL; + + /* get all the softpartitions first of all */ + if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0) + return (-1); + + /* + * Now for each sp, see if it resides on the component we + * are interested in, if so then add it to our list + */ + for (namep = spnlp; namep != NULL; namep = namep->next) { + curnp = namep->namep; + + /* get the unit structure */ + if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL) + continue; + + /* + * If the current soft partition is not on the same + * component, continue the search. If it is on the same + * component, add it to our namelist. + */ + err = meta_check_samedrive(compnp, msp->compnamep, ep); + if (err <= 0) { + /* not on the same device, check the next one */ + continue; + } + + /* it's on the same drive */ + + /* + * Check for overlapping partitions if the component is not + * a metadevice. + */ + if (!metaismeta(msp->compnamep)) { + /* + * if they're on the same drive, neither + * should be a metadevice if one isn't + */ + assert(!metaismeta(compnp)); + + if (meta_check_overlap(msp->compnamep->cname, + compnp, 0, -1, msp->compnamep, 0, -1, ep) == 0) + continue; + + /* in this case it's not an error for them to overlap */ + mdclrerror(ep); + } + + /* Component is on the same device, add to the used list */ + tailpp = meta_namelist_append_wrapper(tailpp, curnp); + cachetailpp = meta_namelist_append_wrapper(cachetailpp, + curnp); + + ++count; + ++cached_count; + } + + assert(count == cached_count); + return (count); + +out: + metafreenamelist(*nlpp); + *nlpp = NULL; + return (-1); +} + +/* + * FUNCTION: meta_sp_get_default_alignment() + * INPUT: sp - the pertinent set name + * compnp - the name of the underlying component + * OUTPUT: ep - return error pointer + * RETURNS: sp_ext_length_t =0: no default alignment + * >0: default alignment + * PURPOSE: returns the default alignment for soft partitions to + * be built on top of the specified component or + * metadevice + */ +static sp_ext_length_t +meta_sp_get_default_alignment( + mdsetname_t *sp, + mdname_t *compnp, + md_error_t *ep +) +{ + sp_ext_length_t a = SP_UNALIGNED; + char *mname; + + assert(compnp != NULL); + + /* + * We treat raw devices as opaque, and assume nothing about + * their alignment requirements. + */ + if (!metaismeta(compnp)) + return (SP_UNALIGNED); + + /* + * We already know it's a metadevice from the previous test; + * metagetmiscname() will tell us which metadevice type we + * have + */ + mname = metagetmiscname(compnp, ep); + if (mname == NULL) + goto out; + + /* + * For a mirror, we want to deal with the stripe that is the + * primary side. If it happens to be asymmetrically + * configured, there is no simple way to fake a universal + * alignment. There's a chance that the least common + * denominator of the set of interlaces from all stripes of + * all submirrors would do it, but nobody that really cared + * that much about this issue would create an asymmetric + * config to start with. + * + * If the component underlying the soft partition is a mirror, + * then at the exit of this loop, compnp will have been + * updated to describe the first active submirror. + */ + if (strcmp(mname, MD_MIRROR) == 0) { + md_mirror_t *mp; + int smi; + md_submirror_t *smp; + + mp = meta_get_mirror(sp, compnp, ep); + if (mp == NULL) + goto out; + + for (smi = 0; smi < NMIRROR; smi++) { + + smp = &mp->submirrors[smi]; + if (smp->state == SMS_UNUSED) + continue; + + compnp = smp->submirnamep; + assert(compnp != NULL); + + mname = metagetmiscname(compnp, ep); + if (mname == NULL) + goto out; + + break; + } + + if (smi == NMIRROR) + goto out; + } + + /* + * Handle stripes and submirrors identically; just return the + * interlace of the first row. + */ + if (strcmp(mname, MD_STRIPE) == 0) { + md_stripe_t *stp; + + stp = meta_get_stripe(sp, compnp, ep); + if (stp == NULL) + goto out; + + a = stp->rows.rows_val[0].interlace; + goto out; + } + + /* + * Raid is even more straightforward; the interlace applies to + * the entire device. + */ + if (strcmp(mname, MD_RAID) == 0) { + md_raid_t *rp; + + rp = meta_get_raid(sp, compnp, ep); + if (rp == NULL) + goto out; + + a = rp->interlace; + goto out; + } + + /* + * If we have arrived here with the alignment still not set, + * then we expect the error to have been set by one of the + * routines we called. If neither is the case, something has + * really gone wrong above. (Probably the submirror walk + * failed to produce a valid submirror, but that would be + * really bad...) + */ +out: + meta_sp_debug("meta_sp_get_default_alignment: miscname %s, " + "alignment %lld\n", (mname == NULL) ? "NULL" : mname, a); + + if (getenv(META_SP_DEBUG) && !mdisok(ep)) { + mde_perror(ep, NULL); + } + + assert((a > 0) || (!mdisok(ep))); + + return (a); +} + + + +/* + * FUNCTION: meta_check_insp() + * INPUT: sp - the set name for the device to check + * np - the name of the device to check + * slblk - the starting offset of the device to check + * nblks - the number of blocks in the device to check + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - device contains soft partitions + * -1 - device does not contain soft partitions + * PURPOSE: determines whether a device contains any soft partitions + */ +/* ARGSUSED */ +int +meta_check_insp( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdnamelist_t *spnlp = NULL; /* soft partition name list */ + int count; + int rval; + + /* check set pointer */ + assert(sp != NULL); + + /* find all soft partitions on the component */ + count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep); + + if (count == -1) { + rval = -1; + } else if (count > 0) { + rval = mduseerror(ep, MDE_ALREADY, np->dev, + spnlp->namep->cname, np->cname); + } else { + rval = 0; + } + + metafreenamelist(spnlp); + return (rval); +} + +/* + * ************************************************************************** + * Extent List Manipulation Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_cmp_by_nameseq() + * INPUT: e1 - first node to compare + * e2 - second node to compare + * OUTPUT: none + * RETURNS: int - =0 - nodes are equal + * <0 - e1 should go before e2 + * >0 - e1 should go after e2 + * PURPOSE: used for sorted list inserts to build a list sorted by + * name first and sequence number second. + */ +static int +meta_sp_cmp_by_nameseq(sp_ext_node_t *e1, sp_ext_node_t *e2) +{ + int rval; + + if (e1->ext_namep == NULL) + return (1); + if (e2->ext_namep == NULL) + return (-1); + if ((rval = strcmp(e1->ext_namep->cname, e2->ext_namep->cname)) != 0) + return (rval); + + /* the names are equal, compare sequence numbers */ + if (e1->ext_seq > e2->ext_seq) + return (1); + if (e1->ext_seq < e2->ext_seq) + return (-1); + /* sequence numbers are also equal */ + return (0); +} + +/* + * FUNCTION: meta_sp_cmp_by_offset() + * INPUT: e1 - first node to compare + * e2 - second node to compare + * OUTPUT: none + * RETURNS: int - =0 - nodes are equal + * <0 - e1 should go before e2 + * >0 - e1 should go after e2 + * PURPOSE: used for sorted list inserts to build a list sorted by offset + */ +static int +meta_sp_cmp_by_offset(sp_ext_node_t *e1, sp_ext_node_t *e2) +{ + if (e1->ext_offset > e2->ext_offset) + return (1); + if (e1->ext_offset < e2->ext_offset) + return (-1); + /* offsets are equal */ + return (0); +} + +/* + * FUNCTION: meta_sp_list_insert() + * INPUT: sp - the set name for the device the node belongs to + * np - the name of the device the node belongs to + * head - the head of the list, must be NULL for empty list + * offset - the physical offset of this extent in sectors + * length - the length of this extent in sectors + * type - the type of the extent being inserted + * seq - the sequence number of the extent being inserted + * flags - extent flags (eg. whether it needs to be updated) + * compare - the compare function to use + * OUTPUT: head - points to the new head if a node was inserted + * at the beginning + * RETURNS: void + * PURPOSE: inserts an extent node into a sorted doubly linked list. + * The sort order is determined by the compare function. + * Memory is allocated for the node in this function and it + * is up to the caller to free it, possibly using + * meta_sp_list_free(). If a node is inserted at the + * beginning of the list, the head pointer is updated to + * point to the new first node. + */ +static void +meta_sp_list_insert( + mdsetname_t *sp, + mdname_t *np, + sp_ext_node_t **head, + sp_ext_offset_t offset, + sp_ext_length_t length, + sp_ext_type_t type, + uint_t seq, + uint_t flags, + ext_cmpfunc_t compare +) +{ + sp_ext_node_t *newext; + sp_ext_node_t *curext; + + assert(head != NULL); + + /* Don't bother adding zero length nodes */ + if (length == 0ULL) + return; + + /* allocate and fill in new ext_node */ + newext = Zalloc(sizeof (sp_ext_node_t)); + + newext->ext_offset = offset; + newext->ext_length = length; + newext->ext_flags = flags; + newext->ext_type = type; + newext->ext_seq = seq; + newext->ext_setp = sp; + newext->ext_namep = np; + + /* first node in the list */ + if (*head == NULL) { + newext->ext_next = newext->ext_prev = NULL; + *head = newext; + } else if ((*compare)(*head, newext) >= 0) { + /* the first node has a bigger offset, so insert before it */ + assert((*head)->ext_prev == NULL); + + newext->ext_prev = NULL; + newext->ext_next = *head; + (*head)->ext_prev = newext; + *head = newext; + } else { + /* + * find the next node whose offset is greater than + * the one we want to insert, or the end of the list. + */ + for (curext = *head; + (curext->ext_next != NULL) && + ((*compare)(curext->ext_next, newext) < 0); + (curext = curext->ext_next)) + ; + + /* link the new node in after the current node */ + newext->ext_next = curext->ext_next; + newext->ext_prev = curext; + + if (curext->ext_next != NULL) + curext->ext_next->ext_prev = newext; + + curext->ext_next = newext; + } +} + +/* + * FUNCTION: meta_sp_list_free() + * INPUT: head - the head of the list, must be NULL for empty list + * OUTPUT: head - points to NULL on return + * RETURNS: void + * PURPOSE: walks a double linked extent list and frees each node + */ +static void +meta_sp_list_free(sp_ext_node_t **head) +{ + sp_ext_node_t *ext; + sp_ext_node_t *next; + + assert(head != NULL); + + ext = *head; + while (ext) { + next = ext->ext_next; + Free(ext); + ext = next; + } + *head = NULL; +} + +/* + * FUNCTION: meta_sp_list_remove() + * INPUT: head - the head of the list, must be NULL for empty list + * ext - the extent to remove, must be a member of the list + * OUTPUT: head - points to the new head of the list + * RETURNS: void + * PURPOSE: unlinks the node specified by ext from the list and + * frees it, possibly moving the head pointer forward if + * the head is the node being removed. + */ +static void +meta_sp_list_remove(sp_ext_node_t **head, sp_ext_node_t *ext) +{ + assert(head != NULL); + assert(*head != NULL); + + if (*head == ext) + *head = ext->ext_next; + + if (ext->ext_prev != NULL) + ext->ext_prev->ext_next = ext->ext_next; + if (ext->ext_next != NULL) + ext->ext_next->ext_prev = ext->ext_prev; + Free(ext); +} + +/* + * FUNCTION: meta_sp_list_size() + * INPUT: head - the head of the list, must be NULL for empty list + * exttype - the type of the extents to sum + * exclude_wm - subtract space for extent headers from total + * OUTPUT: none + * RETURNS: sp_ext_length_t - the sum of all of the lengths + * PURPOSE: sums the lengths of all extents in the list matching the + * specified type. This could be used for computing the + * amount of free or used space, for example. + */ +static sp_ext_length_t +meta_sp_list_size(sp_ext_node_t *head, sp_ext_type_t exttype, int exclude_wm) +{ + sp_ext_node_t *ext; + sp_ext_length_t size = 0LL; + + for (ext = head; ext != NULL; ext = ext->ext_next) + if (ext->ext_type == exttype) + size += ext->ext_length - + ((exclude_wm) ? MD_SP_WMSIZE : 0); + + return (size); +} + +/* + * FUNCTION: meta_sp_list_find() + * INPUT: head - the head of the list, must be NULL for empty list + * offset - the offset contained by the node to find + * OUTPUT: none + * RETURNS: sp_ext_node_t * - the node containing the requested offset + * or NULL if no such nodes were found. + * PURPOSE: finds a node in a list containing the requested offset + * (inclusive). If multiple nodes contain this offset then + * only the first will be returned, though typically these + * lists are managed with non-overlapping nodes. + * + * *The list MUST be sorted by offset for this function to work.* + */ +static sp_ext_node_t * +meta_sp_list_find( + sp_ext_node_t *head, + sp_ext_offset_t offset +) +{ + sp_ext_node_t *ext; + + for (ext = head; ext != NULL; ext = ext->ext_next) { + /* check if the offset lies within this extent */ + if ((offset >= ext->ext_offset) && + (offset < ext->ext_offset + ext->ext_length)) { + /* + * the requested extent should always be a + * subset of an extent in the list. + */ + return (ext); + } + } + return (NULL); +} + +/* + * FUNCTION: meta_sp_list_freefill() + * INPUT: head - the head of the list, must be NULL for empty list + * size - the size of the volume this extent list is + * representing + * OUTPUT: head - the new head of the list + * RETURNS: void + * PURPOSE: finds gaps in the extent list and fills them with a free + * node. If there is a gap at the beginning the head + * pointer will be changed to point to the new free node. + * If there is free space at the end, the last free extent + * will extend all the way out to the size specified. + * + * *The list MUST be sorted by offset for this function to work.* + */ +static void +meta_sp_list_freefill( + sp_ext_node_t **head, + sp_ext_length_t size +) +{ + sp_ext_node_t *ext; + sp_ext_offset_t curoff = 0LL; + + for (ext = *head; ext != NULL; ext = ext->ext_next) { + if (curoff < ext->ext_offset) + meta_sp_list_insert(NULL, NULL, head, + curoff, ext->ext_offset - curoff, + EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset); + curoff = ext->ext_offset + ext->ext_length; + } + + /* pad inverse list out to the end */ + if (curoff < size) + meta_sp_list_insert(NULL, NULL, head, curoff, size - curoff, + EXTTYP_FREE, 0, 0, meta_sp_cmp_by_offset); + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_list_freefill: Extent list with " + "holes freefilled:\n"); + meta_sp_list_dump(*head); + } +} + +/* + * FUNCTION: meta_sp_list_dump() + * INPUT: head - the head of the list, must be NULL for empty list + * OUTPUT: none + * RETURNS: void + * PURPOSE: dumps the entire extent list to stdout for easy debugging + */ +static void +meta_sp_list_dump(sp_ext_node_t *head) +{ + sp_ext_node_t *ext; + + meta_sp_debug("meta_sp_list_dump: dumping extent list:\n"); + meta_sp_debug("%5s %10s %5s %7s %10s %10s %5s %10s %10s\n", "Name", + "Addr", "Seq#", "Type", "Offset", "Length", "Flags", "Prev", + "Next"); + for (ext = head; ext != NULL; ext = ext->ext_next) { + if (ext->ext_namep != NULL) + meta_sp_debug("%5s", ext->ext_namep->cname); + else + meta_sp_debug("%5s", "NONE"); + + meta_sp_debug("%10p %5u ", (void *) ext, ext->ext_seq); + switch (ext->ext_type) { + case EXTTYP_ALLOC: + meta_sp_debug("%7s ", "ALLOC"); + break; + case EXTTYP_FREE: + meta_sp_debug("%7s ", "FREE"); + break; + case EXTTYP_END: + meta_sp_debug("%7s ", "END"); + break; + case EXTTYP_RESERVED: + meta_sp_debug("%7s ", "RESV"); + break; + default: + meta_sp_debug("%7s ", "INVLD"); + break; + } + + meta_sp_debug("%10llu %10llu %5u %10p %10p\n", + ext->ext_offset, ext->ext_length, + ext->ext_flags, (void *) ext->ext_prev, + (void *) ext->ext_next); + } + meta_sp_debug("\n"); +} + +/* + * FUNCTION: meta_sp_list_overlaps() + * INPUT: head - the head of the list, must be NULL for empty list + * OUTPUT: none + * RETURNS: int - 1 if extents overlap, 0 if ok + * PURPOSE: checks a list for overlaps. The list MUST be sorted by + * offset for this function to work properly. + */ +static int +meta_sp_list_overlaps(sp_ext_node_t *head) +{ + sp_ext_node_t *ext; + + for (ext = head; ext->ext_next != NULL; ext = ext->ext_next) { + if (ext->ext_offset + ext->ext_length > + ext->ext_next->ext_offset) + return (1); + } + return (0); +} + +/* + * ************************************************************************** + * Extent Allocation Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_alloc_by_ext() + * INPUT: sp - the set name for the device the node belongs to + * np - the name of the device the node belongs to + * head - the head of the list, must be NULL for empty list + * free_ext - the free extent being allocated from + * alloc_offset - the offset of the allocation + * alloc_len - the length of the allocation + * seq - the sequence number of the allocation + * OUTPUT: head - the new head pointer + * RETURNS: void + * PURPOSE: allocates a portion of the free extent free_ext. The + * allocated portion starts at alloc_offset and is + * alloc_length long. Both (alloc_offset) and (alloc_offset + + * alloc_length) must be contained within the free extent. + * + * The free extent is split into as many as 3 pieces - a + * free extent containing [ free_offset .. alloc_offset ), an + * allocated extent containing the range [ alloc_offset .. + * alloc_end ], and another free extent containing the + * range ( alloc_end .. free_end ]. If either of the two + * new free extents would be zero length, they are not created. + * + * Finally, the original free extent is removed. All newly + * created extents have the EXTFLG_UPDATE flag set. + */ +static void +meta_sp_alloc_by_ext( + mdsetname_t *sp, + mdname_t *np, + sp_ext_node_t **head, + sp_ext_node_t *free_ext, + sp_ext_offset_t alloc_offset, + sp_ext_length_t alloc_length, + uint_t seq +) +{ + sp_ext_offset_t free_offset = free_ext->ext_offset; + sp_ext_length_t free_length = free_ext->ext_length; + + sp_ext_offset_t alloc_end = alloc_offset + alloc_length; + sp_ext_offset_t free_end = free_offset + free_length; + + /* allocated extent must be a subset of the free extent */ + assert(free_offset <= alloc_offset); + assert(free_end >= alloc_end); + + meta_sp_list_remove(head, free_ext); + + if (free_offset < alloc_offset) { + meta_sp_list_insert(NULL, NULL, head, free_offset, + (alloc_offset - free_offset), EXTTYP_FREE, 0, + EXTFLG_UPDATE, meta_sp_cmp_by_offset); + } + + if (free_end > alloc_end) { + meta_sp_list_insert(NULL, NULL, head, alloc_end, + (free_end - alloc_end), EXTTYP_FREE, 0, EXTFLG_UPDATE, + meta_sp_cmp_by_offset); + } + + meta_sp_list_insert(sp, np, head, alloc_offset, alloc_length, + EXTTYP_ALLOC, seq, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_alloc_by_ext: extent list:\n"); + meta_sp_list_dump(*head); + } +} + +/* + * FUNCTION: meta_sp_alloc_by_len() + * INPUT: sp - the set name for the device the node belongs to + * np - the name of the device the node belongs to + * head - the head of the list, must be NULL for empty list + * *lp - the requested length to allocate + * last_off - the last offset already allocated. + * alignment - the desired extent alignmeent + * OUTPUT: head - the new head pointer + * *lp - the length allocated + * RETURNS: int - -1 if error, the number of new extents on success + * PURPOSE: allocates extents from free space to satisfy the requested + * length. If requested length is zero, allocates all + * remaining free space. This function provides the meat + * of the extent allocation algorithm. Allocation is a + * three tier process: + * + * 1. If last_off is nonzero and there is free space following + * that node, then it is extended to allocate as much of that + * free space as possible. This is useful for metattach. + * 2. If a free extent can be found to satisfy the remaining + * requested space, then satisfy the rest of the request + * from that extent. + * 3. Start allocating space from any remaining free extents until + * the remainder of the request is satisified. + * + * If alignment is non-zero, then every extent modified + * or newly allocated will be aligned modulo alignment, + * with a length that is an integer multiple of + * alignment. + * + * The EXTFLG_UPDATE flag is set for all nodes (free and + * allocated) that require updated watermarks. + * + * This algorithm may have a negative impact on fragmentation + * in pathological cases and may be improved if it turns out + * to be a problem. This may be exacerbated by particularly + * large alignments. + * + * NOTE: It's confusing, so it demands an explanation: + * - len is used to represent requested data space; it + * does not include room for a watermark. On each full + * or partial allocation, len will be decremented by + * alloc_len (see next paragraph) until it reaches + * zero. + * - alloc_len is used to represent data space allocated + * from a particular extent; it does not include space + * for a watermark. In the rare event that a_length + * (see next paragraph) is equal to MD_SP_WMSIZE, + * alloc_len will be zero and the resulting MD_SP_WMSIZE + * fragment of space will be utterly unusable. + * - a_length is used to represent all space to be + * allocated from a particular extent; it DOES include + * space for a watermark. + */ +static int +meta_sp_alloc_by_len( + mdsetname_t *sp, + mdname_t *np, + sp_ext_node_t **head, + sp_ext_length_t *lp, + sp_ext_offset_t last_off, + sp_ext_offset_t alignment +) +{ + sp_ext_node_t *free_ext; + sp_ext_node_t *alloc_ext; + uint_t last_seq = 0; + uint_t numexts = 0; + sp_ext_length_t freespace; + sp_ext_length_t alloc_len; + sp_ext_length_t len; + + /* We're DOA if we can't read *lp */ + assert(lp != NULL); + len = *lp; + + /* + * Process the nominal case first: we've been given an actual + * size argument, rather than the literal "all" + */ + + if (len != 0) { + + /* + * Short circuit the check for free space. This may + * tell us we have enough space when we really don't + * because each extent loses space to a watermark, but + * it will always tell us there isn't enough space + * correctly. Worst case we do some extra work. + */ + freespace = meta_sp_list_size(*head, EXTTYP_FREE, + INCLUDE_WM); + + if (freespace < len) + return (-1); + + /* + * First see if we can extend the last extent for an + * attach. + */ + if (last_off != 0LL) { + int align = 0; + + alloc_ext = + meta_sp_list_find(*head, last_off); + assert(alloc_ext != NULL); + + /* + * The offset test reflects the + * inclusion of the watermark in the extent + */ + align = (alignment > 0) && + (((alloc_ext->ext_offset + MD_SP_WMSIZE) % + alignment) == 0); + + /* + * If we decided not to align here, we should + * also reset "alignment" so we don't bother + * later, either. + */ + if (!align) { + alignment = 0; + } + + last_seq = alloc_ext->ext_seq; + + free_ext = meta_sp_list_find(*head, + alloc_ext->ext_offset + + alloc_ext->ext_length); + + /* + * If a free extent follows our last allocated + * extent, then remove the last allocated + * extent and increase the size of the free + * extent to overlap it, then allocate the + * total space from the new free extent. + */ + if (free_ext != NULL && + free_ext->ext_type == EXTTYP_FREE) { + assert(free_ext->ext_offset == + alloc_ext->ext_offset + + alloc_ext->ext_length); + + alloc_len = + MIN(len, free_ext->ext_length); + + if (align && (alloc_len < len)) { + /* No watermark space needed */ + alloc_len -= alloc_len % alignment; + } + + if (alloc_len > 0) { + free_ext->ext_offset -= + alloc_ext->ext_length; + free_ext->ext_length += + alloc_ext->ext_length; + + meta_sp_alloc_by_ext(sp, np, head, + free_ext, free_ext->ext_offset, + alloc_ext->ext_length + alloc_len, + last_seq); + + /* + * now remove the original allocated + * node. We may have overlapping + * extents for a short time before + * this node is removed. + */ + meta_sp_list_remove(head, alloc_ext); + len -= alloc_len; + } + } + last_seq++; + } + + if (len == 0LL) + goto out; + + /* + * Next, see if we can find a single allocation for + * the remainder. This may make fragmentation worse + * in some cases, but there's no good way to allocate + * that doesn't have a highly fragmented corner case. + */ + for (free_ext = *head; free_ext != NULL; + free_ext = free_ext->ext_next) { + sp_ext_offset_t a_offset; + sp_ext_offset_t a_length; + + if (free_ext->ext_type != EXTTYP_FREE) + continue; + + /* + * The length test should include space for + * the watermark + */ + + a_offset = free_ext->ext_offset; + a_length = free_ext->ext_length; + + if (alignment > 0) { + + /* + * Shortcut for extents that have been + * previously added to pad out the + * data space + */ + if (a_length < alignment) { + continue; + } + + /* + * Round up so the data space begins + * on a properly aligned boundary. + */ + a_offset += alignment - + (a_offset % alignment) - MD_SP_WMSIZE; + + /* + * This is only necessary in case the + * watermark size is ever greater than + * one. It'll never happen, of + * course; we'll get rid of watermarks + * before we make 'em bigger. + */ + if (a_offset < free_ext->ext_offset) { + a_offset += alignment; + } + + /* + * Adjust the length to account for + * the space lost above (if any) + */ + a_length -= + (a_offset - free_ext->ext_offset); + } + + if (a_length >= len + MD_SP_WMSIZE) { + meta_sp_alloc_by_ext(sp, np, head, + free_ext, a_offset, + len + MD_SP_WMSIZE, last_seq); + + len = 0LL; + numexts++; + break; + } + } + + if (len == 0LL) + goto out; + + + /* + * If the request could not be satisfied by extending + * the last extent or by a single extent, then put + * multiple smaller extents together until the request + * is satisfied. + */ + for (free_ext = *head; (free_ext != NULL) && (len > 0); + free_ext = free_ext->ext_next) { + sp_ext_offset_t a_offset; + sp_ext_length_t a_length; + + if (free_ext->ext_type != EXTTYP_FREE) + continue; + + a_offset = free_ext->ext_offset; + a_length = free_ext->ext_length; + + if (alignment > 0) { + + /* + * Shortcut for extents that have been + * previously added to pad out the + * data space + */ + if (a_length < alignment) { + continue; + } + + /* + * Round up so the data space begins + * on a properly aligned boundary. + */ + a_offset += alignment - + (a_offset % alignment) - MD_SP_WMSIZE; + + /* + * This is only necessary in case the + * watermark size is ever greater than + * one. It'll never happen, of + * course; we'll get rid of watermarks + * before we make 'em bigger. + */ + if (a_offset < free_ext->ext_offset) { + a_offset += alignment; + } + + /* + * Adjust the length to account for + * the space lost above (if any) + */ + a_length -= + (a_offset - free_ext->ext_offset); + + /* + * Adjust the length to be properly + * aligned if it is NOT to be the + * last extent in the soft partition. + */ + if ((a_length - MD_SP_WMSIZE) < len) + a_length -= + (a_length - MD_SP_WMSIZE) + % alignment; + } + + alloc_len = MIN(len, a_length - MD_SP_WMSIZE); + if (alloc_len == 0) + continue; + + /* + * meta_sp_alloc_by_ext() expects the + * allocation length to include the watermark + * size, which is why we don't simply pass in + * alloc_len here. + */ + meta_sp_alloc_by_ext(sp, np, head, free_ext, + a_offset, MIN(len + MD_SP_WMSIZE, a_length), + last_seq); + + len -= alloc_len; + numexts++; + last_seq++; + } + + + /* + * If there was not enough space we can throw it all + * away since no real work has been done yet. + */ + if (len != 0) { + meta_sp_list_free(head); + return (-1); + } + } + + /* + * Otherwise, the literal "all" was specified: allocate all + * available free space. Don't bother with alignment. + */ + else { + /* First, extend the last extent if this is a grow */ + if (last_off != 0LL) { + alloc_ext = + meta_sp_list_find(*head, last_off); + assert(alloc_ext != NULL); + + last_seq = alloc_ext->ext_seq; + + free_ext = meta_sp_list_find(*head, + alloc_ext->ext_offset + + alloc_ext->ext_length); + + /* + * If a free extent follows our last allocated + * extent, then remove the last allocated + * extent and increase the size of the free + * extent to overlap it, then allocate the + * total space from the new free extent. + */ + if (free_ext != NULL && + free_ext->ext_type == EXTTYP_FREE) { + assert(free_ext->ext_offset == + alloc_ext->ext_offset + + alloc_ext->ext_length); + + len = alloc_len = + free_ext->ext_length; + + free_ext->ext_offset -= + alloc_ext->ext_length; + free_ext->ext_length += + alloc_ext->ext_length; + + meta_sp_alloc_by_ext(sp, np, head, + free_ext, free_ext->ext_offset, + alloc_ext->ext_length + alloc_len, + last_seq); + + /* + * now remove the original allocated + * node. We may have overlapping + * extents for a short time before + * this node is removed. + */ + meta_sp_list_remove(head, alloc_ext); + } + + last_seq++; + } + + /* Next, grab all remaining free space */ + for (free_ext = *head; free_ext != NULL; + free_ext = free_ext->ext_next) { + + if (free_ext->ext_type == EXTTYP_FREE) { + alloc_len = + free_ext->ext_length - MD_SP_WMSIZE; + if (alloc_len == 0) + continue; + + /* + * meta_sp_alloc_by_ext() expects the + * allocation length to include the + * watermark size, which is why we + * don't simply pass in alloc_len + * here. + */ + meta_sp_alloc_by_ext(sp, np, head, + free_ext, free_ext->ext_offset, + free_ext->ext_length, + last_seq); + + len += alloc_len; + numexts++; + last_seq++; + } + } + } + +out: + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_alloc_by_len: Extent list after " + "allocation:\n"); + meta_sp_list_dump(*head); + } + + if (*lp == 0) { + *lp = len; + + /* + * Make sure the callers hit a no space error if we + * didn't actually find anything. + */ + if (len == 0) { + return (-1); + } + } + + return (numexts); +} + +/* + * FUNCTION: meta_sp_alloc_by_list() + * INPUT: sp - the set name for the device the node belongs to + * np - the name of the device the node belongs to + * head - the head of the list, must be NULL for empty list + * oblist - an extent list containing requested nodes to allocate + * OUTPUT: head - the new head pointer + * RETURNS: int - -1 if error, the number of new extents on success + * PURPOSE: allocates extents from free space to satisfy the requested + * extent list. This is primarily used for the -o/-b options + * where the user may specifically request extents to allocate. + * Each extent in the oblist must be a subset (inclusive) of a + * free extent and may not overlap each other. This + * function sets the EXTFLG_UPDATE flag for each node that + * requires a watermark update after allocating. + */ +static int +meta_sp_alloc_by_list( + mdsetname_t *sp, + mdname_t *np, + sp_ext_node_t **head, + sp_ext_node_t *oblist +) +{ + sp_ext_node_t *ext; + sp_ext_node_t *free_ext; + uint_t numexts = 0; + + for (ext = oblist; ext != NULL; ext = ext->ext_next) { + + free_ext = meta_sp_list_find(*head, + ext->ext_offset - MD_SP_WMSIZE); + + /* Make sure the allocation is within the free extent */ + if ((free_ext == NULL) || + (ext->ext_offset + ext->ext_length > + free_ext->ext_offset + free_ext->ext_length) || + (free_ext->ext_type != EXTTYP_FREE)) + return (-1); + + meta_sp_alloc_by_ext(sp, np, head, free_ext, + ext->ext_offset - MD_SP_WMSIZE, + ext->ext_length + MD_SP_WMSIZE, ext->ext_seq); + + numexts++; + } + + assert(meta_sp_list_overlaps(*head) == 0); + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_alloc_by_list: Extent list after " + "allocation:\n"); + meta_sp_list_dump(*head); + } + + return (numexts); +} + +/* + * ************************************************************************** + * Extent List Population Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_extlist_from_namelist() + * INPUT: sp - the set name for the device the node belongs to + * spnplp - the namelist of soft partitions to build a list from + * OUTPUT: extlist - the extent list built from the SPs in the namelist + * ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: builds an extent list representing the soft partitions + * specified in the namelist. Each extent in each soft + * partition is added to the list with the type EXTTYP_ALLOC. + * The EXTFLG_UPDATE flag is not set on any nodes. Each + * extent in the list includes the space occupied by the + * watermark, which is not included in the unit structures. + */ +static int +meta_sp_extlist_from_namelist( + mdsetname_t *sp, + mdnamelist_t *spnlp, + sp_ext_node_t **extlist, + md_error_t *ep +) +{ + int extn; + md_sp_t *msp; /* unit structure of the sp's */ + mdnamelist_t *namep; + + assert(sp != NULL); + + /* + * Now go through the soft partitions and add a node to the used + * list for each allocated extent. + */ + for (namep = spnlp; namep != NULL; namep = namep->next) { + mdname_t *curnp = namep->namep; + + /* get the unit structure */ + if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL) + return (-1); + + for (extn = 0; (extn < msp->ext.ext_len); extn++) { + md_sp_ext_t *extp = &msp->ext.ext_val[extn]; + + /* + * subtract from offset and add to the length + * to account for the watermark, which is not + * contained in the extents in the unit structure. + */ + meta_sp_list_insert(sp, curnp, extlist, + extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE, + EXTTYP_ALLOC, extn, 0, meta_sp_cmp_by_offset); + } + } + return (0); +} + +/* + * FUNCTION: meta_sp_extlist_from_wm() + * INPUT: sp - the set name for the device the node belongs to + * compnp - the name of the device to scan watermarks on + * OUTPUT: extlist - the extent list built from the SPs in the namelist + * ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: builds an extent list representing the soft partitions + * specified in the namelist. Each extent in each soft + * partition is added to the list with the type EXTTYP_ALLOC. + * The EXTFLG_UPDATE flag is not set on any nodes. Each + * extent in the list includes the space occupied by the + * watermark, which is not included in the unit structures. + */ +static int +meta_sp_extlist_from_wm( + mdsetname_t *sp, + mdname_t *compnp, + sp_ext_node_t **extlist, + ext_cmpfunc_t compare, + md_error_t *ep +) +{ + mp_watermark_t wm; + mdname_t *np = NULL; + mdsetname_t *spsetp = NULL; + sp_ext_offset_t cur_off; + + if ((cur_off = meta_sp_get_start(sp, compnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + + for (;;) { + if (meta_sp_read_wm(sp, compnp, &wm, cur_off, ep) != 0) { + return (-1); + } + + /* get the set and name pointers */ + if (strcmp(wm.wm_setname, MD_SP_LOCALSETNAME) != 0) { + if ((spsetp = metasetname(wm.wm_setname, ep)) == NULL) { + return (-1); + } + } + + if (strcmp(wm.wm_mdname, MD_SP_FREEWMNAME) != 0) { + if (meta_init_make_device(&sp, wm.wm_mdname, ep) != 0) + return (-1); + np = metaname(&spsetp, wm.wm_mdname, ep); + if (np == NULL) { + return (-1); + } + } + + /* insert watermark into extent list */ + meta_sp_list_insert(spsetp, np, extlist, cur_off, + wm.wm_length + MD_SP_WMSIZE, wm.wm_type, wm.wm_seq, + EXTFLG_UPDATE, compare); + + /* if we see the end watermark, we're done */ + if (wm.wm_type == EXTTYP_END) + break; + + cur_off += wm.wm_length + 1; + + /* clear out set and name pointers for next iteration */ + np = NULL; + spsetp = NULL; + } + + return (0); +} + +/* + * ************************************************************************** + * Print (metastat) Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_short_print() + * INPUT: msp - the unit structure to display + * fp - the file pointer to send output to + * options - print options from the command line processor + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: display a short report of the soft partition in md.tab + * form, primarily used for metastat -p. + */ +static int +meta_sp_short_print( + md_sp_t *msp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + int extn; + + if (options & PRINT_LARGEDEVICES) { + if (msp->common.revision != MD_64BIT_META_DEV) + return (0); + } + + /* print name and -p */ + if (fprintf(fp, "%s -p", msp->common.namep->cname) == EOF) + return (mdsyserror(ep, errno, fname)); + + /* print the component */ + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(msp->compnamep->rname, "/dev/rdsk") == NULL) && + (strstr(msp->compnamep->rname, "/dev/md/rdsk") == NULL) && + (strstr(msp->compnamep->rname, "/dev/td/") == NULL)) { + /* not standard path so print full pathname */ + if (fprintf(fp, " %s", msp->compnamep->rname) == EOF) + return (mdsyserror(ep, errno, fname)); + } else { + /* standard path so print ctds or d number */ + if (fprintf(fp, " %s", msp->compnamep->cname) == EOF) + return (mdsyserror(ep, errno, fname)); + } + + /* print out each extent */ + for (extn = 0; (extn < msp->ext.ext_len); extn++) { + md_sp_ext_t *extp = &msp->ext.ext_val[extn]; + if (fprintf(fp, " -o %llu -b %llu ", extp->poff, + extp->len) == EOF) + return (mdsyserror(ep, errno, fname)); + } + + if (fprintf(fp, "\n") == EOF) + return (mdsyserror(ep, errno, fname)); + + /* success */ + return (0); +} + +/* + * FUNCTION: meta_sp_status_to_name() + * INPUT: xsp_status - the status value to convert to a string + * tstate - transient errored device state. If set the + * device is Unavailable + * OUTPUT: none + * RETURNS: char * - a pointer to the string representing the status value + * PURPOSE: return an internationalized string representing the + * status value for a soft partition. The strings are + * strdup'd and must be freed by the caller. + */ +static char * +meta_sp_status_to_name( + xsp_status_t xsp_status, + uint_t tstate +) +{ + char *rval = NULL; + + /* + * Check to see if we have MD_INACCESSIBLE set. This is the only valid + * value for an 'Unavailable' return. tstate can be set because of + * other multi-node reasons (e.g. ABR being set) + */ + if (tstate & MD_INACCESSIBLE) { + return (Strdup(dgettext(TEXT_DOMAIN, "Unavailable"))); + } + + switch (xsp_status) { + case MD_SP_CREATEPEND: + rval = Strdup(dgettext(TEXT_DOMAIN, "Creating")); + break; + case MD_SP_GROWPEND: + rval = Strdup(dgettext(TEXT_DOMAIN, "Growing")); + break; + case MD_SP_DELPEND: + rval = Strdup(dgettext(TEXT_DOMAIN, "Deleting")); + break; + case MD_SP_OK: + rval = Strdup(dgettext(TEXT_DOMAIN, "Okay")); + break; + case MD_SP_ERR: + rval = Strdup(dgettext(TEXT_DOMAIN, "Errored")); + break; + case MD_SP_RECOVER: + rval = Strdup(dgettext(TEXT_DOMAIN, "Recovering")); + break; + } + + if (rval == NULL) + rval = Strdup(dgettext(TEXT_DOMAIN, "Invalid")); + + return (rval); +} + +/* + * FUNCTION: meta_sp_report() + * INPUT: sp - the set name for the unit being displayed + * msp - the unit structure to display + * nlpp - pass back the large devs + * fp - the file pointer to send output to + * options - print options from the command line processor + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: print a full report of the device specified + */ +static int +meta_sp_report( + mdsetname_t *sp, + md_sp_t *msp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t extn; + char *status; + char *devid = ""; + mdname_t *didnp = NULL; + ddi_devid_t dtp; + int len; + uint_t tstate = 0; + + if (options & PRINT_LARGEDEVICES) { + if (msp->common.revision != MD_64BIT_META_DEV) { + return (0); + } else { + if (meta_getdevs(sp, msp->common.namep, nlpp, ep) != 0) + return (-1); + } + } + + if (options & PRINT_HEADER) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Soft Partition\n"), + msp->common.namep->cname) == EOF) + return (mdsyserror(ep, errno, fname)); + } + + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Device: %s\n"), + msp->compnamep->cname) == EOF) + return (mdsyserror(ep, errno, fname)); + + /* Determine if device is available before displaying status */ + if (metaismeta(msp->common.namep)) { + if (meta_get_tstate(msp->common.namep->dev, &tstate, ep) != 0) + return (-1); + } + status = meta_sp_status_to_name(msp->status, tstate & MD_DEV_ERRORED); + + /* print out "State" to be consistent with other metadevices */ + if (tstate & MD_ABR_CAP) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " State: %s - Application Based Recovery (ABR)\n"), + status) == EOF) { + Free(status); + return (mdsyserror(ep, errno, fname)); + } + } else { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " State: %s\n"), status) == EOF) { + Free(status); + return (mdsyserror(ep, errno, fname)); + } + } + free(status); + + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %llu blocks (%s)\n"), + msp->common.size, + meta_number_to_string(msp->common.size, DEV_BSIZE)) == EOF) + return (mdsyserror(ep, errno, fname)); + + /* print component details */ + if (! metaismeta(msp->compnamep)) { + diskaddr_t start_blk; + int has_mddb; + char *has_mddb_str; + + /* print header */ + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + len = strlen(msp->compnamep->cname); + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + return (mdsyserror(ep, errno, fname)); + } + + + /* get info */ + if ((start_blk = meta_sp_get_start(sp, msp->compnamep, ep)) == + MD_DISKADDR_ERROR) + return (-1); + + if ((has_mddb = metahasmddb(sp, msp->compnamep, ep)) < 0) + return (-1); + + if (has_mddb) + has_mddb_str = dgettext(TEXT_DOMAIN, "Yes"); + else + has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + + /* populate the key in the name_p structure */ + didnp = metadevname(&sp, msp->compnamep->dev, ep); + if (didnp == NULL) { + return (-1); + } + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) { + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + } + + /* print info */ + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (fprintf(fp, "\t%-*s %8lld %-5.5s %s\n", + len, msp->compnamep->cname, + start_blk, has_mddb_str, devid) == EOF) { + return (mdsyserror(ep, errno, fname)); + } + (void) fprintf(fp, "\n"); + } + + + /* print the headers */ + if (fprintf(fp, "\t%6.6s %24.24s %24.24s\n", + dgettext(TEXT_DOMAIN, "Extent"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Block count")) == EOF) + return (mdsyserror(ep, errno, fname)); + + /* print out each extent */ + for (extn = 0; (extn < msp->ext.ext_len); extn++) { + md_sp_ext_t *extp = &msp->ext.ext_val[extn]; + + /* If PRINT_TIMES option is ever supported, add output here */ + if (fprintf(fp, "\t%6u %24llu %24llu\n", + extn, extp->poff, extp->len) == EOF) + return (mdsyserror(ep, errno, fname)); + } + + /* separate records with a newline */ + (void) fprintf(fp, "\n"); + return (0); +} + +/* + * FUNCTION: meta_sp_print() + * INPUT: sp - the set name for the unit being displayed + * np - the name of the device to print + * fname - ??? not used + * fp - the file pointer to send output to + * options - print options from the command line processor + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: print a full report of the device specified by metastat. + * This is the main entry point for printing. + */ +int +meta_sp_print( + mdsetname_t *sp, + mdname_t *np, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_sp_t *msp; + md_unit_t *mdp; + int rval = 0; + + /* should always have the same set */ + assert(sp != NULL); + + /* print all the soft partitions */ + if (np == NULL) { + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int cnt; + + if ((cnt = meta_get_sp_names(sp, &nlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recusively print them out */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *curnp = p->namep; + + /* + * one problem with the rval of -1 here is that + * the error gets "lost" when the next device is + * printed, but we want to print them all anyway. + */ + rval = meta_sp_print(sp, curnp, nlpp, fname, fp, + options, ep); + } + + /* clean up, return success */ + metafreenamelist(nlp); + return (rval); + } + + /* get the unit structure */ + if ((msp = meta_get_sp_common(sp, np, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* check for parented */ + if ((! (options & PRINT_SUBDEVS)) && + (MD_HAS_PARENT(msp->common.parent))) { + return (0); + } + + /* print appropriate detail */ + if (options & PRINT_SHORT) { + if (meta_sp_short_print(msp, fname, fp, options, ep) != 0) + return (-1); + } else { + if (meta_sp_report(sp, msp, nlpp, fname, fp, options, ep) != 0) + return (-1); + } + + /* + * Print underlying metadevices if they are parented to us and + * if the info for the underlying metadevice has not been printed. + */ + if (metaismeta(msp->compnamep)) { + /* get the unit structure for the subdevice */ + if ((mdp = meta_get_mdunit(sp, msp->compnamep, ep)) == NULL) + return (-1); + + /* If info not already printed, recurse */ + if (!BT_TEST(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp)))) { + if (meta_print_name(sp, msp->compnamep, nlpp, fname, fp, + (options | PRINT_HEADER | PRINT_SUBDEVS), + NULL, ep) != 0) { + return (-1); + } + BT_SET(sp_parent_printed, MD_MIN2UNIT(MD_SID(mdp))); + } + } + return (0); +} + +/* + * ************************************************************************** + * Watermark Manipulation Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_get_start() + * INPUT: sp - the operating set + * np - device upon which the sp is being built + * OUTPUT: ep - return error pointer + * RETURNS: daddr_t - -1 if error, otherwise the start block + * PURPOSE: Encapsulate the determination of the start block of the + * device upon which the sp is built or being built. + * This is done to hide the ugliness of the algorithm. In + * the case where a sp is being built upon a stripe of > 1 + * TB that is made up of a set of disks in which the first + * has a VTOC label the result returned from the call to + * metagetstart is incorrect. The reason being that a > 1 + * TB metadevice will manufacture an EFI label in which the + * start address is zero. This is irrespective of the underlying + * devices. The long term fix for this is to fix + * meta_efi_to_mdvtoc and meta_efi_to mdgeom so that they return + * values that are indicative of the first underlying device in + * metadevice. + */ +static diskaddr_t +meta_sp_get_start( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + daddr_t start_block; + + if ((start_block = metagetstart(sp, np, ep)) != MD_DISKADDR_ERROR) { + start_block += MD_SP_START; + /* + * In the case that the device upon which the sp is being + * created is a metadevice then ensure that in the case that + * the first underlying device has a vtoc label that it is + * not overwritten with a watermark by setting the start block + * to point just past the vtoc label + */ + if (start_block < VTOC_SIZE && metaismeta(np)) + start_block = VTOC_SIZE; + } + + return (start_block); +} + +/* + * FUNCTION: meta_sp_update_wm() + * INPUT: sp - the operating set + * msp - a pointer to the XDR unit structure + * extlist - the extent list specifying watermarks to update + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: steps backwards through the extent list updating + * watermarks for all extents with the EXTFLG_UPDATE flag + * set. Writing the watermarks guarantees consistency when + * extents must be broken into pieces since the original + * watermark will be the last to be updated, and will be + * changed to point to a new watermark that is already + * known to be consistent. If one of the writes fails, the + * original watermark stays intact and none of the changes + * are realized. + */ +static int +meta_sp_update_wm( + mdsetname_t *sp, + md_sp_t *msp, + sp_ext_node_t *extlist, + md_error_t *ep +) +{ + sp_ext_node_t *ext; + sp_ext_node_t *tail; + mp_watermark_t *wmp, *watermarks; + xsp_offset_t *osp, *offsets; + int update_count = 0; + int rval = 0; + md_unit_t *mdp; + md_sp_update_wm_t update_params; + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_update_wm: Updating watermarks:\n"); + meta_sp_list_dump(extlist); + } + + /* + * find the last node so we can write the watermarks backwards + * and count watermarks to update so we can allocate space + */ + for (ext = extlist; ext != NULL; ext = ext->ext_next) { + if ((ext->ext_flags & EXTFLG_UPDATE) != 0) { + update_count++; + } + + if (ext->ext_next == NULL) { + tail = ext; + } + } + ext = tail; + + wmp = watermarks = + Zalloc(update_count * sizeof (mp_watermark_t)); + osp = offsets = + Zalloc(update_count * sizeof (sp_ext_offset_t)); + + while (ext != NULL) { + if ((ext->ext_flags & EXTFLG_UPDATE) != 0) { + /* update watermark */ + wmp->wm_magic = MD_SP_MAGIC; + wmp->wm_version = MD_SP_VERSION; + wmp->wm_type = ext->ext_type; + wmp->wm_seq = ext->ext_seq; + wmp->wm_length = ext->ext_length - MD_SP_WMSIZE; + + /* fill in the volume name and set name */ + if (ext->ext_namep != NULL) + (void) strcpy(wmp->wm_mdname, + ext->ext_namep->cname); + else + (void) strcpy(wmp->wm_mdname, MD_SP_FREEWMNAME); + if (ext->ext_setp != NULL && + ext->ext_setp->setno != MD_LOCAL_SET) + (void) strcpy(wmp->wm_setname, + ext->ext_setp->setname); + else + (void) strcpy(wmp->wm_setname, + MD_SP_LOCALSETNAME); + + /* Generate the checksum */ + wmp->wm_checksum = 0; + crcgen((uchar_t *)wmp, (uint_t *)&wmp->wm_checksum, + sizeof (*wmp), NULL); + + /* record the extent offset */ + *osp = ext->ext_offset; + + /* Advance the placeholders */ + osp++; wmp++; + } + ext = ext->ext_prev; + } + + mdp = meta_get_mdunit(sp, msp->common.namep, ep); + if (mdp == NULL) { + rval = -1; + goto out; + } + + (void) memset(&update_params, 0, sizeof (update_params)); + update_params.mnum = MD_SID(mdp); + update_params.count = update_count; + update_params.wmp = (uintptr_t)watermarks; + update_params.osp = (uintptr_t)offsets; + MD_SETDRIVERNAME(&update_params, MD_SP, + MD_MIN2SET(update_params.mnum)); + + if (metaioctl(MD_IOC_SPUPDATEWM, &update_params, + &update_params.mde, msp->common.namep->cname) != 0) { + (void) mdstealerror(ep, &update_params.mde); + rval = -1; + goto out; + } + +out: + Free(watermarks); + Free(offsets); + + return (rval); +} + +/* + * FUNCTION: meta_sp_clear_wm() + * INPUT: sp - the operating set + * msp - the unit structure for the soft partition to clear + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: steps through the extents for a soft partition unit and + * creates an extent list designed to mark all of the + * watermarks for those extents as free. The extent list + * is then passed to meta_sp_update_wm() to actually write + * the watermarks out. + */ +static int +meta_sp_clear_wm( + mdsetname_t *sp, + md_sp_t *msp, + md_error_t *ep +) +{ + sp_ext_node_t *extlist = NULL; + int numexts = msp->ext.ext_len; + uint_t i; + int rval = 0; + + /* for each watermark must set the flag to SP_FREE */ + for (i = 0; i < numexts; i++) { + md_sp_ext_t *extp = &msp->ext.ext_val[i]; + + meta_sp_list_insert(NULL, NULL, &extlist, + extp->poff - MD_SP_WMSIZE, extp->len + MD_SP_WMSIZE, + EXTTYP_FREE, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + } + + /* update watermarks */ + rval = meta_sp_update_wm(sp, msp, extlist, ep); + + meta_sp_list_free(&extlist); + return (rval); +} + +/* + * FUNCTION: meta_sp_read_wm() + * INPUT: sp - setname for component + * compnp - mdname_t for component + * offset - the offset of the watermark to read (sectors) + * OUTPUT: wm - the watermark structure to read into + * ep - return error pointer + * RETURNS: int - -1 if error, 0 on success + * PURPOSE: seeks out to the requested offset and reads a watermark. + * It then verifies that the magic number is correct and + * that the checksum is valid, returning an error if either + * is wrong. + */ +static int +meta_sp_read_wm( + mdsetname_t *sp, + mdname_t *compnp, + mp_watermark_t *wm, + sp_ext_offset_t offset, + md_error_t *ep +) +{ + md_sp_read_wm_t read_params; + + /* + * make sure block offset does not overflow 2^64 bytes and it's a + * multiple of the block size. + */ + assert(offset <= (1LL << (64 - DEV_BSHIFT))); + /* LINTED */ + assert((sizeof (*wm) % DEV_BSIZE) == 0); + + (void) memset(wm, 0, sizeof (*wm)); + + (void) memset(&read_params, 0, sizeof (read_params)); + read_params.rdev = compnp->dev; + read_params.wmp = (uintptr_t)wm; + read_params.offset = offset; + MD_SETDRIVERNAME(&read_params, MD_SP, sp->setno); + + if (metaioctl(MD_IOC_SPREADWM, &read_params, + &read_params.mde, compnp->cname) != 0) { + + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Extent header read failed, block %llu.\n"), offset); + return (mdstealerror(ep, &read_params.mde)); + } + + /* make sure magic number is correct */ + if (wm->wm_magic != MD_SP_MAGIC) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "found incorrect magic number %x, expected %x.\n"), + wm->wm_magic, MD_SP_MAGIC); + /* + * Pass NULL for the device name as we don't have + * valid watermark contents. + */ + return (mdmderror(ep, MDE_SP_BADWMMAGIC, 0, NULL)); + } + + if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum, + sizeof (*wm), NULL)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "found incorrect checksum %x.\n"), + wm->wm_checksum); + return (mdmderror(ep, MDE_SP_BADWMCRC, 0, wm->wm_mdname)); + } + + return (0); +} + +/* + * ************************************************************************** + * Query Functions + * ************************************************************************** + */ + +/* + * IMPORTANT NOTE: This is a static function that assumes that + * its input parameters have been checked and + * have valid values that lie within acceptable + * ranges. + * + * FUNCTION: meta_sp_enough_space() + * INPUT: desired_number_of_sps - the number of soft partitions desired; + * must be > 0 + * desired_sp_size - the desired soft partition size in blocks; + * must be > 0 + * extent_listpp - a reference to a reference to an extent + * list that lists the extents on a device; + * must be a reference to a reference to a + * valid extent list + * alignment - the desired data space alignment for the sp's + * OUTPUT: boolean_t return value + * RETURNS: boolean_t - B_TRUE if there's enough space in the extent + * list to create the desired soft partitions, + * B_FALSE if there's not enough space + * PURPOSE: determines whether there's enough free space in an extent + * list to allow creation of a set of soft partitions + */ +static boolean_t +meta_sp_enough_space( + int desired_number_of_sps, + blkcnt_t desired_sp_size, + sp_ext_node_t **extent_listpp, + sp_ext_length_t alignment +) +{ + boolean_t enough_space; + int number_of_sps; + int number_of_extents_used; + sp_ext_length_t desired_ext_length = desired_sp_size; + + enough_space = B_TRUE; + number_of_sps = 0; + while ((enough_space == B_TRUE) && + (number_of_sps < desired_number_of_sps)) { + /* + * Use the extent allocation algorithm implemented by + * meta_sp_alloc_by_len() to test whether the free + * extents in the extent list referenced by *extent_listpp + * contain enough space to accomodate a soft partition + * of size desired_ext_length. + * + * Repeat the test <desired_number_of_sps> times + * or until it fails, whichever comes first, + * each time allocating the extents required to + * create the soft partition without actually + * creating the soft partition. + */ + number_of_extents_used = meta_sp_alloc_by_len( + TEST_SETNAMEP, + TEST_SOFT_PARTITION_NAMEP, + extent_listpp, + &desired_ext_length, + NO_OFFSET, + alignment); + if (number_of_extents_used == -1) { + enough_space = B_FALSE; + } else { + number_of_sps++; + } + } + return (enough_space); +} + +/* + * IMPORTANT NOTE: This is a static function that calls other functions + * that check its mdsetnamep and device_mdnamep + * input parameters, but expects extent_listpp to + * be a initialized to a valid address to which + * it can write a reference to the extent list that + * it creates. + * + * FUNCTION: meta_sp_get_extent_list() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the device for + * which the extents are to be listed + * device_mdnamep - a reference to the mdname_t structure + * for the device for which the extents + * are to be listed + * OUTPUT: *extent_listpp - a reference to the extent list for + * the device; NULL if the function fails + * *ep - the libmeta error encountered, if any + * RETURNS: boolean_t - B_TRUE if the function call was successful, + * B_FALSE if not + * PURPOSE: gets the extent list for a device + */ +static boolean_t +meta_sp_get_extent_list( + mdsetname_t *mdsetnamep, + mdname_t *device_mdnamep, + sp_ext_node_t **extent_listpp, + md_error_t *ep +) +{ + diskaddr_t device_size_in_blocks; + mdnamelist_t *sp_name_listp; + diskaddr_t start_block_address_in_blocks; + + *extent_listpp = NULL; + sp_name_listp = NULL; + + start_block_address_in_blocks = meta_sp_get_start(mdsetnamep, + device_mdnamep, + ep); + if (start_block_address_in_blocks == MD_DISKADDR_ERROR) { + if (getenv(META_SP_DEBUG)) { + mde_perror(ep, "meta_sp_get_extent_list:meta_sp_get_start"); + } + return (B_FALSE); + } + + device_size_in_blocks = metagetsize(device_mdnamep, ep); + if (device_size_in_blocks == MD_DISKADDR_ERROR) { + if (getenv(META_SP_DEBUG)) { + mde_perror(ep, + "meta_sp_get_extent_list:metagetsize"); + } + return (B_FALSE); + } + + /* + * Sanity check: the start block will have skipped an integer + * number of cylinders, C. C will usually be zero. If (C > 0), + * and the disk slice happens to only be C cylinders in total + * size, we'll fail this check. + */ + if (device_size_in_blocks <= + (start_block_address_in_blocks + MD_SP_WMSIZE)) { + (void) mdmderror(ep, MDE_SP_NOSPACE, 0, device_mdnamep->cname); + return (B_FALSE); + } + + /* + * After this point, we will have allocated resources, so any + * failure returns must be through the supplied "fail" label + * to properly deallocate things. + */ + + /* + * Create an empty extent list that starts one watermark past + * the start block of the device and ends one watermark before + * the end of the device. + */ + meta_sp_list_insert(TEST_SETNAMEP, + TEST_SOFT_PARTITION_NAMEP, + extent_listpp, + NO_OFFSET, + (sp_ext_length_t)start_block_address_in_blocks, + EXTTYP_RESERVED, + NO_SEQUENCE_NUMBER, + NO_FLAGS, + meta_sp_cmp_by_offset); + meta_sp_list_insert(TEST_SETNAMEP, + TEST_SOFT_PARTITION_NAMEP, + extent_listpp, + (sp_ext_offset_t)(device_size_in_blocks - + MD_SP_WMSIZE), + MD_SP_WMSIZE, + EXTTYP_END, + NO_SEQUENCE_NUMBER, + NO_FLAGS, + meta_sp_cmp_by_offset); + + /* + * Get the list of soft partitions that are already on the + * device. + */ + if (meta_sp_get_by_component(mdsetnamep, device_mdnamep, + &sp_name_listp, FORCE_RELOAD_CACHE, ep) < 1) { + if (getenv(META_SP_DEBUG)) { + mde_perror(ep, + "meta_sp_get_extent_list:meta_sp_get_by_component"); + } + goto fail; + } + + if (sp_name_listp != NULL) { + /* + * If there are soft partitions on the device, add the + * extents used in them to the extent list. + */ + if (meta_sp_extlist_from_namelist(mdsetnamep, sp_name_listp, + extent_listpp, ep) == -1) { + if (getenv(META_SP_DEBUG)) { + mde_perror(ep, "meta_sp_get_extent_list:" + "meta_sp_extlist_from_namelist"); + } + goto fail; + } + metafreenamelist(sp_name_listp); + } + + /* + * Add free extents to the extent list to represent + * the remaining regions of free space on the + * device. + */ + meta_sp_list_freefill(extent_listpp, device_size_in_blocks); + return (B_TRUE); + +fail: + if (sp_name_listp != NULL) { + metafreenamelist(sp_name_listp); + } + + if (*extent_listpp != NULL) { + /* + * meta_sp_list_free sets *extent_listpp to NULL. + */ + meta_sp_list_free(extent_listpp); + } + return (B_FALSE); +} + +/* + * IMPORTANT NOTE: This is a static function that calls other functions + * that check its mdsetnamep and mddrivenamep + * input parameters, but expects extent_listpp to + * be a initialized to a valid address to which + * it can write a reference to the extent list that + * it creates. + * + * FUNCTION: meta_sp_get_extent_list_for_drive() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the drive for + * which the extents are to be listed + * mddrivenamep - a reference to the mddrivename_t structure + * for the drive for which the extents + * are to be listed + * OUTPUT: *extent_listpp - a reference to the extent list for + * the drive; NULL if the function fails + * RETURNS: boolean_t - B_TRUE if the function call was successful, + * B_FALSE if not + * PURPOSE: gets the extent list for a drive when the entire drive + * is to be soft partitioned + */ +static boolean_t +meta_sp_get_extent_list_for_drive( + mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep, + sp_ext_node_t **extent_listpp +) +{ + boolean_t can_use; + diskaddr_t free_space; + md_error_t mderror; + mdvtoc_t proposed_vtoc; + int repartition_options; + int return_value; + md_sp_t test_sp_struct; + + can_use = B_TRUE; + *extent_listpp = NULL; + mderror = mdnullerror; + test_sp_struct.compnamep = metaslicename(mddrivenamep, MD_SLICE0, + &mderror); + if (test_sp_struct.compnamep == NULL) { + can_use = B_FALSE; + } + + if (can_use == B_TRUE) { + mderror = mdnullerror; + repartition_options = 0; + return_value = meta_check_sp(mdsetnamep, &test_sp_struct, + MDCMD_USE_WHOLE_DISK, &repartition_options, + &mderror); + if (return_value != 0) { + can_use = B_FALSE; + } + } + + if (can_use == B_TRUE) { + mderror = mdnullerror; + repartition_options = repartition_options | + (MD_REPART_FORCE | MD_REPART_DONT_LABEL); + return_value = meta_repartition_drive(mdsetnamep, mddrivenamep, + repartition_options, &proposed_vtoc, &mderror); + if (return_value != 0) { + can_use = B_FALSE; + } + } + + if (can_use == B_TRUE) { + free_space = proposed_vtoc.parts[MD_SLICE0].size; + if (free_space <= (MD_SP_START + MD_SP_WMSIZE)) { + can_use = B_FALSE; + } + } + + if (can_use == B_TRUE) { + /* + * Create an extent list that starts with + * a reserved extent that ends at the start + * of the usable space on slice zero of the + * proposed VTOC, ends with an extent that + * reserves space for a watermark at the end + * of slice zero, and contains a single free + * extent that occupies the rest of the space + * on the slice. + * + * NOTE: + * + * Don't use metagetstart() or metagetsize() to + * find the usable space. They query the mdname_t + * structure that represents an actual device to + * determine the amount of space on the device that + * contains metadata and the total amount of space + * on the device. Since this function creates a + * proposed extent list that doesn't reflect the + * state of an actual device, there's no mdname_t + * structure to be queried. + * + * When a drive is reformatted to prepare for + * soft partitioning, all of slice seven is + * reserved for metadata, all of slice zero is + * available for soft partitioning, and all other + * slices on the drive are empty. The proposed + * extent list for the drive therefore contains + * only three extents: a reserved extent that ends + * at the start of the usable space on slice zero, + * a single free extent that occupies all the usable + * space on slice zero, and an ending extent that + * reserves space for a watermark at the end of + * slice zero. + */ + meta_sp_list_insert(TEST_SETNAMEP, + TEST_SOFT_PARTITION_NAMEP, + extent_listpp, + NO_OFFSET, + (sp_ext_length_t)(MD_SP_START), + EXTTYP_RESERVED, + NO_SEQUENCE_NUMBER, + NO_FLAGS, + meta_sp_cmp_by_offset); + meta_sp_list_insert(TEST_SETNAMEP, + TEST_SOFT_PARTITION_NAMEP, + extent_listpp, + (sp_ext_offset_t)(free_space - MD_SP_WMSIZE), + MD_SP_WMSIZE, + EXTTYP_END, + NO_SEQUENCE_NUMBER, + NO_FLAGS, + meta_sp_cmp_by_offset); + meta_sp_list_freefill(extent_listpp, free_space); + } + return (can_use); +} + +/* + * FUNCTION: meta_sp_can_create_sps() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the device for + * which the extents are to be listed + * mdnamep - a reference to the mdname_t of the device + * on which the soft parititions are to be created + * number_of_sps - the desired number of soft partitions + * sp_size - the desired soft partition size + * OUTPUT: boolean_t return value + * RETURNS: boolean_t - B_TRUE if the soft partitionns can be created, + * B_FALSE if not + * PURPOSE: determines whether a set of soft partitions can be created + * on a device + */ +boolean_t +meta_sp_can_create_sps( + mdsetname_t *mdsetnamep, + mdname_t *mdnamep, + int number_of_sps, + blkcnt_t sp_size +) +{ + sp_ext_node_t *extent_listp; + boolean_t succeeded; + md_error_t mde; + + if ((number_of_sps > 0) && (sp_size > 0)) { + succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep, + &extent_listp, &mde); + } else { + succeeded = B_FALSE; + } + + /* + * We don't really care about an error return from the + * alignment call; that will just result in passing zero, + * which will be interpreted as no alignment. + */ + + if (succeeded == B_TRUE) { + succeeded = meta_sp_enough_space(number_of_sps, + sp_size, &extent_listp, + meta_sp_get_default_alignment(mdsetnamep, mdnamep, &mde)); + meta_sp_list_free(&extent_listp); + } + return (succeeded); +} + +/* + * FUNCTION: meta_sp_can_create_sps_on_drive() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the drive for + * which the extents are to be listed + * mddrivenamep - a reference to the mddrivename_t of the drive + * on which the soft parititions are to be created + * number_of_sps - the desired number of soft partitions + * sp_size - the desired soft partition size + * OUTPUT: boolean_t return value + * RETURNS: boolean_t - B_TRUE if the soft partitionns can be created, + * B_FALSE if not + * PURPOSE: determines whether a set of soft partitions can be created + * on a drive if the entire drive is soft partitioned + */ +boolean_t +meta_sp_can_create_sps_on_drive( + mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep, + int number_of_sps, + blkcnt_t sp_size +) +{ + sp_ext_node_t *extent_listp; + boolean_t succeeded; + + if ((number_of_sps > 0) && (sp_size > 0)) { + succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep, + mddrivenamep, + &extent_listp); + } else { + succeeded = B_FALSE; + } + + /* + * We don't care about alignment on the space call because + * we're specifically dealing with a drive, which will have no + * inherent alignment. + */ + + if (succeeded == B_TRUE) { + succeeded = meta_sp_enough_space(number_of_sps, sp_size, + &extent_listp, SP_UNALIGNED); + meta_sp_list_free(&extent_listp); + } + return (succeeded); +} + +/* + * FUNCTION: meta_sp_get_free_space() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the device for + * which the free space is to be returned + * mdnamep - a reference to the mdname_t of the device + * for which the free space is to be returned + * OUTPUT: blkcnt_t return value + * RETURNS: blkcnt_t - the number of blocks of free space on the device + * PURPOSE: returns the number of blocks of free space on a device + */ +blkcnt_t +meta_sp_get_free_space( + mdsetname_t *mdsetnamep, + mdname_t *mdnamep +) +{ + sp_ext_node_t *extent_listp; + sp_ext_length_t free_blocks; + boolean_t succeeded; + md_error_t mde; + + extent_listp = NULL; + free_blocks = 0; + succeeded = meta_sp_get_extent_list(mdsetnamep, mdnamep, + &extent_listp, &mde); + if (succeeded == B_TRUE) { + free_blocks = meta_sp_list_size(extent_listp, + EXTTYP_FREE, INCLUDE_WM); + meta_sp_list_free(&extent_listp); + if (free_blocks > (10 * MD_SP_WMSIZE)) { + /* + * Subtract a safety margin for watermarks when + * computing the number of blocks available for + * use. The actual number of watermarks can't + * be calculated without knowing the exact numbers + * and sizes of both the free extents and the soft + * partitions to be created. The calculation is + * highly complex and error-prone even if those + * quantities are known. The approximate value + * 10 * MD_SP_WMSIZE is within a few blocks of the + * correct value in all practical cases. + */ + free_blocks = free_blocks - (10 * MD_SP_WMSIZE); + } else { + free_blocks = 0; + } + } else { + mdclrerror(&mde); + } + + return (free_blocks); +} + +/* + * FUNCTION: meta_sp_get_free_space_on_drive() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the drive for + * which the free space is to be returned + * mddrivenamep - a reference to the mddrivename_t of the drive + * for which the free space is to be returned + * OUTPUT: blkcnt_t return value + * RETURNS: blkcnt_t - the number of blocks of free space on the drive + * PURPOSE: returns the number of blocks of space usable for soft + * partitions on an entire drive, if the entire drive is + * soft partitioned + */ +blkcnt_t +meta_sp_get_free_space_on_drive( + mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep +) +{ + sp_ext_node_t *extent_listp; + sp_ext_length_t free_blocks; + boolean_t succeeded; + + extent_listp = NULL; + free_blocks = 0; + succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep, + mddrivenamep, &extent_listp); + if (succeeded == B_TRUE) { + free_blocks = meta_sp_list_size(extent_listp, + EXTTYP_FREE, INCLUDE_WM); + meta_sp_list_free(&extent_listp); + if (free_blocks > (10 * MD_SP_WMSIZE)) { + /* + * Subtract a safety margin for watermarks when + * computing the number of blocks available for + * use. The actual number of watermarks can't + * be calculated without knowing the exact numbers + * and sizes of both the free extents and the soft + * partitions to be created. The calculation is + * highly complex and error-prone even if those + * quantities are known. The approximate value + * 10 * MD_SP_WMSIZE is within a few blocks of the + * correct value in all practical cases. + */ + free_blocks = free_blocks - (10 * MD_SP_WMSIZE); + } else { + free_blocks = 0; + } + } + return (free_blocks); +} + +/* + * FUNCTION: meta_sp_get_number_of_possible_sps() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the device for + * which the number of possible soft partitions + * is to be returned + * mdnamep - a reference to the mdname_t of the device + * for which the number of possible soft partitions + * is to be returned + * OUTPUT: int return value + * RETURNS: int - the number of soft partitions of the desired size + * that can be created on the device + * PURPOSE: returns the number of soft partitions of a given size + * that can be created on a device + */ +int +meta_sp_get_number_of_possible_sps( + mdsetname_t *mdsetnamep, + mdname_t *mdnamep, + blkcnt_t sp_size +) +{ + sp_ext_node_t *extent_listp; + int number_of_possible_sps; + boolean_t succeeded; + md_error_t mde; + sp_ext_length_t alignment; + + extent_listp = NULL; + number_of_possible_sps = 0; + if (sp_size > 0) { + if ((succeeded = meta_sp_get_extent_list(mdsetnamep, + mdnamep, &extent_listp, &mde)) == B_FALSE) + mdclrerror(&mde); + } else { + succeeded = B_FALSE; + } + + if (succeeded == B_TRUE) { + alignment = meta_sp_get_default_alignment(mdsetnamep, + mdnamep, &mde); + } + + while (succeeded == B_TRUE) { + /* + * Keep allocating space from the extent list + * for soft partitions of the desired size until + * there's not enough free space left in the list + * for another soft partiition of that size. + * Add one to the number of possible soft partitions + * for each soft partition for which there is + * enough free space left. + */ + succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION, + sp_size, &extent_listp, alignment); + if (succeeded == B_TRUE) { + number_of_possible_sps++; + } + } + if (extent_listp != NULL) { + meta_sp_list_free(&extent_listp); + } + return (number_of_possible_sps); +} + +/* + * FUNCTION: meta_sp_get_number_of_possible_sps_on_drive() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the drive for + * which the number of possible soft partitions + * is to be returned + * mddrivenamep - a reference to the mddrivename_t of the drive + * for which the number of possible soft partitions + * is to be returned + * sp_size - the size in blocks of the proposed soft partitions + * OUTPUT: int return value + * RETURNS: int - the number of soft partitions of the desired size + * that can be created on the drive + * PURPOSE: returns the number of soft partitions of a given size + * that can be created on a drive, if the entire drive is + * soft partitioned + */ +int +meta_sp_get_number_of_possible_sps_on_drive( + mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep, + blkcnt_t sp_size +) +{ + sp_ext_node_t *extent_listp; + int number_of_possible_sps; + boolean_t succeeded; + + extent_listp = NULL; + number_of_possible_sps = 0; + if (sp_size > 0) { + succeeded = meta_sp_get_extent_list_for_drive(mdsetnamep, + mddrivenamep, &extent_listp); + } else { + succeeded = B_FALSE; + } + while (succeeded == B_TRUE) { + /* + * Keep allocating space from the extent list + * for soft partitions of the desired size until + * there's not enough free space left in the list + * for another soft partition of that size. + * Add one to the number of possible soft partitions + * for each soft partition for which there is + * enough free space left. + * + * Since it's a drive, not a metadevice, make no + * assumptions about alignment. + */ + succeeded = meta_sp_enough_space(ONE_SOFT_PARTITION, + sp_size, &extent_listp, SP_UNALIGNED); + if (succeeded == B_TRUE) { + number_of_possible_sps++; + } + } + if (extent_listp != NULL) { + meta_sp_list_free(&extent_listp); + } + return (number_of_possible_sps); +} + +/* + * FUNCTION: meta_sp_get_possible_sp_size() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the device for + * which the possible soft partition size + * is to be returned + * mdnamep - a reference to the mdname_t of the device + * for which the possible soft partition size + * is to be returned + * number_of_sps - the desired number of soft partitions + * OUTPUT: blkcnt_t return value + * RETURNS: blkcnt_t - the possible soft partition size in blocks + * PURPOSE: returns the maximum possible size of each of a given number of + * soft partitions of equal size that can be created on a device + */ +blkcnt_t +meta_sp_get_possible_sp_size( + mdsetname_t *mdsetnamep, + mdname_t *mdnamep, + int number_of_sps +) +{ + blkcnt_t free_blocks; + blkcnt_t sp_size; + boolean_t succeeded; + + sp_size = 0; + if (number_of_sps > 0) { + free_blocks = meta_sp_get_free_space(mdsetnamep, mdnamep); + sp_size = free_blocks / number_of_sps; + succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep, + number_of_sps, sp_size); + while ((succeeded == B_FALSE) && (sp_size > 0)) { + /* + * To compensate for space that may have been + * occupied by watermarks, reduce sp_size by a + * number of blocks equal to the number of soft + * partitions desired, and test again to see + * whether the desired number of soft partitions + * can be created. + */ + sp_size = sp_size - ((blkcnt_t)number_of_sps); + succeeded = meta_sp_can_create_sps(mdsetnamep, mdnamep, + number_of_sps, sp_size); + } + if (sp_size < 0) { + sp_size = 0; + } + } + return (sp_size); +} + +/* + * FUNCTION: meta_sp_get_possible_sp_size_on_drive() + * INPUT: mdsetnamep - a reference to the mdsetname_t structure + * for the set containing the drive for + * which the possible soft partition size + * is to be returned + * mddrivenamep - a reference to the mddrivename_t of the drive + * for which the possible soft partition size + * is to be returned + * number_of_sps - the desired number of soft partitions + * OUTPUT: blkcnt_t return value + * RETURNS: blkcnt_t - the possible soft partition size in blocks + * PURPOSE: returns the maximum possible size of each of a given number of + * soft partitions of equal size that can be created on a drive + * if the entire drive is soft partitioned + */ +blkcnt_t +meta_sp_get_possible_sp_size_on_drive( + mdsetname_t *mdsetnamep, + mddrivename_t *mddrivenamep, + int number_of_sps +) +{ + blkcnt_t free_blocks; + blkcnt_t sp_size; + boolean_t succeeded; + + sp_size = 0; + if (number_of_sps > 0) { + free_blocks = meta_sp_get_free_space_on_drive(mdsetnamep, + mddrivenamep); + sp_size = free_blocks / number_of_sps; + succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep, + mddrivenamep, + number_of_sps, sp_size); + while ((succeeded == B_FALSE) && (sp_size > 0)) { + /* + * To compensate for space that may have been + * occupied by watermarks, reduce sp_size by a + * number of blocks equal to the number of soft + * partitions desired, and test again to see + * whether the desired number of soft partitions + * can be created. + */ + sp_size = sp_size - ((blkcnt_t)number_of_sps); + succeeded = meta_sp_can_create_sps_on_drive(mdsetnamep, + mddrivenamep, + number_of_sps, sp_size); + } + if (sp_size < 0) { + sp_size = 0; + } + } + return (sp_size); +} + +/* + * ************************************************************************** + * Unit Structure Manipulation Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_fillextarray() + * INPUT: mp - the unit structure to fill + * extlist - the list of extents to fill with + * OUTPUT: none + * RETURNS: void + * PURPOSE: fills in the unit structure extent list with the extents + * specified by extlist. Only extents in extlist with the + * EXTFLG_UPDATE flag are changed in the unit structure, + * and the index into the unit structure is the sequence + * number in the extent list. After all of the nodes have + * been updated the virtual offsets in the unit structure + * are updated to reflect the new lengths. + */ +static void +meta_sp_fillextarray( + mp_unit_t *mp, + sp_ext_node_t *extlist +) +{ + int i; + sp_ext_node_t *ext; + sp_ext_offset_t curvoff = 0LL; + + assert(mp != NULL); + + /* go through the allocation list and fill in our unit structure */ + for (ext = extlist; ext != NULL; ext = ext->ext_next) { + if ((ext->ext_type == EXTTYP_ALLOC) && + (ext->ext_flags & EXTFLG_UPDATE) != 0) { + mp->un_ext[ext->ext_seq].un_poff = + ext->ext_offset + MD_SP_WMSIZE; + mp->un_ext[ext->ext_seq].un_len = + ext->ext_length - MD_SP_WMSIZE; + } + } + + for (i = 0; i < mp->un_numexts; i++) { + assert(mp->un_ext[i].un_poff != 0); + assert(mp->un_ext[i].un_len != 0); + mp->un_ext[i].un_voff = curvoff; + curvoff += mp->un_ext[i].un_len; + } +} + +/* + * FUNCTION: meta_sp_createunit() + * INPUT: np - the name of the device to create a unit structure for + * compnp - the name of the device the soft partition is on + * extlist - the extent list to populate the new unit with + * numexts - the number of extents in the extent list + * len - the total size of the soft partition (sectors) + * status - the initial status of the unit structure + * OUTPUT: ep - return error pointer + * RETURNS: mp_unit_t * - the new unit structure. + * PURPOSE: allocates and fills in a new soft partition unit + * structure to be passed to the soft partitioning driver + * for creation. + */ +static mp_unit_t * +meta_sp_createunit( + mdname_t *np, + mdname_t *compnp, + sp_ext_node_t *extlist, + int numexts, + sp_ext_length_t len, + sp_status_t status, + md_error_t *ep +) +{ + mp_unit_t *mp; + uint_t ms_size; + + ms_size = (sizeof (*mp) - sizeof (mp->un_ext[0])) + + (numexts * sizeof (mp->un_ext[0])); + + mp = Zalloc(ms_size); + + /* fill in fields in common unit structure */ + mp->c.un_type = MD_METASP; + mp->c.un_size = ms_size; + MD_SID(mp) = meta_getminor(np->dev); + mp->c.un_total_blocks = len; + mp->c.un_actual_tb = len; + + /* set up geometry */ + (void) meta_sp_setgeom(np, compnp, mp, ep); + + /* if we're building on metadevice we can't parent */ + if (metaismeta(compnp)) + MD_CAPAB(mp) = MD_CANT_PARENT; + else + MD_CAPAB(mp) = MD_CAN_PARENT; + + /* fill soft partition-specific fields */ + mp->un_dev = compnp->dev; + mp->un_key = compnp->key; + + /* mdname_t start_blk field is not 64-bit! */ + mp->un_start_blk = (sp_ext_offset_t)compnp->start_blk; + mp->un_status = status; + mp->un_numexts = numexts; + mp->un_length = len; + + /* fill in the extent array */ + meta_sp_fillextarray(mp, extlist); + + return (mp); +} + +/* + * FUNCTION: meta_sp_updateunit() + * INPUT: np - name structure for the metadevice being updated + * old_un - the original unit structure that is being updated + * extlist - the extent list to populate the new unit with + * grow_len - the amount by which the partition is being grown + * numexts - the number of extents in the extent list + * ep - return error pointer + * OUTPUT: none + * RETURNS: mp_unit_t * - the updated unit structure + * PURPOSE: allocates and fills in a new soft partition unit structure to + * be passed to the soft partitioning driver for creation. The + * old unit structure is first copied in, and then the updated + * extents are changed in the new unit structure. This is + * typically used when the size of an existing unit is changed. + */ +static mp_unit_t * +meta_sp_updateunit( + mdname_t *np, + mp_unit_t *old_un, + sp_ext_node_t *extlist, + sp_ext_length_t grow_len, + int numexts, + md_error_t *ep +) +{ + mp_unit_t *new_un; + sp_ext_length_t new_len; + uint_t new_size; + + assert(old_un != NULL); + assert(extlist != NULL); + + /* allocate new unit structure and copy in old unit */ + new_size = (sizeof (*old_un) - sizeof (old_un->un_ext[0])) + + ((old_un->un_numexts + numexts) * sizeof (old_un->un_ext[0])); + new_len = old_un->un_length + grow_len; + new_un = Zalloc(new_size); + bcopy(old_un, new_un, old_un->c.un_size); + + /* update size and geometry information */ + new_un->c.un_size = new_size; + new_un->un_length = new_len; + new_un->c.un_total_blocks = new_len; + new_un->c.un_actual_tb = new_len; + if (meta_adjust_geom((md_unit_t *)new_un, np, + old_un->c.un_wr_reinstruct, old_un->c.un_rd_reinstruct, + 0, ep) != 0) { + Free(new_un); + return (NULL); + } + + /* update extent information */ + new_un->un_numexts += numexts; + + meta_sp_fillextarray(new_un, extlist); + + return (new_un); +} + +/* + * FUNCTION: meta_get_sp() + * INPUT: sp - the set name for the device to get + * np - the name of the device to get + * OUTPUT: ep - return error pointer + * RETURNS: md_sp_t * - the XDR unit structure for the soft partition + * PURPOSE: interface to the rest of libmeta for fetching a unit structure + * for the named device. Just a wrapper for meta_get_sp_common(). + */ +md_sp_t * +meta_get_sp( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + return (meta_get_sp_common(sp, np, 0, ep)); +} + +/* + * FUNCTION: meta_get_sp_common() + * INPUT: sp - the set name for the device to get + * np - the name of the device to get + * fast - whether to use the cache or not (NOT IMPLEMENTED!) + * OUTPUT: ep - return error pointer + * RETURNS: md_sp_t * - the XDR unit structure for the soft partition, + * NULL if np is not a soft partition + * PURPOSE: common routine for fetching a soft partition unit structure + */ +md_sp_t * +meta_get_sp_common( + mdsetname_t *sp, + mdname_t *np, + int fast, + md_error_t *ep +) +{ + mddrivename_t *dnp = np->drivenamep; + char *miscname; + mp_unit_t *mp; + md_sp_t *msp; + int i; + + /* must have set */ + assert(sp != NULL); + + /* short circuit */ + if (dnp->unitp != NULL) { + if (dnp->unitp->type != MD_METASP) + return (NULL); + return ((md_sp_t *)dnp->unitp); + } + /* get miscname and unit */ + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (NULL); + + if (strcmp(miscname, MD_SP) != 0) { + (void) mdmderror(ep, MDE_NOT_SP, 0, np->cname); + return (NULL); + } + + if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL) + return (NULL); + + assert(mp->c.un_type == MD_METASP); + + /* allocate soft partition */ + msp = Zalloc(sizeof (*msp)); + + /* get the common information */ + msp->common.namep = np; + msp->common.type = mp->c.un_type; + msp->common.state = mp->c.un_status; + msp->common.capabilities = mp->c.un_capabilities; + msp->common.parent = mp->c.un_parent; + msp->common.size = mp->c.un_total_blocks; + msp->common.user_flags = mp->c.un_user_flags; + msp->common.revision = mp->c.un_revision; + + /* get soft partition information */ + if ((msp->compnamep = metakeyname(&sp, mp->un_key, fast, ep)) == NULL) + goto out; + + /* + * Fill in the key and the start block. Note that the start + * block in the unit structure is 64 bits but the name pointer + * only supports 32 bits. + */ + msp->compnamep->key = mp->un_key; + msp->compnamep->start_blk = mp->un_start_blk; + + /* fill in status field */ + msp->status = mp->un_status; + + /* allocate the extents */ + msp->ext.ext_val = Zalloc(mp->un_numexts * sizeof (*msp->ext.ext_val)); + msp->ext.ext_len = mp->un_numexts; + + /* do the extents for this soft partition */ + for (i = 0; i < mp->un_numexts; i++) { + struct mp_ext *mde = &mp->un_ext[i]; + md_sp_ext_t *extp = &msp->ext.ext_val[i]; + + extp->voff = mde->un_voff; + extp->poff = mde->un_poff; + extp->len = mde->un_len; + } + + /* cleanup, return success */ + Free(mp); + dnp->unitp = (md_common_t *)msp; + return (msp); + +out: + /* clean up and return error */ + Free(mp); + Free(msp); + return (NULL); +} + + +/* + * FUNCTION: meta_init_sp() + * INPUT: spp - the set name for the new device + * argc - the remaining argument count for the metainit cmdline + * argv - the remainder of the unparsed command line + * options - global options parsed by metainit + * OUTPUT: ep - return error pointer + * RETURNS: int - -1 failure, 0 success + * PURPOSE: provides the command line parsing and name management overhead + * for creating a new soft partition. Ultimately this calls + * meta_create_sp() which does the real work of allocating space + * for the new soft partition. + */ +int +meta_init_sp( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + char *compname = NULL; + mdname_t *spcompnp = NULL; /* name of component volume */ + char *devname = argv[0]; /* unit name */ + mdname_t *np = NULL; /* name of soft partition */ + md_sp_t *msp = NULL; + int c; + int old_optind; + sp_ext_length_t len = 0LL; + int rval = -1; + uint_t seq; + int oflag; + int failed; + mddrivename_t *dnp = NULL; + sp_ext_length_t alignment = 0LL; + sp_ext_node_t *extlist = NULL; + + assert(argc > 0); + + /* expect sp name, -p, optional -e, compname, and size parameters */ + /* grab soft partition name */ + if ((np = metaname(spp, devname, ep)) == NULL) + goto out; + + /* see if it exists already */ + if (metagetmiscname(np, ep) != NULL) { + (void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP, + meta_getminor(np->dev), devname); + goto out; + } else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) { + goto out; + } else { + mdclrerror(ep); + } + --argc, ++argv; + + if (argc == 0) + goto syntax; + + /* grab -p */ + if (strcmp(argv[0], "-p") != 0) + goto syntax; + --argc, ++argv; + + if (argc == 0) + goto syntax; + + /* see if -e is there */ + if (strcmp(argv[0], "-e") == 0) { + /* use the whole disk */ + options |= MDCMD_USE_WHOLE_DISK; + --argc, ++argv; + } + + if (argc == 0) + goto syntax; + + /* get component name */ + compname = Strdup(argv[0]); + + if (options & MDCMD_USE_WHOLE_DISK) { + if ((dnp = metadrivename(spp, compname, ep)) == NULL) { + goto out; + } + if ((spcompnp = metaslicename(dnp, 0, ep)) == NULL) { + goto out; + } + } else if ((spcompnp = metaname(spp, compname, ep)) == NULL) { + goto out; + } + assert(*spp != NULL); + + if (!(options & MDCMD_NOLOCK)) { + /* grab set lock */ + if (meta_lock(*spp, TRUE, ep)) + goto out; + + if (meta_check_ownership(*spp, ep) != 0) + goto out; + } + + /* allocate the soft partition */ + msp = Zalloc(sizeof (*msp)); + + /* setup common */ + msp->common.namep = np; + msp->common.type = MD_METASP; + + compname = spcompnp->cname; + + assert(spcompnp->rname != NULL); + --argc, ++argv; + + if (argc == 0) { + goto syntax; + } + + if (*argv[0] == '-') { + /* + * parse any other command line options, this includes + * the recovery options -o and -b. The special thing + * with these options is that the len needs to be + * kept track of otherwise when the geometry of the + * "device" is built it will create an invalid geometry + */ + old_optind = optind = 0; + opterr = 0; + oflag = 0; + seq = 0; + failed = 0; + while ((c = getopt(argc, argv, "A:o:b:")) != -1) { + sp_ext_offset_t offset; + sp_ext_length_t length; + longlong_t tmp_size; + + switch (c) { + case 'A': /* data alignment */ + if (meta_sp_parsesizestring(optarg, + &alignment) == -1) { + failed = 1; + } + break; + case 'o': /* offset in the partition */ + if (oflag == 1) { + failed = 1; + } else { + tmp_size = atoll(optarg); + if (tmp_size <= 0) { + failed = 1; + } else { + oflag = 1; + options |= MDCMD_DIRECT; + + offset = tmp_size; + } + } + + break; + case 'b': /* number of blocks */ + if (oflag == 0) { + failed = 1; + } else { + tmp_size = atoll(optarg); + if (tmp_size <= 0) { + failed = 1; + } else { + oflag = 0; + + length = tmp_size; + + /* we have a pair of values */ + meta_sp_list_insert(*spp, np, + &extlist, offset, + length, EXTTYP_ALLOC, + seq++, EXTFLG_UPDATE, + meta_sp_cmp_by_offset); + len += length; + } + } + + break; + default: + argc -= old_optind; + argv += old_optind; + goto options; + } + + if (failed) { + argc -= old_optind; + argv += old_optind; + goto syntax; + } + + old_optind = optind; + } + argc -= optind; + argv += optind; + + /* + * Must have matching pairs of -o and -b flags + */ + if (oflag != 0) + goto syntax; + + /* + * Can't specify both layout (indicated indirectly by + * len being set by thye -o/-b cases above) AND + * alignment + */ + if ((len > 0LL) && (alignment > 0LL)) + goto syntax; + + /* + * sanity check the allocation list + */ + if ((extlist != NULL) && meta_sp_list_overlaps(extlist)) + goto syntax; + } + + if (len == 0LL) { + if (argc == 0) + goto syntax; + if (meta_sp_parsesize(argv[0], &len) == -1) + goto syntax; + --argc, ++argv; + } + + msp->ext.ext_val = Zalloc(sizeof (*msp->ext.ext_val)); + msp->ext.ext_val->len = len; + msp->compnamep = spcompnp; + + /* we should be at the end */ + if (argc != 0) + goto syntax; + + /* create soft partition */ + if (meta_create_sp(*spp, msp, extlist, options, alignment, ep) != 0) + goto out; + rval = 0; + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition is setup\n"), + devname); + (void) fflush(stdout); + } + goto out; + +syntax: + /* syntax error */ + rval = meta_cook_syntax(ep, MDE_SYNTAX, compname, argc, argv); + goto out; + +options: + /* options error */ + rval = meta_cook_syntax(ep, MDE_OPTION, compname, argc, argv); + goto out; + +out: + if (msp != NULL) { + if (msp->ext.ext_val != NULL) { + Free(msp->ext.ext_val); + } + Free(msp); + } + + return (rval); +} + +/* + * FUNCTION: meta_free_sp() + * INPUT: msp - the soft partition unit to free + * OUTPUT: none + * RETURNS: void + * PURPOSE: provides an interface from the rest of libmeta for freeing a + * soft partition unit + */ +void +meta_free_sp(md_sp_t *msp) +{ + Free(msp); +} + +/* + * FUNCTION: meta_sp_issp() + * INPUT: sp - the set name to check + * np - the name to check + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 means sp,np is a soft partition + * 1 means sp,np is not a soft partition + * PURPOSE: determines whether the given device is a soft partition + * device. This is called by other metadevice check routines. + */ +int +meta_sp_issp( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + if (meta_get_sp_common(sp, np, 0, ep) == NULL) + return (1); + + return (0); +} + +/* + * FUNCTION: meta_check_sp() + * INPUT: sp - the set name to check + * msp - the unit structure to check + * options - creation options + * OUTPUT: repart_options - options to be passed to + * meta_repartition_drive() + * ep - return error pointer + * RETURNS: int - 0 ok to create on this component + * -1 error or not ok to create on this component + * PURPOSE: Checks to determine whether the rules for creation of + * soft partitions allow creation of a soft partition on + * the device described by the mdname_t structure referred + * to by msp->compnamep. + * + * NOTE: Does NOT check to determine whether the extents + * described in the md_sp_t structure referred to by + * msp will fit on the device described by the mdname_t + * structure located at msp->compnamep. + */ +static int +meta_check_sp( + mdsetname_t *sp, + md_sp_t *msp, + mdcmdopts_t options, + int *repart_options, + md_error_t *ep +) +{ + md_common_t *mdp; + mdname_t *compnp = msp->compnamep; + uint_t slice; + mddrivename_t *dnp; + mdname_t *slicenp; + mdvtoc_t *vtocp; + + /* make sure it is in the set */ + if (meta_check_inset(sp, compnp, ep) != 0) + return (-1); + + if ((options & MDCMD_USE_WHOLE_DISK) != 0) { + uint_t rep_slice; + + /* + * check to make sure we can partition this drive. + * we cannot continue if any of the following are + * true: + * The drive is a metadevice. + * The drive contains a mounted slice. + * The drive contains a slice being swapped to. + * The drive contains slices which are part of other + * metadevices. + * The drive contains a metadb. + */ + if (metaismeta(compnp)) + return (mddeverror(ep, MDE_IS_META, compnp->dev, + compnp->cname)); + + assert(compnp->drivenamep != NULL); + + /* + * ensure that we have slice 0 since the disk will be + * repartitioned in the USE_WHOLE_DISK case. this check + * is redundant unless the user incorrectly specifies a + * a fully qualified drive AND slice name (i.e., + * /dev/dsk/cXtXdXsX), which will be incorrectly + * recognized as a drive name by the metaname code. + */ + + if ((vtocp = metagetvtoc(compnp, FALSE, &slice, ep)) == NULL) + return (-1); + if (slice != MD_SLICE0) + return (mderror(ep, MDE_NOT_DRIVENAME, compnp->cname)); + + dnp = compnp->drivenamep; + if (meta_replicaslice(dnp, &rep_slice, ep) != 0) + return (-1); + + for (slice = 0; slice < vtocp->nparts; slice++) { + + /* only check if the slice really exists */ + if (vtocp->parts[slice].size == 0) + continue; + + slicenp = metaslicename(dnp, slice, ep); + if (slicenp == NULL) + return (-1); + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, + slicenp, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + /* + * Up to this point, tests are applied to all + * slices uniformly. + */ + + if (slice == rep_slice) { + /* + * Tests inside the body of this + * conditional are applied only to + * slice seven. + */ + if (meta_check_inmeta(sp, slicenp, + options | MDCHK_ALLOW_MDDB | + MDCHK_ALLOW_REPSLICE, 0, -1, ep) != 0) + return (-1); + + /* + * For slice seven, a metadb is NOT an + * automatic failure. It merely means + * that we're not allowed to muck + * about with the partitioning of that + * slice. We indicate this by masking + * in the MD_REPART_LEAVE_REP flag. + */ + if (metahasmddb(sp, slicenp, ep)) { + assert(repart_options != + NULL); + *repart_options |= + MD_REPART_LEAVE_REP; + } + + /* + * Skip the remaining tests for slice + * seven + */ + continue; + } + + /* + * Tests below this point will be applied to + * all slices EXCEPT for the replica slice. + */ + + + /* check if component is in a metadevice */ + if (meta_check_inmeta(sp, slicenp, options, 0, + -1, ep) != 0) + return (-1); + + /* check to see if component has a metadb */ + if (metahasmddb(sp, slicenp, ep)) + return (mddeverror(ep, MDE_HAS_MDDB, + slicenp->dev, slicenp->cname)); + } + /* + * This should be all of the testing necessary when + * the MDCMD_USE_WHOLE_DISK flag is set; the rest of + * meta_check_sp() is oriented towards component + * arguments instead of disks. + */ + goto meta_check_sp_ok; + + } + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, compnp, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + if (!metaismeta(compnp)) { /* handle non-metadevices */ + + /* + * The component can have one or more soft partitions on it + * already, but can't be part of any other type of metadevice, + * so if it is used for a metadevice, but the metadevice + * isn't a soft partition, return failure. + */ + + if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0 && + meta_check_insp(sp, compnp, 0, -1, ep) == 0) { + return (-1); + } + } else { /* handle metadevices */ + /* get underlying unit & check capabilities */ + if ((mdp = meta_get_unit(sp, compnp, ep)) == NULL) + return (-1); + + if ((! (mdp->capabilities & MD_CAN_PARENT)) || + (! (mdp->capabilities & MD_CAN_SP))) + return (mdmderror(ep, MDE_INVAL_UNIT, + meta_getminor(compnp->dev), compnp->cname)); + } + +meta_check_sp_ok: + mdclrerror(ep); + return (0); +} + +/* + * FUNCTION: meta_create_sp() + * INPUT: sp - the set name to create in + * msp - the unit structure to create + * oblist - an optional list of requested extents (-o/-b options) + * options - creation options + * alignment - data alignment + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 success, -1 error + * PURPOSE: does most of the work for creating a soft partition. If + * metainit -p -e was used, first partition the drive. Then + * create an extent list based on the existing soft partitions + * and assume all space not used by them is free. Storage for + * the new soft partition is allocated from the free extents + * based on the length specified on the command line or the + * oblist passed in. The unit structure is then committed and + * the watermarks are updated. Finally, the status is changed to + * Okay and the process is complete. + */ +static int +meta_create_sp( + mdsetname_t *sp, + md_sp_t *msp, + sp_ext_node_t *oblist, + mdcmdopts_t options, + sp_ext_length_t alignment, + md_error_t *ep +) +{ + mdname_t *np = msp->common.namep; + mdname_t *compnp = msp->compnamep; + mp_unit_t *mp = NULL; + mdnamelist_t *keynlp = NULL, *spnlp = NULL; + md_set_params_t set_params; + int rval = -1; + diskaddr_t comp_size; + diskaddr_t sp_start; + sp_ext_node_t *extlist = NULL; + int numexts = 0; /* number of extents */ + int count = 0; + int committed = 0; + int repart_options = MD_REPART_FORCE; + int create_flag = MD_CRO_32BIT; + + md_set_desc *sd; + mm_unit_t *mm; + md_set_mmown_params_t *ownpar = NULL; + int comp_is_mirror = 0; + + /* validate soft partition */ + if (meta_check_sp(sp, msp, options, &repart_options, ep) != 0) + return (-1); + + if ((options & MDCMD_USE_WHOLE_DISK) != 0) { + if ((options & MDCMD_DOIT) != 0) { + if (meta_repartition_drive(sp, + compnp->drivenamep, + repart_options, + NULL, /* Don't return the VTOC */ + ep) != 0) + + return (-1); + } else { + /* + * If -n and -e are both specified, it doesn't make + * sense to continue without actually partitioning + * the drive. + */ + return (0); + } + } + + /* populate the start_blk field of the component name */ + if ((sp_start = meta_sp_get_start(sp, compnp, ep)) == + MD_DISKADDR_ERROR) { + rval = -1; + goto out; + } + + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, compnp, &keynlp, ep) != 0) { + rval = -1; + goto out; + } + } + + /* + * Get a list of the soft partitions that currently reside on + * the component. We should ALWAYS force reload the cache, + * because if this is a single creation, there will not BE a + * cached list, and if we're using the md.tab, we must rebuild + * the list because it won't contain the previous (if any) + * soft partition. + */ + count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep); + if (count < 0) { + /* error occured */ + rval = -1; + goto out; + } + + /* + * get the size of the underlying device. if the size is smaller + * than or equal to the watermark size, we know there isn't + * enough space. + */ + if ((comp_size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) { + rval = -1; + goto out; + } else if (comp_size <= MD_SP_WMSIZE) { + (void) mdmderror(ep, MDE_SP_NOSPACE, 0, compnp->cname); + rval = -1; + goto out; + } + /* + * seed extlist with reserved space at the beginning of the volume and + * enough space for the end watermark. The end watermark always gets + * updated, but if the underlying device changes size it may not be + * pointed to until the extent before it is updated. Since the + * end of the reserved space is where the first watermark starts, + * the reserved extent should never be marked for updating. + */ + + meta_sp_list_insert(NULL, NULL, &extlist, + 0ULL, sp_start, EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset); + meta_sp_list_insert(NULL, NULL, &extlist, + (sp_ext_offset_t)(comp_size - MD_SP_WMSIZE), MD_SP_WMSIZE, + EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + + if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) { + rval = -1; + goto out; + } + + metafreenamelist(spnlp); + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_create_sp: list of used extents:\n"); + meta_sp_list_dump(extlist); + } + + meta_sp_list_freefill(&extlist, metagetsize(compnp, ep)); + + /* get extent list from -o/-b options or from free space */ + if (options & MDCMD_DIRECT) { + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_create_sp: Dumping -o/-b list:\n"); + meta_sp_list_dump(oblist); + } + + numexts = meta_sp_alloc_by_list(sp, np, &extlist, oblist); + if (numexts == -1) { + (void) mdmderror(ep, MDE_SP_OVERLAP, 0, np->cname); + rval = -1; + goto out; + } + } else { + numexts = meta_sp_alloc_by_len(sp, np, &extlist, + &msp->ext.ext_val->len, 0LL, (alignment > 0) ? alignment : + meta_sp_get_default_alignment(sp, compnp, ep)); + if (numexts == -1) { + (void) mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname); + rval = -1; + goto out; + } + } + + assert(extlist != NULL); + + /* create soft partition */ + mp = meta_sp_createunit(msp->common.namep, msp->compnamep, + extlist, numexts, msp->ext.ext_val->len, MD_SP_CREATEPEND, ep); + + create_flag = meta_check_devicesize(mp->c.un_total_blocks); + + /* if we're not doing anything (metainit -n), return success */ + if (! (options & MDCMD_DOIT)) { + rval = 0; /* success */ + goto out; + } + + (void) memset(&set_params, 0, sizeof (set_params)); + + if (create_flag == MD_CRO_64BIT) { + mp->c.un_revision = MD_64BIT_META_DEV; + set_params.options = MD_CRO_64BIT; + } else { + mp->c.un_revision = MD_32BIT_META_DEV; + set_params.options = MD_CRO_32BIT; + } + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_create_sp: printing unit structure\n"); + meta_sp_printunit(mp); + } + + /* + * Check to see if we're trying to create a partition on a mirror. If so + * we may have to enforce an ownership change before writing the + * watermark out. + */ + if (metaismeta(compnp)) { + char *miscname; + + miscname = metagetmiscname(compnp, ep); + if (miscname != NULL) + comp_is_mirror = (strcmp(miscname, MD_MIRROR) == 0); + else + comp_is_mirror = 0; + } else { + comp_is_mirror = 0; + } + + /* + * For a multi-node environment we have to ensure that the master + * node owns an underlying mirror before we issue the MD_IOCSET ioctl. + * If the master does not own the device we will deadlock as the + * implicit write of the watermarks (in sp_ioctl.c) will cause an + * ownership change that will block as the MD_IOCSET is still in + * progress. To close this window we force an owner change to occur + * before issuing the MD_IOCSET. We cannot simply open the device and + * write to it as this will only work for the first soft-partition + * creation. + */ + + if (comp_is_mirror && !metaislocalset(sp)) { + + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + rval = -1; + goto out; + } + if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) { + mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep); + if (mm == NULL) { + rval = -1; + goto out; + } else { + rval = meta_mn_change_owner(&ownpar, sp->setno, + meta_getminor(compnp->dev), + sd->sd_mn_mynode->nd_nodeid, + MD_MN_MM_PREVENT_CHANGE | + MD_MN_MM_SPAWN_THREAD); + if (rval == -1) + goto out; + } + } + } + + set_params.mnum = MD_SID(mp); + set_params.size = mp->c.un_size; + set_params.mdp = (uintptr_t)mp; + MD_SETDRIVERNAME(&set_params, MD_SP, MD_MIN2SET(set_params.mnum)); + + /* first phase of commit. */ + if (metaioctl(MD_IOCSET, &set_params, &set_params.mde, + np->cname) != 0) { + (void) mdstealerror(ep, &set_params.mde); + rval = -1; + goto out; + } + + /* we've successfully committed the record */ + committed = 1; + + /* write watermarks */ + if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) { + rval = -1; + goto out; + } + + /* + * Allow mirror ownership to change. If we don't succeed in this + * ioctl it isn't fatal, but the cluster will probably hang fairly + * soon as the mirror owner won't change. However, we have + * successfully written the watermarks out to the device so the + * softpart creation has succeeded + */ + if (ownpar) { + (void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum, + ownpar->d.owner, + MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD); + } + + /* second phase of commit, set status to MD_SP_OK */ + if (meta_sp_setstatus(sp, &(MD_SID(mp)), 1, MD_SP_OK, ep) < 0) { + rval = -1; + goto out; + } + rval = 0; +out: + Free(mp); + if (ownpar) + Free(ownpar); + + if (extlist != NULL) + meta_sp_list_free(&extlist); + + if (rval != 0 && keynlp != NULL && committed != 1) + (void) del_key_names(sp, keynlp, NULL); + + metafreenamelist(keynlp); + + return (rval); +} + +/* + * ************************************************************************** + * Reset (metaclear) Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_reset_common() + * INPUT: sp - the set name of the device to reset + * np - the name of the device to reset + * msp - the unit structure to reset + * options - metaclear options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 success, -1 error + * PURPOSE: "resets", or more accurately deletes, the soft partition + * specified. First the state is set to "deleting" and then the + * watermarks are all cleared out. Once the watermarks have been + * updated, the unit structure is deleted from the metadb. + */ +static int +meta_sp_reset_common( + mdsetname_t *sp, + mdname_t *np, + md_sp_t *msp, + md_sp_reset_t reset_params, + mdcmdopts_t options, + md_error_t *ep +) +{ + char *miscname; + int rval = -1; + int is_open = 0; + + /* make sure that nobody owns us */ + if (MD_HAS_PARENT(msp->common.parent)) + return (mdmderror(ep, MDE_IN_USE, meta_getminor(np->dev), + np->cname)); + + /* make sure that the soft partition isn't open */ + if ((is_open = meta_isopen(sp, np, ep, options)) < 0) + return (-1); + else if (is_open) + return (mdmderror(ep, MDE_IS_OPEN, meta_getminor(np->dev), + np->cname)); + + /* get miscname */ + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + + /* fill in reset params */ + MD_SETDRIVERNAME(&reset_params, miscname, sp->setno); + reset_params.mnum = meta_getminor(np->dev); + reset_params.force = (options & MDCMD_FORCE) ? 1 : 0; + + /* + * clear soft partition - phase one. + * place the soft partition into the "delete pending" state. + */ + if (meta_sp_setstatus(sp, &reset_params.mnum, 1, MD_SP_DELPEND, ep) < 0) + return (-1); + + /* + * Now clear the watermarks. If the force flag is specified, + * ignore any errors writing the watermarks and delete the unit + * structure anyway. An error may leave the on-disk format in a + * corrupt state. If force is not specified and we fail here, + * the soft partition will remain in the "delete pending" state. + */ + if ((meta_sp_clear_wm(sp, msp, ep) < 0) && + ((options & MDCMD_FORCE) == 0)) + goto out; + + /* + * clear soft partition - phase two. + * the driver removes the soft partition from the metadb and + * zeros out incore version. + */ + if (metaioctl(MD_IOCRESET, &reset_params, + &reset_params.mde, np->cname) != 0) { + (void) mdstealerror(ep, &reset_params.mde); + goto out; + } + rval = 0; /* success */ + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition is cleared\n"), + np->cname); + (void) fflush(stdout); + } + + /* + * if told to recurse and on a metadevice, then attempt to + * clear the subdevices. Indicate failure if the clear fails. + */ + if ((options & MDCMD_RECURSE) && + (metaismeta(msp->compnamep)) && + (meta_reset_by_name(sp, msp->compnamep, options, ep) != 0)) + rval = -1; + +out: + meta_invalidate_name(np); + return (rval); +} + +/* + * FUNCTION: meta_sp_reset() + * INPUT: sp - the set name of the device to reset + * np - the name of the device to reset + * options - metaclear options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 success, -1 error + * PURPOSE: provides the entry point to the rest of libmeta for deleting a + * soft partition. If np is NULL, then soft partitions are + * all deleted at the current level and then recursively deleted. + * Otherwise, if a name is specified either directly or as a + * result of a recursive operation, it deletes only that name. + * Since something sitting under a soft partition may be parented + * to it, we have to reparent that other device to another soft + * partition on the same component if we're deleting the one it's + * parented to. + */ +int +meta_sp_reset( + mdsetname_t *sp, + mdname_t *np, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_sp_t *msp; + int rval = -1; + mdnamelist_t *spnlp = NULL, *nlp = NULL; + md_sp_reset_t reset_params; + int num_sp; + + assert(sp != NULL); + + /* reset/delete all soft paritions */ + if (np == NULL) { + /* + * meta_reset_all sets MDCMD_RECURSE, but this behavior + * is incorrect for soft partitions. We want to clear + * all soft partitions at a particular level in the + * metadevice stack before moving to the next level. + * Thus, we clear MDCMD_RECURSE from the options. + */ + options &= ~MDCMD_RECURSE; + + /* for each soft partition */ + rval = 0; + if (meta_get_sp_names(sp, &spnlp, 0, ep) < 0) + rval = -1; + + for (nlp = spnlp; (nlp != NULL); nlp = nlp->next) { + np = nlp->namep; + if ((msp = meta_get_sp(sp, np, ep)) == NULL) { + rval = -1; + break; + } + /* + * meta_reset_all calls us twice to get soft + * partitions at the top and bottom of the stack. + * thus, if we have a parent, we'll get deleted + * on the next call. + */ + if (MD_HAS_PARENT(msp->common.parent)) + continue; + /* + * If this is a multi-node set, we send a series + * of individual metaclear commands. + */ + if (meta_is_mn_set(sp, ep)) { + if (meta_mn_send_metaclear_command(sp, + np->cname, options, 0, ep) != 0) { + rval = -1; + break; + } + } else { + if (meta_sp_reset(sp, np, options, ep) != 0) { + rval = -1; + break; + } + } + } + /* cleanup return status */ + metafreenamelist(spnlp); + return (rval); + } + + /* check the name */ + if (metachkmeta(np, ep) != 0) + return (-1); + + /* get the unit structure */ + if ((msp = meta_get_sp(sp, np, ep)) == NULL) + return (-1); + + /* clear out reset parameters */ + (void) memset(&reset_params, 0, sizeof (reset_params)); + + /* if our child is a metadevice, we need to deparent/reparent it */ + if (metaismeta(msp->compnamep)) { + /* get sp's on this component */ + if ((num_sp = meta_sp_get_by_component(sp, msp->compnamep, + &spnlp, 1, ep)) <= 0) + /* no sp's on this device. error! */ + return (-1); + else if (num_sp == 1) + /* last sp on this device, so we deparent */ + reset_params.new_parent = MD_NO_PARENT; + else { + /* have to reparent this metadevice */ + for (nlp = spnlp; nlp != NULL; nlp = nlp->next) { + if (meta_getminor(nlp->namep->dev) == + meta_getminor(np->dev)) + continue; + /* + * this isn't the softpart we are deleting, + * so use this device as the new parent. + */ + reset_params.new_parent = + meta_getminor(nlp->namep->dev); + break; + } + } + metafreenamelist(spnlp); + } + + if (meta_sp_reset_common(sp, np, msp, reset_params, options, ep) != 0) + return (-1); + + return (0); +} + +/* + * FUNCTION: meta_sp_reset_component() + * INPUT: sp - the set name of the device to reset + * name - the string name of the device to reset + * options - metaclear options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 success, -1 error + * PURPOSE: provides the ability to delete all soft partitions on a + * specified device (metaclear -p). It first gets all of the + * soft partitions on the component and then deletes each one + * individually. + */ +int +meta_sp_reset_component( + mdsetname_t *sp, + char *name, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *compnp, *np; + mdnamelist_t *spnlp = NULL; + mdnamelist_t *nlp = NULL; + md_sp_t *msp; + int count; + md_sp_reset_t reset_params; + + if ((compnp = metaname(&sp, name, ep)) == NULL) + return (-1); + + /* If we're starting out with no soft partitions, it's an error */ + count = meta_sp_get_by_component(sp, compnp, &spnlp, 1, ep); + if (count == 0) + return (mdmderror(ep, MDE_SP_NOSP, 0, compnp->cname)); + else if (count < 0) + return (-1); + + /* + * clear all soft partitions on this component. + * NOTE: we reparent underlying metadevices as we go so that + * things stay sane. Also, if we encounter an error, we stop + * and go no further in case recovery might be needed. + */ + for (nlp = spnlp; nlp != NULL; nlp = nlp->next) { + /* clear out reset parameters */ + (void) memset(&reset_params, 0, sizeof (reset_params)); + + /* check the name */ + np = nlp->namep; + + if (metachkmeta(np, ep) != 0) { + metafreenamelist(spnlp); + return (-1); + } + + /* get the unit structure */ + if ((msp = meta_get_sp(sp, np, ep)) == NULL) { + metafreenamelist(spnlp); + return (-1); + } + + /* have to deparent/reparent metadevices */ + if (metaismeta(compnp)) { + if (nlp->next == NULL) + reset_params.new_parent = MD_NO_PARENT; + else + reset_params.new_parent = + meta_getminor(spnlp->next->namep->dev); + } + + /* clear soft partition */ + if (meta_sp_reset_common(sp, np, msp, reset_params, + options, ep) < 0) { + metafreenamelist(spnlp); + return (-1); + } + } + metafreenamelist(spnlp); + return (0); +} + +/* + * ************************************************************************** + * Grow (metattach) Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_sp_attach() + * INPUT: sp - the set name of the device to attach to + * np - the name of the device to attach to + * addsize - the unparsed string holding the amount of space to add + * options - metattach options + * alignment - data alignment + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 success, -1 error + * PURPOSE: grows a soft partition by reading in the existing unit + * structure and setting its state to Growing, allocating more + * space (similar to meta_create_sp()), updating the watermarks, + * and then writing out the new unit structure in the Okay state. + */ +int +meta_sp_attach( + mdsetname_t *sp, + mdname_t *np, + char *addsize, + mdcmdopts_t options, + sp_ext_length_t alignment, + md_error_t *ep +) +{ + md_grow_params_t grow_params; + sp_ext_length_t grow_len; /* amount to grow */ + mp_unit_t *mp, *new_un; + mdname_t *compnp = NULL; + + sp_ext_node_t *extlist = NULL; + int numexts; + mdnamelist_t *spnlp = NULL; + int count; + md_sp_t *msp; + daddr_t start_block; + + /* should have the same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* check name */ + if (metachkmeta(np, ep) != 0) + return (-1); + + if (meta_sp_parsesize(addsize, &grow_len) == -1) { + return (mdmderror(ep, MDE_SP_BAD_LENGTH, 0, np->cname)); + } + + if ((mp = (mp_unit_t *)meta_get_mdunit(sp, np, ep)) == NULL) + return (-1); + + /* make sure we don't have a parent */ + if (MD_HAS_PARENT(mp->c.un_parent)) { + Free(mp); + return (mdmderror(ep, MDE_INVAL_UNIT, 0, np->cname)); + } + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_attach: Unit structure before new " + "space:\n"); + meta_sp_printunit(mp); + } + + /* + * NOTE: the fast option to metakeyname is 0 as opposed to 1 + * If this was not the case we would suffer the following + * assertion failure: + * Assertion failed: type1 != MDT_FAST_META && type1 != MDT_FAST_COMP + * file meta_check.x, line 315 + * I guess this is because we have not "seen" this drive before + * and hence hit the failure - this is of course the attach routine + */ + if ((compnp = metakeyname(&sp, mp->un_key, 0, ep)) == NULL) { + Free(mp); + return (-1); + } + + /* metakeyname does not fill in the key. */ + compnp->key = mp->un_key; + + /* work out the space on the component that we are dealing with */ + count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep); + + /* + * see if the component has been soft partitioned yet, or if an + * error occurred. + */ + if (count == 0) { + Free(mp); + return (mdmderror(ep, MDE_NOT_SP, 0, np->cname)); + } else if (count < 0) { + Free(mp); + return (-1); + } + + /* + * seed extlist with reserved space at the beginning of the volume and + * enough space for the end watermark. The end watermark always gets + * updated, but if the underlying device changes size it may not be + * pointed to until the extent before it is updated. Since the + * end of the reserved space is where the first watermark starts, + * the reserved extent should never be marked for updating. + */ + if ((start_block = meta_sp_get_start(sp, compnp, ep)) == + MD_DISKADDR_ERROR) { + Free(mp); + return (-1); + } + + meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block, + EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset); + meta_sp_list_insert(NULL, NULL, &extlist, + metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE, + EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + + if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) { + Free(mp); + return (-1); + } + + metafreenamelist(spnlp); + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_attach: list of used extents:\n"); + meta_sp_list_dump(extlist); + } + + meta_sp_list_freefill(&extlist, metagetsize(compnp, ep)); + + assert(mp->un_numexts >= 1); + numexts = meta_sp_alloc_by_len(sp, np, &extlist, &grow_len, + mp->un_ext[mp->un_numexts - 1].un_poff, + (alignment > 0) ? alignment : + meta_sp_get_default_alignment(sp, compnp, ep)); + + if (numexts == -1) { + Free(mp); + return (mdmderror(ep, MDE_SP_NOSPACE, 0, np->cname)); + } + + /* allocate new unit structure and copy in old unit */ + if ((new_un = meta_sp_updateunit(np, mp, extlist, + grow_len, numexts, ep)) == NULL) { + Free(mp); + return (-1); + } + Free(mp); + + /* If running in dryrun mode (-n option), we're done here */ + if ((options & MDCMD_DOIT) == 0) { + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition would grow\n"), + np->cname); + (void) fflush(stdout); + } + return (0); + } + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_attach: updated unit structure:\n"); + meta_sp_printunit(new_un); + } + + assert(new_un != NULL); + + (void) memset(&grow_params, 0, sizeof (grow_params)); + if (new_un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS) { + grow_params.options = MD_CRO_64BIT; + new_un->c.un_revision = MD_64BIT_META_DEV; + } else { + grow_params.options = MD_CRO_32BIT; + new_un->c.un_revision = MD_32BIT_META_DEV; + } + grow_params.mnum = MD_SID(new_un); + grow_params.size = new_un->c.un_size; + grow_params.mdp = (uintptr_t)new_un; + MD_SETDRIVERNAME(&grow_params, MD_SP, MD_MIN2SET(grow_params.mnum)); + + if (metaioctl(MD_IOCGROW, &grow_params, &grow_params.mde, + np->cname) != 0) { + (void) mdstealerror(ep, &grow_params.mde); + return (-1); + } + + /* update all watermarks */ + + if ((msp = meta_get_sp(sp, np, ep)) == NULL) + return (-1); + if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) + return (-1); + + + /* second phase of commit, set status to MD_SP_OK */ + if (meta_sp_setstatus(sp, &(MD_SID(new_un)), 1, MD_SP_OK, ep) < 0) + return (-1); + + meta_invalidate_name(np); + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition has been grown\n"), + np->cname); + (void) fflush(stdout); + } + + return (0); +} + +/* + * ************************************************************************** + * Recovery (metarecover) Functions * + * ************************************************************************** + */ + +/* + * FUNCTION: meta_recover_sp() + * INPUT: sp - the name of the set we are recovering on + * compnp - name pointer for device we are recovering on + * argc - argument count + * argv - left over arguments not parsed by metarecover command + * options - metarecover options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: parse soft partitioning-specific metarecover options and + * dispatch to the appropriate function to handle recovery. + */ +int +meta_recover_sp( + mdsetname_t *sp, + mdname_t *compnp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + md_set_desc *sd; + + if (argc > 1) { + (void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname, + argc, argv); + return (-1); + } + + /* + * For a MN set, this operation must be performed on the master + * as it is responsible for maintaining the watermarks + */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) + return (-1); + if (MD_MNSET_DESC(sd) && !sd->sd_mn_am_i_master) { + (void) mddserror(ep, MDE_DS_MASTER_ONLY, sp->setno, + sd->sd_mn_master_nodenm, NULL, NULL); + return (-1); + } + } + if (argc == 0) { + /* + * if no additional arguments are passed, metarecover should + * validate both on-disk and metadb structures as well as + * checking that both are consistent with each other + */ + if (meta_sp_validate_wm(sp, compnp, options, ep) < 0) + return (-1); + if (meta_sp_validate_unit(sp, compnp, options, ep) < 0) + return (-1); + if (meta_sp_validate_wm_and_unit(sp, compnp, options, ep) < 0) + return (-1); + } else if (strcmp(argv[0], "-d") == 0) { + /* + * Ensure that there is no existing valid record for this + * soft-partition. If there is we have nothing to do. + */ + if (meta_sp_validate_unit(sp, compnp, options, ep) == 0) + return (-1); + /* validate and recover from on-disk structures */ + if (meta_sp_validate_wm(sp, compnp, options, ep) < 0) + return (-1); + if (meta_sp_recover_from_wm(sp, compnp, options, ep) < 0) + return (-1); + } else if (strcmp(argv[0], "-m") == 0) { + /* validate and recover from metadb structures */ + if (meta_sp_validate_unit(sp, compnp, options, ep) < 0) + return (-1); + if (meta_sp_recover_from_unit(sp, compnp, options, ep) < 0) + return (-1); + } else { + /* syntax error */ + (void) meta_cook_syntax(ep, MDE_SYNTAX, compnp->cname, + argc, argv); + return (-1); + } + + return (0); +} + +/* + * FUNCTION: meta_sp_display_exthdr() + * INPUT: none + * OUTPUT: none + * RETURNS: void + * PURPOSE: print header line for sp_ext_node_t information. to be used + * in conjunction with meta_sp_display_ext(). + */ +static void +meta_sp_display_exthdr(void) +{ + (void) printf("%20s %5s %7s %20s %20s\n", + dgettext(TEXT_DOMAIN, "Name"), + dgettext(TEXT_DOMAIN, "Seq#"), + dgettext(TEXT_DOMAIN, "Type"), + dgettext(TEXT_DOMAIN, "Offset"), + dgettext(TEXT_DOMAIN, "Length")); +} + + +/* + * FUNCTION: meta_sp_display_ext() + * INPUT: ext - extent to display + * OUTPUT: none + * RETURNS: void + * PURPOSE: print selected fields from sp_ext_node_t. + */ +static void +meta_sp_display_ext(sp_ext_node_t *ext) +{ + /* print extent information */ + if (ext->ext_namep != NULL) + (void) printf("%20s ", ext->ext_namep->cname); + else + (void) printf("%20s ", "NONE"); + + (void) printf("%5u ", ext->ext_seq); + + switch (ext->ext_type) { + case EXTTYP_ALLOC: + (void) printf("%7s ", "ALLOC"); + break; + case EXTTYP_FREE: + (void) printf("%7s ", "FREE"); + break; + case EXTTYP_RESERVED: + (void) printf("%7s ", "RESV"); + break; + case EXTTYP_END: + (void) printf("%7s ", "END"); + break; + default: + (void) printf("%7s ", "INVLD"); + break; + } + + (void) printf("%20llu %20llu\n", ext->ext_offset, ext->ext_length); +} + + +/* + * FUNCTION: meta_sp_checkseq() + * INPUT: extlist - list of extents to be checked + * OUTPUT: none + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: check soft partition sequence numbers. this function assumes + * that a list of extents representing 1 or more soft partitions + * is passed in sorted in sequence number order. within a + * single soft partition, there may not be any missing or + * duplicate sequence numbers. + */ +static int +meta_sp_checkseq(sp_ext_node_t *extlist) +{ + sp_ext_node_t *ext; + + assert(extlist != NULL); + + for (ext = extlist; + ext->ext_next != NULL && ext->ext_next->ext_type == EXTTYP_ALLOC; + ext = ext->ext_next) { + if (ext->ext_next->ext_namep != NULL && + strcmp(ext->ext_next->ext_namep->cname, + ext->ext_namep->cname) != 0) + continue; + + if (ext->ext_next->ext_seq != ext->ext_seq + 1) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: sequence numbers are " + "incorrect: %d should be %d\n"), + ext->ext_next->ext_namep->cname, + ext->ext_next->ext_seq, ext->ext_seq + 1); + return (-1); + } + } + return (0); +} + + +/* + * FUNCTION: meta_sp_resolve_name_conflict() + * INPUT: sp - name of set we're are recovering in. + * old_np - name pointer of soft partition we found on disk. + * OUTPUT: new_np - name pointer for new soft partition name. + * ep - error pointer returned. + * RETURNS: int - 0 - name not replace, 1 - name replaced, -1 - error + * PURPOSE: Check to see if the name of one of the soft partitions we found + * on disk already exists in the metadb. If so, prompt for a new + * name. In addition, we keep a static array of names that + * will be recovered from this device since these names don't + * exist in the configuration at this point but cannot be + * recovered more than once. + */ +static int +meta_sp_resolve_name_conflict( + mdsetname_t *sp, + mdname_t *old_np, + mdname_t **new_np, + md_error_t *ep +) +{ + char yesno[255]; + char *yes; + char newname[MD_SP_MAX_DEVNAME_PLUS_1]; + int nunits; + static int *used_names = NULL; + + assert(old_np != NULL); + + if (used_names == NULL) { + if ((nunits = meta_get_nunits(ep)) < 0) + return (-1); + used_names = Zalloc(nunits * sizeof (int)); + } + + /* see if it exists already */ + if (used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] == 0 && + metagetmiscname(old_np, ep) == NULL) { + if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) + return (-1); + else { + used_names[MD_MIN2UNIT(meta_getminor(old_np->dev))] = 1; + mdclrerror(ep); + return (0); + } + } + + /* name exists, ask the user for a new one */ + (void) printf(dgettext(TEXT_DOMAIN, + "WARNING: A soft partition named %s was found in the extent\n" + "headers, but this name already exists in the metadb " + "configuration.\n" + "In order to continue recovery you must supply\n" + "a new name for this soft partition.\n"), old_np->cname); + (void) printf(dgettext(TEXT_DOMAIN, + "Would you like to continue and supply a new name? (yes/no) ")); + + (void) fflush(stdout); + if ((fgets(yesno, sizeof (yesno), stdin) == NULL) || + (strlen(yesno) == 1)) + (void) snprintf(yesno, sizeof (yesno), "%s\n", + dgettext(TEXT_DOMAIN, "no")); + yes = dgettext(TEXT_DOMAIN, "yes"); + if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) { + return (-1); + } + + (void) fflush(stdin); + + /* get the new name */ + for (;;) { + (void) printf(dgettext(TEXT_DOMAIN, "Please enter a new name " + "for this soft partition (dXXXX) ")); + (void) fflush(stdout); + if (fgets(newname, MD_SP_MAX_DEVNAME_PLUS_1, stdin) == NULL) + (void) strcpy(newname, ""); + + /* remove newline character */ + if (newname[strlen(newname) - 1] == '\n') + newname[strlen(newname) - 1] = '\0'; + + if (!(is_metaname(newname)) || + (meta_init_make_device(&sp, newname, ep) != 0)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Invalid metadevice name\n")); + (void) fflush(stderr); + continue; + } + + if ((*new_np = metaname(&sp, newname, ep)) == NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Invalid metadevice name\n")); + (void) fflush(stderr); + continue; + } + + assert(MD_MIN2UNIT(meta_getminor((*new_np)->dev)) < nunits); + /* make sure the name isn't already being used */ + if (used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] || + metagetmiscname(*new_np, ep) != NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "That name already exists\n")); + continue; + } else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) + return (-1); + + break; + } + + /* got a new name, place in used array and return */ + used_names[MD_MIN2UNIT(meta_getminor((*new_np)->dev))] = 1; + mdclrerror(ep); + return (1); +} + +/* + * FUNCTION: meta_sp_validate_wm() + * INPUT: sp - set name we are recovering in + * compnp - name pointer for device we are recovering from + * options - metarecover options + * OUTPUT: ep - error pointer returned + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: validate and display watermark configuration. walk the + * on-disk watermark structures and validate the information + * found within. since a watermark configuration is + * "self-defining", the act of traversing the watermarks + * is part of the validation process. + */ +static int +meta_sp_validate_wm( + mdsetname_t *sp, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + sp_ext_node_t *extlist = NULL; + sp_ext_node_t *ext; + int num_sps = 0; + int rval; + + if ((options & MDCMD_VERBOSE) != 0) + (void) printf(dgettext(TEXT_DOMAIN, + "Verifying on-disk structures on %s.\n"), + compnp->cname); + + /* + * for each watermark, build an ext_node, place on list. + */ + rval = meta_sp_extlist_from_wm(sp, compnp, &extlist, + meta_sp_cmp_by_nameseq, ep); + + if ((options & MDCMD_VERBOSE) != 0) { + /* print out what we found */ + if (extlist == NULL) + (void) printf(dgettext(TEXT_DOMAIN, + "No extent headers found on %s.\n"), + compnp->cname); + else { + (void) printf(dgettext(TEXT_DOMAIN, + "The following extent headers were found on %s.\n"), + compnp->cname); + meta_sp_display_exthdr(); + } + for (ext = extlist; ext != NULL; ext = ext->ext_next) + meta_sp_display_ext(ext); + } + + if (rval < 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: On-disk structures invalid or " + "no soft partitions found.\n"), + compnp->cname); + return (-1); + } + + assert(extlist != NULL); + + /* count number of soft partitions */ + for (ext = extlist; + ext != NULL && ext->ext_type == EXTTYP_ALLOC; + ext = ext->ext_next) { + if (ext->ext_next != NULL && + ext->ext_next->ext_namep != NULL && + strcmp(ext->ext_next->ext_namep->cname, + ext->ext_namep->cname) == 0) + continue; + num_sps++; + } + + if ((options & MDCMD_VERBOSE) != 0) + (void) printf(dgettext(TEXT_DOMAIN, + "Found %d soft partition(s) on %s.\n"), num_sps, + compnp->cname); + + if (num_sps == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: No soft partitions.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + /* check sequence numbers */ + if ((options & MDCMD_VERBOSE) != 0) + (void) printf(dgettext(TEXT_DOMAIN, + "Checking sequence numbers.\n")); + + if (meta_sp_checkseq(extlist) != 0) + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + + return (0); +} + +/* + * FUNCTION: meta_sp_validate_unit() + * INPUT: sp - name of set we are recovering in + * compnp - name of component we are recovering from + * options - metarecover options + * OUTPUT: ep - error pointer returned + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: validate and display metadb configuration. begin by getting + * all soft partitions built on the specified component. get + * the unit structure for each one and validate the fields within. + */ +static int +meta_sp_validate_unit( + mdsetname_t *sp, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_sp_t *msp; + mdnamelist_t *spnlp = NULL; + mdnamelist_t *namep = NULL; + int count; + uint_t extn; + sp_ext_length_t size; + + if ((options & MDCMD_VERBOSE) != 0) + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Validating soft partition metadb entries.\n"), + compnp->cname); + + if ((size = metagetsize(compnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + + /* get all soft partitions on component */ + count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep); + + if (count == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: No soft partitions.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } else if (count < 0) { + return (-1); + } + + /* Now go through the soft partitions and check each one */ + for (namep = spnlp; namep != NULL; namep = namep->next) { + mdname_t *curnp = namep->namep; + sp_ext_offset_t curvoff; + + /* get the unit structure */ + if ((msp = meta_get_sp_common(sp, curnp, 0, ep)) == NULL) + return (-1); + + /* verify generic unit structure parameters */ + if ((options & MDCMD_VERBOSE) != 0) + (void) printf(dgettext(TEXT_DOMAIN, + "\nVerifying device %s.\n"), + curnp->cname); + + /* + * MD_SP_LAST is an invalid state and is always the + * highest numbered. + */ + if (msp->status >= MD_SP_LAST) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: status value %u is out of range.\n"), + curnp->cname, msp->status); + return (mdmderror(ep, MDE_RECOVER_FAILED, + 0, curnp->cname)); + } else if ((options & MDCMD_VERBOSE) != 0) { + uint_t tstate = 0; + + if (metaismeta(msp->compnamep)) { + if (meta_get_tstate(msp->common.namep->dev, + &tstate, ep) != 0) + return (-1); + } + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Status \"%s\" is valid.\n"), + curnp->cname, meta_sp_status_to_name(msp->status, + tstate & MD_DEV_ERRORED)); + } + + /* Now verify each extent */ + if ((options & MDCMD_VERBOSE) != 0) + (void) printf("%14s %21s %21s %21s\n", + dgettext(TEXT_DOMAIN, "Extent Number"), + dgettext(TEXT_DOMAIN, "Virtual Offset"), + dgettext(TEXT_DOMAIN, "Physical Offset"), + dgettext(TEXT_DOMAIN, "Length")); + + curvoff = 0ULL; + for (extn = 0; extn < msp->ext.ext_len; extn++) { + md_sp_ext_t *extp = &msp->ext.ext_val[extn]; + + if ((options & MDCMD_VERBOSE) != 0) + (void) printf("%14u %21llu %21llu %21llu\n", + extn, extp->voff, extp->poff, extp->len); + + if (extp->voff != curvoff) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: virtual offset for extent %u " + "is inconsistent, expected %llu, " + "got %llu.\n"), curnp->cname, extn, + curvoff, extp->voff); + return (mdmderror(ep, MDE_RECOVER_FAILED, + 0, compnp->cname)); + } + + /* make sure extent does not drop off the end */ + if ((extp->poff + extp->len) == size) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: extent %u at offset %llu, " + "length %llu exceeds the size of the " + "device, %llu.\n"), curnp->cname, + extn, extp->poff, extp->len, size); + return (mdmderror(ep, MDE_RECOVER_FAILED, + 0, compnp->cname)); + } + + curvoff += extp->len; + } + } + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition metadb configuration is valid\n"), + compnp->cname); + } + return (0); +} + +/* + * FUNCTION: meta_sp_validate_wm_and_unit() + * INPUT: sp - name of set we are recovering in + * compnp - name of device we are recovering from + * options - metarecover options + * OUTPUT: ep - error pointer returned + * RETURNS: int - 0 - success, -1 error + * PURPOSE: cross-validate and display watermarks and metadb records. + * get both the unit structures for the soft partitions built + * on the specified component and the watermarks found on that + * component and check to make sure they are consistent with + * each other. + */ +static int +meta_sp_validate_wm_and_unit( + mdsetname_t *sp, + mdname_t *np, + mdcmdopts_t options, + md_error_t *ep +) +{ + sp_ext_node_t *wmlist = NULL; + sp_ext_node_t *unitlist = NULL; + sp_ext_node_t *unitext; + sp_ext_node_t *wmext; + sp_ext_offset_t tmpunitoff; + mdnamelist_t *spnlp = NULL; + int count; + int rval = 0; + int verbose = (options & MDCMD_VERBOSE); + + /* get unit structure list */ + count = meta_sp_get_by_component(sp, np, &spnlp, 0, ep); + if (count <= 0) + return (-1); + + meta_sp_list_insert(NULL, NULL, &unitlist, + metagetsize(np, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE, + EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + + if (meta_sp_extlist_from_namelist(sp, spnlp, &unitlist, ep) == -1) { + metafreenamelist(spnlp); + return (-1); + } + + metafreenamelist(spnlp); + + meta_sp_list_freefill(&unitlist, metagetsize(np, ep)); + + if (meta_sp_extlist_from_wm(sp, np, &wmlist, + meta_sp_cmp_by_offset, ep) < 0) { + meta_sp_list_free(&unitlist); + return (-1); + } + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_validate_wm_and_unit: unit list:\n"); + meta_sp_list_dump(unitlist); + meta_sp_debug("meta_sp_validate_wm_and_unit: wm list:\n"); + meta_sp_list_dump(wmlist); + } + + /* + * step through both lists and compare allocated nodes. Free + * nodes and end watermarks may differ between the two but + * that's generally ok, and if they're wrong will typically + * cause misplaced allocated extents. + */ + if (verbose) + (void) printf(dgettext(TEXT_DOMAIN, "\n%s: Verifying metadb " + "allocations match extent headers.\n"), np->cname); + + unitext = unitlist; + wmext = wmlist; + while ((wmext != NULL) && (unitext != NULL)) { + /* find next allocated extents in each list */ + while (wmext != NULL && wmext->ext_type != EXTTYP_ALLOC) + wmext = wmext->ext_next; + + while (unitext != NULL && unitext->ext_type != EXTTYP_ALLOC) + unitext = unitext->ext_next; + + if (wmext == NULL || unitext == NULL) + break; + + if (verbose) { + (void) printf(dgettext(TEXT_DOMAIN, + "Metadb extent:\n")); + meta_sp_display_exthdr(); + meta_sp_display_ext(unitext); + (void) printf(dgettext(TEXT_DOMAIN, + "Extent header extent:\n")); + meta_sp_display_exthdr(); + meta_sp_display_ext(wmext); + (void) printf("\n"); + } + + if (meta_sp_validate_exts(np, wmext, unitext, ep) < 0) + rval = -1; + + /* + * if the offsets aren't equal, only increment the + * lowest one in hopes of getting the lists back in sync. + */ + tmpunitoff = unitext->ext_offset; + if (unitext->ext_offset <= wmext->ext_offset) + unitext = unitext->ext_next; + if (wmext->ext_offset <= tmpunitoff) + wmext = wmext->ext_next; + } + + /* + * if both lists aren't at the end then there are extra + * allocated nodes in one of them. + */ + if (wmext != NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: extent headers contain allocations not in " + "the metadb\n\n"), np->cname); + rval = -1; + } + + if (unitext != NULL) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: metadb contains allocations not in the extent " + "headers\n\n"), np->cname); + rval = -1; + } + + if (options & MDCMD_PRINT) { + if (rval == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition metadb matches extent " + "header configuration\n"), np->cname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partition metadb does not match extent " + "header configuration\n"), np->cname); + } + } + + return (rval); +} + +/* + * FUNCTION: meta_sp_validate_exts() + * INPUT: compnp - name pointer for device we are recovering from + * wmext - extent node representing watermark + * unitext - extent node from unit structure + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - succes, mdmderror return code - error + * PURPOSE: Takes two extent nodes and checks them against each other. + * offset, length, sequence number, set, and name are compared. + */ +static int +meta_sp_validate_exts( + mdname_t *compnp, + sp_ext_node_t *wmext, + sp_ext_node_t *unitext, + md_error_t *ep +) +{ + if (wmext->ext_offset != unitext->ext_offset) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header offsets differ.\n"), + compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + if (wmext->ext_length != unitext->ext_length) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header lengths differ.\n"), + compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + if (wmext->ext_seq != unitext->ext_seq) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header sequence numbers " + "differ.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + if (wmext->ext_type != unitext->ext_type) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header types differ.\n"), + compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + /* + * If one has a set pointer and the other doesn't, error. + * If both extents have setnames, then make sure they match + * If both are NULL, it's ok, they match. + */ + if ((unitext->ext_setp == NULL) ^ (wmext->ext_setp == NULL)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header set values " + "differ.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + if (unitext->ext_setp != NULL) { + if (strcmp(unitext->ext_setp->setname, + wmext->ext_setp->setname) != 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header set names " + "differ.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, + 0, compnp->cname)); + } + } + + /* + * If one has a name pointer and the other doesn't, error. + * If both extents have names, then make sure they match + * If both are NULL, it's ok, they match. + */ + if ((unitext->ext_namep == NULL) ^ (wmext->ext_namep == NULL)) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header name values " + "differ.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + } + + if (unitext->ext_namep != NULL) { + if (strcmp(wmext->ext_namep->cname, + unitext->ext_namep->cname) != 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: unit structure and extent header names " + "differ.\n"), compnp->cname); + return (mdmderror(ep, MDE_RECOVER_FAILED, + 0, compnp->cname)); + } + } + + return (0); +} + +/* + * FUNCTION: update_sp_status() + * INPUT: sp - name of set we are recovering in + * minors - pointer to an array of soft partition minor numbers + * num_sps - number of minor numbers in array + * status - new status to be applied to all soft parts in array + * mn_set - set if current set is a multi-node set + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: update status of soft partitions to new status. minors is an + * array of minor numbers to apply the new status to. + * If mn_set is set, a message is sent to all nodes in the + * cluster to update the status locally. + */ +static int +update_sp_status( + mdsetname_t *sp, + minor_t *minors, + int num_sps, + sp_status_t status, + bool_t mn_set, + md_error_t *ep +) +{ + int i; + int err = 0; + + if (mn_set) { + md_mn_msg_sp_setstat_t sp_setstat_params; + int result; + md_mn_result_t *resp = NULL; + + for (i = 0; i < num_sps; i++) { + sp_setstat_params.sp_setstat_mnum = minors[i]; + sp_setstat_params.sp_setstat_status = status; + + result = mdmn_send_message(sp->setno, + MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, + (char *)&sp_setstat_params, + sizeof (sp_setstat_params), + &resp, ep); + if (resp != NULL) { + if (resp->mmr_exitval != 0) + err = -1; + free_result(resp); + } + if (result != 0) { + err = -1; + } + } + } else { + if (meta_sp_setstatus(sp, minors, num_sps, status, ep) < 0) + err = -1; + } + if (err < 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "Error updating status on recovered soft " + "partitions.\n")); + } + return (err); +} + +/* + * FUNCTION: meta_sp_recover_from_wm() + * INPUT: sp - name of set we are recovering in + * compnp - name pointer for component we are recovering from + * options - metarecover options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: update metadb records to match watermarks. begin by getting + * an extlist representing all soft partitions on the component. + * then build a unit structure for each soft partition. + * notify user of changes, then commit each soft partition to + * the metadb one at a time in the "recovering" state. update + * any watermarks that may need it (to reflect possible name + * changes), and, finally, set the status of all recovered + * partitions to the "OK" state at once. + */ +static int +meta_sp_recover_from_wm( + mdsetname_t *sp, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + sp_ext_node_t *extlist = NULL; + sp_ext_node_t *sp_list = NULL; + sp_ext_node_t *update_list = NULL; + sp_ext_node_t *ext; + sp_ext_node_t *sp_ext; + mp_unit_t *mp; + mp_unit_t **un_array; + int numexts = 0, num_sps = 0, i = 0; + int err = 0; + int not_recovered = 0; + int committed = 0; + sp_ext_length_t sp_length = 0LL; + mdnamelist_t *keynlp = NULL; + mdname_t *np; + mdname_t *new_np; + int new_name; + md_set_params_t set_params; + minor_t *minors = NULL; + char yesno[255]; + char *yes; + bool_t mn_set = 0; + md_set_desc *sd; + mm_unit_t *mm; + md_set_mmown_params_t *ownpar = NULL; + int comp_is_mirror = 0; + + /* + * if this component appears in another metadevice already, do + * NOT recover from it. + */ + if (meta_check_inmeta(sp, compnp, options, 0, -1, ep) != 0) + return (-1); + + /* set flag if dealing with a MN set */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + if (MD_MNSET_DESC(sd)) + mn_set = 1; + } + /* + * for each watermark, build an ext_node, place on list. + */ + if (meta_sp_extlist_from_wm(sp, compnp, &extlist, + meta_sp_cmp_by_nameseq, ep) < 0) + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + + assert(extlist != NULL); + + /* count number of soft partitions */ + for (ext = extlist; + ext != NULL && ext->ext_type == EXTTYP_ALLOC; + ext = ext->ext_next) { + if (ext->ext_next != NULL && + ext->ext_next->ext_namep != NULL && + strcmp(ext->ext_next->ext_namep->cname, + ext->ext_namep->cname) == 0) + continue; + num_sps++; + } + + /* allocate array of unit structure pointers */ + un_array = Zalloc(num_sps * sizeof (mp_unit_t *)); + + /* + * build unit structures from list of ext_nodes. + */ + for (ext = extlist; + ext != NULL && ext->ext_type == EXTTYP_ALLOC; + ext = ext->ext_next) { + meta_sp_list_insert(ext->ext_setp, ext->ext_namep, + &sp_list, ext->ext_offset, ext->ext_length, + ext->ext_type, ext->ext_seq, ext->ext_flags, + meta_sp_cmp_by_nameseq); + + numexts++; + sp_length += ext->ext_length - MD_SP_WMSIZE; + + if (ext->ext_next != NULL && + ext->ext_next->ext_namep != NULL && + strcmp(ext->ext_next->ext_namep->cname, + ext->ext_namep->cname) == 0) + continue; + + /* + * if we made it here, we are at a soft partition + * boundary in the list. + */ + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_recover_from_wm: dumping wm " + "list:\n"); + meta_sp_list_dump(sp_list); + } + + assert(sp_list != NULL); + assert(sp_list->ext_namep != NULL); + + if ((new_name = meta_sp_resolve_name_conflict(sp, + sp_list->ext_namep, &new_np, ep)) < 0) { + err = 1; + goto out; + } else if (new_name) { + for (sp_ext = sp_list; + sp_ext != NULL; + sp_ext = sp_ext->ext_next) { + /* + * insert into the update list for + * watermark update. + */ + meta_sp_list_insert(sp_ext->ext_setp, + new_np, &update_list, sp_ext->ext_offset, + sp_ext->ext_length, sp_ext->ext_type, + sp_ext->ext_seq, EXTFLG_UPDATE, + meta_sp_cmp_by_offset); + } + + } + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (mn_set) { + /* send message to all nodes to return key */ + md_mn_msg_addkeyname_t *send_params; + int result; + md_mn_result_t *resp = NULL; + int message_size; + + message_size = sizeof (*send_params) + + strlen(compnp->cname) + 1; + send_params = Zalloc(message_size); + send_params->addkeyname_setno = sp->setno; + (void) strcpy(&send_params->addkeyname_name[0], + compnp->cname); + result = mdmn_send_message(sp->setno, + MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS, + (char *)send_params, message_size, &resp, + ep); + Free(send_params); + if (resp != NULL) { + if (resp->mmr_exitval >= 0) { + compnp->key = + (mdkey_t)resp->mmr_exitval; + } else { + err = 1; + free_result(resp); + goto out; + } + free_result(resp); + } + if (result != 0) { + err = 1; + goto out; + } + (void) metanamelist_append(&keynlp, compnp); + } else { + if (add_key_name(sp, compnp, &keynlp, + ep) != 0) { + err = 1; + goto out; + } + } + } + + /* create the unit structure */ + if ((mp = meta_sp_createunit( + (new_name) ? new_np : sp_list->ext_namep, compnp, + sp_list, numexts, sp_length, MD_SP_RECOVER, ep)) == NULL) { + err = 1; + goto out; + } + + if (getenv(META_SP_DEBUG)) { + meta_sp_debug("meta_sp_recover_from_wm: " + "printing newly created unit structure"); + meta_sp_printunit(mp); + } + + /* place in unit structure array */ + un_array[i++] = mp; + + /* free sp_list */ + meta_sp_list_free(&sp_list); + sp_list = NULL; + numexts = 0; + sp_length = 0LL; + } + + /* display configuration updates */ + (void) printf(dgettext(TEXT_DOMAIN, + "The following soft partitions were found and will be added to\n" + "your metadevice configuration.\n")); + (void) printf("%5s %15s %18s\n", + dgettext(TEXT_DOMAIN, "Name"), + dgettext(TEXT_DOMAIN, "Size"), + dgettext(TEXT_DOMAIN, "No. of Extents")); + for (i = 0; i < num_sps; i++) { + (void) printf("%5s%lu %15llu %9d\n", "d", + MD_MIN2UNIT(MD_SID(un_array[i])), + un_array[i]->un_length, un_array[i]->un_numexts); + } + + if (!(options & MDCMD_DOIT)) { + not_recovered = 1; + goto out; + } + + /* ask user for confirmation */ + (void) printf(dgettext(TEXT_DOMAIN, + "WARNING: You are about to add one or more soft partition\n" + "metadevices to your metadevice configuration. If there\n" + "appears to be an error in the soft partition(s) displayed\n" + "above, do NOT proceed with this recovery operation.\n")); + (void) printf(dgettext(TEXT_DOMAIN, + "Are you sure you want to do this (yes/no)? ")); + + (void) fflush(stdout); + if ((fgets(yesno, sizeof (yesno), stdin) == NULL) || + (strlen(yesno) == 1)) + (void) snprintf(yesno, sizeof (yesno), "%s\n", + dgettext(TEXT_DOMAIN, "no")); + yes = dgettext(TEXT_DOMAIN, "yes"); + if (strncasecmp(yesno, yes, strlen(yesno) - 1) != 0) { + not_recovered = 1; + goto out; + } + + /* commit records one at a time */ + for (i = 0; i < num_sps; i++) { + (void) memset(&set_params, 0, sizeof (set_params)); + set_params.mnum = MD_SID(un_array[i]); + set_params.size = (un_array[i])->c.un_size; + set_params.mdp = (uintptr_t)(un_array[i]); + set_params.options = + meta_check_devicesize(un_array[i]->un_length); + if (set_params.options == MD_CRO_64BIT) { + un_array[i]->c.un_revision = MD_64BIT_META_DEV; + } else { + un_array[i]->c.un_revision = MD_32BIT_META_DEV; + } + MD_SETDRIVERNAME(&set_params, MD_SP, + MD_MIN2SET(set_params.mnum)); + + np = metamnumname(&sp, MD_SID(un_array[i]), 0, ep); + + /* + * If this is an MN set, send the MD_IOCSET ioctl to all nodes + */ + if (mn_set) { + md_mn_msg_iocset_t send_params; + int result; + md_mn_result_t *resp = NULL; + int mess_size; + + /* + * Calculate message size. md_mn_msg_iocset_t only + * contains one extent, so increment the size to + * include all extents + */ + mess_size = sizeof (send_params) - + sizeof (mp_ext_t) + + (un_array[i]->un_numexts * sizeof (mp_ext_t)); + + send_params.iocset_params = set_params; + (void) memcpy(&send_params.unit, un_array[i], + sizeof (*un_array[i]) - sizeof (mp_ext_t) + + (un_array[i]->un_numexts * sizeof (mp_ext_t))); + result = mdmn_send_message(sp->setno, + MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, + (char *)&send_params, mess_size, &resp, + ep); + if (resp != NULL) { + if (resp->mmr_exitval != 0) + err = 1; + free_result(resp); + } + if (result != 0) { + err = 1; + } + } else { + if (metaioctl(MD_IOCSET, &set_params, &set_params.mde, + np->cname) != 0) { + err = 1; + } + } + + if (err == 1) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: Error committing record to metadb.\n"), + np->cname); + goto out; + } + + /* note that we've committed a record */ + if (!committed) + committed = 1; + + /* update any watermarks that need it */ + if (update_list != NULL) { + md_sp_t *msp; + + /* + * Check to see if we're trying to create a partition + * on a mirror. If so we may have to enforce an + * ownership change before writing the watermark out. + */ + if (metaismeta(compnp)) { + char *miscname; + + miscname = metagetmiscname(compnp, ep); + if (miscname != NULL) + comp_is_mirror = (strcmp(miscname, + MD_MIRROR) == 0); + else + comp_is_mirror = 0; + } + /* + * If this is a MN set and the component is a mirror, + * change ownership to this node in order to write the + * watermarks + */ + if (mn_set && comp_is_mirror) { + mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep); + if (mm == NULL) { + err = 1; + goto out; + } else { + err = meta_mn_change_owner(&ownpar, + sp->setno, + meta_getminor(compnp->dev), + sd->sd_mn_mynode->nd_nodeid, + MD_MN_MM_PREVENT_CHANGE | + MD_MN_MM_SPAWN_THREAD); + if (err != 0) + goto out; + } + } + + if ((msp = meta_get_sp(sp, np, ep)) == NULL) { + err = 1; + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: Error updating extent headers.\n"), + np->cname); + goto out; + } + if (meta_sp_update_wm(sp, msp, update_list, ep) < 0) { + err = 1; + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%s: Error updating extent headers " + "on disk.\n"), np->cname); + goto out; + } + } + /* + * If we have changed ownership earlier and prevented any + * ownership changes, we can now allow ownership changes + * again. + */ + if (ownpar) { + (void) meta_mn_change_owner(&ownpar, sp->setno, + ownpar->d.mnum, + ownpar->d.owner, + MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD); + } + } + + /* update status of all soft partitions to OK */ + minors = Zalloc(num_sps * sizeof (minor_t)); + for (i = 0; i < num_sps; i++) + minors[i] = MD_SID(un_array[i]); + + err = update_sp_status(sp, minors, num_sps, MD_SP_OK, mn_set, ep); + if (err != 0) + goto out; + + if (options & MDCMD_PRINT) + (void) printf(dgettext(TEXT_DOMAIN, "%s: " + "Soft Partitions recovered from device.\n"), + compnp->cname); +out: + /* free memory */ + if (extlist != NULL) + meta_sp_list_free(&extlist); + if (sp_list != NULL) + meta_sp_list_free(&sp_list); + if (update_list != NULL) + meta_sp_list_free(&update_list); + if (un_array != NULL) { + for (i = 0; i < num_sps; i++) + Free(un_array[i]); + Free(un_array); + } + if (minors != NULL) + Free(minors); + if (ownpar != NULL) + Free(ownpar); + (void) fflush(stdout); + + if ((keynlp != NULL) && (committed != 1)) { + /* + * if we haven't committed any softparts, either because of an + * error or because the user decided not to proceed, delete + * namelist key for the component + */ + if (mn_set) { + mdnamelist_t *p; + + for (p = keynlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + md_mn_msg_delkeyname_t send_params; + md_mn_result_t *resp = NULL; + + send_params.delkeyname_dev = np->dev; + send_params.delkeyname_setno = sp->setno; + send_params.delkeyname_key = np->key; + (void) mdmn_send_message(sp->setno, + MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS, + (char *)&send_params, sizeof (send_params), + &resp, ep); + if (resp != NULL) { + free_result(resp); + } + } + } else { + (void) del_key_names(sp, keynlp, NULL); + } + } + + metafreenamelist(keynlp); + + if (err) + return (mdmderror(ep, MDE_RECOVER_FAILED, 0, compnp->cname)); + + if (not_recovered) + if (options & MDCMD_PRINT) + (void) printf(dgettext(TEXT_DOMAIN, "%s: " + "Soft Partitions NOT recovered from device.\n"), + compnp->cname); + return (0); +} + +/* + * FUNCTION: meta_sp_recover_from_unit() + * INPUT: sp - name of set we are recovering in + * compnp - name of component we are recovering from + * options - metarecover options + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: update watermarks to match metadb records. begin by getting + * a namelist representing all soft partitions on the specified + * component. then, build an extlist representing the soft + * partitions, filling in the freespace extents. notify user + * of changes, place all soft partitions into the "recovering" + * state and update the watermarks. finally, return all soft + * partitions to the "OK" state. + */ +static int +meta_sp_recover_from_unit( + mdsetname_t *sp, + mdname_t *compnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *spnlp = NULL; + mdnamelist_t *nlp = NULL; + sp_ext_node_t *ext = NULL; + sp_ext_node_t *extlist = NULL; + int count; + char yesno[255]; + char *yes; + int rval = 0; + minor_t *minors = NULL; + int i; + md_sp_t *msp; + md_set_desc *sd; + bool_t mn_set = 0; + daddr_t start_block; + + count = meta_sp_get_by_component(sp, compnp, &spnlp, 0, ep); + if (count <= 0) + return (-1); + + /* set flag if dealing with a MN set */ + if (!metaislocalset(sp)) { + if ((sd = metaget_setdesc(sp, ep)) == NULL) { + return (-1); + } + if (MD_MNSET_DESC(sd)) + mn_set = 1; + } + /* + * Save the XDR unit structure for one of the soft partitions; + * we'll use this later to provide metadevice context to + * update the watermarks so the device can be resolved by + * devid instead of dev_t. + */ + if ((msp = meta_get_sp(sp, spnlp->namep, ep)) == NULL) { + metafreenamelist(spnlp); + return (-1); + } + + if ((start_block = meta_sp_get_start(sp, compnp, ep)) == + MD_DISKADDR_ERROR) { + return (-1); + } + + meta_sp_list_insert(NULL, NULL, &extlist, 0ULL, start_block, + EXTTYP_RESERVED, 0, 0, meta_sp_cmp_by_offset); + meta_sp_list_insert(NULL, NULL, &extlist, + metagetsize(compnp, ep) - MD_SP_WMSIZE, MD_SP_WMSIZE, + EXTTYP_END, 0, EXTFLG_UPDATE, meta_sp_cmp_by_offset); + + if (meta_sp_extlist_from_namelist(sp, spnlp, &extlist, ep) == -1) { + metafreenamelist(spnlp); + return (-1); + } + + assert(extlist != NULL); + if ((options & MDCMD_VERBOSE) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "Updating extent headers on device %s from metadb.\n\n"), + compnp->cname); + (void) printf(dgettext(TEXT_DOMAIN, + "The following extent headers will be written:\n")); + meta_sp_display_exthdr(); + } + + meta_sp_list_freefill(&extlist, metagetsize(compnp, ep)); + + for (ext = extlist; ext != NULL; ext = ext->ext_next) { + + /* mark every node for updating except the reserved space */ + if (ext->ext_type != EXTTYP_RESERVED) { + ext->ext_flags |= EXTFLG_UPDATE; + + /* print extent information */ + if ((options & MDCMD_VERBOSE) != 0) + meta_sp_display_ext(ext); + } + } + + /* request verification and then update all watermarks */ + if ((options & MDCMD_DOIT) != 0) { + + (void) printf(dgettext(TEXT_DOMAIN, + "\nWARNING: You are about to overwrite portions of %s\n" + "with soft partition metadata. The extent headers will be\n" + "written to match the existing metadb configuration. If\n" + "the device was not previously setup with this\n" + "configuration, data loss may result.\n\n"), + compnp->cname); + (void) printf(dgettext(TEXT_DOMAIN, + "Are you sure you want to do this (yes/no)? ")); + + (void) fflush(stdout); + if ((fgets(yesno, sizeof (yesno), stdin) == NULL) || + (strlen(yesno) == 1)) + (void) snprintf(yesno, sizeof (yesno), + "%s\n", dgettext(TEXT_DOMAIN, "no")); + yes = dgettext(TEXT_DOMAIN, "yes"); + if (strncasecmp(yesno, yes, strlen(yesno) - 1) == 0) { + /* place soft partitions into recovering state */ + minors = Zalloc(count * sizeof (minor_t)); + for (nlp = spnlp, i = 0; + nlp != NULL && i < count; + nlp = nlp->next, i++) { + assert(nlp->namep != NULL); + minors[i] = meta_getminor(nlp->namep->dev); + } + if (update_sp_status(sp, minors, count, + MD_SP_RECOVER, mn_set, ep) != 0) { + rval = -1; + goto out; + } + + /* update the watermarks */ + if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) { + rval = -1; + goto out; + } + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, "%s: " + "Soft Partitions recovered from metadb\n"), + compnp->cname); + } + + /* return soft partitions to the OK state */ + if (update_sp_status(sp, minors, count, + MD_SP_OK, mn_set, ep) != 0) { + rval = -1; + goto out; + } + + rval = 0; + goto out; + } + } + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Soft Partitions NOT recovered from metadb\n"), + compnp->cname); + } + +out: + if (minors != NULL) + Free(minors); + metafreenamelist(spnlp); + meta_sp_list_free(&extlist); + (void) fflush(stdout); + return (rval); +} + + +/* + * FUNCTION: meta_sp_update_abr() + * INPUT: sp - name of set we are recovering in + * OUTPUT: ep - return error pointer + * RETURNS: int - 0 - success, -1 - error + * PURPOSE: update the ABR state for all soft partitions in the set. This + * is called when joining a set. It sends a message to the master + * node for each soft partition to get the value of tstate and + * then sets ABR ,if required, by opening the sp, setting ABR + * and then closing the sp. This approach is taken rather that + * just issuing the MD_MN_SET_CAP ioctl, in order to deal with + * the case when we have another node simultaneously unsetting ABR. + */ +int +meta_sp_update_abr( + mdsetname_t *sp, + md_error_t *ep +) +{ + mdnamelist_t *devnlp = NULL; + mdnamelist_t *p; + mdname_t *devnp = NULL; + md_unit_t *un; + char fname[MAXPATHLEN]; + int mnum, fd; + volcap_t vc; + uint_t tstate; + + + if (meta_get_sp_names(sp, &devnlp, 0, ep) < 0) { + return (-1); + } + + /* Exit if no soft partitions in this set */ + if (devnlp == NULL) + return (0); + + /* For each soft partition */ + for (p = devnlp; (p != NULL); p = p->next) { + devnp = p->namep; + + /* check if this is a top level metadevice */ + if ((un = meta_get_mdunit(sp, devnp, ep)) == NULL) + goto out; + if (MD_HAS_PARENT(MD_PARENT(un))) { + Free(un); + continue; + } + Free(un); + + /* Get tstate from Master */ + if (meta_mn_send_get_tstate(devnp->dev, &tstate, ep) != 0) { + mdname_t *np; + np = metamnumname(&sp, meta_getminor(devnp->dev), 0, + ep); + if (np) { + md_perror(dgettext(TEXT_DOMAIN, + "Unable to get tstate for %s"), np->cname); + } + continue; + } + /* If not set on the master, nothing to do */ + if (!(tstate & MD_ABR_CAP)) + continue; + + mnum = meta_getminor(devnp->dev); + (void) snprintf(fname, MAXPATHLEN, "/dev/md/%s/rdsk/d%u", + sp->setname, (unsigned)MD_MIN2UNIT(mnum)); + if ((fd = open(fname, O_RDWR, 0)) < 0) { + md_perror(dgettext(TEXT_DOMAIN, + "Could not open device %s"), fname); + continue; + } + + /* Set ABR state */ + vc.vc_info = 0; + vc.vc_set = 0; + if (ioctl(fd, DKIOCGETVOLCAP, &vc) < 0) { + (void) close(fd); + continue; + } + + vc.vc_set = DKV_ABR_CAP; + if (ioctl(fd, DKIOCSETVOLCAP, &vc) < 0) { + (void) close(fd); + goto out; + } + + (void) close(fd); + } + metafreenamelist(devnlp); + return (0); +out: + metafreenamelist(devnlp); + return (-1); +} + +/* + * FUNCTION: meta_mn_sp_update_abr() + * INPUT: arg - Given set. + * PURPOSE: update the ABR state for all soft partitions in the set by + * forking a process to call meta_sp_update_abr() + * This function is only called via rpc.metad when adding a node + * to a set, ie this node is beong joined to the set by another + * node. + */ +void * +meta_mn_sp_update_abr(void *arg) +{ + set_t setno = *((set_t *)arg); + mdsetname_t *sp; + md_error_t mde = mdnullerror; + int fval; + + /* should have a set */ + assert(setno != NULL); + + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + mde_perror(&mde, ""); + return (NULL); + } + + if (!(meta_is_mn_set(sp, &mde))) { + mde_perror(&mde, ""); + return (NULL); + } + + /* fork a process */ + if ((fval = md_daemonize(sp, &mde)) != 0) { + /* + * md_daemonize will fork off a process. The is the + * parent or error. + */ + if (fval > 0) { + return (NULL); + } + mde_perror(&mde, ""); + return (NULL); + } + /* + * Child process should never return back to rpc.metad, but + * should exit. + * Flush all internally cached data inherited from parent process + * since cached data will be cleared when parent process RPC request + * has completed (which is possibly before this child process + * can complete). + * Child process can retrieve and cache its own copy of data from + * rpc.metad that won't be changed by the parent process. + * + * Reset md_in_daemon since this child will be a client of rpc.metad + * not part of the rpc.metad daemon itself. + * md_in_daemon is used by rpc.metad so that libmeta can tell if + * this thread is rpc.metad or any other thread. (If this thread + * was rpc.metad it could use some short circuit code to get data + * directly from rpc.metad instead of doing an RPC call to rpc.metad). + */ + md_in_daemon = 0; + metaflushsetname(sp); + sr_cache_flush_setno(setno); + if ((sp = metasetnosetname(setno, &mde)) == NULL) { + mde_perror(&mde, ""); + md_exit(sp, 1); + } + + + /* + * Closing stdin/out/err here. + */ + (void) close(0); + (void) close(1); + (void) close(2); + assert(fval == 0); + + (void) meta_sp_update_abr(sp, &mde); + + md_exit(sp, 0); + /*NOTREACHED*/ +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_stat.c b/usr/src/lib/lvm/libmeta/common/meta_stat.c new file mode 100644 index 0000000000..90844f9148 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_stat.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 1993, 1994, 2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Caching stat function + */ + +#include <meta.h> + +#define MD_NUM_STAT_HEAD 16 + +struct statcache { + struct statcache *sc_next; + struct stat sc_stat; + char *sc_filename; +}; + +static struct statcache *statcache_head[MD_NUM_STAT_HEAD] = + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +int +meta_stat(const char *filename, struct stat *sbp) +{ + struct statcache *scp; + int hash; + char *cp; + + hash = 0; + for (cp = (char *)filename; *cp != 0; cp++) + hash += *cp; + + hash &= 0xf; + + for (scp = statcache_head[hash]; scp != NULL; scp = scp->sc_next) + if (strcmp(filename, scp->sc_filename) == 0) + break; + if (scp) { + (void) memcpy((caddr_t)sbp, (caddr_t)&scp->sc_stat, + sizeof (*sbp)); + return (0); + } + if (stat(filename, sbp) != 0) + return (-1); + + if (!S_ISBLK(sbp->st_mode) && !S_ISCHR(sbp->st_mode)) + return (-1); + + scp = (struct statcache *)malloc(sizeof (*scp)); + if (scp != NULL) { + (void) memcpy((caddr_t)&scp->sc_stat, (caddr_t)sbp, + sizeof (*sbp)); + scp->sc_filename = strdup(filename); + if (scp->sc_filename == NULL) { + free((char *)scp); + return (0); + } + scp->sc_next = statcache_head[hash]; + statcache_head[hash] = scp; + } + return (0); +} + +void +metaflushstatcache(void) +{ + struct statcache *p, *n; + int i; + + for (i = 0; i < MD_NUM_STAT_HEAD; i++) { + for (p = statcache_head[i], n = NULL; p != NULL; p = n) { + n = p->sc_next; + Free(p->sc_filename); + Free(p); + } + statcache_head[i] = NULL; + } +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_stripe.c b/usr/src/lib/lvm/libmeta/common/meta_stripe.c new file mode 100644 index 0000000000..237afcd60b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_stripe.c @@ -0,0 +1,2496 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * stripe operations + */ + +#include <limits.h> +#include <stdlib.h> +#include <meta.h> +#include <sys/lvm/md_stripe.h> +#include <sys/lvm/md_convert.h> + +#define QUOTE(x) #x +#define VAL2STR(x) QUOTE(x) + +/* + * replace stripe/concat + */ +int +meta_stripe_replace( + mdsetname_t *sp, + mdname_t *stripenp, + mdname_t *oldnp, + mdname_t *newnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + replace_params_t params; + md_dev64_t old_dev, + new_dev; + diskaddr_t new_start_blk, + new_end_blk, + label, + size, + start_blk; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev))); + + new_dev = newnp->dev; + new_start_blk = newnp->start_blk; + new_end_blk = newnp->end_blk; + + meta_invalidate_name(stripenp); + + /* the old device binding is now established */ + if ((old_dev = oldnp->dev) == NODEV64) + return (mdsyserror(ep, ENODEV, oldnp->cname)); + + if (((strcmp(oldnp->rname, newnp->rname) == 0) && + (old_dev != new_dev))) { + newnp->dev = new_dev; + newnp->start_blk = new_start_blk; + newnp->end_blk = new_end_blk; + } + + if ((size = metagetsize(newnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((label = metagetlabel(newnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if ((start_blk = metagetstart(sp, newnp, ep)) == MD_DISKADDR_ERROR) + return (-1); + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, newnp->cname); + return (-1); + } + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + } + + /* + * There is no need to call meta_fixdevid() here as this function is + * only called by the metareplace -c command which actually does + * nothing (in terms of a resync) and thus does nothing with the devid. + */ + + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(stripenp->dev); + MD_SETDRIVERNAME(¶ms, MD_STRIPE, sp->setno); + + params.cmd = REPLACE_COMP; + params.old_dev = old_dev; + params.new_dev = new_dev; + params.new_key = newnp->key; + params.start_blk = newnp->start_blk; + params.number_blks = size; + /* Is this just a dryrun ? */ + if ((options & MDCMD_DOIT) == 0) { + params.options |= MDIOCTL_DRYRUN; + } + if (label == 0) + params.has_label = 0; + else + params.has_label = 1; + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) { + if (options & MDCMD_DOIT) + (void) del_key_name(sp, newnp, ep); + return (mdstealerror(ep, ¶ms.mde)); + } + meta_invalidate_name(oldnp); + meta_invalidate_name(newnp); + meta_invalidate_name(stripenp); + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is replaced with %s\n"), + stripenp->cname, oldnp->cname, newnp->cname); + + } + return (0); +} + + +/* + * FUNCTION: meta_get_stripe_names() + * INPUT: sp - the set name to get stripes from + * options - options from the command line + * OUTPUT: nlpp - list of all stripe names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all stripes in the metadb + * for all devices in the specified set + */ +int +meta_get_stripe_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + return (meta_get_names(MD_STRIPE, sp, nlpp, options, ep)); +} + +/* + * free stripe + */ +void +meta_free_stripe( + md_stripe_t *stripep +) +{ + uint_t row; + + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + + if (rp->comps.comps_val != NULL) { + assert(rp->comps.comps_len > 0); + Free(rp->comps.comps_val); + } + } + if (stripep->rows.rows_val != NULL) { + assert(stripep->rows.rows_len > 0); + Free(stripep->rows.rows_val); + } + Free(stripep); +} + + +/* + * get stripe (common) + */ +md_stripe_t * +meta_get_stripe_common( + mdsetname_t *sp, + mdname_t *stripenp, + int fast, + md_error_t *ep +) +{ + mddrivename_t *dnp = stripenp->drivenamep; + char *miscname; + ms_unit_t *ms; + md_stripe_t *stripep; + uint_t row; + + /* must have set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev))); + + /* short circuit */ + if (dnp->unitp != NULL) { + assert(dnp->unitp->type == MD_DEVICE); + return ((md_stripe_t *)dnp->unitp); + } + + /* get miscname and unit */ + if ((miscname = metagetmiscname(stripenp, ep)) == NULL) + return (NULL); + if (strcmp(miscname, MD_STRIPE) != 0) { + (void) mdmderror(ep, MDE_NOT_STRIPE, + meta_getminor(stripenp->dev), stripenp->cname); + return (NULL); + } + if ((ms = (ms_unit_t *)meta_get_mdunit(sp, stripenp, ep)) == NULL) + return (NULL); + assert(ms->c.un_type == MD_DEVICE); + + /* allocate stripe */ + stripep = Zalloc(sizeof (*stripep)); + + /* allocate rows */ + assert(ms->un_nrows > 0); + stripep->rows.rows_len = ms->un_nrows; + stripep->rows.rows_val = Zalloc(stripep->rows.rows_len * + sizeof (*stripep->rows.rows_val)); + + /* get common info */ + stripep->common.namep = stripenp; + stripep->common.type = ms->c.un_type; + stripep->common.state = ms->c.un_status; + stripep->common.capabilities = ms->c.un_capabilities; + stripep->common.parent = ms->c.un_parent; + stripep->common.size = ms->c.un_total_blocks; + stripep->common.user_flags = ms->c.un_user_flags; + stripep->common.revision = ms->c.un_revision; + + /* get options */ + if ((ms->un_hsp_id != MD_HSP_NONE) && + ((stripep->hspnamep = metahsphspname(&sp, ms->un_hsp_id, + ep)) == NULL)) { + goto out; + } + + /* get rows */ + for (row = 0; (row < ms->un_nrows); ++row) { + struct ms_row *mdr = &ms->un_row[row]; + struct ms_comp *mdcomp = (void *)&((char *)ms)[ms->un_ocomp]; + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp, c; + + /* get interlace */ + rp->interlace = mdr->un_interlace; + + /* allocate comps */ + assert(mdr->un_ncomp > 0); + rp->comps.comps_len = mdr->un_ncomp; + rp->comps.comps_val = Zalloc(rp->comps.comps_len * + sizeof (*rp->comps.comps_val)); + + /* get components */ + for (comp = 0, c = mdr->un_icomp; (comp < mdr->un_ncomp); + ++comp, ++c) { + struct ms_comp *mdc = &mdcomp[c]; + diskaddr_t comp_start_blk = mdc->un_start_block; + md_comp_t *cp = &rp->comps.comps_val[comp]; + + /* get the component name */ + cp->compnamep = metakeyname(&sp, mdc->un_key, fast, ep); + if (cp->compnamep == NULL) + goto out; + + /* if hotspared */ + if (mdc->un_mirror.ms_hs_id != 0) { + diskaddr_t hs_start_blk = mdc->un_start_block; + + /* get the hotspare name */ + cp->hsnamep = metakeyname(&sp, + mdc->un_mirror.ms_hs_key, fast, ep); + if (cp->hsnamep == NULL) + goto out; + + if (getenv("META_DEBUG_START_BLK") != NULL) { + if (metagetstart(sp, cp->hsnamep, + ep) == MD_DISKADDR_ERROR) + mdclrerror(ep); + + if ((cp->hsnamep->start_blk == 0) && + (hs_start_blk != 0)) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block," + " seems labelled [stripe/hs]\n"), + cp->hsnamep->cname); + + if ((cp->hsnamep->start_blk > 0) && + (hs_start_blk == 0) && + ! ((row == 0) && (comp == 0))) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block, " + "seems unlabelled [stripe/hs]\n"), + cp->hsnamep->cname); + } + /* override any start_blk */ + cp->hsnamep->start_blk = hs_start_blk; + + /* get the right component start_blk */ + comp_start_blk = mdc->un_mirror.ms_orig_blk; + } else { + if (getenv("META_DEBUG_START_BLK") != NULL) { + if (metagetstart(sp, cp->compnamep, + ep) == MD_DISKADDR_ERROR) + mdclrerror(ep); + + if ((cp->compnamep->start_blk == 0) && + (comp_start_blk != 0)) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block," + " seems labelled [stripe]"), + cp->compnamep->cname); + + if ((cp->compnamep->start_blk > 0) && + (comp_start_blk == 0) && + ! ((row == 0) && (comp == 0))) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block, " + "seems unlabelled [stripe]"), + cp->compnamep->cname); + } + } + + /* override any start_blk */ + cp->compnamep->start_blk = comp_start_blk; + + /* get state */ + cp->state = mdc->un_mirror.ms_state; + + /* get time of last state change */ + cp->timestamp = mdc->un_mirror.ms_timestamp; + + /* get lasterr count */ + cp->lasterrcnt = mdc->un_mirror.ms_lasterrcnt; + } + } + + /* cleanup, return success */ + Free(ms); + dnp->unitp = (md_common_t *)stripep; + return (stripep); + + /* cleanup, return error */ +out: + Free(ms); + meta_free_stripe(stripep); + return (NULL); +} + +/* + * get stripe + */ +md_stripe_t * +meta_get_stripe( + mdsetname_t *sp, + mdname_t *stripenp, + md_error_t *ep +) +{ + return (meta_get_stripe_common(sp, stripenp, 0, ep)); +} + +/* + * check stripe for dev + */ +static int +in_stripe( + mdsetname_t *sp, + mdname_t *stripenp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_stripe_t *stripep; + uint_t row; + + /* should be in the same set */ + assert(sp != NULL); + + /* get unit */ + if ((stripep = meta_get_stripe(sp, stripenp, ep)) == NULL) + return (-1); + + /* look in rows */ + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + /* look in columns */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + diskaddr_t comp_sblk; + int err; + + /* check same drive since metagetstart() can fail */ + if ((err = meta_check_samedrive(np, compnp, ep)) < 0) + return (-1); + else if (err == 0) + continue; + + /* check overlap */ + if ((comp_sblk = metagetstart(sp, compnp, ep)) == + MD_DISKADDR_ERROR) + return (-1); + if (meta_check_overlap(stripenp->cname, np, + slblk, nblks, compnp, comp_sblk, -1, + ep) != 0) { + return (-1); + } + } + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a stripe + */ +int +meta_check_instripe( + mdsetname_t *sp, + mdname_t *np, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdnamelist_t *stripenlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each stripe */ + if (meta_get_stripe_names(sp, &stripenlp, 0, ep) < 0) + return (-1); + for (p = stripenlp; (p != NULL); p = p->next) { + mdname_t *stripenp = p->namep; + + /* check stripe */ + if (in_stripe(sp, stripenp, np, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(stripenlp); + return (rval); +} + +/* + * check component + */ +int +meta_check_component( + mdsetname_t *sp, + mdname_t *np, + int force, + md_error_t *ep +) +{ + mdchkopts_t options = (MDCHK_ALLOW_MDDB); + md_common_t *mdp; + + /* + * See if we are a soft partition: meta_sp_issp() returns 0 if + * np points to a soft partition, so the if and else clauses + * here represent "not a soft partition" and "soft partition," + * respectively. + */ + if (meta_sp_issp(sp, np, ep) != 0) { + /* make sure we have a disk */ + if (metachkcomp(np, ep) != 0) + return (-1); + } else { + /* make sure soft partition can parent & doesn't have parent */ + if ((mdp = meta_get_unit(sp, np, ep)) == NULL) + return (mdmderror(ep, MDE_INVAL_UNIT, NULL, + np->cname)); + if (mdp->capabilities == MD_CANT_PARENT) + return (mdmderror(ep, MDE_INVAL_UNIT, NULL, + np->cname)); + if (MD_HAS_PARENT(mdp->parent)) { + mdname_t *pnp; + + pnp = metamnumname(&sp, mdp->parent, 0, ep); + if (pnp == NULL) { + return (-1); + } + + return (mduseerror(ep, MDE_ALREADY, np->dev, + pnp->cname, np->cname)); + } + } + + /* check to ensure that it is not already in use */ + if ((! force) && + (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0)) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + + /* return success */ + return (0); +} + +/* + * print stripe + */ +static int +stripe_print( + md_stripe_t *stripep, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t row; + int rval = -1; + + if (options & PRINT_LARGEDEVICES) { + if (stripep->common.revision != MD_64BIT_META_DEV) { + rval = 0; + goto out; + } + } + + /* print name and num rows */ + if (fprintf(fp, "%s %u", + stripep->common.namep->cname, stripep->rows.rows_len) == EOF) + goto out; + + /* print rows */ + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + /* print num components */ + if (fprintf(fp, " %u", rp->comps.comps_len) == EOF) + goto out; + + /* print components */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + + /* print component */ + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(cp->compnamep->rname, "/dev/rdsk") == + NULL) && (strstr(cp->compnamep->rname, + "/dev/md/rdsk") == NULL) && + (strstr(cp->compnamep->rname, "/dev/td/") == + NULL)) { + /* not standard path, print full pathname */ + if (fprintf(fp, " %s", cp->compnamep->rname) + == EOF) + goto out; + } else { + /* standard path */ + if (fprintf(fp, " %s", cp->compnamep->cname) + == EOF) + goto out; + } + } + + /* print interlace */ + if (rp->comps.comps_len > 1) + if (fprintf(fp, " -i %lldb", rp->interlace) == EOF) + goto out; + + /* print continuation */ + if (row != (stripep->rows.rows_len - 1)) + if (fprintf(fp, " \\\n\t") == EOF) + goto out; + } + + /* print hotspare name */ + if (stripep->hspnamep != NULL) + if (fprintf(fp, " -h %s", stripep->hspnamep->hspname) == EOF) + goto out; + + /* terminate last line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * convert component state to name + */ +char * +comp_state_to_name( + md_comp_t *mdcp, + md_timeval32_t *tvp, + uint_t tstate /* Errored tstate flags */ +) +{ + comp_state_t state = mdcp->state; + + /* grab time */ + if (tvp != NULL) + *tvp = mdcp->timestamp; + + if (tstate != 0) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* return state */ + switch (state) { + case CS_OKAY: + return (dgettext(TEXT_DOMAIN, "Okay")); + case CS_ERRED: + return (dgettext(TEXT_DOMAIN, "Maintenance")); + case CS_LAST_ERRED: + return (dgettext(TEXT_DOMAIN, "Last Erred")); + case CS_RESYNC: + return (dgettext(TEXT_DOMAIN, "Resyncing")); + default: + return (dgettext(TEXT_DOMAIN, "invalid")); + } +} + +/* + * print subdevice stripe row + */ +static int +subdev_row_report( + mdsetname_t *sp, + md_row_t *rp, + char *fname, + FILE *fp, + mdprtopts_t options, + uint_t top_tstate, /* Errored tstate flags */ + md_error_t *ep +) +{ + uint_t comp; + int rval = -1; + ddi_devid_t dtp; + int len = 0; + + + /* + * building a format string on the fly that will be used + * in fprintf. This is to allow really really long ctd names + */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + char *cname = cp->compnamep->cname; + + len = max(len, strlen(cname)); + } + + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + /* print header */ + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + "\t%-*.*s %-12.12s %5.5s %12.12s %5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare")) == EOF) { + goto out; + } + } else { + if (fprintf(fp, + "\t%-*s %5s %5s %-11s %-5s %-9s %s\n", + len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "State"), + dgettext(TEXT_DOMAIN, "Reloc"), + dgettext(TEXT_DOMAIN, "Hot Spare"), + dgettext(TEXT_DOMAIN, "Time")) == EOF) { + goto out; + } + } + + + /* print components */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *namep = cp->compnamep; + char *cname = namep->cname; + diskaddr_t start_blk; + int has_mddb; + char *has_mddb_str; + char *comp_state; + md_timeval32_t tv; + char *hsname = ((cp->hsnamep != NULL) ? + cp->hsnamep->cname : ""); + char *devid = " "; + mdname_t *didnp = NULL; + uint_t tstate = 0; + + /* get info */ + if ((start_blk = metagetstart(sp, namep, ep)) == + MD_DISKADDR_ERROR) { + return (-1); + } + if ((has_mddb = metahasmddb(sp, namep, ep)) < 0) { + return (-1); + } + if (has_mddb) + has_mddb_str = dgettext(TEXT_DOMAIN, "Yes"); + else + has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + + /* + * If the component is a metadevice, print out either + * unavailable or the state of the metadevice, if not + * a metadevice, print nothing if the state of the + * stripe is unavailable + */ + if (metaismeta(namep)) { + if (meta_get_tstate(namep->dev, &tstate, ep) != 0) + return (-1); + comp_state = comp_state_to_name(cp, &tv, tstate & + MD_DEV_ERRORED); + } else { + /* + * if top_tstate is set, that implies that you have + * a ctd type device with an unavailable metadevice + * on top of it. If so, print a - for it's state + */ + if (top_tstate != 0) + comp_state = "-"; + else + comp_state = comp_state_to_name(cp, &tv, + tstate & MD_DEV_ERRORED); + } + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, namep->dev, ep)) + == NULL) { + return (-1); + } + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) { + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + } + /* print info */ + /* + * building a format string on the fly that will be used + * in fprintf. This is to allow really really long ctd names + */ + if (! (options & PRINT_TIMES)) { + if (fprintf(fp, + "\t%-*s %8lld %-5.5s %12.12s %5.5s %s\n", + len, cname, start_blk, + has_mddb_str, comp_state, devid, hsname) == EOF) { + goto out; + } + } else { + char *timep = meta_print_time(&tv); + + if (fprintf(fp, + "\t%-*s %5lld %-5s %-11s %-5s %-9s %s\n", + len, cname, start_blk, + has_mddb_str, comp_state, devid, hsname, + timep) == EOF) { + goto out; + } + } + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print toplevel stripe row + */ +/*ARGSUSED4*/ +static int +toplev_row_report( + mdsetname_t *sp, + md_row_t *rp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t comp; + int rval = -1; + char *devid = " "; + mdname_t *didnp = NULL; + int len = 0; + + /* + * building a format string on the fly that will be used + * in fprintf. This is to allow really really long ctd names + */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + len = max(len, + strlen(rp->comps.comps_val[comp].compnamep->cname)); + } + + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Device"))); + len += 2; + /* print header */ + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s\t%s\n", + len, len, + dgettext(TEXT_DOMAIN, "Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + + /* print components */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *namep = cp->compnamep; + char *cname = namep->cname; + diskaddr_t start_blk; + int has_mddb; + char *has_mddb_str; + ddi_devid_t dtp; + + /* get info */ + if ((start_blk = metagetstart(sp, namep, ep)) == + MD_DISKADDR_ERROR) { + return (-1); + } + if ((has_mddb = metahasmddb(sp, namep, ep)) < 0) { + return (-1); + } + if (has_mddb) + has_mddb_str = dgettext(TEXT_DOMAIN, "Yes"); + else + has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, namep->dev, ep)) + == NULL) { + return (-1); + } + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) { + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) { + devid = dgettext(TEXT_DOMAIN, "No "); + } else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + } + /* print info */ + /* + * building a format string on the fly that will be used + * in fprintf. This is to allow really really long ctd names + */ + if (fprintf(fp, + "\t%-*s %8lld %-5.5s\t%s\n", len, + cname, start_blk, has_mddb_str, devid) == EOF) { + goto out; + } + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print stripe options + */ +int +meta_print_stripe_options( + mdhspname_t *hspnamep, + char *fname, + FILE *fp, + md_error_t *ep +) +{ + char *hspname = ((hspnamep != NULL) ? hspnamep->hspname : + dgettext(TEXT_DOMAIN, "none")); + int rval = -1; + + /* print options */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Hot spare pool: %s\n"), hspname) == EOF) { + goto out; + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * report stripe + */ +static int +stripe_report( + mdsetname_t *sp, + md_stripe_t *stripep, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + uint_t row; + int rval = -1; + uint_t tstate = 0; + + /* + * if the -B option has been specified check to see if the + * metadevice is s "big" one and print if so, also if a + * big device we need to store the ctd involved for use in + * printing out the relocation information. + */ + if (options & PRINT_LARGEDEVICES) { + if (stripep->common.revision != MD_64BIT_META_DEV) { + rval = 0; + goto out; + } else { + if (meta_getdevs(sp, stripep->common.namep, + nlpp, ep) != 0) + goto out; + } + } + + /* print header */ + if (options & PRINT_HEADER) { + if (fprintf(fp, "%s: Concat/Stripe\n", + stripep->common.namep->cname) == EOF) { + goto out; + } + + } + + /* print hotspare pool */ + if (stripep->hspnamep != NULL) { + if (meta_print_stripe_options(stripep->hspnamep, + fname, fp, ep) != 0) { + return (-1); + } + } + + if (metaismeta(stripep->common.namep)) { + if (meta_get_tstate(stripep->common.namep->dev, &tstate, ep) + != 0) + return (-1); + } + if ((tstate & MD_DEV_ERRORED) != 0) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " State: Unavailable\n" + " Reconnect disk and invoke: metastat -i\n")) == EOF) { + goto out; + } + } + + /* print size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %lld blocks (%s)\n"), + stripep->common.size, + meta_number_to_string(stripep->common.size, DEV_BSIZE)) + == EOF) { + goto out; + } + + /* print rows */ + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + + /* print stripe and interlace */ + if (rp->comps.comps_len > 1) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Stripe %u: (interlace: %lld blocks)\n"), + row, rp->interlace) == EOF) { + goto out; + } + } else { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Stripe %u:\n"), + row) == EOF) { + goto out; + } + } + + /* print components appropriately */ + if (MD_HAS_PARENT(stripep->common.parent)) { + if (subdev_row_report(sp, rp, fname, fp, options, + tstate & MD_DEV_ERRORED, ep) != 0) { + return (-1); + } + } else { + if (toplev_row_report(sp, rp, fname, fp, options, + ep) != 0) { + return (-1); + } + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report stripe + */ +int +meta_stripe_print( + mdsetname_t *sp, + mdname_t *stripenp, + mdnamelist_t **nlpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + md_stripe_t *stripep; + int row, comp; + + /* should have same set */ + assert(sp != NULL); + assert((stripenp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev)))); + + /* print all stripes */ + if (stripenp == NULL) { + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int cnt; + int rval = 0; + + /* get list */ + if ((cnt = meta_get_stripe_names(sp, &nlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recurse */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (meta_stripe_print(sp, np, nlpp, fname, fp, + options, ep) != 0) + rval = -1; + } + + /* cleanup, return success */ + metafreenamelist(nlp); + return (rval); + } + + /* get unit structure */ + if ((stripep = meta_get_stripe_common(sp, stripenp, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* check for parented */ + if ((! (options & PRINT_SUBDEVS)) && + (MD_HAS_PARENT(stripep->common.parent))) { + return (0); + } + + /* print appropriate detail */ + if (options & PRINT_SHORT) { + if (stripe_print(stripep, fname, fp, options, ep) != 0) + return (-1); + } else { + if (stripe_report(sp, stripep, nlpp, fname, fp, options, + ep) != 0) + return (-1); + } + + /* Recurse on components that are metadevices */ + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + + /* look for components that are metadevices */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *namep = cp->compnamep; + + if ((metaismeta(namep)) && + (meta_print_name(sp, namep, nlpp, fname, fp, + (options | PRINT_HEADER | PRINT_SUBDEVS), + NULL, ep) != 0)) { + return (-1); + } + } + } + return (0); +} + +/* + * find stripe component to replace + */ +int +meta_find_erred_comp( + mdsetname_t *sp, + mdname_t *stripenp, + mdname_t **compnpp, + comp_state_t *compstate, + md_error_t *ep +) +{ + md_stripe_t *stripep; + md_comp_t *compp = NULL; + uint_t lasterrcnt = 0; + uint_t row; + + /* get stripe */ + *compnpp = NULL; + if ((stripep = meta_get_stripe_common(sp, stripenp, 1, ep)) == NULL) + return (-1); + + /* + * Try to find the first erred component. + * If there is not one, then look for the + * first last_erred component. + */ + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + + if ((cp->state == CS_ERRED) && ((compp == NULL) || + (cp->lasterrcnt < lasterrcnt))) { + compp = cp; + lasterrcnt = cp->lasterrcnt; + } + } + } + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + + if ((cp->state == CS_LAST_ERRED) && ((compp == NULL) || + (cp->lasterrcnt < lasterrcnt))) { + compp = cp; + lasterrcnt = cp->lasterrcnt; + } + } + } + + /* return component */ + if (compp != NULL) { + *compnpp = compp->compnamep; + *compstate = compp->state; + } + + /* return success */ + return (0); +} + +/* + * invalidate component names + */ +static int +invalidate_components( + mdsetname_t *sp, + mdname_t *stripenp, + md_error_t *ep +) +{ + md_stripe_t *stripep; + uint_t row; + + if ((stripep = meta_get_stripe(sp, stripenp, ep)) == NULL) + return (-1); + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + + meta_invalidate_name(compnp); + } + } + return (0); +} + +/* + * attach components to stripe + */ +int +meta_stripe_attach( + mdsetname_t *sp, + mdname_t *stripenp, + mdnamelist_t *nlp, + diskaddr_t interlace, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *lp; + ms_unit_t *old_un, *new_un; + struct ms_row *mdr, *new_mdr; + uint_t newcomps, ncomps, icomp; + uint_t row; + size_t mdsize, first_comp; + diskaddr_t new_blks; + diskaddr_t limit; + diskaddr_t disk_size = 0; + ms_comp_t *mdcomp, *new_comp; + uint_t write_reinstruct = 0; + uint_t read_reinstruct = 0; + mdnamelist_t *keynlp = NULL; + uint_t round_cyl = 1; + minor_t parent; + md_grow_params_t mgp; + int rval = -1; + md_timeval32_t creation_time; + int create_flag = MD_CRO_32BIT; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev))); + + /* check type */ + if (metachkmeta(stripenp, ep) != 0) + return (-1); + + /* check and count components */ + assert(nlp != NULL); + newcomps = 0; + for (lp = nlp; (lp != NULL); lp = lp->next) { + mdname_t *np = lp->namep; + mdnamelist_t *p; + + /* check against existing devices */ + if (meta_check_component(sp, np, 0, ep) != 0) + return (-1); + + /* check against ourselves */ + for (p = lp->next; (p != NULL); p = p->next) { + if (meta_check_overlap(np->cname, np, 0, -1, + p->namep, 0, -1, ep) != 0) { + return (-1); + } + } + + /* count */ + ++newcomps; + } + + /* get old unit */ + if ((old_un = (ms_unit_t *)meta_get_mdunit(sp, stripenp, ep)) == NULL) + return (-1); + + /* if zero, inherit the last rows interlace value */ + if (interlace == 0) { + mdr = &old_un->un_row[old_un->un_nrows - 1]; + interlace = mdr->un_interlace; + } + + /* + * calculate size of new unit structure + */ + + /* unit + rows */ + mdsize = sizeof (ms_unit_t) - sizeof (struct ms_row); + mdsize += sizeof (struct ms_row) * (old_un->un_nrows + 1); + + /* number of new components being added */ + ncomps = newcomps; + + /* count the # of components in the old unit */ + mdr = &old_un->un_row[0]; + for (row = 0; (row < old_un->un_nrows); row++) + ncomps += mdr[row].un_ncomp; + first_comp = roundup(mdsize, sizeof (long long)); + mdsize += sizeof (ms_comp_t) * ncomps + (first_comp - mdsize); + + /* allocate new unit */ + new_un = Zalloc(mdsize); + new_un->un_ocomp = first_comp; + + /* compute new data */ + new_mdr = &new_un->un_row[old_un->un_nrows]; + new_mdr->un_icomp = ncomps - newcomps; + new_mdr->un_ncomp = newcomps; + new_mdr->un_blocks = 0; + new_mdr->un_cum_blocks = + old_un->un_row[old_un->un_nrows - 1].un_cum_blocks; + new_mdr->un_interlace = interlace; + + /* for each new device */ + mdcomp = (struct ms_comp *)(void *)&((char *)new_un)[new_un->un_ocomp]; + icomp = new_mdr->un_icomp; + if (meta_gettimeofday(&creation_time) == -1) + return (mdsyserror(ep, errno, NULL)); + for (lp = nlp; (lp != NULL); lp = lp->next) { + mdname_t *np = lp->namep; + diskaddr_t size, start_blk; + mdgeom_t *geomp; + + /* figure out how big */ + if ((size = metagetsize(np, ep)) == MD_DISKADDR_ERROR) + goto out; + if ((start_blk = metagetstart(sp, np, ep)) == + MD_DISKADDR_ERROR) + goto out; + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, np->cname); + goto out; + } + size -= start_blk; + if (newcomps > 1) + size = rounddown(size, interlace); + + /* adjust for smallest disk */ + if (disk_size == 0) { + disk_size = size; + } else if (size < disk_size) { + disk_size = size; + } + + /* get worst reinstructs */ + if ((geomp = metagetgeom(np, ep)) == NULL) + goto out; + if (geomp->write_reinstruct > write_reinstruct) + write_reinstruct = geomp->write_reinstruct; + if (geomp->read_reinstruct > read_reinstruct) + read_reinstruct = geomp->read_reinstruct; + + /* In dryrun mode (DOIT not set) we must not alter the mddb */ + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, np, &keynlp, ep) != 0) + goto out; + } + + /* build new component */ + new_comp = &mdcomp[icomp++]; + new_comp->un_key = np->key; + new_comp->un_dev = np->dev; + new_comp->un_start_block = start_blk; + new_comp->un_mirror.ms_state = CS_OKAY; + new_comp->un_mirror.ms_timestamp = creation_time; + } + + limit = LLONG_MAX; + + /* compute new size */ + new_mdr->un_blocks = new_mdr->un_ncomp * disk_size; + new_blks = new_mdr->un_cum_blocks + new_mdr->un_blocks; + if (new_blks > limit) { + new_mdr->un_cum_blocks = limit; + new_blks = limit; + md_eprintf(dgettext(TEXT_DOMAIN, + "unit size overflow, limit is %lld blocks\n"), + limit); + } else { + new_mdr->un_cum_blocks += new_mdr->un_blocks; + } + new_un->c.un_actual_tb = new_mdr->un_cum_blocks; + new_un->un_nrows = old_un->un_nrows + 1; + + /* adjust geometry */ + new_un->c.un_nhead = old_un->c.un_nhead; + new_un->c.un_nsect = old_un->c.un_nsect; + new_un->c.un_rpm = old_un->c.un_rpm; + new_un->c.un_wr_reinstruct = old_un->c.un_wr_reinstruct; + new_un->c.un_rd_reinstruct = old_un->c.un_rd_reinstruct; + if (meta_adjust_geom((md_unit_t *)new_un, stripenp, + write_reinstruct, read_reinstruct, round_cyl, ep) != 0) + goto out; + + /* if in dryrun mode, we are done here. */ + if ((options & MDCMD_DOIT) == 0) { + if (options & MDCMD_PRINT) { + if (newcomps == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: attaching component would suceed\n"), + stripenp->cname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: attaching components would suceed\n"), + stripenp->cname); + } + } + rval = 0; /* success */ + goto out; + } + + create_flag = meta_check_devicesize(new_un->c.un_total_blocks); + + /* grow stripe */ + (void) memset(&mgp, 0, sizeof (mgp)); + mgp.mnum = MD_SID(old_un); + MD_SETDRIVERNAME(&mgp, MD_STRIPE, sp->setno); + mgp.size = mdsize; + mgp.mdp = (uintptr_t)new_un; + mgp.nrows = old_un->un_nrows; + if (create_flag == MD_CRO_32BIT) { + mgp.options = MD_CRO_32BIT; + new_un->c.un_revision = MD_32BIT_META_DEV; + } else { + mgp.options = MD_CRO_64BIT; + new_un->c.un_revision = MD_64BIT_META_DEV; + } + + if ((MD_HAS_PARENT(old_un->c.un_parent)) && + (old_un->c.un_parent != MD_MULTI_PARENT)) { + mgp.npar = 1; + parent = old_un->c.un_parent; + mgp.par = (uintptr_t)(&parent); + } + + if (metaioctl(MD_IOCGROW, &mgp, &mgp.mde, NULL) != 0) { + (void) mdstealerror(ep, &mgp.mde); + goto out; + } + + /* clear cache */ + if (invalidate_components(sp, stripenp, ep) != 0) + goto out; + meta_invalidate_name(stripenp); + + /* let em know */ + if (options & MDCMD_PRINT) { + if (newcomps == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: component is attached\n"), stripenp->cname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: components are attached\n"), stripenp->cname); + } + (void) fflush(stdout); + } + + /* grow any parents */ + if (meta_concat_parent(sp, stripenp, ep) != 0) + return (-1); + + rval = 0; /* success */ + + /* cleanup, return error */ +out: + Free(old_un); + Free(new_un); + if (options & MDCMD_DOIT) { + if (rval != 0) + (void) del_key_names(sp, keynlp, NULL); + metafreenamelist(keynlp); + } + return (rval); +} + +/* + * get stripe parameters + */ +int +meta_stripe_get_params( + mdsetname_t *sp, + mdname_t *stripenp, + ms_params_t *paramsp, + md_error_t *ep +) +{ + md_stripe_t *stripep; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev))); + + /* check name */ + if (metachkmeta(stripenp, ep) != 0) + return (-1); + + /* get unit */ + if ((stripep = meta_get_stripe(sp, stripenp, ep)) == NULL) + return (-1); + + /* return parameters */ + (void) memset(paramsp, 0, sizeof (*paramsp)); + if (stripep->hspnamep == NULL) + paramsp->hsp_id = MD_HSP_NONE; + else + paramsp->hsp_id = stripep->hspnamep->hsp; + return (0); +} + +/* + * set stripe parameters + */ +int +meta_stripe_set_params( + mdsetname_t *sp, + mdname_t *stripenp, + ms_params_t *paramsp, + md_error_t *ep +) +{ + md_stripe_params_t msp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev))); + + /* check name */ + if (metachkmeta(stripenp, ep) != 0) + return (-1); + + /* set parameters */ + (void) memset(&msp, 0, sizeof (msp)); + MD_SETDRIVERNAME(&msp, MD_STRIPE, sp->setno); + msp.mnum = meta_getminor(stripenp->dev); + msp.params = *paramsp; + if (metaioctl(MD_IOCCHANGE, &msp, &msp.mde, stripenp->cname) != 0) + return (mdstealerror(ep, &msp.mde)); + + /* clear cache */ + meta_invalidate_name(stripenp); + + /* return success */ + return (0); +} + +/* + * check for dups in the stripe itself + */ +static int +check_twice( + md_stripe_t *stripep, + uint_t row, + uint_t comp, + md_error_t *ep +) +{ + mdname_t *stripenp = stripep->common.namep; + mdname_t *thisnp; + uint_t r; + + thisnp = stripep->rows.rows_val[row].comps.comps_val[comp].compnamep; + for (r = 0; (r <= row); ++r) { + md_row_t *rp = &stripep->rows.rows_val[r]; + uint_t e = ((r == row) ? comp : rp->comps.comps_len); + uint_t c; + + for (c = 0; (c < e); ++c) { + md_comp_t *cp = &rp->comps.comps_val[c]; + mdname_t *compnp = cp->compnamep; + + if (meta_check_overlap(stripenp->cname, thisnp, 0, -1, + compnp, 0, -1, ep) != 0) { + return (-1); + } + } + } + return (0); +} + +/* + * default stripe interlace + */ +diskaddr_t +meta_default_stripe_interlace(void) +{ + diskaddr_t interlace; + + /* default to 16k, round up if necessary */ + interlace = btodb(16 * 1024); + if (interlace < btodb(MININTERLACE)) + interlace = roundup(MININTERLACE, interlace); + return (interlace); +} + +/* + * convert interlaces + */ +int +meta_stripe_check_interlace( + diskaddr_t interlace, + char *uname, + md_error_t *ep +) +{ + if ((interlace < btodb(MININTERLACE)) || + (interlace > btodb(MAXINTERLACE))) { + return (mderror(ep, MDE_BAD_INTERLACE, uname)); + } + return (0); +} + + +/* + * check stripe + */ +int +meta_check_stripe( + mdsetname_t *sp, + md_stripe_t *stripep, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *stripenp = stripep->common.namep; + int force = ((options & MDCMD_FORCE) ? 1 : 0); + int doit = ((options & MDCMD_DOIT) ? 1 : 0); + int updateit = ((options & MDCMD_UPDATE) ? 1 : 0); + uint_t row; + + /* check rows */ + if (stripep->rows.rows_len < 1) { + return (mdmderror(ep, MDE_BAD_STRIPE, + meta_getminor(stripenp->dev), stripenp->cname)); + } + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + /* check number */ + if (rp->comps.comps_len < 1) { + return (mdmderror(ep, MDE_BAD_STRIPE, + meta_getminor(stripenp->dev), stripenp->cname)); + } + + /* compute default interlace */ + if (rp->interlace == 0) { + rp->interlace = meta_default_stripe_interlace(); + } + + /* check interlace */ + if (meta_stripe_check_interlace(rp->interlace, stripenp->cname, + ep) != 0) { + return (-1); + } + + /* check components */ + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + diskaddr_t start_blk, size; + + /* check component */ + if (!updateit) { + if (meta_check_component(sp, compnp, + force, ep) != 0) + return (-1); + if (((start_blk = metagetstart(sp, compnp, + ep)) == MD_DISKADDR_ERROR) || + ((size = metagetsize(compnp, ep)) == + MD_DISKADDR_ERROR)) { + return (-1); + } + if (start_blk >= size) + return (mdsyserror(ep, ENOSPC, + compnp->cname)); + size -= start_blk; + size = rounddown(size, rp->interlace); + if (size == 0) + return (mdsyserror(ep, ENOSPC, + compnp->cname)); + } + + /* check this stripe too */ + if (check_twice(stripep, row, comp, ep) != 0) + return (-1); + } + } + + /* check hotspare pool name */ + if (doit) { + if ((stripep->hspnamep != NULL) && + (metachkhsp(sp, stripep->hspnamep, ep) != 0)) { + return (-1); + } + } + + /* return success */ + return (0); +} + +/* + * setup stripe geometry + */ +static int +stripe_geom( + md_stripe_t *stripep, + ms_unit_t *ms, + md_error_t *ep +) +{ + uint_t nrow = stripep->rows.rows_len; + uint_t write_reinstruct = 0; + uint_t read_reinstruct = 0; + uint_t round_cyl = 1; + uint_t row; + mdgeom_t *geomp; + diskaddr_t first_row_size = 0; + char *miscname; + int is_sp = 0; + + /* get worst reinstructs */ + for (row = 0; (row < nrow); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t ncomp = rp->comps.comps_len; + uint_t comp; + + for (comp = 0; (comp < ncomp); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + + if ((geomp = metagetgeom(compnp, ep)) == NULL) + return (-1); + if (geomp->write_reinstruct > write_reinstruct) + write_reinstruct = geomp->write_reinstruct; + if (geomp->read_reinstruct > read_reinstruct) + read_reinstruct = geomp->read_reinstruct; + } + } + + if ((geomp = metagetgeom( + stripep->rows.rows_val[0].comps.comps_val[0].compnamep, + ep)) == NULL) { + return (-1); + } + /* + * Figure out if the first component is a softpartition as the + * truncation check only occurs on them. + */ + if ((miscname = metagetmiscname( + stripep->rows.rows_val[0].comps.comps_val[0].compnamep, + ep)) == NULL) { + if (!mdisdeverror(ep, MDE_NOT_META)) + return (-1); + } else if (strcmp(miscname, MD_SP) == 0) { + is_sp = 1; + } + + + /* setup geometry from first device */ + if (meta_setup_geom((md_unit_t *)ms, stripep->common.namep, geomp, + write_reinstruct, read_reinstruct, round_cyl, ep) != 0) + return (-1); + + /* + * Here we want to make sure that any truncation did not + * result in lost data (or, more appropriately, inaccessible + * data). + * + * This is mainly a danger for (1, 1) concats, but it is + * mathematically possible for other somewhat contrived + * arrangements where in the sum of the lengths of each row + * beyond the first is smaller than the cylinder size of the + * only component in the first row. + * + * It is tempting to simply test for truncation here, by + * (md->c.un_total_blocks < md->c.un_actual_tb). That does + * not tell us, however, if rounding resulted in data loss, + * rather only that it occurred. The somewhat less obvious + * test below covers both the obvious (1, 1) case and the + * aforementioned corner case. + */ + first_row_size = ms->un_row[0].un_blocks; + if (is_sp == 1) { + md_unit_t *md = (md_unit_t *)ms; + + if (md->c.un_total_blocks < first_row_size) { + char buf[] = VAL2STR(ULLONG_MAX); + + /* + * The only difference here is the text of the error + * message, since the remediation is slightly + * different in the one-component versus + * multiple-component cases. + */ + if (nrow == 1) { + (void) mderror(ep, MDE_STRIPE_TRUNC_SINGLE, + stripep->common.namep->cname); + } else { + (void) mderror(ep, MDE_STRIPE_TRUNC_MULTIPLE, + stripep->common.namep->cname); + } + + /* + * By the size comparison above and the initialization + * of buf[] in terms of ULLONG_MAX, we guarantee that + * the value arg is non-negative and that we won't + * overflow the container. + */ + mderrorextra(ep, ulltostr((md->c.un_total_blocks + + (geomp->nhead * geomp->nsect)) + - first_row_size, &buf[sizeof (buf) - 1])); + + return (-1); + } + } + + /* return success */ + return (0); +} + +/* + * create stripe + */ +int +meta_create_stripe( + mdsetname_t *sp, + md_stripe_t *stripep, + mdcmdopts_t options, + md_error_t *ep +) +{ + mdname_t *stripenp = stripep->common.namep; + int force = ((options & MDCMD_FORCE) ? 1 : 0); + int doall = ((options & MDCMD_ALLOPTION) ? 1 : 0); + uint_t nrow = stripep->rows.rows_len; + uint_t ncomp = 0; + uint_t icomp = 0; + diskaddr_t cum_blocks = 0; + diskaddr_t limit; + size_t mdsize, first_comp; + uint_t row; + ms_unit_t *ms; + ms_comp_t *mdcomp; + mdnamelist_t *keynlp = NULL; + md_set_params_t set_params; + int rval = -1; + md_timeval32_t creation_time; + int create_flag = MD_CRO_32BIT; + + /* validate stripe */ + if (meta_check_stripe(sp, stripep, options, ep) != 0) + return (-1); + + /* allocate stripe unit */ + mdsize = sizeof (*ms) - sizeof (ms->un_row[0]); + mdsize += sizeof (ms->un_row) * nrow; + for (row = 0; (row < nrow); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + + ncomp += rp->comps.comps_len; + } + first_comp = roundup(mdsize, sizeof (long long)); + mdsize += (first_comp - mdsize) + (ncomp * sizeof (ms_comp_t)); + ms = Zalloc(mdsize); + ms->un_ocomp = first_comp; + if (meta_gettimeofday(&creation_time) == -1) + return (mdsyserror(ep, errno, NULL)); + + /* do rows */ + mdcomp = (ms_comp_t *)(void *)&((char *)ms)[ms->un_ocomp]; + for (row = 0; (row < nrow); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t ncomp = rp->comps.comps_len; + struct ms_row *mdr = &ms->un_row[row]; + diskaddr_t disk_size = 0; + uint_t comp; + + /* setup component count and offfset */ + mdr->un_icomp = icomp; + mdr->un_ncomp = ncomp; + + /* do components */ + for (comp = 0; (comp < ncomp); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + ms_comp_t *mdc = &mdcomp[icomp++]; + diskaddr_t size, start_blk; + + /* + * get start and size + * if first component is labelled, include label + */ + if ((size = metagetsize(compnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((start_blk = metagetstart(sp, compnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((row == 0) && (comp == 0)) { + diskaddr_t label; + int has_db; + + if ((has_db = metahasmddb(sp, compnp, ep)) < 0) + goto out; + if ((label = metagetlabel(compnp, ep)) == + MD_DISKADDR_ERROR) + goto out; + if ((has_db == 0) && (label != 0)) { + ms->c.un_flag |= MD_LABELED; + start_blk = compnp->start_blk = 0; + } + } + /* make sure we still have something left */ + if (start_blk >= size) { + (void) mdsyserror(ep, ENOSPC, compnp->cname); + goto out; + } + size -= start_blk; + + /* + * round down by interlace: this only applies + * if this row is a stripe, as indicated by + * (ncomp > 1) + */ + if (ncomp > 1) + size = rounddown(size, rp->interlace); + + if (size == 0) { + (void) mdsyserror(ep, ENOSPC, compnp->cname); + goto out; + } + + /* + * adjust for smallest disk: for a concat (any + * row with only one component), this will + * never hit the second conditional. + */ + if (disk_size == 0) { + disk_size = size; + } else if (size < disk_size) { + disk_size = size; + } + + if (options & MDCMD_DOIT) { + /* store name in namespace */ + if (add_key_name(sp, compnp, &keynlp, ep) != 0) + goto out; + } + + /* setup component */ + mdc->un_key = compnp->key; + mdc->un_dev = compnp->dev; + mdc->un_start_block = start_blk; + mdc->un_mirror.ms_state = CS_OKAY; + mdc->un_mirror.ms_timestamp = creation_time; + } + limit = LLONG_MAX; + + /* setup row */ + mdr->un_blocks = mdr->un_ncomp * disk_size; + cum_blocks += mdr->un_blocks; + if (cum_blocks > limit) { + cum_blocks = limit; + md_eprintf(dgettext(TEXT_DOMAIN, + "unit size overflow, limit is %lld blocks\n"), + limit); + } + mdr->un_cum_blocks = cum_blocks; + mdr->un_interlace = rp->interlace; + } + + /* setup unit */ + ms->c.un_type = MD_DEVICE; + MD_SID(ms) = meta_getminor(stripenp->dev); + ms->c.un_actual_tb = cum_blocks; + ms->c.un_size = mdsize; + if (stripep->hspnamep != NULL) + ms->un_hsp_id = stripep->hspnamep->hsp; + else + ms->un_hsp_id = MD_HSP_NONE; + ms->un_nrows = nrow; + + /* fill in the size of the stripe */ + if (options & MDCMD_UPDATE) { + stripep->common.size = ms->c.un_total_blocks; + for (row = 0; (row < nrow); ++row) { + stripep->rows.rows_val[row].row_size = + ms->un_row[row].un_blocks; + } + } + + if (stripe_geom(stripep, ms, ep) != 0) { + /* + * If the device is being truncated then only allow this + * if the user is aware (using the -f option) or they + * are in a recovery/complete build situation (using the -a + * option). + */ + if ((mdiserror(ep, MDE_STRIPE_TRUNC_SINGLE) || + mdiserror(ep, MDE_STRIPE_TRUNC_MULTIPLE)) && + (force || doall)) { + md_eprintf(dgettext(TEXT_DOMAIN, +"%s: WARNING: This form of metainit is not recommended.\n" +"The stripe is truncating the size of the underlying device.\n" +"Please see ERRORS in metainit(1M) for additional information.\n"), + stripenp->cname); + mdclrerror(ep); + } else { + goto out; + } + } + + create_flag = meta_check_devicesize(ms->c.un_total_blocks); + + /* if we're not doing anything, return success */ + if (! (options & MDCMD_DOIT)) { + rval = 0; /* success */ + goto out; + } + + /* create stripe */ + (void) memset(&set_params, 0, sizeof (set_params)); + + /* did the user tell us to generate a large device? */ + if (create_flag == MD_CRO_64BIT) { + ms->c.un_revision = MD_64BIT_META_DEV; + set_params.options = MD_CRO_64BIT; + } else { + ms->c.un_revision = MD_32BIT_META_DEV; + set_params.options = MD_CRO_32BIT; + } + + set_params.mnum = MD_SID(ms); + set_params.size = ms->c.un_size; + set_params.mdp = (uintptr_t)ms; + MD_SETDRIVERNAME(&set_params, MD_STRIPE, MD_MIN2SET(set_params.mnum)); + if (metaioctl(MD_IOCSET, &set_params, &set_params.mde, + stripenp->cname) != 0) { + (void) mdstealerror(ep, &set_params.mde); + goto out; + } + rval = 0; /* success */ + + /* cleanup, return success */ +out: + Free(ms); + if (rval != 0) { + (void) del_key_names(sp, keynlp, NULL); + } + + metafreenamelist(keynlp); + if ((rval == 0) && (options & MDCMD_DOIT)) { + if (invalidate_components(sp, stripenp, ep) != 0) + rval = -1; + meta_invalidate_name(stripenp); + } + return (rval); +} + +/* + * initialize stripe + * NOTE: this functions is metainit(1m)'s command line parser! + */ +int +meta_init_stripe( + mdsetname_t **spp, + int argc, + char *argv[], + mdcmdopts_t options, + md_error_t *ep +) +{ + char *uname = argv[0]; + mdname_t *stripenp = NULL; + int old_optind; + int c; + md_stripe_t *stripep = NULL; + uint_t nrow, row; + int rval = -1; + + /* get stripe name */ + assert(argc > 0); + if (argc < 1) + goto syntax; + + if ((stripenp = metaname(spp, uname, ep)) == NULL) + goto out; + assert(*spp != NULL); + uname = stripenp->cname; + if (metachkmeta(stripenp, ep) != 0) + goto out; + + if (!(options & MDCMD_NOLOCK)) { + /* grab set lock */ + if (meta_lock(*spp, TRUE, ep)) + goto out; + + if (meta_check_ownership(*spp, ep) != 0) + goto out; + } + + /* see if it exists already */ + if (metagetmiscname(stripenp, ep) != NULL) { + (void) mdmderror(ep, MDE_UNIT_ALREADY_SETUP, + meta_getminor(stripenp->dev), uname); + goto out; + } else if (! mdismderror(ep, MDE_UNIT_NOT_SETUP)) { + goto out; + } else { + mdclrerror(ep); + } + --argc, ++argv; + + /* parse general options */ + optind = 0; + opterr = 0; + if (getopt(argc, argv, "") != -1) + goto options; + + /* allocate stripe */ + stripep = Zalloc(sizeof (*stripep)); + + /* setup common */ + stripep->common.namep = stripenp; + stripep->common.type = MD_DEVICE; + + /* allocate and parse rows */ + if (argc < 1) { + (void) mdmderror(ep, MDE_NROWS, meta_getminor(stripenp->dev), + uname); + goto out; + } else if ((sscanf(argv[0], "%u", &nrow) != 1) || ((int)nrow < 0)) { + goto syntax; + } else if (nrow < 1) { + (void) mdmderror(ep, MDE_NROWS, meta_getminor(stripenp->dev), + uname); + goto out; + } + --argc, ++argv; + stripep->rows.rows_len = nrow; + stripep->rows.rows_val = + Zalloc(nrow * sizeof (*stripep->rows.rows_val)); + for (row = 0; (row < nrow); ++row) { + md_row_t *mdr = &stripep->rows.rows_val[row]; + uint_t ncomp, comp; + + /* allocate and parse components */ + if (argc < 1) { + (void) mdmderror(ep, MDE_NROWS, + meta_getminor(stripenp->dev), uname); + goto out; + } else if ((sscanf(argv[0], "%u", &ncomp) != 1) || + ((int)ncomp < 0)) { + goto syntax; + } else if (ncomp < 1) { + (void) mdmderror(ep, MDE_NCOMPS, + meta_getminor(stripenp->dev), uname); + goto out; + } + --argc, ++argv; + mdr->comps.comps_len = ncomp; + mdr->comps.comps_val = + Zalloc(ncomp * sizeof (*mdr->comps.comps_val)); + for (comp = 0; (comp < ncomp); ++comp) { + md_comp_t *mdc = &mdr->comps.comps_val[comp]; + mdname_t *compnp; + + /* parse component name */ + if (argc < 1) { + (void) mdmderror(ep, MDE_NCOMPS, + meta_getminor(stripenp->dev), uname); + goto out; + } + if ((compnp = metaname(spp, argv[0], ep)) == NULL) { + goto out; + } + /* check for soft partition */ + if (meta_sp_issp(*spp, compnp, ep) != 0) { + /* check disk */ + if (metachkcomp(compnp, ep) != 0) { + goto out; + } + } + mdc->compnamep = compnp; + --argc, ++argv; + } + + /* parse row options */ + old_optind = optind = 0; + opterr = 0; + while ((c = getopt(argc, argv, "i:")) != -1) { + switch (c) { + case 'i': + if (parse_interlace(uname, optarg, + &mdr->interlace, ep) != 0) { + goto out; + } + if (meta_stripe_check_interlace(mdr->interlace, + uname, ep)) + goto out; + break; + + default: + optind = old_optind; /* bomb out later */ + goto done_row_opts; + } + old_optind = optind; + } +done_row_opts: + argc -= optind; + argv += optind; + } + + /* parse stripe options */ + old_optind = optind = 0; + opterr = 0; + while ((c = getopt(argc, argv, "h:")) != -1) { + switch (c) { + case 'h': + if ((stripep->hspnamep = metahspname(spp, optarg, + ep)) == NULL) { + goto out; + } + break; + + default: + argc += old_optind; + argv += old_optind; + goto options; + } + old_optind = optind; + } + argc -= optind; + argv += optind; + + /* we should be at the end */ + if (argc != 0) + goto syntax; + + /* create stripe */ + if (meta_create_stripe(*spp, stripep, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Concat/Stripe is setup\n"), + uname); + (void) fflush(stdout); + } + goto out; + + /* syntax error */ +syntax: + rval = meta_cook_syntax(ep, MDE_SYNTAX, uname, argc, argv); + goto out; + + /* options error */ +options: + rval = meta_cook_syntax(ep, MDE_OPTION, uname, argc, argv); + goto out; + + /* cleanup, return error */ +out: + if (stripep != NULL) + meta_free_stripe(stripep); + return (rval); +} + +/* + * reset stripes + */ +int +meta_stripe_reset( + mdsetname_t *sp, + mdname_t *stripenp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_stripe_t *stripep; + int rval = -1; + int row, comp; + + /* should have same set */ + assert(sp != NULL); + assert((stripenp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(stripenp->dev)))); + + /* reset all stripes */ + if (stripenp == NULL) { + mdnamelist_t *stripenlp = NULL; + mdnamelist_t *p; + + /* for each stripe */ + rval = 0; + if (meta_get_stripe_names(sp, &stripenlp, 0, ep) < 0) + return (-1); + for (p = stripenlp; (p != NULL); p = p->next) { + /* reset stripe */ + stripenp = p->namep; + + /* + * If this is a multi-node set, we send a series + * of individual metaclear commands. + */ + if (meta_is_mn_set(sp, ep)) { + if (meta_mn_send_metaclear_command(sp, + stripenp->cname, options, 0, ep) != 0) { + rval = -1; + break; + } + } else { + if (meta_stripe_reset(sp, stripenp, + options, ep) != 0) { + rval = -1; + break; + } + } + } + + /* cleanup, return success */ + metafreenamelist(stripenlp); + return (rval); + } + + /* check name */ + if (metachkmeta(stripenp, ep) != 0) + return (-1); + + /* get unit structure */ + if ((stripep = meta_get_stripe(sp, stripenp, ep)) == NULL) + return (-1); + + /* make sure nobody owns us */ + if (MD_HAS_PARENT(stripep->common.parent)) { + return (mdmderror(ep, MDE_IN_USE, meta_getminor(stripenp->dev), + stripenp->cname)); + } + + /* clear subdevices cache */ + if (invalidate_components(sp, stripenp, ep) != 0) + return (-1); + + /* clear metadevice */ + if (meta_reset(sp, stripenp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: Concat/Stripe is cleared\n"), + stripenp->cname); + (void) fflush(stdout); + } + + /* clear subdevices */ + if (! (options & MDCMD_RECURSE)) + goto out; + + for (row = 0; (row < stripep->rows.rows_len); ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + for (comp = 0; (comp < rp->comps.comps_len); ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + mdname_t *compnp = cp->compnamep; + + /* only recurse on metadevices */ + if (! metaismeta(compnp)) + continue; + + if (meta_reset_by_name(sp, compnp, options, ep) != 0) + rval = -1; + } + } + + /* cleanup, return success */ +out: + meta_invalidate_name(stripenp); + return (rval); +} + +/* + * reports TRUE if any stripe component is in error + */ +int +meta_stripe_anycomp_is_err(mdsetname_t *sp, mdnamelist_t *stripe_names) +{ + mdnamelist_t *nlp; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int any_errs = FALSE; + + for (nlp = stripe_names; nlp; nlp = nlp->next) { + md_stripe_t *stripep; + int row; + + if ((stripep = meta_get_stripe(sp, nlp->namep, ep)) == NULL) { + any_errs |= TRUE; + goto out; + } + + for (row = 0; row < stripep->rows.rows_len; ++row) { + md_row_t *rp = &stripep->rows.rows_val[row]; + uint_t comp; + + for (comp = 0; comp < rp->comps.comps_len; ++comp) { + md_comp_t *cp = &rp->comps.comps_val[comp]; + + if (cp->state != CS_OKAY) { + any_errs |= TRUE; + goto out; + } + } + } + } +out: + if (!mdisok(ep)) + mdclrerror(ep); + + return (any_errs); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_systemfile.c b/usr/src/lib/lvm/libmeta/common/meta_systemfile.c new file mode 100644 index 0000000000..9e5e20f057 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_systemfile.c @@ -0,0 +1,475 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * patch /kernel/drv/md.conf file + */ +#include <sys/types.h> +#include <sys/stat.h> +#include <meta.h> +#include <sys/lvm/md_mddb.h> + +/* + * magic strings in system + */ +#define BEGROOTSTR "* Begin MDD root info (do not edit)\n" +#define ENDROOTSTR "* End MDD root info (do not edit)\n" +#define BEGMDDBSTR "# Begin MDD database info (do not edit)\n" +#define ENDMDDBSTR "# End MDD database info (do not edit)\n" + +/* + * copy system file, yank root and database lines + */ +int +meta_systemfile_copy( + char *sname, /* system file name */ + int doroot, /* remove mdd root stuff */ + int domddb, /* remove mdd database stuff */ + int doit, /* really copy file */ + int verbose, /* show what we're doing */ + char **tname, /* returned temp file name */ + FILE **tfp, /* returned open FILE */ + md_error_t *ep /* returned error */ +) +{ + FILE *fp; + struct stat sbuf; + char buf[MDDB_BOOTLIST_MAX_LEN]; + int delroot = 0; + int delmddb = 0; + + /* check names */ + assert(sname != NULL); + assert(tname != NULL); + assert(tfp != NULL); + + /* get temp name */ + *tfp = NULL; + *tname = Malloc(strlen(sname) + strlen(".tmp") + 1); + (void) strcpy(*tname, sname); + (void) strcat(*tname, ".tmp"); + + /* copy system file, yank stuff */ + if (((fp = fopen(sname, "r")) == NULL) || + (fstat(fileno(fp), &sbuf) != 0)) { + if (errno != ENOENT) { + (void) mdsyserror(ep, errno, sname); + goto out; + } + } + if (doit) { + if ((*tfp = fopen(*tname, "w")) == NULL) { + /* + * If we are on the miniroot we need to create + * files in /var/tmp. Opening a writable file + * in the miniroot result is EROFS error. + */ + if (errno != EROFS) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + Free(*tname); + *tname = tempnam("/var/tmp", "svm_"); + if (*tname == NULL) { + (void) mdsyserror(ep, errno, NULL); + goto out; + } + if ((*tfp = fopen(*tname, "w")) == NULL) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + if (fp != NULL) { + if ((fchmod(fileno(*tfp), (sbuf.st_mode & 0777)) + != 0) || + (fchown(fileno(*tfp), sbuf.st_uid, sbuf.st_gid) + != 0)) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + } + if (verbose) { + (void) printf(dgettext(TEXT_DOMAIN, + "Delete the following lines from %s:\n\n"), sname); + } + while ((fp != NULL) && (fgets(buf, sizeof (buf), fp) != NULL)) { + if ((doroot) && (strcmp(buf, BEGROOTSTR) == 0)) { + delroot = 1; + if (verbose) + (void) printf("%s", buf); + continue; + } + if (delroot) { + if (strcmp(buf, ENDROOTSTR) == 0) + delroot = 0; + if (verbose) + (void) printf("%s", buf); + continue; + } + if ((domddb) && (strcmp(buf, BEGMDDBSTR) == 0)) { + delmddb = 1; + if (verbose) + (void) printf("%s", buf); + continue; + } + if (delmddb) { + if (strcmp(buf, ENDMDDBSTR) == 0) + delmddb = 0; + if (verbose) + (void) printf("%s", buf); + continue; + } + if (doit) { + if (fputs(buf, *tfp) == EOF) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + } + if (fp != NULL) { + if ((! feof(fp)) || + (fclose(fp) != 0)) { + (void) mdsyserror(ep, errno, sname); + goto out; + } + fp = NULL; + } + if (verbose) + (void) printf("\n"); + + /* make sure we didn't stop mid-delete */ + if ((delroot) || (delmddb)) { + (void) mderror(ep, MDE_SYSTEM_FILE, sname); + goto out; + } + + /* flush stuff */ + if (doit) { + if ((fflush(*tfp) != 0) || + (fsync(fileno(*tfp)) != 0)) { + (void) mdsyserror(ep, errno, *tname); + goto out; + } + } + + /* return success */ + return (0); + + /* cleanup, return error */ +out: + if (fp != NULL) + (void) fclose(fp); + if (*tname != NULL) { + (void) unlink(*tname); + Free(*tname); + } + if (*tfp != NULL) + (void) fclose(*tfp); + return (-1); +} + +/* + * append root on MD lines to system + */ +int +meta_systemfile_append_mdroot( + mdname_t *rootnp, /* root device name */ + char *sname, /* system file name */ + char *tname, /* temp file name */ + FILE *tfp, /* temp FILE */ + int ismeta, /* is a metadevice */ + int doit, /* really patch file */ + int verbose, /* show what we're doing */ + md_error_t *ep +) +{ + char *longblkname; + + /* check names */ + assert(sname != NULL); + assert(tname != NULL); + assert(!doit || tfp != NULL); + + /* get root /devices name */ + if ((longblkname = metagetdevicesname(rootnp, ep)) == NULL) + return (-1); + + /* add header */ + if (verbose) { + (void) printf(dgettext(TEXT_DOMAIN, + "Add the following lines to %s:\n\n"), sname); + (void) printf("%s", BEGROOTSTR); + } + if (doit) { + if (fprintf(tfp, "%s", BEGROOTSTR) == EOF) { + return (mdsyserror(ep, errno, tname)); + } + } + + /* add rootdev */ + if (ismeta) { + if (verbose) + (void) printf("rootdev:%s\n", longblkname); + if (doit) { + if (fprintf(tfp, "rootdev:%s\n", longblkname) == EOF) { + return (mdsyserror(ep, errno, tname)); + } + } + } + + /* add trailer */ + if (verbose) { + (void) printf("%s\n", ENDROOTSTR); + } + if (doit) { + if (fprintf(tfp, "%s", ENDROOTSTR) == EOF) { + return (mdsyserror(ep, errno, tname)); + } + } + + /* flush stuff */ + if (doit) { + if ((fflush(tfp) != 0) || + (fsync(fileno(tfp)) != 0)) { + return (mdsyserror(ep, errno, tname)); + } + } + + /* return success */ + return (0); +} + +/* + * parse mddb.cf line + * + * Caller of this routine needs to free the device id string that + * is passed back during a successful return. + */ +static int +confline( + char *line, /* line in file */ + char **driver, /* returned driver name */ + minor_t *mnump, /* returned minor number */ + daddr_t *block, /* returned block offset */ + char **devid_char_pp /* returned device id string */ +) +{ + char *p = line; + int chksum = 0; + int i; + uint_t devid_size; + + if (*p == '#') { + return (-1); + } + *driver = p; + while ((*p != ' ') && (*p != '\t')) + chksum += *p++; + if (*driver == p) { + return (-1); + } + *p++ = '\0'; + *mnump = strtoul(p, &p, 10); + chksum += *mnump; + *block = strtol(p, &p, 10); + chksum += *block; + + /* parse out devid */ + while ((*p == ' ') || (*p == '\t')) { + p++; + } + i = strcspn(p, " \t"); + *devid_char_pp = Malloc(i+1); + (void) strncpy(*devid_char_pp, p, i); + (*devid_char_pp)[i] = '\0'; + devid_size = i; + p += devid_size; + for (i = 0; i < devid_size; i++) { + chksum += (*devid_char_pp)[i]; + } + + chksum += strtol(p, &p, 10); + if (chksum != 42) { + Free (*devid_char_pp); + devid_char_pp = NULL; + return (-1); + } + return (0); +} + +/* + * append MDDB lines to system + */ +int +meta_systemfile_append_mddb( + char *cname, /* mddb.cf file name */ + char *sname, /* system file name */ + char *tname, /* temp file name */ + FILE *tfp, /* temp FILE */ + int doit, /* really patch file */ + int verbose, /* show what we're doing */ + md_error_t *ep /* returned error */ +) +{ + FILE *cfp = NULL; + char buf[1024]; + char *p; + int i; + char *driver; + minor_t mnum; + daddr_t block; + char line[MDDB_BOOTLIST_MAX_LEN]; + char entry[MDDB_BOOTLIST_MAX_LEN]; + char *devid_char_p = NULL; + struct stat statbuf; + + /* check names */ + assert(cname != NULL); + assert(sname != NULL); + assert(tname != NULL); + assert(!doit || tfp != NULL); + + /* open database conf file */ + if ((cfp = fopen(cname, "r")) == NULL) { + (void) mdsyserror(ep, errno, cname); + goto out; + } + /* Check that it is an ordinary file */ + if (stat(cname, &statbuf) != 0) { + (void) mdsyserror(ep, errno, cname); + goto out; + } + if ((statbuf.st_mode & S_IFMT) != S_IFREG) { + (void) mderror(ep, MDE_MDDB_FILE, cname); + goto out; + } + + /* add header */ + if (verbose) { + (void) printf(dgettext(TEXT_DOMAIN, + "Add the following lines to %s:\n\n"), sname); + (void) printf("%s", BEGMDDBSTR); + } + if (doit) { + if (fprintf(tfp, "%s", BEGMDDBSTR) == EOF) { + (void) mdsyserror(ep, errno, tname); + goto out; + } + } + + /* append database lines */ + while (((p = fgets(buf, sizeof (buf), cfp)) != NULL) && + (confline(buf, &driver, &mnum, &block, &devid_char_p) != 0)) + ; + for (i = 1; ((p != NULL) && (i <= MDDB_MAX_PATCH)); ++i) { + (void) snprintf(line, sizeof (line), + "mddb_bootlist%d=\"%s:%lu:%ld:%s", + i, driver, mnum, block, devid_char_p); + if (devid_char_p != NULL) { + free(devid_char_p); + devid_char_p = NULL; + } + + while ((p = fgets(buf, sizeof (buf), cfp)) != NULL) { + if (confline(buf, &driver, &mnum, &block, + &devid_char_p) != 0) { + continue; + } + (void) snprintf(entry, sizeof (entry), " %s:%lu:%ld:%s", + driver, mnum, block, devid_char_p); + + if ((strlen(line) + strlen(entry) + 4) > sizeof (line)) + break; + (void) strcat(line, entry); + if (devid_char_p != NULL) { + free(devid_char_p); + devid_char_p = NULL; + } + } + if (verbose) + /* CSTYLED */ + (void) printf("%s\";\n", line); + if (doit) { + /* CSTYLED */ + if (fprintf(tfp, "%s\";\n", line) <= 0) { + (void) mdsyserror(ep, errno, tname); + goto out; + } + } + } + + if (devid_char_p != NULL) { + free(devid_char_p); + devid_char_p = NULL; + } + + /* add trailer */ + if (verbose) + (void) printf("%s\n", ENDMDDBSTR); + if (doit) { + if (fprintf(tfp, "%s", ENDMDDBSTR) == EOF) { + (void) mdsyserror(ep, errno, tname); + goto out; + } + } + + /* close database conf file */ + if (fclose(cfp) != 0) { + cfp = NULL; + (void) mdsyserror(ep, errno, cname); + goto out; + } + cfp = NULL; + + /* flush stuff */ + if (doit) { + if ((fflush(tfp) != 0) || + (fsync(fileno(tfp)) != 0)) { + (void) mdsyserror(ep, errno, tname); + goto out; + } + } + + /* return success */ + return (0); + + /* cleanup, return error */ +out: + if (cfp != NULL) + (void) fclose(cfp); + return (-1); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_tab.c b/usr/src/lib/lvm/libmeta/common/meta_tab.c new file mode 100644 index 0000000000..7e1ed32a6b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_tab.c @@ -0,0 +1,342 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#include <meta.h> + +#include <ctype.h> + +/* + * free md.tab struct + */ +void +meta_tab_free( + md_tab_t *tabp +) +{ + size_t line; + + Free(tabp->filename); + Free(tabp->data); + if (tabp->lines != NULL) { + assert(tabp->alloc > 0); + for (line = 0; (line < tabp->nlines); ++line) { + md_tab_line_t *linep = &tabp->lines[line]; + + if (linep->context != NULL) + Free(linep->context); + if (linep->cname != NULL) + Free(linep->cname); + if (linep->argv != NULL) { + assert(linep->alloc > 0); + Free(linep->argv); + } + } + Free(tabp->lines); + } + Free(tabp); +} + +/* + * (re)allocate argv array + */ +static void +realloc_argv( + md_tab_line_t *linep, + size_t argc +) +{ + /* allocate in chunks */ + argc = roundup(argc, TAB_ARG_ALLOC); + if (argc < linep->alloc) + return; + + /* (re)allocate */ + if (linep->alloc == 0) { + linep->argv = Malloc(argc * sizeof (*linep->argv)); + } else { + assert(linep->argv != NULL); + linep->argv = + Realloc(linep->argv, (argc * sizeof (*linep->argv))); + } + + /* zero out new stuff */ + (void) memset(&linep->argv[linep->alloc], 0, + ((argc - linep->alloc) * sizeof (*linep->argv))); + + /* adjust for new size */ + linep->alloc = argc; +} + +/* + * (re)allocate line array + */ +static void +realloc_lines( + md_tab_t *tabp, + size_t nlines +) +{ + /* allocate in chunks */ + nlines = roundup(nlines, TAB_LINE_ALLOC); + if (nlines < tabp->alloc) + return; + + /* (re)allocate */ + if (tabp->alloc == 0) { + assert(tabp->lines == NULL); + tabp->lines = Malloc(nlines * sizeof (*tabp->lines)); + } else { + assert(tabp->lines != NULL); + tabp->lines = + Realloc(tabp->lines, (nlines * sizeof (*tabp->lines))); + } + + /* zero out new stuff */ + (void) memset(&tabp->lines[tabp->alloc], 0, + ((nlines - tabp->alloc) * sizeof (*tabp->lines))); + + /* adjust for new size */ + tabp->alloc = nlines; +} + +/* + * parse up md.tab struct + */ +static void +parse_tab( + md_tab_t *tabp +) +{ + uint_t lineno = 1; + char *p = tabp->data; + char *e = tabp->data + tabp->total - 1; + char *context; + size_t len; + + /* we can count on '\n\0' as the last characters */ + assert(tabp->total >= 2); + assert(tabp->data[tabp->total - 2] == '\n'); + assert(tabp->data[tabp->total - 1] == '\0'); + + /* allocate context buffer "file line XXX" */ + assert(tabp->filename != NULL); + len = strlen(tabp->filename) + + strlen(dgettext(TEXT_DOMAIN, "%s line %u")) + 20 + 1; + context = Malloc(len); + + /* parse lines */ + while (p < e) { + md_tab_line_t *linep; + char *t; + + /* allocate new line */ + realloc_lines(tabp, (tabp->nlines + 1)); + linep = &tabp->lines[tabp->nlines]; + (void) snprintf(context, len, + dgettext(TEXT_DOMAIN, "%s line %u"), tabp->filename, + lineno); + + /* comments */ + if (*p == '#') { + while (*p != '\n') + ++p; + } + + /* coalesce \ continuations */ + t = p; + while (*t != '\n') { + if ((*t == '\\') && (*(t + 1) == '\n')) { + *t++ = ' '; + *t = ' '; + ++lineno; + } + ++t; + } + + /* leading whitespace */ + while ((*p != '\n') && (isspace(*p))) + ++p; + + /* count lines */ + if (*p == '\n') { + ++p; + ++lineno; + continue; + } + + /* tokenize line */ + while ((p < e) && (*p != '\n')) { + char **argvp; + + /* allocate new token */ + realloc_argv(linep, (linep->argc + 1)); + argvp = &linep->argv[linep->argc++]; + + /* find end of token */ + *argvp = p; + while ((*p != '\n') && (! isspace(*p))) + ++p; + + /* terminate */ + if (*p == '\n') { + *p++ = '\0'; + ++lineno; + break; + } + + /* eat white space */ + *p++ = '\0'; + while ((p < e) && (*p != '\n') && (isspace(*p))) + ++p; + } + tabp->nlines++; + + /* fill in the rest */ + assert((linep->argc > 0) && (linep->argv != NULL) && + (linep->argv[0][0] != '\0') && + (! isspace(linep->argv[0][0]))); + linep->context = Strdup(context); + linep->type = meta_get_init_type(linep->argc, linep->argv); + linep->cname = Strdup(meta_canonicalize(NULL, linep->argv[0])); + assert(linep->cname != NULL); + } + + /* cleanup */ + Free(context); +} + +/* + * read in md.tab file and return struct + */ +md_tab_t * +meta_tab_parse( + char *filename, + md_error_t *ep +) +{ + md_tab_t *tabp = NULL; + int fd = -1; + struct stat statbuf; + size_t sofar; + char *p; + + /* open tab file */ + if (filename == NULL) + filename = METATAB; + if ((fd = open(filename, O_RDONLY, 0)) < 0) { + (void) mdsyserror(ep, errno, filename); + goto out; + } + if (fstat(fd, &statbuf) != 0) { + (void) mdsyserror(ep, errno, filename); + goto out; + } + + /* allocate table */ + tabp = Zalloc(sizeof (*tabp)); + tabp->filename = Strdup(filename); + tabp->total = statbuf.st_size + 2; /* terminating "\n\0" */ + tabp->data = Malloc(tabp->total); + + /* read in data */ + sofar = 0; + p = tabp->data; + while (sofar < statbuf.st_size) { + int cnt; + + if ((cnt = read(fd, p, 8192)) < 0) { + (void) mdsyserror(ep, errno, filename); + goto out; + } else if (cnt == 0) { + (void) mderror(ep, MDE_SYNTAX, filename); + goto out; + } + sofar += cnt; + p += cnt; + } + tabp->data[tabp->total - 2] = '\n'; + tabp->data[tabp->total - 1] = '\0'; + + /* close file */ + if (close(fd) != 0) { + (void) mdsyserror(ep, errno, filename); + fd = -1; + goto out; + } + fd = -1; + + /* parse it up */ + parse_tab(tabp); + + /* return success */ + return (tabp); + + /* cleanup, return error */ +out: + if (fd >= 0) + (void) close(fd); + if (tabp != NULL) + meta_tab_free(tabp); + return (NULL); +} + +/* + * find line in md.tab + */ +md_tab_line_t * +meta_tab_find( + mdsetname_t *sp, + md_tab_t *tabp, + char *name, + mdinittypes_t type +) +{ + char *cname = meta_canonicalize(sp, name); + size_t line; + + for (line = 0; (line < tabp->nlines); ++line) { + md_tab_line_t *linep = &tabp->lines[line]; + + assert((linep->argc > 0) && (linep->argv[0] != NULL)); + if (((linep->type & type) != 0) && + (strcmp(linep->cname, cname) == 0)) { + Free(cname); + return (linep); + } + } + Free(cname); + return (NULL); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_time.c b/usr/src/lib/lvm/libmeta/common/meta_time.c new file mode 100644 index 0000000000..ace6483a08 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_time.c @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * 32-bit only version of gettimeofday + */ + +#include <sys/time.h> +#include <sys/types32.h> +#include <meta.h> + +int +meta_gettimeofday(md_timeval32_t *tv32) +{ + struct timeval tv; + int retval; + + if (tv32 == NULL) + return (0); + + if ((retval = gettimeofday(&tv, NULL)) == 0) { + tv32->tv_sec = (time32_t)tv.tv_sec; + tv32->tv_usec = (int32_t)tv.tv_usec; + return (0); + } + + return (retval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_trans.c b/usr/src/lib/lvm/libmeta/common/meta_trans.c new file mode 100644 index 0000000000..e350e2d2d5 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_trans.c @@ -0,0 +1,1761 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * trans operations + */ + +#include <meta.h> +#include <meta_basic.h> +#include <sys/lvm/md_trans.h> +#include <sys/wait.h> +#include <sys/mnttab.h> +#include <stddef.h> + +extern char *getfullblkname(); + +/* + * replace trans + */ + +int +meta_trans_replace(mdsetname_t *sp, mdname_t *transnp, mdname_t *oldnp, + mdname_t *newnp, mdcmdopts_t options, md_error_t *ep) +{ + replace_params_t params; + md_dev64_t old_dev, + new_dev; + daddr_t new_start_blk, + new_end_blk; + + /* should have same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(transnp->dev))); + + new_dev = newnp->dev; + new_start_blk = newnp->start_blk; + new_end_blk = newnp->end_blk; + + meta_invalidate_name(transnp); + /* the old device binding is now established */ + if ((old_dev = oldnp->dev) == NODEV64) + return (mdsyserror(ep, ENODEV, oldnp->cname)); + + if (((strcmp(oldnp->rname, newnp->rname) == 0) && + (old_dev != new_dev))) { + newnp->dev = new_dev; + newnp->start_blk = new_start_blk; + newnp->end_blk = new_end_blk; + } + + if (add_key_name(sp, newnp, NULL, ep) != 0) + return (-1); + + (void) memset(¶ms, 0, sizeof (params)); + params.mnum = meta_getminor(transnp->dev); + MD_SETDRIVERNAME(¶ms, MD_TRANS, sp->setno); + + params.cmd = REPLACE_COMP; + params.old_dev = old_dev; + params.new_dev = new_dev; + params.new_key = newnp->key; + if (metaioctl(MD_IOCREPLACE, ¶ms, ¶ms.mde, NULL) != 0) { + (void) del_key_name(sp, newnp, ep); + return (mdstealerror(ep, ¶ms.mde)); + } + meta_invalidate_name(oldnp); + meta_invalidate_name(newnp); + meta_invalidate_name(transnp); + + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: device %s is replaced with %s\n"), + transnp->cname, oldnp->cname, newnp->cname); + } + return (0); +} + + + +/* + * FUNCTION: meta_get_trans_names() + * INPUT: sp - the set name to get trans from + * options - options from the command line + * OUTPUT: nlpp - list of all trans names + * ep - return error pointer + * RETURNS: int - -1 if error, 0 success + * PURPOSE: returns a list of all trans in the metadb + * for all devices in the specified set + */ +int +meta_get_trans_names( + mdsetname_t *sp, + mdnamelist_t **nlpp, + int options, + md_error_t *ep +) +{ + return (meta_get_names(MD_TRANS, sp, nlpp, options, ep)); +} + +/* + * free trans unit + */ +void +meta_free_trans( + md_trans_t *transp +) +{ + Free(transp); +} + +/* + * get trans (common) + */ +md_trans_t * +meta_get_trans_common( + mdsetname_t *sp, + mdname_t *transnp, + int fast, + md_error_t *ep +) +{ + mddrivename_t *dnp = transnp->drivenamep; + char *miscname; + mt_unit_t *mt; + md_trans_t *transp; + int gotlog; + + /* must have set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(transnp->dev))); + + /* short circuit */ + if (dnp->unitp != NULL) { + assert(dnp->unitp->type == MD_METATRANS); + return ((md_trans_t *)dnp->unitp); + } + + /* get miscname and unit */ + if ((miscname = metagetmiscname(transnp, ep)) == NULL) + return (NULL); + if (strcmp(miscname, MD_TRANS) != 0) { + (void) mdmderror(ep, MDE_NOT_MT, + meta_getminor(transnp->dev), transnp->cname); + return (NULL); + } + if ((mt = (mt_unit_t *)meta_get_mdunit(sp, transnp, ep)) == NULL) + return (NULL); + assert(mt->c.un_type == MD_METATRANS); + + /* allocate trans */ + transp = Zalloc(sizeof (*transp)); + + /* get common info */ + transp->common.namep = transnp; + transp->common.type = mt->c.un_type; + transp->common.state = mt->c.un_status; + transp->common.capabilities = mt->c.un_capabilities; + transp->common.parent = mt->c.un_parent; + transp->common.size = mt->c.un_total_blocks; + transp->common.user_flags = mt->c.un_user_flags; + transp->common.revision = mt->c.un_revision; + + /* get master */ + transp->masternamep = metakeyname(&sp, mt->un_m_key, fast, ep); + if (transp->masternamep == NULL) + goto out; + + /* get log */ + gotlog = ((mt->un_flags & TRANS_DETACHED) == 0); + if (gotlog) { + daddr_t sblk; + + transp->lognamep = metakeyname(&sp, mt->un_l_key, fast, ep); + if (transp->lognamep == NULL) + goto out; + + /* calculate the kernels start block */ + sblk = mt->un_l_pwsblk + mt->un_l_maxtransfer; + + if (getenv("META_DEBUG_START_BLK") != NULL) { + if (metagetstart(sp, transp->lognamep, ep) == + MD_DISKADDR_ERROR) + mdclrerror(ep); + + if (transp->lognamep->start_blk > sblk) + md_eprintf(dgettext(TEXT_DOMAIN, + "%s: suspected bad start block [trans]\n"), + transp->lognamep->cname); + } + + /* override any start_blk */ + transp->lognamep->start_blk = sblk; + } + + /* get flags, etc. */ + transp->flags = mt->un_flags; + transp->timestamp = mt->un_timestamp; + transp->log_error = mt->un_l_error; + transp->log_timestamp = mt->un_l_timestamp; + transp->log_size = mt->un_l_nblks; + transp->debug = mt->un_debug; + + /* cleanup, return success */ + Free(mt); + dnp->unitp = (md_common_t *)transp; + return (transp); + + /* cleanup, return error */ +out: + Free(mt); + meta_free_trans(transp); + return (NULL); +} + +/* + * get trans + */ +md_trans_t * +meta_get_trans( + mdsetname_t *sp, + mdname_t *transnp, + md_error_t *ep +) +{ + return (meta_get_trans_common(sp, transnp, 0, ep)); +} + +/* + * check trans for dev + */ +static int +in_trans( + mdsetname_t *sp, + mdname_t *transnp, + mdname_t *np, + mdchkopts_t options, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + md_trans_t *transp; + mdname_t *masternp; + mdname_t *lognp; + + /* should be in the same set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(transnp->dev))); + + /* get unit */ + if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) + return (-1); + + /* check master */ + masternp = transp->masternamep; + if ((! metaismeta(masternp)) && + (meta_check_overlap(transnp->cname, np, slblk, nblks, + masternp, 0, -1, ep) != 0)) { + return (-1); + } + + /* check log */ + if (((lognp = transp->lognamep) != NULL) && + (! (options & MDCHK_ALLOW_LOG)) && + (! metaismeta(lognp))) { + daddr_t log_start; + int err; + + /* check same drive since metagetstart() can fail */ + if ((err = meta_check_samedrive(np, lognp, ep)) < 0) + return (-1); + + /* check overlap */ + if (err != 0) { + if ((log_start = metagetstart(sp, lognp, ep)) == + MD_DISKADDR_ERROR) + return (-1); + if (meta_check_overlap(transnp->cname, np, slblk, + nblks, lognp, log_start, -1, ep) != 0) { + return (-1); + } + } + } + + /* return success */ + return (0); +} + +/* + * check to see if we're in a trans + */ +int +meta_check_intrans( + mdsetname_t *sp, + mdname_t *np, + mdchkopts_t options, + diskaddr_t slblk, + diskaddr_t nblks, + md_error_t *ep +) +{ + mdnamelist_t *transnlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* should have a set */ + assert(sp != NULL); + + /* for each trans */ + if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) + return (-1); + for (p = transnlp; (p != NULL); p = p->next) { + mdname_t *transnp = p->namep; + + /* check trans */ + if (in_trans(sp, transnp, np, options, slblk, nblks, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(transnlp); + return (rval); +} + +/* + * check master + */ +int +meta_check_master( + mdsetname_t *sp, + mdname_t *np, + int force, + md_error_t *ep +) +{ + mdchkopts_t options = 0; + md_common_t *mdp; + + /* make sure we have a disk */ + if (metachkdisk(np, ep) != 0) + return (-1); + + /* check to ensure that it is not already in use */ + if ((!force) && meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (! metaismeta(np)) { /* Non-metadevices */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + } else { /* Metadevices only! */ + if ((mdp = meta_get_unit(sp, np, ep)) == NULL) + return (-1); + + /* + * Since soft partitions may appear at the top or bottom + * of the metadevice stack, we check them separately. + * A trans may be built on top of a soft partition if + * the soft partition has no parent (can't rely on the + * MD_CAN_PARENT flag in this case since a soft partition + * built on a metadevice clears this flag to prevent nested + * configurations). + */ + if ((meta_sp_issp(sp, np, ep) == 0) && + (mdp->parent == MD_NO_PARENT)) + return (0); + + if ((! (mdp->capabilities & MD_CAN_PARENT)) || + (mdp->parent != MD_NO_PARENT)) { + return (mdmderror(ep, MDE_INVAL_UNIT, + meta_getminor(np->dev), np->cname)); + } + } + + /* return success */ + return (0); +} + +/* + * check log + */ +int +meta_check_log( + mdsetname_t *sp, + mdname_t *np, + md_error_t *ep +) +{ + mdchkopts_t options = (MDCHK_ALLOW_MDDB | MDCHK_ALLOW_LOG); + md_common_t *mdp; + + /* make sure we have a disk */ + if (metachkdisk(np, ep) != 0) + return (-1); + + /* check to ensure that it is not already in use */ + if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { + return (-1); + } + + /* make sure it is in the set */ + if (meta_check_inset(sp, np, ep) != 0) + return (-1); + + /* make sure its not in a metadevice */ + if (! metaismeta(np)) { /* Non-metadevices */ + if (meta_check_inmeta(sp, np, options, 0, -1, ep) != 0) + return (-1); + } else { /* Metadevices only! */ + if ((mdp = meta_get_unit(sp, np, ep)) == NULL) + return (-1); + + /* + * Since soft partitions may appear at the top or bottom + * of the metadevice stack, we check them separately. + * A trans may be built on top of a soft partition if + * the soft partition has no parent (can't rely on the + * MD_CAN_PARENT flag in this case since a soft partition + * built on a metadevice clears this flag to prevent nested + * configurations). + * + */ + if ((meta_sp_issp(sp, np, ep) == 0) && + (mdp->parent == MD_NO_PARENT)) + return (0); + + if ((! (mdp->capabilities & MD_CAN_PARENT)) || + ((mdp->parent != MD_NO_PARENT) && + (mdp->parent != MD_MULTI_PARENT))) { + return (mdmderror(ep, MDE_INVAL_UNIT, + meta_getminor(np->dev), np->cname)); + } + } + + /* return success */ + return (0); +} + +/* + * print trans + */ +static int +trans_print( + md_trans_t *transp, + char *fname, + FILE *fp, + md_error_t *ep +) +{ + int rval = -1; + + /* print name and -t */ + if (fprintf(fp, "%s -t", transp->common.namep->cname) == EOF) + goto out; + + /* print master */ + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(transp->masternamep->rname, "/dev/rdsk") == NULL) && + (strstr(transp->masternamep->rname, "/dev/md/rdsk") == NULL) && + (strstr(transp->masternamep->rname, "/dev/td/") == NULL)) { + /* not standard path, print full pathname */ + if (fprintf(fp, " %s", transp->masternamep->rname) == EOF) + goto out; + } else { + /* standard path, print ctds or d number */ + if (fprintf(fp, " %s", transp->masternamep->cname) == EOF) + goto out; + } + + + /* print log */ + if (transp->lognamep != NULL) { + /* + * If the path is our standard /dev/rdsk or /dev/md/rdsk + * then just print out the cxtxdxsx or the dx, metainit + * will assume the default, otherwise we need the full + * pathname to make sure this works as we intend. + */ + if ((strstr(transp->lognamep->rname, "/dev/rdsk") == NULL) && + (strstr(transp->lognamep->rname, "/dev/md/rdsk") == NULL) && + (strstr(transp->lognamep->rname, "/dev/td/") == NULL)) { + /* not standard path, print full pathname */ + if (fprintf(fp, " %s", transp->lognamep->rname) == EOF) + goto out; + } else { + /* standard path */ + if (fprintf(fp, " %s", transp->lognamep->cname) == EOF) + goto out; + } + } + + /* print terminating newline */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * convert flags to repair action + */ + +char * +mt_flags_to_action( + md_trans_t *transp +) +{ + int len; + char *actionp = NULL; + int err = -1; + + if (!transp) { + goto out; + } + + /* + * if in any of these states, the log_error word is not (yet) meaningful + */ + if (transp->flags & (TRANS_DETACHED|TRANS_DETACHING|TRANS_ATTACHING)) { + goto out; + } + + if (transp->log_error & LDL_ANYERROR) { + char *fix_msg = dgettext(TEXT_DOMAIN, + " To Fix: Please refer to the log device's status.\n"); + + if ((len = strlen(fix_msg)) <= 0) { + goto out; + } + if (!(actionp = Zalloc(len+1))) { + goto out; + } + if (strncpy(actionp, fix_msg, len + 1) != actionp) { + goto out; + } + } + err = 0; +out: + if (err != 0) { + if (actionp) { + Free(actionp); + actionp = NULL; + } + } + return (actionp); +} + +/* + * convert log state to repair action + */ +char * +mt_l_error_to_action( + mdsetname_t *sp, + mdnamelist_t *transnlp, + mdname_t *lognamep, + md_error_t *ep +) +{ + char umnt_msg[1024]; + char fsck_msg[1024]; + char mnt_msg[1024]; + mdnamelist_t *p; + md_trans_t *tp; + int rc; + int len = 0; + char *rmsg = NULL; + char *mp = NULL; + bool_t is_mounted = FALSE; + bool_t any_in_error = FALSE; + int only_fsck = TRUE; + + (void) memset(umnt_msg, 0, sizeof (umnt_msg)); + (void) memset(fsck_msg, 0, sizeof (fsck_msg)); + (void) memset(mnt_msg, 0, sizeof (mnt_msg)); + + /* + * If a the trans devices listed in transnlp contain + * devices which are in error and are sub-mount points + * of each other, than it would need to be reverse sorted. + * When this actually occurs, and customers find the usage + * message insufficiently clear, then we should take the + * hit to sort it. + */ + + /* + * this preliminary loop is necessary to keep the + * fsck message greppable, if possible + */ + for (p = transnlp; ((p != NULL) && (only_fsck == TRUE)); p = p->next) { + + if ((tp = meta_get_trans(sp, p->namep, ep)) == NULL) { + goto out; + } + + if (!(tp->log_error & LDL_ANYERROR)) { + continue; + } + + if ((tp->lognamep == NULL) || + (strcmp(lognamep->bname, tp->lognamep->bname) != 0)) { + continue; + } + + mdclrerror(ep); + is_mounted = (meta_check_inuse(sp, + p->namep, MDCHK_MOUNTED, ep) != 0); + + if (!mdisok(ep) && mdiserror(ep, MDE_IS_MOUNTED)) { + goto out; + } + + mdclrerror(ep); + mp = meta_get_mountp(sp, p->namep, ep); + + if (!mdisok(ep)) { + goto out; + } + + if (is_mounted) { + if (!mp) { + goto out; + } + only_fsck = FALSE; + + /* + * not greppable; there must be multiple commands, so + * add preliminary newline so the formatting is uniform + */ + if (sprintf(umnt_msg, "\n") == EOF) { + goto out; + } + + } + + if (mp) { + Free(mp); + mp = NULL; + } + } + + /* + * although the log may either be in error or hard-error + * states, the action is the same; unmount, fsck and remount + * all fs associated with this log + */ + for (p = transnlp; (p != NULL); p = p->next) { + + if ((tp = meta_get_trans(sp, p->namep, ep)) == NULL) { + goto out; + } + + if (!(tp->log_error & LDL_ANYERROR)) { + continue; + } + + if ((tp->lognamep == NULL) || + (strcmp(lognamep->bname, tp->lognamep->bname) != 0)) { + continue; + } + + mdclrerror(ep); + is_mounted = (meta_check_inuse(sp, + p->namep, MDCHK_MOUNTED, ep) != 0); + + if (!mdisok(ep) && mdiserror(ep, MDE_IS_MOUNTED)) { + goto out; + } + + mdclrerror(ep); + mp = meta_get_mountp(sp, p->namep, ep); + + if (!mdisok(ep)) { + goto out; + } + + if (is_mounted) { + if (!mp) { + goto out; + } + } + + if (is_mounted) { + rc = snprintf(umnt_msg, sizeof (umnt_msg), + "%s umount %s\n", umnt_msg, mp); + + if (rc < 0) { + goto out; + } + } + + rc = snprintf(fsck_msg, sizeof (fsck_msg), "%s %s", + (any_in_error) ? fsck_msg : + ((only_fsck) ? "fsck" : " fsck"), + p->namep->rname); + if (rc < 0) { + goto out; + } + + if (is_mounted) { + rc = snprintf(mnt_msg, sizeof (mnt_msg), + "%s mount %s %s\n", + mnt_msg, p->namep->bname, mp); + + if (rc < 0) { + goto out; + } + } + + if (mp) { + Free(mp); + mp = NULL; + } + + any_in_error |= TRUE; + } + + if (!any_in_error) { + goto out; + } + + len = strlen(umnt_msg) + strlen(fsck_msg) + strlen(mnt_msg) + + (only_fsck? 1: 0) + 1; + if (!(rmsg = Zalloc(len))) { + len = 0; + goto out; + } + rc = snprintf(rmsg, len, "%s%s%s%s", umnt_msg, fsck_msg, + !only_fsck? "\n": "", mnt_msg); + if (rc == EOF) { + goto out; + } + +out: + if (mp) { + Free(mp); + mp = NULL; + } + if (len == 0 && rmsg) { + Free(rmsg); + rmsg = NULL; + } + + return (rmsg); +} + +/* + * printable log state + */ +char * +mt_l_error_to_name( + md_trans_t *transp, + md_timeval32_t *tvp, + uint_t tstate /* Errored tstate flags */ +) +{ + mt_l_error_t log_error = transp->log_error; + + /* grab time */ + if (tvp != NULL) + *tvp = transp->log_timestamp; + + if (tstate != 0) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* return state */ + if (log_error & LDL_ERROR) { + return (dgettext(TEXT_DOMAIN, "Error")); + } else if (log_error & LDL_HERROR) { + return (dgettext(TEXT_DOMAIN, "Hard Error")); + } else { + return (dgettext(TEXT_DOMAIN, "Okay")); + } +} + +/* + * printable trans state + */ +char * +mt_flags_to_name( + md_trans_t *transp, + md_timeval32_t *tvp, + uint_t tstate /* Errored tstate flags */ +) +{ + /* grab time */ + if (tvp != NULL) + *tvp = transp->timestamp; + + if (tstate != 0) { + return (dgettext(TEXT_DOMAIN, "Unavailable")); + } + + /* return state */ + if (transp->flags & TRANS_DETACHED) + return (dgettext(TEXT_DOMAIN, "Detached")); + else if (transp->flags & TRANS_DETACHING) + return (dgettext(TEXT_DOMAIN, "Detaching")); + else if (transp->flags & TRANS_ATTACHING) + return (dgettext(TEXT_DOMAIN, "Attaching")); + return (mt_l_error_to_name(transp, tvp, tstate)); +} + +/* + * report trans + */ +static int +trans_report( + mdsetname_t *sp, + md_trans_t *transp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + char *mt_state; + md_timeval32_t tv; + char *timep; + int rval = -1; + char *actionp = NULL; + char *devid = ""; + mdname_t *didnp = NULL; + ddi_devid_t dtp; + uint_t tstate = 0; + + /* print header */ + if (options & PRINT_HEADER) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Trans" + " (Feature replaced see message below)\n"), + transp->common.namep->cname) == EOF) { + goto out; + } + } + + /* print state */ + if (metaismeta(transp->common.namep)) { + if (meta_get_tstate(transp->common.namep->dev, &tstate, ep) + != 0) + goto out; + } + mt_state = mt_flags_to_name(transp, &tv, tstate & MD_DEV_ERRORED); + if (options & PRINT_TIMES) { + timep = meta_print_time(&tv); + } else { + timep = ""; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, " State: %-12s %s\n"), + mt_state, timep) == EOF) { + goto out; + } + + if ((tstate & MD_DEV_ERRORED) == 0) { + actionp = mt_flags_to_action(transp); + if (actionp) { + if (fprintf(fp, "%s", actionp) == EOF) { + goto out; + } + Free(actionp); + actionp = NULL; + } + } + + /* debug stuff */ + if (transp->debug) { + if (fprintf(fp, + " Debug Modes:%s%s%s%s%s%s%s%s%s%s%s\n", + (transp->debug & MT_TRANSACT) ? " TRANSACT" : "", + (transp->debug & MT_MATAMAP) ? " METADATA" : "", + (transp->debug & MT_WRITE_CHECK) ? " WRITES" : "", + (transp->debug & MT_LOG_WRITE_CHECK) ? " LOGWRITES" : "", + (transp->debug & MT_CHECK_MAP) ? " MAP" : "", + (transp->debug & MT_TRACE) ? " TRACE" : "", + (transp->debug & MT_SIZE) ? " SIZE" : "", + (transp->debug & MT_NOASYNC) ? " NOASYNC" : "", + (transp->debug & MT_FORCEROLL) ? " FORCEROLL" : "", + (transp->debug & MT_SCAN) ? " SCAN" : "", + (transp->debug & MT_PREWRITE) ? " PREWRITE" : "") + == EOF) { + goto out; + } + } + + /* print size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %lld blocks (%s)\n"), + transp->common.size, + meta_number_to_string(transp->common.size, DEV_BSIZE)) == EOF) { + goto out; + } + + + /* print master */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Master Device: %s\n"), + transp->masternamep->cname) == EOF) { + goto out; + } + + /* print log */ + if (transp->lognamep != NULL) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Logging Device: %s\n"), + transp->lognamep->cname) == EOF) { + goto out; + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* print master details if regular device */ + if (! metaismeta(transp->masternamep)) { + daddr_t start_blk = 0; + char *has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + int len; + + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + len = strlen(transp->masternamep->cname) + 2; + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Master Device"))); + + /* print header */ + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Master Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, + transp->masternamep->dev, ep)) == NULL) { + return (-1); + } + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + + /* print info */ + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (fprintf(fp, "\t%-*s %8ld %-5.5s %s\n", len, + transp->masternamep->cname, + start_blk, has_mddb_str, devid) == EOF) { + goto out; + } + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + } + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report trans + */ +int +meta_trans_print( + mdsetname_t *sp, + mdname_t *transnp, + mdnamelist_t **nlistpp, + char *fname, + FILE *fp, + mdprtopts_t options, + int *meta_print_trans_msgp, /* NULL if transnp != NULL */ + mdnamelist_t **lognlpp, + md_error_t *ep +) +{ + md_trans_t *transp; + mdname_t *lognamep; + + /* should have same set */ + assert(sp != NULL); + + /* print all transs */ + if (transnp == NULL) { + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + int cnt; + int rval = 0; + + /* get list */ + if ((cnt = meta_get_trans_names(sp, &nlp, options, ep)) < 0) + return (-1); + else if (cnt == 0) + return (0); + + /* recurse */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *np = p->namep; + + if (meta_trans_print(sp, np, nlistpp, fname, fp, + options, meta_print_trans_msgp, lognlpp, ep) != 0) + rval = -1; + } + + if (meta_print_trans_msgp) + *meta_print_trans_msgp = 1; + + /* cleanup, return success */ + metafreenamelist(nlp); + return (rval); + } + + + /* get unit structure */ + if ((transp = meta_get_trans_common(sp, transnp, + ((options & PRINT_FAST) ? 1 : 0), ep)) == NULL) + return (-1); + + /* save unique log */ + if ((lognlpp != NULL) && + ((lognamep = transp->lognamep) != NULL)) { + mdnamelist_t *p; + + for (p = *lognlpp; (p != NULL); p = p->next) { + if (strcmp(lognamep->bname, p->namep->bname) == 0) + break; + } + if (p == NULL) + (void) metanamelist_append(lognlpp, lognamep); + } + + /* check for parented */ + if ((! (options & PRINT_SUBDEVS)) && + (MD_HAS_PARENT(transp->common.parent))) { + return (0); + } + + /* can't have a large trans */ + if (!(options & PRINT_LARGEDEVICES)) { + /* print appropriate detail */ + if (options & PRINT_SHORT) { + if (trans_print(transp, fname, fp, ep) != 0) + return (-1); + } else { + if (trans_report(sp, transp, fname, fp, options, ep) + != 0) + return (-1); + } + } + + /* print underlying metadevices, log is later */ + if (metaismeta(transp->masternamep)) { + if (meta_print_name(sp, transp->masternamep, nlistpp, fname, + fp, (options | PRINT_HEADER | PRINT_SUBDEVS), NULL, ep) + != 0) { + return (-1); + } + } + + /* return success */ + return (0); +} + +/* + * print log + */ +static int +log_print( + mdsetname_t *sp, + mdname_t *lognamep, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *nlp = NULL; + + /* metadevice info */ + if (metaismeta(lognamep)) { + return (meta_print_name(sp, lognamep, &nlp, fname, fp, + options, NULL, ep)); + } + + /* regular device info */ + return (0); +} + +/* + * report log + */ +static int +log_report( + mdsetname_t *sp, + mdname_t *lognamep, + mdnamelist_t **nlistpp, + char *fname, + FILE *fp, + mdprtopts_t options, + mdnamelist_t *transnlp, + md_error_t *ep +) +{ + md_trans_t *transp = NULL; + mdnamelist_t *p; + char *ml_state; + md_timeval32_t tv; + char *timep; + char *actionp = NULL; + int rval = -1; + char *devid = " "; + mdname_t *didnp = NULL; + ddi_devid_t dtp; + uint_t tstate = 0; + + for (p = transnlp; (p != NULL); p = p->next) { + md_trans_t *tp; + + if ((tp = meta_get_trans(sp, p->namep, ep)) == NULL) + return (-1); + if ((tp->lognamep != NULL) && + (strcmp(lognamep->bname, tp->lognamep->bname) == 0)) { + transp = tp; /* save any parent trans */ + } + } + + /* we must have at least one trans */ + assert(transp != NULL); + if (transp == NULL) { + rval = 0; + goto out; + } + + if ((options & PRINT_LARGEDEVICES) && + (transp->log_size <= MD_MAX_BLKS_FOR_SMALL_DEVS)) { + rval = 0; + goto out; + } + + /* print header and trans devices, collect log_error and size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, "%s: Logging device for"), + lognamep->cname) == EOF) { + goto out; + } + + if ((transp->lognamep != NULL) && + (strcmp(lognamep->bname, transp->lognamep->bname) == 0)) { + if (fprintf(fp, " %s", transp->common.namep->cname) + == EOF) { + goto out; + } + } + if (fprintf(fp, "\n") == EOF) + goto out; + + /* print state */ + if (metaismeta(transp->lognamep)) { + if (meta_get_tstate(transp->lognamep->dev, &tstate, ep) != 0) + return (-1); + } + ml_state = mt_l_error_to_name(transp, &tv, tstate & MD_DEV_ERRORED); + if (options & PRINT_TIMES) { + timep = meta_print_time(&tv); + } else { + timep = ""; + } + if (fprintf(fp, dgettext(TEXT_DOMAIN, " State: %-12s %s\n"), + ml_state, timep) == EOF) { + goto out; + } + + if ((tstate & MD_DEV_ERRORED) == 0) { + actionp = mt_l_error_to_action(sp, transnlp, lognamep, ep); + if (actionp) { + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Invoke: %s\n"), actionp) == EOF) { + goto out; + } + Free(actionp); + actionp = NULL; + } + } + + /* print size */ + if (fprintf(fp, dgettext(TEXT_DOMAIN, " Size: %ld blocks (%s)\n"), + transp->log_size, + meta_number_to_string(transp->log_size, DEV_BSIZE)) == EOF) { + goto out; + } + + /* MD_DEBUG stuff */ + if (options & PRINT_DEBUG) { + mdname_t *transnp = transp->common.namep; + mt_unit_t *mt; + daddr_t blksinuse, head, tail, nblks, eblk, sblk; + int percent; + + if ((mt = (mt_unit_t *)meta_get_mdunit(sp, transnp, ep)) + == NULL) { + return (-1); + } + assert(mt->c.un_type == MD_METATRANS); + + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Transfer Size: %d blocks\n"), + mt->un_l_maxtransfer) == EOF) { + Free(mt); + goto out; + } + + head = mt->un_l_head; + tail = mt->un_l_tail; + sblk = mt->un_l_sblk; + nblks = mt->un_l_nblks; + eblk = sblk + nblks; + if (head <= tail) + blksinuse = tail - head; + else + blksinuse = (eblk - head) + (tail - sblk); + + percent = ((u_longlong_t)blksinuse * 100) / nblks; + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Full: %d%% (%ld of %ld blocks)\n"), + percent, blksinuse, nblks) == EOF) { + Free(mt); + goto out; + } + + percent = ((u_longlong_t)mt->un_l_resv * 100) / + mt->un_l_maxresv; + if (fprintf(fp, dgettext(TEXT_DOMAIN, + " Reserved: %d%% (%ud of %ud bytes)\n"), + percent, mt->un_l_resv, mt->un_l_maxresv) == EOF) { + Free(mt); + goto out; + } + Free(mt); + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* print log details */ + if (metaismeta(lognamep)) { + if (meta_print_name(sp, lognamep, nlistpp, fname, fp, + options, NULL, ep) != 0) { + return (-1); + } + } else { + daddr_t start_blk; + int has_mddb; + char *has_mddb_str; + int len; + + /* + * Building a format string on the fly that will + * be used in (f)printf. This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + len = strlen(lognamep->cname) + 2; + len = max(len, strlen(dgettext(TEXT_DOMAIN, "Logging Device"))); + /* print header */ + if (fprintf(fp, + "\t%-*.*s %-12.12s %-5.5s %s\n", + len, len, + dgettext(TEXT_DOMAIN, "Logging Device"), + dgettext(TEXT_DOMAIN, "Start Block"), + dgettext(TEXT_DOMAIN, "Dbase"), + dgettext(TEXT_DOMAIN, "Reloc")) == EOF) { + goto out; + } + /* get info */ + if ((start_blk = metagetstart(sp, lognamep, ep)) == + MD_DISKADDR_ERROR) { + return (-1); + } + if ((has_mddb = metahasmddb(sp, lognamep, ep)) < 0) { + return (-1); + } + if (has_mddb) + has_mddb_str = dgettext(TEXT_DOMAIN, "Yes"); + else + has_mddb_str = dgettext(TEXT_DOMAIN, "No"); + + /* populate the key in the name_p structure */ + if ((didnp = metadevname(&sp, lognamep->dev, ep)) == NULL) { + return (-1); + } + + /* determine if devid does NOT exist */ + if (options & PRINT_DEVID) + if ((dtp = meta_getdidbykey(sp->setno, getmyside(sp, ep), + didnp->key, ep)) == NULL) + devid = dgettext(TEXT_DOMAIN, "No "); + else { + devid = dgettext(TEXT_DOMAIN, "Yes"); + free(dtp); + } + + /* print info */ + /* + * This allows the length + * of the ctd to vary from small to large without + * looking horrible. + */ + if (fprintf(fp, "\t%-*s %8ld %-5.5s %s\n", + len, lognamep->cname, start_blk, + has_mddb_str, devid) == EOF) { + goto out; + } + } + + /* add extra line */ + if (fprintf(fp, "\n") == EOF) + goto out; + + /* success */ + rval = 0; + + /* cleanup, return error */ +out: + if (rval != 0) + (void) mdsyserror(ep, errno, fname); + return (rval); +} + +/* + * print/report logs + */ +int +meta_logs_print( + mdsetname_t *sp, + mdnamelist_t *lognlp, + mdnamelist_t **nlistpp, + char *fname, + FILE *fp, + mdprtopts_t options, + md_error_t *ep +) +{ + mdnamelist_t *transnlp = NULL; + mdnamelist_t *p; + int rval = 0; + + /* must have a set */ + assert(sp != NULL); + + /* get trans devices */ + if (lognlp == NULL) + return (0); + + if (! (options & PRINT_SHORT)) + if (meta_get_trans_names(sp, &transnlp, options, ep) < 0) + return (-1); + + /* print all logs */ + options |= PRINT_SUBDEVS; + for (p = lognlp; (p != NULL); p = p->next) { + mdname_t *lognamep = p->namep; + + /* print appropriate detail */ + if (options & PRINT_SHORT) { + if (log_print(sp, lognamep, fname, fp, options, + ep) != 0) { + rval = -1; + } + } else { + if (log_report(sp, lognamep, nlistpp, fname, fp, + options, transnlp, ep) != 0) { + rval = -1; + } + } + } + + /* cleanup, return success */ +out: + metafreenamelist(transnlp); + return (rval); +} + +/* + * meta_lockfs_common -- common lock and unlock code + * + * Normally this routine will return a 0 for success. Even if + * lockfs wasn't able to lock down the filesystem. The reason + * for this is that the master device can be in an errored state + * and the lock can't be obtained. We don't want to prevent + * possible recovery in this case and it's not likely any activity + * will be occurring. If the filesystem is healthy with activity + * lockfs will successfully lock the filesystem and return an + * error code of 0. + * + * The one case where this routine returns a non-zero value would + * be if we can't determine the outcome of the lockfs. This should + * never occur because we don't catch signals that could cause + * waitpid() to prematurely return. + */ +static int +meta_lockfs_common(mdname_t *fs, void **cookie, int lockit) +{ + char *blkname; + FILE *m; + struct mnttab tab_wildcard, tab_match; + pid_t pid; + int lock_exit; + + (void) memset(&tab_wildcard, 0, sizeof (tab_wildcard)); + (void) memset(&tab_match, 0, sizeof (tab_match)); + + if ((blkname = fs->bname) == NULL) + blkname = getfullblkname(fs->cname); + + tab_wildcard.mnt_special = blkname; + + if ((m = fopen(MNTTAB, "r")) == NULL) { + /* + * No mnttab means nothing is mounted + */ + *cookie = 0; + return (0); + } + + if (getmntany(m, &tab_match, &tab_wildcard)) { + /* + * No match in mnttab so we're not mounted ... at least + * nothing better be mounted. + */ + *cookie = 0; + return (0); + } + + (void) fclose(m); + + switch (pid = fork()) { + case -1: + /* + * We've got some major trouble here and shouldn't + * continue. The user needs to clear up the problems + * that the system currently has before proceeding + * to detach the log. + */ + (void) printf(dgettext(TEXT_DOMAIN, "failed to fork lockfs\n")); + *cookie = 0; + return (1); + + case 0: + (void) execl("/usr/sbin/lockfs", "lockfs", lockit ? "-w" : "-u", + "-c", "Solaris Volume Manager detach lock", + tab_match.mnt_mountp, 0); + /* + * Shouldn't reach here, but if this code is run on + * a release that doesn't have lockfs return an error + * code so that the -f (force) option could be used + * by metadetach. + */ + exit(1); + + default: + if (waitpid(pid, &lock_exit, 0) != pid) { + /* + * We couldn't get status regarding the + * outcome of the lockfs command. We should + * attempt to unlock the filesystem though. + * Return an error code so that if the user + * is trying to force the detach make them + * clear up this problem first. + */ + *cookie = (void *)1; + return (1); + } + + *cookie = (void *)1; + return (0); + } +} + +/* + * meta_lockfs - if mounted, lock a given device against writes + * + * See comment section for meta_lockfs_common + */ +static int +meta_lockfs(mdname_t *fs, void **cookie) +{ + return (meta_lockfs_common(fs, cookie, 1)); +} + +/* + * meta_unlockfs - if mounted, unlock the filesystem if previously locked + * + * See comment section for meta_lockfs_common + */ +static void +meta_unlockfs(mdname_t *fs, void **cookie) +{ + /* + * Simple time saver. We could always try to unlock + * the filesystem, that takes time a resources. + */ + if (*cookie == (void *)1) + (void) meta_lockfs_common(fs, cookie, 0); +} + +/* + * meta_trans_detach -- detach log from trans device + */ +int +meta_trans_detach( + mdsetname_t *sp, + mdname_t *transnp, + mdcmdopts_t options, + int *delayed, + md_error_t *ep +) +{ + int force = ((options & MDCMD_FORCE) ? 1 : 0); + md_i_get_t detach; + md_trans_t *transp; + mdname_t *lognp; + void *lock_cookie; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(transnp->dev))); + + /* check name */ + if (metachkmeta(transnp, ep) != 0) + return (-1); + + /* save log name */ + if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) + return (-1); + if ((lognp = transp->lognamep) == NULL) + return (mdmderror(ep, MDE_NO_LOG, meta_getminor(transnp->dev), + transnp->cname)); + + /* + * If trans device is mounted lock the filesystem + * against writes and mod time updates. + */ + if (force && meta_lockfs(transnp, &lock_cookie)) { + /* + * This device is mounted and we were unable + * lock the device. Data corruption can occur + * if we don't lock the device before removing + * the log so bail out here. + * NOTE: There's one case were the exist status + * of lockfs could have been lost yet the command + * could have run. We should try to unlock the filesystem + * before returning. + */ + meta_unlockfs(transnp, &lock_cookie); + return (mdmderror(ep, MDE_UNKNOWN_TYPE, + meta_getminor(transnp->dev), transnp->cname)); + } + + /* detach log */ + *delayed = 0; + (void) memset(&detach, 0, sizeof (detach)); + detach.id = meta_getminor(transnp->dev); + MD_SETDRIVERNAME(&detach, MD_TRANS, sp->setno); + detach.size = force; + if (metaioctl(MD_IOC_TRANS_DETACH, &detach, &detach.mde, NULL) != 0) { + /* delayed detach */ + if ((force) && (mdissyserror(&detach.mde, EBUSY))) { + *delayed = 1; + mdclrerror(&detach.mde); + } else { + meta_unlockfs(transnp, &lock_cookie); + return (mdstealerror(ep, &detach.mde)); + } + } + + /* + * Unlock the filesystem + */ + meta_unlockfs(transnp, &lock_cookie); + + /* clear cache */ + meta_invalidate_name(lognp); + meta_invalidate_name(transnp); + + /* let em know */ + if (options & MDCMD_PRINT) { + if (*delayed) { + (void) printf(dgettext(TEXT_DOMAIN, +"%s: logging device %s will be detached at unmount or reboot\n"), + transnp->cname, lognp->cname); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "%s: logging device %s is detached\n"), + transnp->cname, lognp->cname); + } + (void) fflush(stdout); + } + + /* return success */ + return (0); +} + +/* + * reset trans + */ +int +meta_trans_reset( + mdsetname_t *sp, + mdname_t *transnp, + mdcmdopts_t options, + md_error_t *ep +) +{ + md_trans_t *transp; + int rval = -1; + + /* should have a set */ + assert(sp != NULL); + assert((transnp == NULL) || + (sp->setno == MD_MIN2SET(meta_getminor(transnp->dev)))); + + /* reset all trans */ + if (transnp == NULL) { + mdnamelist_t *transnlp = NULL; + mdnamelist_t *p; + + /* for each trans */ + rval = 0; + if (meta_get_trans_names(sp, &transnlp, 0, ep) < 0) + return (-1); + for (p = transnlp; (p != NULL); p = p->next) { + /* reset trans */ + transnp = p->namep; + if (meta_trans_reset(sp, transnp, options, ep) != 0) { + rval = -1; + break; + } + } + + /* cleanup, return success */ + metafreenamelist(transnlp); + return (rval); + } + + /* check name */ + if (metachkmeta(transnp, ep) != 0) + return (-1); + /* get unit structure */ + if ((transp = meta_get_trans(sp, transnp, ep)) == NULL) + return (-1); + + /* make sure nobody owns us */ + if (MD_HAS_PARENT(transp->common.parent)) { + return (mdmderror(ep, MDE_IN_USE, meta_getminor(transnp->dev), + transnp->cname)); + } + + /* clear subdevices cache */ + meta_invalidate_name(transp->masternamep); + if (transp->lognamep) + meta_invalidate_name(transp->lognamep); + + /* clear metadevice */ + if (meta_reset(sp, transnp, options, ep) != 0) + goto out; + rval = 0; /* success */ + + /* let em know */ + if (options & MDCMD_PRINT) { + (void) printf(dgettext(TEXT_DOMAIN, "%s: Trans is cleared\n"), + transnp->cname); + (void) fflush(stdout); + } + + /* clear subdevices */ + if (! (options & MDCMD_RECURSE)) + goto out; + if (metaismeta(transp->masternamep)) { + mdname_t *masternp = transp->masternamep; + + if (meta_reset_by_name(sp, masternp, options, ep) != 0) + rval = -1; + } + /* (multi-parented) log will be cleared later */ + + /* cleanup, return success */ +out: + meta_invalidate_name(transnp); + return (rval); +} diff --git a/usr/src/lib/lvm/libmeta/common/meta_userflags.c b/usr/src/lib/lvm/libmeta/common/meta_userflags.c new file mode 100644 index 0000000000..6ac028625d --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/meta_userflags.c @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1993-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * get/set user flags for the metadevices (FOR GUI USE ONLY) + */ + +#include <meta.h> + +/* + * get user flags stored in the common unit structure. + */ +int +meta_getuserflags( + mdsetname_t *sp, + mdname_t *np, + uint_t *userflags, + md_error_t *ep +) +{ + md_common_t *mdp; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + if ((mdp = meta_get_unit(sp, np, ep)) == NULL) + return (-1); + + *userflags = mdp->user_flags; + return (0); +} + + +/* + * set user flags, stored in the common unit structure. + */ +int +meta_setuserflags( + mdsetname_t *sp, + mdname_t *np, + uint_t userflags, + md_error_t *ep +) +{ + md_set_userflags_t msu; + char *miscname; + + /* should have a set */ + assert(sp != NULL); + assert(sp->setno == MD_MIN2SET(meta_getminor(np->dev))); + + /* check name */ + if (metachkmeta(np, ep) != 0) + return (-1); + + /* get misc name */ + if ((miscname = metagetmiscname(np, ep)) == NULL) + return (-1); + + /* set parameters */ + (void) memset(&msu, 0, sizeof (msu)); + MD_SETDRIVERNAME(&msu, miscname, sp->setno); + msu.mnum = meta_getminor(np->dev); + msu.userflags = userflags; + if (metaioctl(MD_IOCSET_FLAGS, &msu, &msu.mde, np->cname) != 0) + return (mdstealerror(ep, &msu.mde)); + + /* clear cache */ + meta_invalidate_name(np); + + return (0); +} diff --git a/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c b/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c new file mode 100644 index 0000000000..32be258ab3 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/metad_svc_stubs.c @@ -0,0 +1,825 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <meta.h> +#include <metad.h> + +#pragma weak mdrpc_nullproc_1_svc = _mdrpc_nullproc_1_svc +#pragma weak mdrpc_hostname_1_svc = _mdrpc_hostname_1_svc +#pragma weak mdrpc_addhosts_1_svc = _mdrpc_addhosts_1_svc +#pragma weak mdrpc_delhosts_1_svc = _mdrpc_delhosts_1_svc +#pragma weak mdrpc_createset_1_svc = _mdrpc_createset_1_svc +#pragma weak mdrpc_delset_1_svc = _mdrpc_delset_1_svc +#pragma weak mdrpc_getset_1_svc = _mdrpc_getset_1_svc +#pragma weak mdrpc_setnumbusy_1_svc = _mdrpc_setnumbusy_1_svc +#pragma weak mdrpc_setnameok_1_svc = _mdrpc_setnameok_1_svc +#pragma weak mdrpc_ownset_1_svc = _mdrpc_ownset_1_svc +#pragma weak mdrpc_adddrvs_1_svc = _mdrpc_adddrvs_1_svc +#pragma weak mdrpc_deldrvs_1_svc = _mdrpc_deldrvs_1_svc +#pragma weak mdrpc_upd_dr_dbinfo_1_svc = _mdrpc_upd_dr_dbinfo_1_svc +#pragma weak mdrpc_devinfo_1_svc = _mdrpc_devinfo_1_svc +#pragma weak mdrpc_drvused_1_svc = _mdrpc_drvused_1_svc +#pragma weak mdrpc_add_drv_sidenms_1_svc = _mdrpc_add_drv_sidenms_1_svc +#pragma weak mdrpc_del_drv_sidenms_1_svc = _mdrpc_del_drv_sidenms_1_svc +#pragma weak mdrpc_gtimeout_1_svc = _mdrpc_gtimeout_1_svc +#pragma weak mdrpc_stimeout_1_svc = _mdrpc_stimeout_1_svc +#pragma weak mdrpc_upd_dr_flags_1_svc = _mdrpc_upd_dr_flags_1_svc +#pragma weak mdrpc_upd_sr_flags_1_svc = _mdrpc_upd_sr_flags_1_svc +#pragma weak mdrpc_unlock_set_1_svc = _mdrpc_unlock_set_1_svc +#pragma weak mdrpc_lock_set_1_svc = _mdrpc_lock_set_1_svc +#pragma weak mdrpc_updmeds_1_svc = _mdrpc_updmeds_1_svc + +#pragma weak mdrpc_nullproc_2_svc = _mdrpc_nullproc_2_svc +#pragma weak mdrpc_hostname_2_svc = _mdrpc_hostname_2_svc +#pragma weak mdrpc_addhosts_2_svc = _mdrpc_addhosts_2_svc +#pragma weak mdrpc_delhosts_2_svc = _mdrpc_delhosts_2_svc +#pragma weak mdrpc_createset_2_svc = _mdrpc_createset_2_svc +#pragma weak mdrpc_delset_2_svc = _mdrpc_delset_2_svc +#pragma weak mdrpc_getset_2_svc = _mdrpc_getset_2_svc +#pragma weak mdrpc_setnumbusy_2_svc = _mdrpc_setnumbusy_2_svc +#pragma weak mdrpc_setnameok_2_svc = _mdrpc_setnameok_2_svc +#pragma weak mdrpc_ownset_2_svc = _mdrpc_ownset_2_svc +#pragma weak mdrpc_adddrvs_2_svc = _mdrpc_adddrvs_2_svc +#pragma weak mdrpc_deldrvs_2_svc = _mdrpc_deldrvs_2_svc +#pragma weak mdrpc_upd_dr_dbinfo_2_svc = _mdrpc_upd_dr_dbinfo_2_svc +#pragma weak mdrpc_devinfo_2_svc = _mdrpc_devinfo_2_svc +#pragma weak mdrpc_devid_2_svc = _mdrpc_devid_2_svc +#pragma weak mdrpc_devinfo_by_devid_2_svc = _mdrpc_devinfo_by_devid_2_svc +#pragma weak mdrpc_devinfo_by_devid_name_2_svc =\ + _mdrpc_devinfo_by_devid_name_2_svc +#pragma weak mdrpc_drvused_2_svc = _mdrpc_drvused_2_svc +#pragma weak mdrpc_add_drv_sidenms_2_svc = _mdrpc_add_drv_sidenms_2_svc +#pragma weak mdrpc_del_drv_sidenms_2_svc = _mdrpc_del_drv_sidenms_2_svc +#pragma weak mdrpc_gtimeout_2_svc = _mdrpc_gtimeout_2_svc +#pragma weak mdrpc_stimeout_2_svc = _mdrpc_stimeout_2_svc +#pragma weak mdrpc_upd_dr_flags_2_svc = _mdrpc_upd_dr_flags_2_svc +#pragma weak mdrpc_upd_sr_flags_2_svc = _mdrpc_upd_sr_flags_2_svc +#pragma weak mdrpc_unlock_set_2_svc = _mdrpc_unlock_set_2_svc +#pragma weak mdrpc_lock_set_2_svc = _mdrpc_lock_set_2_svc +#pragma weak mdrpc_updmeds_2_svc = _mdrpc_updmeds_2_svc +#pragma weak mdrpc_mncreateset_2_svc = _mdrpc_mncreateset_2_svc +#pragma weak mdrpc_mngetset_2_svc = _mdrpc_mngetset_2_svc +#pragma weak mdrpc_mnsetmaster_2_svc = _mdrpc_mnsetmaster_2_svc +#pragma weak mdrpc_joinset_2_svc = _mdrpc_joinset_2_svc +#pragma weak mdrpc_withdrawset_2_svc = _mdrpc_withdrawset_2_svc +#pragma weak mdrpc_upd_nr_flags_2_svc = _mdrpc_upd_nr_flags_2_svc +#pragma weak mdrpc_mn_is_stale_2_svc = _mdrpc_mn_is_stale_2_svc +#pragma weak mdrpc_mdcommdctl_2_svc = _mdrpc_mdcommdctl_2_svc +#pragma weak mdrpc_upd_dr_reconfig_2_svc = _mdrpc_upd_dr_reconfig_2_svc +#pragma weak mdrpc_getdrivedesc_2_svc = _mdrpc_getdrivedesc_2_svc +#pragma weak mdrpc_reset_mirror_owner_2_svc = _mdrpc_reset_mirror_owner_2_svc +#pragma weak mdrpc_mn_susp_res_io_2_svc = _mdrpc_mn_susp_res_io_2_svc +#pragma weak mdrpc_resnarf_set_2_svc = _mdrpc_resnarf_set_2_svc +#pragma weak mdrpc_mn_mirror_resync_all_2_svc = \ + _mdrpc_mn_mirror_resync_all_2_svc + +/*ARGSUSED*/ +bool_t +_mdrpc_nullproc_1_svc( + mdrpc_null_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_hostname_1_svc( + mdrpc_null_args *a, + mdrpc_hostname_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_addhosts_1_svc( + mdrpc_host_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_delhosts_1_svc( + mdrpc_host_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_createset_1_svc( + mdrpc_createset_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_delset_1_svc( + mdrpc_sp_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_getset_1_svc( + mdrpc_getset_args *a, + mdrpc_getset_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_setnumbusy_1_svc( + mdrpc_setno_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_setnameok_1_svc( + mdrpc_sp_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_ownset_1_svc( + mdrpc_sp_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_adddrvs_1_svc( + mdrpc_drives_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_deldrvs_1_svc( + mdrpc_drives_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_dr_dbinfo_1_svc( + mdrpc_drives_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_devinfo_1_svc( + mdrpc_devinfo_args *a, + mdrpc_devinfo_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_drvused_1_svc( + mdrpc_drvused_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_add_drv_sidenms_1_svc( + mdrpc_drv_sidenm_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_del_drv_sidenms_1_svc( + mdrpc_sp_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_gtimeout_1_svc( + mdrpc_sp_args *a, + mdrpc_gtimeout_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_stimeout_1_svc( + mdrpc_stimeout_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_dr_flags_1_svc( + mdrpc_upd_dr_flags_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_sr_flags_1_svc( + mdrpc_upd_sr_flags_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_unlock_set_1_svc( + mdrpc_null_args *a, + mdrpc_setlock_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_lock_set_1_svc( + mdrpc_null_args *a, + mdrpc_setlock_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_updmeds_1_svc( + mdrpc_updmeds_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + + +/*ARGSUSED*/ +bool_t +_mdrpc_nullproc_2_svc( + mdrpc_null_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_hostname_2_svc( + mdrpc_null_args *a, + mdrpc_hostname_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_addhosts_2_svc( + mdrpc_host_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_delhosts_2_svc( + mdrpc_host_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_createset_2_svc( + mdrpc_createset_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_delset_2_svc( + mdrpc_sp_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_getset_2_svc( + mdrpc_getset_args *a, + mdrpc_getset_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_setnumbusy_2_svc( + mdrpc_setno_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_setnameok_2_svc( + mdrpc_sp_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_ownset_2_svc( + mdrpc_sp_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_adddrvs_2_svc( + mdrpc_drives_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_deldrvs_2_svc( + mdrpc_drives_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_dr_dbinfo_2_svc( + mdrpc_drives_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_devinfo_2_svc( + mdrpc_devinfo_2_args *a, + mdrpc_devinfo_2_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_devid_2_svc( + mdrpc_devid_args *a, + mdrpc_devid_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_devinfo_by_devid_2_svc( + mdrpc_devidstr_args *a, + mdrpc_devinfo_2_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_devinfo_by_devid_name_2_svc( + mdrpc_devid_name_2_args *a, + mdrpc_devinfo_2_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + + +/*ARGSUSED*/ +bool_t +_mdrpc_drvused_2_svc( + mdrpc_drvused_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_add_drv_sidenms_2_svc( + mdrpc_drv_sidenm_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_del_drv_sidenms_2_svc( + mdrpc_sp_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_gtimeout_2_svc( + mdrpc_sp_args *a, + mdrpc_gtimeout_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_stimeout_2_svc( + mdrpc_stimeout_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_dr_flags_2_svc( + mdrpc_upd_dr_flags_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_sr_flags_2_svc( + mdrpc_upd_sr_flags_args *a, + mdrpc_generic_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_unlock_set_2_svc( + mdrpc_null_args *a, + mdrpc_setlock_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_lock_set_2_svc( + mdrpc_null_args *a, + mdrpc_setlock_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_updmeds_2_svc( + mdrpc_updmeds_args *a, + mdrpc_generic_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mncreateset_2_svc( + mdrpc_mncreateset_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mngetset_2_svc( + mdrpc_getset_2_args *a, + mdrpc_mngetset_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mnsetmaster_2_svc( + mdrpc_mnsetmaster_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_joinset_2_svc( + mdrpc_sp_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c +) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_withdrawset_2_svc( + mdrpc_sp_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_nr_flags_2_svc( + mdrpc_upd_nr_flags_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mn_is_stale_2_svc( + mdrpc_setno_2_args *a, + mdrpc_bool_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mdcommdctl_2_svc( + mdrpc_mdcommdctl_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_resnarf_set_2_svc( + mdrpc_setno_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_upd_dr_reconfig_2_svc( + mdrpc_upd_dr_flags_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_getdrivedesc_2_svc( + mdrpc_sp_2_args *a, + mdrpc_getdrivedesc_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_reset_mirror_owner_2_svc( + mdrpc_nodeid_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mn_susp_res_io_2_svc( + mdrpc_mn_susp_res_io_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} + +/*ARGSUSED*/ +bool_t +_mdrpc_mn_mirror_resync_all_2_svc( + mdrpc_setno_2_args *a, + mdrpc_generic_res *b, + struct svc_req *c) +{ + assert(0); + return (TRUE); +} diff --git a/usr/src/lib/lvm/libmeta/common/metagetroot.c b/usr/src/lib/lvm/libmeta/common/metagetroot.c new file mode 100644 index 0000000000..3891c6bd74 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/metagetroot.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * get root device + */ + +#include <meta.h> +#include "meta_lib_prv.h" + +#include <sys/mnttab.h> + +/* + * Return the current root filesystem block device name + */ +void * +meta_get_current_root( + md_error_t *ep +) +{ + FILE *fp; + struct mnttab mp; + + if ((fp = open_mnttab()) == NULL) { + (void) mdsyserror(ep, errno, MNTTAB); + return (NULL); + } + + while (getmntent(fp, &mp) == 0) { + if (strcmp(mp.mnt_mountp, "/") == 0) + return (mp.mnt_special); + } + (void) mderror(ep, MDE_NOROOT, NULL); + return (NULL); +} + +/* + * Return the current root filesystem block device name. This is only valid + * when root is either a slice, a stripe or a mirror. + */ +mdname_t * +meta_get_current_root_dev( + mdsetname_t *sp, + md_error_t *ep +) +{ + md_stripe_t *stripep; + md_mirror_t *mirrorp; + md_row_t *rp; + md_comp_t *cp; + mdname_t *rootnp; + void *curroot; + char *miscname; + int smi; + + if ((curroot = meta_get_current_root(ep)) == NULL) + return (NULL); + if ((rootnp = metaname(&sp, curroot, ep)) == NULL) + return (NULL); + if (metaismeta(rootnp)) { + if ((miscname = metagetmiscname(rootnp, ep)) == NULL) + return (NULL); + if ((strcmp(miscname, MD_MIRROR) == 0) && + ((mirrorp = meta_get_mirror(sp, rootnp, ep)) != NULL)) { + for (smi = 0; smi < NMIRROR; smi++) { + md_submirror_t *mdsp = + &mirrorp->submirrors[smi]; + rootnp = mdsp->submirnamep; + /* skip unused submirrors */ + if (rootnp == NULL) { + assert(mdsp->state == SMS_UNUSED); + continue; + } + if ((miscname = metagetmiscname(rootnp, ep)) + == NULL) { + (void) mdmderror(ep, MDE_UNKNOWN_TYPE, + meta_getminor(rootnp->dev), + rootnp->cname); + return (NULL); + } + break; + } + } + if ((strcmp(miscname, MD_STRIPE) == 0) && + ((stripep = meta_get_stripe(sp, rootnp, ep)) != NULL)) { + rp = &stripep->rows.rows_val[0]; + cp = &rp->comps.comps_val[0]; + if (metachkcomp(cp->compnamep, ep) == 0) + return (cp->compnamep); + } + /* Root is not a single stripe metadevice */ + (void) mddeverror(ep, MDE_INV_ROOT, rootnp->dev, rootnp->cname); + return (NULL); + } else return (rootnp); +} diff --git a/usr/src/lib/lvm/libmeta/common/metarpcopen.c b/usr/src/lib/lvm/libmeta/common/metarpcopen.c new file mode 100644 index 0000000000..bd0f4232f5 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/metarpcopen.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Just in case we're not in a build environment, make sure that + * TEXT_DOMAIN gets set to something. + */ +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +#include <meta.h> +#include <metad.h> + +#define CC_TTL_MAX 20 + +typedef struct { + char *cc_node; + struct timeval cc_ttl; + CLIENT *cc_clp; +} client_cache_t; + +typedef struct client_header { + client_cache_t **ch_cache; /* array of clients. */ + mutex_t ch_mutex; /* lock access to ch_cache */ +} client_header_t; + +/* + * This structure is used to pass data from meta_client_create to + * client_create_helper via meta_client_create_retry. + */ +typedef struct clnt_data { + rpcprog_t cd_prognum; /* RPC program number */ + rpcvers_t cd_version; /* Desired interface version */ + char *cd_nettype; /* Type of network to use */ +} clnt_data_t; + +#define MALLOC_BLK_SIZE 10 +static client_header_t client_header = {(client_cache_t **)NULL, DEFAULTMUTEX}; + +static void +cc_add( + client_header_t *header, + char *node, + CLIENT *clntp, + md_error_t *ep +) +{ + client_cache_t ***cachep = &header->ch_cache; + struct timeval now; + int i; + int j = 0; + + if (gettimeofday(&now, NULL) == -1) { + (void) mdsyserror(ep, errno, "gettimeofday()"); + return; + } + + (void) mutex_lock(&header->ch_mutex); + if (*cachep) { + for (i = 0; (*cachep)[i] != NULL; i++) + if (strcmp((*cachep)[i]->cc_node, node) == 0 && + (*cachep)[i]->cc_clp == NULL) { + (*cachep)[i]->cc_clp = clntp; + (*cachep)[i]->cc_ttl = now; + (void) mutex_unlock(&header->ch_mutex); + return; + } + } else { + *cachep = Calloc(MALLOC_BLK_SIZE, sizeof (**cachep)); + i = 0; + } + + (*cachep)[i] = Zalloc(sizeof (***cachep)); + (*cachep)[i]->cc_node = Strdup(node); + (*cachep)[i]->cc_clp = clntp; + (*cachep)[i]->cc_ttl = now; + + if ((++i % MALLOC_BLK_SIZE) == 0) { + *cachep = Realloc(*cachep, + (i + MALLOC_BLK_SIZE) * sizeof (**cachep)); + for (j = i; j < (i + MALLOC_BLK_SIZE); j++) + (*cachep)[j] = NULL; + } + (void) mutex_unlock(&header->ch_mutex); +} + +static void +rel_clntp(client_cache_t *cachep) +{ + CLIENT *clntp = cachep->cc_clp; + + if (clntp != NULL) { + auth_destroy(clntp->cl_auth); + clnt_destroy(clntp); + } + cachep->cc_clp = NULL; +} + +static void +cc_destroy(client_header_t *header) +{ + client_cache_t ***cachep = &header->ch_cache; + int i; + + (void) mutex_lock(&header->ch_mutex); + if (*cachep) { + for (i = 0; ((*cachep)[i] != NULL); i++) { + client_cache_t *p = (*cachep)[i]; + + Free(p->cc_node); + rel_clntp(p); + Free(p); + } + Free(*cachep); + *cachep = NULL; + } + (void) mutex_unlock(&header->ch_mutex); +} + +/* + * Set the timeout value for this client handle. + */ +static int +cl_sto( + CLIENT *clntp, + char *hostname, + long time_out, + md_error_t *ep +) +{ + struct timeval nto; + + (void) memset(&nto, '\0', sizeof (nto)); + + nto.tv_sec = time_out; + + if (clnt_control(clntp, CLSET_TIMEOUT, (char *)&nto) != TRUE) + return (mdrpcerror(ep, clntp, hostname, + dgettext(TEXT_DOMAIN, "metad client set timeout"))); + + return (0); +} + +/* + * client_create_vers_retry is the helper function to be passed to + * meta_client_create_retry to do the actual work of creating the client + * when version selection is necessary. + */ + +/* ARGSUSED */ +static CLIENT * +client_create_vers_retry(char *hostname, + void *ignore, + struct timeval *tout +) +{ + rpcvers_t vers; /* Version # not needed. */ + + return (clnt_create_vers_timed(hostname, METAD, &vers, + METAD_VERSION, METAD_VERSION_DEVID, "tcp", tout)); +} + +/* + * client_create_helper is the helper function to be passed to + * meta_client_create_retry when plain vanilla client create is desired. + */ +static CLIENT * +client_create_helper(char *hostname, void *private, struct timeval *time_out) +{ + clnt_data_t *cd = (clnt_data_t *)private; + + return (clnt_create_timed(hostname, cd->cd_prognum, cd->cd_version, + cd->cd_nettype, time_out)); +} + +/* + * meta_client_create_retry is a general function to assist in creating RPC + * clients. This function handles retrying if the attempt to create a + * client fails. meta_client_create_retry itself does not actually create + * the client. Instead it calls the helper function, func, to do that job. + * + * With the help of func, meta_client_create_retry will create an RPC + * connection allowing up to tout seconds to complete the task. If the + * connection creation fails for RPC_RPCBFAILURE, RPC_CANTRECV or + * RPC_PROGNOTREGISTERED and tout seconds have not passed, + * meta_client_create_retry will try again. The reason retries are + * important is that when the inet daemon is being refreshed, it can take + * 15-20 seconds for it to start responding again. + * + * Arguments: + * + * hostname - Name of remote host + * + * func - Pointer to the helper function, that will + * actually try to create the client. + * + * data - Private data to be passed on to func. + * meta_client_create_retry treats this as an opaque + * pointer. + * + * tout - Number of seconds to allow for the connection + * attempt. + * + * ep - Standard SVM error pointer. May be NULL. + */ +CLIENT * +meta_client_create_retry( + char *hostname, + clnt_create_func_t func, + void *data, + time_t tout, + md_error_t *ep +) +{ + static int debug; /* print debugging info */ + static int debug_set = 0; + + CLIENT *clnt = (CLIENT *) NULL; + struct timeval curtime; + char *d; + struct timeval start; + struct timeval timeout; + + if (debug_set == 0) { + d = getenv("MD_DEBUG"); + if (d == NULL) { + debug = 0; + } else { + debug = (strstr(d, "RPC") == NULL) ? 0 : 1; + } + debug_set = 1; + } + timeout.tv_usec = 0; + if (gettimeofday(&start, NULL) == -1) { + if (ep != (md_error_t *)NULL) { + (void) mdsyserror(ep, errno, "gettimeofday()"); + } + return (clnt); + } + curtime = start; + while ((curtime.tv_sec - start.tv_sec) < tout) { + /* Use remaining time as the timeout value. */ + timeout.tv_sec = tout - (curtime.tv_sec - start.tv_sec); + clnt = (*func)(hostname, data, &timeout); + if (clnt != (CLIENT *) NULL) + break; + if ((rpc_createerr.cf_stat == RPC_RPCBFAILURE) || + (rpc_createerr.cf_stat == RPC_PROGNOTREGISTERED) || + (rpc_createerr.cf_stat == RPC_CANTRECV)) { + if (debug) { + clnt_pcreateerror("meta_client_create_retry"); + } + /* If error might be fixed in time, sleep & try again */ + (void) sleep(2); + if (gettimeofday(&curtime, NULL) == -1) { + if (ep != (md_error_t *)NULL) { + (void) mdsyserror(ep, errno, + "gettimeofday()"); + } + return (clnt); + } + } else { + /* Not a recoverable error. */ + break; + } + } + if ((clnt == (CLIENT *) NULL) && (ep != (md_error_t *)NULL)) { + (void) mdrpccreateerror(ep, hostname, + "meta_client_create_retry"); + } + return (clnt); +} + +/* + * meta_client_create is intended to be used within SVM as a replacement + * for calls to clnt_create. meta_client_create invokes the retry + * mechanism of meta_client_create_retry. + */ +CLIENT * +meta_client_create(char *host, rpcprog_t prognum, rpcvers_t version, + char *nettype) +{ + clnt_data_t cd; + + cd.cd_prognum = prognum; + cd.cd_version = version; + cd.cd_nettype = nettype; + return (meta_client_create_retry(host, client_create_helper, + (void *)&cd, MD_CLNT_CREATE_TOUT, (md_error_t *)NULL)); +} + +/* + * create and return RPC connection + */ +CLIENT * +metarpcopen( + char *hostname, + long time_out, + md_error_t *ep +) +{ + CLIENT *clntp = NULL; + client_cache_t ***cachep = &client_header.ch_cache; + int i; + long delta; + struct timeval now; + + if (gettimeofday(&now, NULL) == -1) { + (void) mdsyserror(ep, errno, "gettimeofday()"); + return (NULL); + } + + /* + * Before trying to create the client, make sure that the core SVM + * services are enabled by the Service Management Facility. We + * don't want to suffer the 60 second timeout if the services are + * not even enabled. This call actually only verifies that they + * are enabled on this host no matter which host the caller wants + * to connect to. Nonetheless, if the services are not enabled on + * the local host, our RPC stuff is not going to work as expected. + */ + if (meta_smf_isonline(META_SMF_CORE, ep) == 0) { + return (NULL); + } + + (void) mutex_lock(&client_header.ch_mutex); + if (client_header.ch_cache) { + for (i = 0; (*cachep)[i] != NULL; i++) { + if (strcmp((*cachep)[i]->cc_node, hostname) == 0) { + clntp = (*cachep)[i]->cc_clp; + if (clntp == NULL) + continue; + delta = now.tv_sec - + (*cachep)[i]->cc_ttl.tv_sec; + if (delta > CC_TTL_MAX) { + rel_clntp((*cachep)[i]); + continue; + } + if (cl_sto(clntp, hostname, time_out, + ep) != 0) { + (void) mutex_unlock( + &client_header.ch_mutex); + return (NULL); + } + (void) mutex_unlock(&client_header.ch_mutex); + return (clntp); + } + } + } + (void) mutex_unlock(&client_header.ch_mutex); + + /* + * Try to create a version 2 client handle by default. + * If this fails (i.e. client is version 1), try to + * create a version 1 client handle. + */ + clntp = meta_client_create_retry(hostname, client_create_vers_retry, + (void *)NULL, MD_CLNT_CREATE_TOUT, ep); + + /* open connection */ + if (clntp == NULL) { + (void) mdrpccreateerror(ep, hostname, + dgettext(TEXT_DOMAIN, "metad client create")); + cc_add(&client_header, hostname, NULL, ep); + return (NULL); + } else { + auth_destroy(clntp->cl_auth); + clntp->cl_auth = authsys_create_default(); + assert(clntp->cl_auth != NULL); + } + + cc_add(&client_header, hostname, clntp, ep); + + if (cl_sto(clntp, hostname, time_out, ep) != 0) + return (NULL); + + return (clntp); +} + +/* + * metarpcclose - is a place holder so that when using + * metarpcopen, it does not appear that + * we have dangling opens. We can at some + * later decrement open counts here too, if needed. + */ +/*ARGSUSED*/ +void +metarpcclose(CLIENT *clntp) +{ +} + +void +metarpccloseall(void) +{ + cc_destroy(&client_header); +} diff --git a/usr/src/lib/lvm/libmeta/common/metasplitname.c b/usr/src/lib/lvm/libmeta/common/metasplitname.c new file mode 100644 index 0000000000..84634a109e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/metasplitname.c @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 1993, 2000 by Sun Microsystems, Inc. + * All rights reserved. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * split and splice name + */ + +#include <meta.h> + +int +splitname(char *name, md_splitname *spn) +{ + size_t prefixlen; + size_t suffixlen; + char *lastslash; + lastslash = strrchr(name, '/'); + if (lastslash != NULL) { + prefixlen = lastslash - name; + suffixlen = (strlen(name) - prefixlen) - 1; /* slash dropped */ + } else { + prefixlen = 0; + suffixlen = strlen(name); + } + if (prefixlen > MD_MAXPREFIX || + suffixlen > MD_MAXSUFFIX) + return (1); + (void) memcpy(SPN_PREFIX(spn).pre_data, name, prefixlen); + SPN_PREFIX(spn).pre_len = prefixlen; + (void) memcpy(SPN_SUFFIX(spn).suf_data, lastslash + 1, suffixlen); + SPN_SUFFIX(spn).suf_len = suffixlen; + return (0); +} + +char * +splicename(md_splitname *spn) +{ + char *name; + char *suffix; + size_t prefixlen; + size_t suffixlen; + + prefixlen = SPN_PREFIX(spn).pre_len; + suffixlen = SPN_SUFFIX(spn).suf_len; + name = Malloc(prefixlen + suffixlen + 2); + (void) memcpy(name, SPN_PREFIX(spn).pre_data, prefixlen); + name[prefixlen] = '/'; + suffix = name + (prefixlen + 1); + (void) memcpy(suffix, SPN_SUFFIX(spn).suf_data, suffixlen); + name[prefixlen + suffixlen + 1] = 0; + return (name); +} diff --git a/usr/src/lib/lvm/libmeta/common/sdssc_bind.c b/usr/src/lib/lvm/libmeta/common/sdssc_bind.c new file mode 100644 index 0000000000..c8e1f8c3ee --- /dev/null +++ b/usr/src/lib/lvm/libmeta/common/sdssc_bind.c @@ -0,0 +1,205 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Block comment which describes the contents of this file. + */ + +#include <dlfcn.h> +#include <meta.h> +#include <metadyn.h> +#include <sdssc.h> + +#define SDSSC_PATH SDSSC_CL_LIBDIR "/sc/libsds_sc.so" + +static func_table_t dl_table[] = { + { "_sdssc_version", (void **)&sdssc_version }, + { "_sdssc_create_begin", (void **)&sdssc_create_begin }, + { "_sdssc_mo_create_begin", (void **)&sdssc_mo_create_begin }, + { "_sdssc_create_end", (void **)&sdssc_create_end }, + { "_sdssc_delete_begin", (void **)&sdssc_delete_begin }, + { "_sdssc_delete_end", (void **)&sdssc_delete_end }, + { "_sdssc_get_index", (void **)&sdssc_get_index }, + { "_sdssc_add_hosts", (void **)&sdssc_add_hosts }, + { "_sdssc_delete_hosts", (void **)&sdssc_delete_hosts }, + { "_sdssc_get_primary_host", (void **)&sdssc_get_primary_host }, + { "_sdssc_cmd_proxy", (void **)&sdssc_cmd_proxy }, + { "_sdssc_getnodelist", (void **)&sdssc_getnodelist }, + { "_sdssc_freenodelist", (void **)&sdssc_freenodelist }, + { "_sdssc_binddevs", (void **)&sdssc_binddevs }, + { "_sdssc_bindclusterdevs", (void **)&sdssc_bindclusterdevs }, + { "_sdssc_gettransportbynode", (void **)&sdssc_gettransportbynode }, + { "_sdssc_free_mdcerr_list", (void **)&sdssc_free_mdcerr_list }, + { "_sdssc_property_get", (void **)&sdssc_property_get }, + { "_sdssc_property_set", (void **)&sdssc_property_set }, + { "_sdssc_get_services", (void **)&sdssc_get_services }, + { "_sdssc_get_services_free", (void **)&sdssc_get_services_free }, + { "_sdssc_suspend", (void **)&sdssc_suspend }, + { "_sdssc_convert_cluster_path", + (void **)&sdssc_convert_cluster_path }, + { "_sdssc_convert_ctd_path", + (void **)&sdssc_convert_ctd_path }, + { "_sdssc_convert_path_free", + (void **)&sdssc_convert_path_free }, + { "_sdssc_notify_service", (void **)&sdssc_notify_service }, + { "_sdssc_cm_nm2nid", (void **)&sdssc_cm_nm2nid }, + { "_sdssc_cm_sr_nm2nid", (void **)&sdssc_cm_sr_nm2nid }, + { "_sdssc_cm_nid2nm", (void **)&sdssc_cm_nid2nm }, + { "_sdssc_cm_sr_nid2nm", (void **)&sdssc_cm_sr_nid2nm }, + { "_sdssc_get_priv_ipaddr", (void **)&sdssc_get_priv_ipaddr }, + { (char *)0, (void **)0 } +}; + +static rval_e +just_dup_string(const char *source, char **dest) +{ + *dest = strdup(source); + return (SDSSC_OKAY); +} + +static void +free_dup_string(char *source) +{ + free(source); +} + +/* + * not_bound -- routine to always return NOT_BOUND + */ +static rval_e +not_bound(void) +{ + return (SDSSC_NOT_BOUND); +} + +/* + * not_bound_error -- routine to always return SDSSC_NOT_BOUND_ERROR since + * routine is not bound. This is used when using an older version + * of libsdssc that doesn't support MN disksets. When an MN specific + * routine is called (such as sdssc_mo_create_set) an SDSSC_NOT_BOUND_ERROR + * will be returned. + */ +static rval_e +not_bound_error(void) +{ + return (SDSSC_NOT_BOUND_ERROR); +} + + +/* + * set_common_routine -- set cluster interface routines to return NOT_BOUND + */ +static void +set_common_routine() +{ + func_table_p f; + + for (f = dl_table; f->fptr != (void *)0; f++) { + if (strcmp(f->fname, "_sdssc_convert_cluster_path") == 0) { + *f->fptr = (void *)&just_dup_string; + } else if (strcmp(f->fname, "_sdssc_free_convert_cluster_path") + == 0) { + *f->fptr = (void *)&free_dup_string; + } else { + *f->fptr = (void *)¬_bound; + } + } +} + +/* + * sdssc_bind_library -- entry point which resolves all cluster interface pts. + */ +rval_e +sdssc_bind_library(void) +{ + void *dp; + int (*lb)(); + func_table_p ftp; + + /* + * If already bound then just return okay so this routine + * becomes idempotent. If this check isn't made then we'll + * fail when calling the "_bind_library" function because + * dcs_initialize() can only be called once. + */ + if (sdssc_version != 0) { + if ((void *)sdssc_version == (void *)not_bound) + return (SDSSC_NOT_BOUND); + else + return (SDSSC_OKAY); + } + + if ((dp = dlopen(SDSSC_PATH, RTLD_LAZY)) == NULL) { + set_common_routine(); + return (SDSSC_NOT_BOUND); + } else { + + /* + * Allow the binding library to initialize state if + * necessary. Currently this calls the DCS initialize() + * routine which checks to see if we're part of a cluster. + */ + if ((lb = (int (*)())dlsym(dp, "_bind_library")) != NULL) { + if (lb() != 0) { + set_common_routine(); + return (SDSSC_NOT_BOUND); + } + } + + /* + * Load 'em up. Pick up the function address and store + * the values in the global pointers for other routines + * to use. + */ + for (ftp = dl_table; ftp->fptr != (void *)0; ftp++) { + if ((*ftp->fptr = dlsym(dp, ftp->fname)) == NULL) { + + /* + * If old libsdssc library is there, then + * sdssc_mo_create_begin is not yet supported. + */ + if (strcmp(ftp->fname, + "sdssc_mo_create_begin")) { + *ftp->fptr = (void *)¬_bound_error; + continue; + } + /* + * If this routine fails to find a single + * entry point that it's expecting + * (except sdssc_mo_create_begin) then + * setup non-sdssc stubs routines + * as function pointers. + */ + set_common_routine(); + return (SDSSC_ERROR); + } + } + + return (SDSSC_OKAY); + } +} diff --git a/usr/src/lib/lvm/libmeta/i386/Makefile b/usr/src/lib/lvm/libmeta/i386/Makefile new file mode 100644 index 0000000000..17c519db5c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/i386/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +install debug: all $(ROOTLIBS) $(ROOTLINT) $(ROOTLINKS) diff --git a/usr/src/lib/lvm/libmeta/sparc/Makefile b/usr/src/lib/lvm/libmeta/sparc/Makefile new file mode 100644 index 0000000000..75eec28afb --- /dev/null +++ b/usr/src/lib/lvm/libmeta/sparc/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 1998-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +install debug: all $(ROOTLIBS) $(ROOTLINT) $(ROOTLINKS) diff --git a/usr/src/lib/lvm/libmeta/spec/Makefile b/usr/src/lib/lvm/libmeta/spec/Makefile new file mode 100644 index 0000000000..5e88f3ac7e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/Makefile @@ -0,0 +1,29 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libmeta/spec/Makefile + +include $(SRC)/lib/Makefile.spec.arch diff --git a/usr/src/lib/lvm/libmeta/spec/Makefile.targ b/usr/src/lib/lvm/libmeta/spec/Makefile.targ new file mode 100644 index 0000000000..40ffb28073 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/Makefile.targ @@ -0,0 +1,36 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libmeta/spec/Makefile.targ + +LIBRARY = libmeta.a +VERS = .1 + +OBJECTS = meta.o + +TRANSCPP = + +SPECCPP = -I.. -I../../inc diff --git a/usr/src/lib/lvm/libmeta/spec/amd64/Makefile b/usr/src/lib/lvm/libmeta/spec/amd64/Makefile new file mode 100644 index 0000000000..c7d89e007c --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/amd64/Makefile @@ -0,0 +1,46 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#amd64_C_PICFLAGS = $(amd64_C_BIGPICFLAGS) + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB64) diff --git a/usr/src/lib/lvm/libmeta/spec/i386/Makefile b/usr/src/lib/lvm/libmeta/spec/i386/Makefile new file mode 100644 index 0000000000..3fe06d99af --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/i386/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libmeta/spec/i386/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#i386_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB) diff --git a/usr/src/lib/lvm/libmeta/spec/meta.spec b/usr/src/lib/lvm/libmeta/spec/meta.spec new file mode 100644 index 0000000000..48d7d2b30e --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/meta.spec @@ -0,0 +1,3699 @@ +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# lib/lvm/libmeta/spec/meta.spec + +function meta_smf_enable +version SUNWprivate_1.1 +end + +function meta_smf_disable +version SUNWprivate_1.1 +end + +function meta_smf_getmask +version SUNWprivate_1.1 +end + +function meta_smf_isonline +version SUNWprivate_1.1 +end + +function meta_svm_sysevent +version SUNWprivate_1.1 +end + +function close_admin +version SUNWprivate_1.1 +end + +function meta_dev_ismeta +version SUNWprivate_1.1 +end + +function meta_get_nunits +version SUNWprivate_1.1 +end + +function metamakedev +version SUNWprivate_1.1 +end + +function meta_get_tstate +version SUNWprivate_1.1 +end + +function meta_expldev +version SUNWprivate_1.1 +end + +function meta_cmpldev +version SUNWprivate_1.1 +end + +function meta_getmajor +version SUNWprivate_1.1 +end + +function meta_getminor +version SUNWprivate_1.1 +end + +function open_admin +version SUNWprivate_1.1 +end + +function meta_concat_generic +version SUNWprivate_1.1 +end + +function meta_concat_parent +version SUNWprivate_1.1 +end + +function meta_check_driveinset +version SUNWprivate_1.1 +end + +function meta_check_drivemounted +version SUNWprivate_1.1 +end + +function meta_check_driveswapped +version SUNWprivate_1.1 +end + +function meta_check_inmeta +version SUNWprivate_1.1 +end + +function meta_check_inset +version SUNWprivate_1.1 +end + +function meta_check_root +version SUNWprivate_1.1 +end + +function meta_check_inuse +version SUNWprivate_1.1 +end + +function meta_imp_drvused +version SUNWprivate_1.1 +end + +function meta_check_overlap +version SUNWprivate_1.1 +end + +function meta_check_samedrive +version SUNWprivate_1.1 +end + +function meta_check_inreplica +version SUNWprivate_1.1 +end + +function meta_check_replica +version SUNWprivate_1.1 +end + +function meta_db_addsidenms +version SUNWprivate_1.1 +end + +function meta_db_attach +version SUNWprivate_1.1 +end + +function meta_db_delsidenm +version SUNWprivate_1.1 +end + +function meta_db_detach +version SUNWprivate_1.1 +end + +function meta_db_minreplica +version SUNWprivate_1.1 +end + +function meta_db_patch +version SUNWprivate_1.1 +end + +function meta_get_replica_names +version SUNWprivate_1.1 +end + +function meta_setup_db_locations +version SUNWprivate_1.1 +end + +function meta_sync_db_locations +version SUNWprivate_1.1 +end + +function meta_getdidminorbykey +version SUNWprivate_1.1 +end + +function meta_getdidbykey +version SUNWprivate_1.1 +end + +function meta_setdid +version SUNWprivate_1.1 +end + +function metafreereplicalist +version SUNWprivate_1.1 +end + +function metareplicalist +version SUNWprivate_1.1 +end + +function meta_db_balance +version SUNWprivate_1.1 +end + +function meta_create_non_dup_list +version SUNWprivate_1.1 +end + +function sdssc_add_hosts +version SUNWprivate_1.1 +end + +function sdssc_bind_library +version SUNWprivate_1.1 +end + +function sdssc_bindclusterdevs +version SUNWprivate_1.1 +end + +function sdssc_binddevs +version SUNWprivate_1.1 +end + +function sdssc_clnt_bind_devs +version SUNWprivate_1.1 +end + +function sdssc_clnt_proxy_cmd +version SUNWprivate_1.1 +end + +function sdssc_cm_nid2nm +version SUNWprivate_1.1 +end + +function sdssc_cm_nm2nid +version SUNWprivate_1.1 +end + +function sdssc_cm_sr_nid2nm +version SUNWprivate_1.1 +end + +function sdssc_cm_sr_nm2nid +version SUNWprivate_1.1 +end + +function sdssc_cmd_proxy +version SUNWprivate_1.1 +end + +function sdssc_convert_cluster_path +version SUNWprivate_1.1 +end + +function sdssc_convert_ctd_path +version SUNWprivate_1.1 +end + +function sdssc_convert_path_free +version SUNWprivate_1.1 +end + +function sdssc_create_begin +version SUNWprivate_1.1 +end + +function sdssc_mo_create_begin +version SUNWprivate_1.1 +end + +function sdssc_create_end +version SUNWprivate_1.1 +end + +function sdssc_delete_begin +version SUNWprivate_1.1 +end + +function sdssc_delete_end +version SUNWprivate_1.1 +end + +function sdssc_delete_hosts +version SUNWprivate_1.1 +end + +function sdssc_free_mdcerr_list +version SUNWprivate_1.1 +end + +function sdssc_freenodelist +version SUNWprivate_1.1 +end + +function sdssc_get_index +version SUNWprivate_1.1 +end + +function sdssc_get_primary_host +version SUNWprivate_1.1 +end + +function sdssc_get_priv_ipaddr +version SUNWprivate_1.1 +end + +function sdssc_get_services +version SUNWprivate_1.1 +end + +function sdssc_get_services_free +version SUNWprivate_1.1 +end + +function sdssc_getnodelist +version SUNWprivate_1.1 +end + +function sdssc_gettransportbynode +version SUNWprivate_1.1 +end + +function sdssc_notify_service +version SUNWprivate_1.1 +end + +function sdssc_property_get +version SUNWprivate_1.1 +end + +function sdssc_property_set +version SUNWprivate_1.1 +end + +function sdssc_suspend +version SUNWprivate_1.1 +end + +function sdssc_version +version SUNWprivate_1.1 +end + +function getdevstamp +version SUNWprivate_1.1 +end + +function setdevstamp +version SUNWprivate_1.1 +end + +function md_eprintf +version SUNWprivate_1.1 +end + +function meta_mc_log +version SUNWprivate_1.1 +end + +function md_logpfx +version SUNWprivate_1.1 +end + +function md_perror +version SUNWprivate_1.1 +end + +function mdclrerror +version SUNWprivate_1.1 +end + +function mdcomperror +version SUNWprivate_1.1 +end + +function mddeverror +version SUNWprivate_1.1 +end + +function mddserror +version SUNWprivate_1.1 +end + +function mde_perror +version SUNWprivate_1.1 +end + +function mde_sperror +version SUNWprivate_1.1 +end + +function mderror +version SUNWprivate_1.1 +end + +function mderrorextra +version SUNWprivate_1.1 +end + +function mdhserror +version SUNWprivate_1.1 +end + +function mdhsperror +version SUNWprivate_1.1 +end + +function mdmddberror +version SUNWprivate_1.1 +end + +function mdmderror +version SUNWprivate_1.1 +end + +function mdrpccreateerror +version SUNWprivate_1.1 +end + +function mdrpcerror +version SUNWprivate_1.1 +end + +function mdstealerror +version SUNWprivate_1.1 +end + +function mdsyserror +version SUNWprivate_1.1 +end + +function mduseerror +version SUNWprivate_1.1 +end + +function metaioctl +version SUNWprivate_1.1 +end + +function meta_getalldevs +version SUNWprivate_1.1 +end + +function meta_getdevs +version SUNWprivate_1.1 +end + +function meta_getvtoc +version SUNWprivate_1.1 +end + +function meta_setvtoc +version SUNWprivate_1.1 +end + +function hs_state_to_name +version SUNWprivate_1.1 +end + +function meta_check_hotspare +version SUNWprivate_1.1 +end + +function meta_check_hsp +version SUNWprivate_1.1 +end + +function meta_check_inhsp +version SUNWprivate_1.1 +end + +function meta_create_hsp +version SUNWprivate_1.1 +end + +function meta_free_hsp +version SUNWprivate_1.1 +end + +function meta_get_hsp +version SUNWprivate_1.1 +end + +function meta_get_hsp_common +version SUNWprivate_1.1 +end + +function meta_get_hsp_names +version SUNWprivate_1.1 +end + +function meta_hs_add +version SUNWprivate_1.1 +end + +function meta_hs_delete +version SUNWprivate_1.1 +end + +function meta_hs_enable +version SUNWprivate_1.1 +end + +function meta_hs_replace +version SUNWprivate_1.1 +end + +function meta_hsp_print +version SUNWprivate_1.1 +end + +function meta_hsp_reset +version SUNWprivate_1.1 +end + +function meta_init_hsp +version SUNWprivate_1.1 +end + +function meta_invalidate_hsp +version SUNWprivate_1.1 +end + +function metachkhsp +version SUNWprivate_1.1 +end + +function meta_adjust_geom +version SUNWprivate_1.1 +end + +function meta_cook_syntax +version SUNWprivate_1.1 +end + +function meta_init_name +version SUNWprivate_1.1 +end + +function meta_init_make_device +version SUNWprivate_1.1 +end + +function meta_setup_geom +version SUNWprivate_1.1 +end + +function parse_interlace +version SUNWprivate_1.1 +end + +function close_mnttab +version SUNWprivate_1.1 +end + +function open_mnttab +version SUNWprivate_1.1 +end + +function meta_update_md_cf +version SUNWprivate_1.1 +end + +function med_errnum_to_str +version SUNWprivate_1.1 +end + +function Calloc +version SUNWprivate_1.1 +end + +function Free +version SUNWprivate_1.1 +end + +function Malloc +version SUNWprivate_1.1 +end + +function Realloc +version SUNWprivate_1.1 +end + +function Strdup +version SUNWprivate_1.1 +end + +function Zalloc +version SUNWprivate_1.1 +end + +function cl_get_setkey +version SUNWprivate_1.1 +end + +function cl_set_setkey +version SUNWprivate_1.1 +end + +function clnt_add_drv_sidenms +version SUNWprivate_1.1 +end + +function clnt_adddrvs +version SUNWprivate_1.1 +end + +function clnt_addhosts +version SUNWprivate_1.1 +end + +function clnt_createset +version SUNWprivate_1.1 +end + +function clnt_del_drv_sidenms +version SUNWprivate_1.1 +end + +function clnt_deldrvs +version SUNWprivate_1.1 +end + +function clnt_delhosts +version SUNWprivate_1.1 +end + +function clnt_delset +version SUNWprivate_1.1 +end + +function clnt_devinfo +version SUNWprivate_1.1 +end + +function clnt_drvused +version SUNWprivate_1.1 +end + +function clnt_devinfo_by_devid +version SUNWprivate_1.1 +end + +function clnt_getset +version SUNWprivate_1.1 +end + +function clnt_mngetset +version SUNWprivate_1.1 +end + +function clnt_gtimeout +version SUNWprivate_1.1 +end + +function clnt_hostname +version SUNWprivate_1.1 +end + +function clnt_lock_set +version SUNWprivate_1.1 +end + +function clnt_nullproc +version SUNWprivate_1.1 +end + +function clnt_ownset +version SUNWprivate_1.1 +end + +function clnt_setnameok +version SUNWprivate_1.1 +end + +function clnt_setnumbusy +version SUNWprivate_1.1 +end + +function clnt_stimeout +version SUNWprivate_1.1 +end + +function clnt_unlock_set +version SUNWprivate_1.1 +end + +function clnt_upd_dr_dbinfo +version SUNWprivate_1.1 +end + +function clnt_upd_dr_flags +version SUNWprivate_1.1 +end + +function clnt_upd_sr_flags +version SUNWprivate_1.1 +end + +function clnt_upd_nr_flags +version SUNWprivate_1.1 +end + +function clnt_updmeds +version SUNWprivate_1.1 +end + +function meta_conv_drvdesc_new2old +version SUNWprivate_1.1 +end + +function meta_conv_drvdesc_old2new +version SUNWprivate_1.1 +end + +function meta_conv_drvname_new2old +version SUNWprivate_1.1 +end + +function meta_conv_drvname_old2new +version SUNWprivate_1.1 +end + +function alloc_olddrvdesc +version SUNWprivate_1.1 +end + +function alloc_newdrvdesc +version SUNWprivate_1.1 +end + +function free_olddrvdesc +version SUNWprivate_1.1 +end + +function free_newdrvdesc +version SUNWprivate_1.1 +end + +function meta_get_devid +version SUNWprivate_1.1 +end + +function meta_print_devid +version SUNWprivate_1.1 +end + +function clnt_mncreateset +version SUNWprivate_1.1 +end + +function clnt_joinset +version SUNWprivate_1.1 +end + +function clnt_mnsetmaster +version SUNWprivate_1.1 +end + +function clnt_mn_mirror_resync_all +version SUNWprivate_1.1 +end + +function clnt_mn_sp_update_abr +version SUNWprivate_1.1 +end + +function free_sr +version SUNWprivate_1.1 +end + +function short_circuit_getset +version SUNWprivate_1.1 +end + +function commitset +version SUNWprivate_1.1 +end + +function dr_cache_add +version SUNWprivate_1.1 +end + +function dr_cache_del +version SUNWprivate_1.1 +end + +function mnnr_cache_add +version SUNWprivate_1.1 +end + +function mnnr_cache_del +version SUNWprivate_1.1 +end + +function drdup +version SUNWprivate_1.1 +end + +function get_db_rec +version SUNWprivate_1.1 +end + +function get_ur_rec +version SUNWprivate_1.1 +end + +function metad_getsetbyname +version SUNWprivate_1.1 +end + +function metad_getsetbynum +version SUNWprivate_1.1 +end + +function resnarf_set +version SUNWprivate_1.1 +end + +function metad_isautotakebyname +version SUNWprivate_1.1 +end + +function metad_isautotakebynum +version SUNWprivate_1.1 +end + +function s_delrec +version SUNWprivate_1.1 +end + +function s_delset +version SUNWprivate_1.1 +end + +function s_ownset +version SUNWprivate_1.1 +end + +function set_snarf +version SUNWprivate_1.1 +end + +function setdup +version SUNWprivate_1.1 +end + +function mnsetdup +version SUNWprivate_1.1 +end + +function sr_cache_add +version SUNWprivate_1.1 +end + +function sr_cache_del +version SUNWprivate_1.1 +end + +function sr_cache_flush +version SUNWprivate_1.1 +end + +function sr_cache_flush_setno +version SUNWprivate_1.1 +end + +function sr_validate +version SUNWprivate_1.1 +end + +function sr_del_drv +version SUNWprivate_1.1 +end + +function clnt_med_get_data +version SUNWprivate_1.1 +end + +function clnt_med_get_rec +version SUNWprivate_1.1 +end + +function clnt_med_hostname +version SUNWprivate_1.1 +end + +function clnt_med_null +version SUNWprivate_1.1 +end + +function clnt_med_upd_data +version SUNWprivate_1.1 +end + +function clnt_med_upd_rec +version SUNWprivate_1.1 +end + +function meddstealerror +version SUNWprivate_1.1 +end + +function meta_h2hi +version SUNWprivate_1.1 +end + +function meta_hi2h +version SUNWprivate_1.1 +end + +function meta_med_hnm2ip +version SUNWprivate_1.1 +end + +function setup_med_cfg +version SUNWprivate_1.1 +end + +function defmhiargs +version SUNWprivate_1.1 +end + +function meta_drive_to_disk_status_list +version SUNWprivate_1.1 +end + +function meta_free_disk_status_list +version SUNWprivate_1.1 +end + +function meta_free_drive_info_list +version SUNWprivate_1.1 +end + +function meta_get_drive_names +version SUNWprivate_1.1 +end + +function meta_list_disks +version SUNWprivate_1.1 +end + +function meta_imp_set +version SUNWprivate_1.1 +end + +function meta_list_drives +version SUNWprivate_1.1 +end + +function meta_get_set_info +version SUNWprivate_1.1 +end + +function meta_prune_cnames +version SUNWprivate_1.1 +end + +function meta_rel_own +version SUNWprivate_1.1 +end + +function meta_status_own +version SUNWprivate_1.1 +end + +function meta_take_own +version SUNWprivate_1.1 +end + +function mhstealerror +version SUNWprivate_1.1 +end + +function rel_own_bydd +version SUNWprivate_1.1 +end + +function tk_own_bydd +version SUNWprivate_1.1 +end + +function meta_check_inmirror +version SUNWprivate_1.1 +end + +function meta_check_mirror +version SUNWprivate_1.1 +end + +function meta_check_submirror +version SUNWprivate_1.1 +end + +function meta_create_mirror +version SUNWprivate_1.1 +end + +function meta_free_mirror +version SUNWprivate_1.1 +end + +function meta_get_mirror +version SUNWprivate_1.1 +end + +function meta_get_mirror_names +version SUNWprivate_1.1 +end + +function meta_init_mirror +version SUNWprivate_1.1 +end + +function meta_mirror_anycomp_is_err +version SUNWprivate_1.1 +end + +function meta_mirror_attach +version SUNWprivate_1.1 +end + +function meta_mirror_detach +version SUNWprivate_1.1 +end + +function meta_mirror_enable +version SUNWprivate_1.1 +end + +function meta_mirror_get_params +version SUNWprivate_1.1 +end + +function meta_mirror_offline +version SUNWprivate_1.1 +end + +function meta_mirror_online +version SUNWprivate_1.1 +end + +function meta_mirror_print +version SUNWprivate_1.1 +end + +function meta_mirror_replace +version SUNWprivate_1.1 +end + +function meta_mirror_reset +version SUNWprivate_1.1 +end + +function meta_mirror_set_params +version SUNWprivate_1.1 +end + +function meta_print_mirror_options +version SUNWprivate_1.1 +end + +function name_to_pass_num +version SUNWprivate_1.1 +end + +function name_to_rd_opt +version SUNWprivate_1.1 +end + +function name_to_wr_opt +version SUNWprivate_1.1 +end + +function rd_opt_to_name +version SUNWprivate_1.1 +end + +function sm_state_to_action +version SUNWprivate_1.1 +end + +function sm_state_to_name +version SUNWprivate_1.1 +end + +function wr_opt_to_name +version SUNWprivate_1.1 +end + +function meta_mirror_resync +version SUNWprivate_1.1 +end + +function meta_mirror_resync_all +version SUNWprivate_1.1 +end + +function meta_mn_mirror_resync_all +version SUNWprivate_1.1 +end + +function meta_mirror_resync_kill_all +version SUNWprivate_1.1 +end + +function meta_mirror_resync_block_all +version SUNWprivate_1.1 +end + +function meta_mirror_resync_unblock_all +version SUNWprivate_1.1 +end + +function meta_mirror_resync_unblock +version SUNWprivate_1.1 +end + +function meta_mirror_resync_kill +version SUNWprivate_1.1 +end + +function meta_get_mountp +version SUNWprivate_1.1 +end + +function blkname +version SUNWprivate_1.1 +end + +function get_devname +version SUNWprivate_1.1 +end + +function get_hspname +version SUNWprivate_1.1 +end + +function get_mdname +version SUNWprivate_1.1 +end + +function is_hspname +version SUNWprivate_1.1 +end + +function sr2setdesc +version SUNWprivate_1.1 +end + +function is_metaname +version SUNWprivate_1.1 +end + +function meta_canonicalize +version SUNWprivate_1.1 +end + +function meta_get_hotspare_names +version SUNWprivate_1.1 +end + +function meta_getdev +version SUNWprivate_1.1 +end + +function metachkcomp +version SUNWprivate_1.1 +end + +function metachkdisk +version SUNWprivate_1.1 +end + +function metachkmeta +version SUNWprivate_1.1 +end + +function metadevname +version SUNWprivate_1.1 +end + +function metadiskname +version SUNWprivate_1.1 +end + +function metadrivename +version SUNWprivate_1.1 +end + +function metadrivenamelist +version SUNWprivate_1.1 +end + +function metadrivenamelist_append +version SUNWprivate_1.1 +end + +function meta_drivenamelist_append_wrapper +version SUNWprivate_1.1 +end + +function metafakesetname +version SUNWprivate_1.1 +end + +function metaflushnames +version SUNWprivate_1.1 +end + +function metaflushsetname +version SUNWprivate_1.1 +end + +function metaflushsidenames +version SUNWprivate_1.1 +end + +function metafreedrivename +version SUNWprivate_1.1 +end + +function metafreedrivenamelist +version SUNWprivate_1.1 +end + +function metafreehspnamelist +version SUNWprivate_1.1 +end + +function metafreenamelist +version SUNWprivate_1.1 +end + +function metaget_setdesc +version SUNWprivate_1.1 +end + +function metahsphspname +version SUNWprivate_1.1 +end + +function metahspname +version SUNWprivate_1.1 +end + +function metahspnamelist +version SUNWprivate_1.1 +end + +function metahspnamelist_append +version SUNWprivate_1.1 +end + +function metaislocalset +version SUNWprivate_1.1 +end + +function metaismeta +version SUNWprivate_1.1 +end + +function metaissameset +version SUNWprivate_1.1 +end + +function metakeyname +version SUNWprivate_1.1 +end + +function metamnumname +version SUNWprivate_1.1 +end + +function metaname +version SUNWprivate_1.1 +end + +function metaname_fast +version SUNWprivate_1.1 +end + +function metanamelist +version SUNWprivate_1.1 +end + +function metanamelist_append +version SUNWprivate_1.1 +end + +function metasetname +version SUNWprivate_1.1 +end + +function metasetnosetname +version SUNWprivate_1.1 +end + +function metaslicename +version SUNWprivate_1.1 +end + +function ctlr_cache_add +version SUNWprivate_1.1 +end + +function ctlr_cache_look +version SUNWprivate_1.1 +end + +function getdrvnode +version SUNWprivate_1.1 +end + +function meta_free_unit +version SUNWprivate_1.1 +end + +function meta_get_mdunit +version SUNWprivate_1.1 +end + +function meta_get_unit +version SUNWprivate_1.1 +end + +function meta_invalidate_name +version SUNWprivate_1.1 +end + +function meta_isopen +version SUNWprivate_1.1 +end + +function meta_match_enclosure +version SUNWprivate_1.1 +end + +function metaflushctlrcache +version SUNWprivate_1.1 +end + +function metafreevtoc +version SUNWprivate_1.1 +end + +function metagetcinfo +version SUNWprivate_1.1 +end + +function metagetdevicesname +version SUNWprivate_1.1 +end + +function metagetgeom +version SUNWprivate_1.1 +end + +function metagetlabel +version SUNWprivate_1.1 +end + +function metagetmiscname +version SUNWprivate_1.1 +end + +function metagetpartno +version SUNWprivate_1.1 +end + +function metagetset +version SUNWprivate_1.1 +end + +function metagetsize +version SUNWprivate_1.1 +end + +function metagetstart +version SUNWprivate_1.1 +end + +function metagetvtoc +version SUNWprivate_1.1 +end + +function metahasmddb +version SUNWprivate_1.1 +end + +function metasetvtoc +version SUNWprivate_1.1 +end + +function add_key_name +version SUNWprivate_1.1 +end + +function add_name +version SUNWprivate_1.1 +end + +function del_key_name +version SUNWprivate_1.1 +end + +function del_key_names +version SUNWprivate_1.1 +end + +function del_name +version SUNWprivate_1.1 +end + +function meta_getnmbykey +version SUNWprivate_1.1 +end + +function meta_getnmentbydev +version SUNWprivate_1.1 +end + +function meta_getnmentbykey +version SUNWprivate_1.1 +end + +function evdrv2evlib_typetab +version SUNWprivate_1.1 +end + +function meta_notify_createq +version SUNWprivate_1.1 +end + +function meta_notify_deleteq +version SUNWprivate_1.1 +end + +function meta_notify_doputev +version SUNWprivate_1.1 +end + +function meta_notify_flushq +version SUNWprivate_1.1 +end + +function meta_notify_freeevlist +version SUNWprivate_1.1 +end + +function meta_notify_getev +version SUNWprivate_1.1 +end + +function meta_notify_getevlist +version SUNWprivate_1.1 +end + +function meta_notify_listq +version SUNWprivate_1.1 +end + +function meta_notify_putev +version SUNWprivate_1.1 +end + +function meta_notify_putevlist +version SUNWprivate_1.1 +end + +function meta_notify_sendev +version SUNWprivate_1.1 +end + +function meta_notify_validq +version SUNWprivate_1.1 +end + +function tag2obj_typetab +version SUNWprivate_1.1 +end + +function meta_patch_fsdev +version SUNWprivate_1.1 +end + +function meta_patch_swapdev +version SUNWprivate_1.1 +end + +function meta_patch_vfstab +version SUNWprivate_1.1 +end + +function meta_patch_rootdev +version SUNWprivate_1.1 +end + +function meta_prbits +version SUNWprivate_1.1 +end + +function meta_print_all +version SUNWprivate_1.1 +end + +function meta_print_name +version SUNWprivate_1.1 +end + +function meta_print_time +version SUNWprivate_1.1 +end + +function meta_print_hrtime +version SUNWprivate_1.1 +end + +function meta_check_column +version SUNWprivate_1.1 +end + +function meta_check_inraid +version SUNWprivate_1.1 +end + +function meta_check_raid +version SUNWprivate_1.1 +end + +function meta_create_raid +version SUNWprivate_1.1 +end + +function meta_default_raid_interlace +version SUNWprivate_1.1 +end + +function meta_free_raid +version SUNWprivate_1.1 +end + +function meta_get_raid_common +version SUNWprivate_1.1 +end + +function meta_get_raid +version SUNWprivate_1.1 +end + +function meta_get_raid_names +version SUNWprivate_1.1 +end + +function meta_init_raid +version SUNWprivate_1.1 +end + +function meta_print_raid_options +version SUNWprivate_1.1 +end + +function meta_raid_anycomp_is_err +version SUNWprivate_1.1 +end + +function meta_raid_attach +version SUNWprivate_1.1 +end + +function meta_raid_check_interlace +version SUNWprivate_1.1 +end + +function meta_raid_enable +version SUNWprivate_1.1 +end + +function meta_raid_get_params +version SUNWprivate_1.1 +end + +function meta_raid_print +version SUNWprivate_1.1 +end + +function meta_raid_regen_byname +version SUNWprivate_1.1 +end + +function meta_raid_replace +version SUNWprivate_1.1 +end + +function meta_raid_reset +version SUNWprivate_1.1 +end + +function meta_raid_set_params +version SUNWprivate_1.1 +end + +function meta_raid_state_cnt +version SUNWprivate_1.1 +end + +function meta_raid_valid +version SUNWprivate_1.1 +end + +function raid_col_state_to_name +version SUNWprivate_1.1 +end + +function raid_state_to_action +version SUNWprivate_1.1 +end + +function raid_state_to_name +version SUNWprivate_1.1 +end + +function meta_raid_resync +version SUNWprivate_1.1 +end + +function meta_raid_resync_all +version SUNWprivate_1.1 +end + +function meta_exchange +version SUNWprivate_1.1 +end + +function meta_rename +version SUNWprivate_1.1 +end + +function meta_enable_byname +version SUNWprivate_1.1 +end + +function meta_replace +version SUNWprivate_1.1 +end + +function meta_replace_byname +version SUNWprivate_1.1 +end + +function meta_reset +version SUNWprivate_1.1 +end + +function meta_reset_all +version SUNWprivate_1.1 +end + +function meta_reset_by_name +version SUNWprivate_1.1 +end + +function meta_resync_all +version SUNWprivate_1.1 +end + +function meta_resync_byname +version SUNWprivate_1.1 +end + +function do_owner_ioctls +version SUNWprivate_1.1 +end + +function commd_get_verbosity +version SUNWprivate_1.1 +end + +function commd_get_outfile +version SUNWprivate_1.1 +end + +function get_max_meds +version SUNWprivate_1.1 +end + +function get_max_sets +version SUNWprivate_1.1 +end + +function getmyside +version SUNWprivate_1.1 +end + +function getsetbyname +version SUNWprivate_1.1 +end + +function getsetbynum +version SUNWprivate_1.1 +end + +function meta_check_drive_inuse +version SUNWprivate_1.1 +end + +function meta_check_ownership +version SUNWprivate_1.1 +end + +function meta_check_ownership_on_host +version SUNWprivate_1.1 +end + +function meta_get_reserved_names +version SUNWprivate_1.1 +end + +function meta_getnextside_devinfo +version SUNWprivate_1.1 +end + +function meta_is_drive_in_anyset +version SUNWprivate_1.1 +end + +function meta_is_drive_in_thisset +version SUNWprivate_1.1 +end + +function meta_set_balance +version SUNWprivate_1.1 +end + +function meta_set_destroy +version SUNWprivate_1.1 +end + +function meta_set_purge +version SUNWprivate_1.1 +end + +function meta_set_query +version SUNWprivate_1.1 +end + +function metadrivename_withdrkey +version SUNWprivate_1.1 +end + +function metafreedrivedesc +version SUNWprivate_1.1 +end + +function metaget_drivedesc +version SUNWprivate_1.1 +end + +function metaget_drivedesc_fromnamelist +version SUNWprivate_1.1 +end + +function metaget_drivedesc_sideno +version SUNWprivate_1.1 +end + +function metaget_setownership +version SUNWprivate_1.1 +end + +function mynode +version SUNWprivate_1.1 +end + +function strinlst +version SUNWprivate_1.1 +end + +function meta_make_sidenmlist +version SUNWprivate_1.1 +end + +function meta_set_adddrives +version SUNWprivate_1.1 +end + +function meta_set_deletedrives +version SUNWprivate_1.1 +end + +function meta_set_checkname +version SUNWprivate_1.1 +end + +function meta_set_addhosts +version SUNWprivate_1.1 +end + +function meta_set_deletehosts +version SUNWprivate_1.1 +end + +function meta_set_addmeds +version SUNWprivate_1.1 +end + +function meta_set_deletemeds +version SUNWprivate_1.1 +end + +function meta_set_auto_take +version SUNWprivate_1.1 +end + +function checkdrive_onnode +version SUNWprivate_1.1 +end + +function getnodeside +version SUNWprivate_1.1 +end + +function halt_set +version SUNWprivate_1.1 +end + +function metadrivedesc_append +version SUNWprivate_1.1 +end + +function nodehasset +version SUNWprivate_1.1 +end + +function nodesuniq +version SUNWprivate_1.1 +end + +function own_set +version SUNWprivate_1.1 +end + +function resync_genid +version SUNWprivate_1.1 +end + +function setup_db_bydd +version SUNWprivate_1.1 +end + +function snarf_set +version SUNWprivate_1.1 +end + +function meta_set_release +version SUNWprivate_1.1 +end + +function meta_set_take +version SUNWprivate_1.1 +end + +function meta_set_join +version SUNWprivate_1.1 +end + +function meta_set_withdraw +version SUNWprivate_1.1 +end + +function meta_update_mb +version SUNWprivate_1.1 +end + +function allsigs +version SUNWprivate_1.1 +end + +function md_daemonize +version SUNWprivate_1.1 +end + +function md_exit +version SUNWprivate_1.1 +end + +function md_got_sig +version SUNWprivate_1.1 +end + +function setup_mc_log +version SUNWprivate_1.1 +end + +function md_init +version SUNWprivate_1.1 +end + +function md_init_nosig +version SUNWprivate_1.1 +end + +function md_init_daemon +version SUNWprivate_1.1 +end + +function md_post_sig +version SUNWprivate_1.1 +end + +function md_rb_sig_handling_off +version SUNWprivate_1.1 +end + +function md_rb_sig_handling_on +version SUNWprivate_1.1 +end + +function md_which_sig +version SUNWprivate_1.1 +end + +function meta_lock +version SUNWprivate_1.1 +end + +function meta_lock_name +version SUNWprivate_1.1 +end + +function meta_lock_nowait +version SUNWprivate_1.1 +end + +function meta_lock_status +version SUNWprivate_1.1 +end + +function meta_unlock +version SUNWprivate_1.1 +end + +function metalogfp +version SUNWprivate_1.1 +end + +function metasyslog +version SUNWprivate_1.1 +end + +function verbosity +version SUNWprivate_1.1 +end + +function start_time +version SUNWprivate_1.1 +end + +function myname +version SUNWprivate_1.1 +end + +function procsigs +version SUNWprivate_1.1 +end + +function rb_test +version SUNWprivate_1.1 +end + +function meta_stat +version SUNWprivate_1.1 +end + +function metaflushstatcache +version SUNWprivate_1.1 +end + +function comp_state_to_name +version SUNWprivate_1.1 +end + +function meta_check_component +version SUNWprivate_1.1 +end + +function meta_check_instripe +version SUNWprivate_1.1 +end + +function meta_check_stripe +version SUNWprivate_1.1 +end + +function meta_create_stripe +version SUNWprivate_1.1 +end + +function meta_default_stripe_interlace +version SUNWprivate_1.1 +end + +function meta_find_erred_comp +version SUNWprivate_1.1 +end + +function meta_free_stripe +version SUNWprivate_1.1 +end + +function meta_get_stripe_common +version SUNWprivate_1.1 +end + +function meta_get_stripe +version SUNWprivate_1.1 +end + +function meta_get_stripe_names +version SUNWprivate_1.1 +end + +function meta_init_stripe +version SUNWprivate_1.1 +end + +function meta_print_stripe_options +version SUNWprivate_1.1 +end + +function meta_recover_sp +version SUNWprivate_1.1 +end + +function meta_sp_issp +version SUNWprivate_1.1 +end + +function meta_sp_reset_component +version SUNWprivate_1.1 +end + +function meta_sp_attach +version SUNWprivate_1.1 +end + +function meta_sp_update_abr +version SUNWprivate_1.1 +end + +function meta_mn_sp_update_abr +version SUNWprivate_1.1 +end + +function meta_get_sp_common +version SUNWprivate_1.1 +end + +function meta_get_sp +version SUNWprivate_1.1 +end + +function meta_free_sp +version SUNWprivate_1.1 +end + +function meta_get_sp_names +version SUNWprivate_1.1 +end + +function meta_sp_can_create_sps +version SUNWprivate_1.1 +end + +function meta_sp_can_create_sps_on_drive +version SUNWprivate_1.1 +end + +function meta_sp_get_free_space +version SUNWprivate_1.1 +end + +function meta_sp_get_free_space_on_drive +version SUNWprivate_1.1 +end + +function meta_sp_get_number_of_possible_sps +version SUNWprivate_1.1 +end + +function meta_sp_get_number_of_possible_sps_on_drive +version SUNWprivate_1.1 +end + +function meta_sp_get_possible_sp_size +version SUNWprivate_1.1 +end + +function meta_sp_get_possible_sp_size_on_drive +version SUNWprivate_1.1 +end + +function meta_sp_parsesize +version SUNWprivate_1.1 +end + +function meta_stripe_anycomp_is_err +version SUNWprivate_1.1 +end + +function meta_stripe_attach +version SUNWprivate_1.1 +end + +function meta_stripe_check_interlace +version SUNWprivate_1.1 +end + +function meta_stripe_get_params +version SUNWprivate_1.1 +end + +function meta_stripe_print +version SUNWprivate_1.1 +end + +function meta_stripe_replace +version SUNWprivate_1.1 +end + +function meta_stripe_reset +version SUNWprivate_1.1 +end + +function meta_stripe_set_params +version SUNWprivate_1.1 +end + +function meta_systemfile_append_mddb +version SUNWprivate_1.1 +end + +function meta_systemfile_append_mdroot +version SUNWprivate_1.1 +end + +function meta_systemfile_copy +version SUNWprivate_1.1 +end + +function meta_tab_find +version SUNWprivate_1.1 +end + +function meta_tab_free +version SUNWprivate_1.1 +end + +function meta_tab_parse +version SUNWprivate_1.1 +end + +function meta_check_intrans +version SUNWprivate_1.1 +end + +function meta_check_log +version SUNWprivate_1.1 +end + +function meta_check_master +version SUNWprivate_1.1 +end + +function meta_free_trans +version SUNWprivate_1.1 +end + +function meta_get_trans +version SUNWprivate_1.1 +end + +function meta_get_trans_common +version SUNWprivate_1.1 +end + +function meta_get_trans_names +version SUNWprivate_1.1 +end + +function meta_logs_print +version SUNWprivate_1.1 +end + +function meta_trans_detach +version SUNWprivate_1.1 +end + +function meta_trans_print +version SUNWprivate_1.1 +end + +function meta_trans_replace +version SUNWprivate_1.1 +end + +function meta_trans_reset +version SUNWprivate_1.1 +end + +function mt_flags_to_action +version SUNWprivate_1.1 +end + +function mt_flags_to_name +version SUNWprivate_1.1 +end + +function mt_l_error_to_action +version SUNWprivate_1.1 +end + +function mt_l_error_to_name +version SUNWprivate_1.1 +end + +function transstats +version SUNWprivate_1.1 +end + +function meta_getuserflags +version SUNWprivate_1.1 +end + +function meta_setuserflags +version SUNWprivate_1.1 +end + +function metarpcclose +version SUNWprivate_1.1 +end + +function metarpccloseall +version SUNWprivate_1.1 +end + +function metarpcopen +version SUNWprivate_1.1 +end + +function splicename +version SUNWprivate_1.1 +end + +function splitname +version SUNWprivate_1.1 +end + +function crcfreetab +version SUNWprivate_1.1 +end + +function crcfunc +version SUNWprivate_1.1 +end + +function mdnullerror +version SUNWprivate_1.1 +end + +function xdr_comp_state_t +version SUNWprivate_1.1 +end + +function xdr_comp_t +version SUNWprivate_1.1 +end + +function xdr_diskaddr_t +version SUNWprivate_1.1 +end + +function xdr_hotspare_states_t +version SUNWprivate_1.1 +end + +function xdr_hs_t +version SUNWprivate_1.1 +end + +function xdr_hsp_t +version SUNWprivate_1.1 +end + +function xdr_md_common_t +version SUNWprivate_1.1 +end + +function xdr_md_comp_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_comp_error_t +version SUNWprivate_1.1 +end + +function xdr_md_comp_t +version SUNWprivate_1.1 +end + +function xdr_md_dev_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_dev_error_t +version SUNWprivate_1.1 +end + +function xdr_md_drive_desc +version SUNWprivate_1.1 +end + +function xdr_md_drive_record +version SUNWprivate_1.1 +end + +function xdr_md_ds_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_ds_error_t +version SUNWprivate_1.1 +end + +function xdr_md_errclass_t +version SUNWprivate_1.1 +end + +function xdr_md_error_info_t +version SUNWprivate_1.1 +end + +function xdr_md_error_t +version SUNWprivate_1.1 +end + +function xdr_md_hs_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_hs_error_t +version SUNWprivate_1.1 +end + +function xdr_md_hs_t +version SUNWprivate_1.1 +end + +function xdr_md_hsp_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_hsp_error_t +version SUNWprivate_1.1 +end + +function xdr_md_hsp_t +version SUNWprivate_1.1 +end + +function xdr_md_md_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_md_error_t +version SUNWprivate_1.1 +end + +function xdr_md_mddb_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_mddb_error_t +version SUNWprivate_1.1 +end + +function xdr_md_mirror_t +version SUNWprivate_1.1 +end + +function xdr_md_name_prefix +version SUNWprivate_1.1 +end + +function xdr_md_name_suffix +version SUNWprivate_1.1 +end + +function xdr_md_parent_t +version SUNWprivate_1.1 +end + +function xdr_md_raid_t +version SUNWprivate_1.1 +end + +function xdr_md_raidcol_t +version SUNWprivate_1.1 +end + +function xdr_md_replica_t +version SUNWprivate_1.1 +end + +function xdr_md_replica_recerr_t +version SUNWprivate_1.1 +end + +function xdr_md_replicalist_t +version SUNWprivate_1.1 +end + +function xdr_md_riflags_t +version SUNWprivate_1.1 +end + +function xdr_md_row_t +version SUNWprivate_1.1 +end + +function xdr_md_rpc_error_t +version SUNWprivate_1.1 +end + +function xdr_md_set_desc +version SUNWprivate_1.1 +end + +function xdr_md_set_record +version SUNWprivate_1.1 +end + +function xdr_md_setkey_t +version SUNWprivate_1.1 +end + +function xdr_md_shared_t +version SUNWprivate_1.1 +end + +function xdr_md_splitname +version SUNWprivate_1.1 +end + +function xdr_md_stackcap_t +version SUNWprivate_1.1 +end + +function xdr_md_status_t +version SUNWprivate_1.1 +end + +function xdr_md_stripe_t +version SUNWprivate_1.1 +end + +function xdr_md_submirror_t +version SUNWprivate_1.1 +end + +function xdr_md_sys_error_t +version SUNWprivate_1.1 +end + +function xdr_md_trans_t +version SUNWprivate_1.1 +end + +function xdr_md_types_t +version SUNWprivate_1.1 +end + +function xdr_md_ur_get_cmd_t +version SUNWprivate_1.1 +end + +function xdr_md_use_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_use_error_t +version SUNWprivate_1.1 +end + +function xdr_md_void_errno_t +version SUNWprivate_1.1 +end + +function xdr_md_void_error_t +version SUNWprivate_1.1 +end + +function xdr_mdcinfo_t +version SUNWprivate_1.1 +end + +function xdr_mddb_cfgcmd_t +version SUNWprivate_1.1 +end + +function xdr_mddb_recstatus_t +version SUNWprivate_1.1 +end + +function xdr_mddb_type_t +version SUNWprivate_1.1 +end + +function xdr_mddb_usercmd_t +version SUNWprivate_1.1 +end + +function xdr_mddb_userrec_t +version SUNWprivate_1.1 +end + +function xdr_mddrivename_t +version SUNWprivate_1.1 +end + +function xdr_mddrivenamelist_t +version SUNWprivate_1.1 +end + +function xdr_mdgeom_t +version SUNWprivate_1.1 +end + +function xdr_mdhspname_t +version SUNWprivate_1.1 +end + +function xdr_mdhspnamelist_t +version SUNWprivate_1.1 +end + +function xdr_mdname_t +version SUNWprivate_1.1 +end + +function xdr_mdnamelist_t +version SUNWprivate_1.1 +end + +function xdr_mdnmtype_t +version SUNWprivate_1.1 +end + +function xdr_mdpart_t +version SUNWprivate_1.1 +end + +function xdr_mdsetname_t +version SUNWprivate_1.1 +end + +function xdr_mdsetnamelist_t +version SUNWprivate_1.1 +end + +function xdr_mdsidenames_t +version SUNWprivate_1.1 +end + +function xdr_mdvtoc_t +version SUNWprivate_1.1 +end + +function xdr_minor_or_hsp_t +version SUNWprivate_1.1 +end + +function xdr_mm_params_t +version SUNWprivate_1.1 +end + +function xdr_mm_pass_num_t +version SUNWprivate_1.1 +end + +function xdr_mm_rd_opt_t +version SUNWprivate_1.1 +end + +function xdr_mm_wr_opt_t +version SUNWprivate_1.1 +end + +function xdr_mr_params_t +version SUNWprivate_1.1 +end + +function xdr_ms_params_t +version SUNWprivate_1.1 +end + +function xdr_mt_debug_t +version SUNWprivate_1.1 +end + +function xdr_mt_flags_t +version SUNWprivate_1.1 +end + +function xdr_mt_l_error_t +version SUNWprivate_1.1 +end + +function xdr_rcs_flags_t +version SUNWprivate_1.1 +end + +function xdr_rcs_state_t +version SUNWprivate_1.1 +end + +function xdr_replica_flags_t +version SUNWprivate_1.1 +end + +function xdr_rus_state_t +version SUNWprivate_1.1 +end + +function xdr_sm_flags_t +version SUNWprivate_1.1 +end + +function xdr_sm_state_t +version SUNWprivate_1.1 +end + +function xdr_unit_t +version SUNWprivate_1.1 +end + +function xdr_clnt_stat +version SUNWprivate_1.1 +end + +function xdr_md_timeval32_t +version SUNWprivate_1.1 +end + +function xdr_daddr_t +version SUNWprivate_1.1 +end + +function xdr_md_dev64_t +version SUNWprivate_1.1 +end + +function xdr_dev_t +version SUNWprivate_1.1 +end + +function xdr_md_alias_ip_t +version SUNWprivate_1.1 +end + +function xdr_md_alias_nm_t +version SUNWprivate_1.1 +end + +function xdr_md_h_arr_t +version SUNWprivate_1.1 +end + +function xdr_md_h_t +version SUNWprivate_1.1 +end + +function xdr_md_hi_arr_t +version SUNWprivate_1.1 +end + +function xdr_md_hi_t +version SUNWprivate_1.1 +end + +function xdr_md_node_nm_arr_t +version SUNWprivate_1.1 +end + +function xdr_md_node_nm_t +version SUNWprivate_1.1 +end + +function xdr_md_set_nm_t +version SUNWprivate_1.1 +end + +function xdr_mddb_recid_t +version SUNWprivate_1.1 +end + +function xdr_mdkey_t +version SUNWprivate_1.1 +end + +function xdr_minor_t +version SUNWprivate_1.1 +end + +function xdr_off_t +version SUNWprivate_1.1 +end + +function xdr_set_t +version SUNWprivate_1.1 +end + +function xdr_side_t +version SUNWprivate_1.1 +end + +function xdr_size_t +version SUNWprivate_1.1 +end + +function xdr_timeval +version SUNWprivate_1.1 +end + +function md_in_daemon +version SUNWprivate_1.1 +end + +function mdrpc_add_drv_sidenms_1 +version SUNWprivate_1.1 +end + +function mdrpc_adddrvs_1 +version SUNWprivate_1.1 +end + +function mdrpc_addhosts_1 +version SUNWprivate_1.1 +end + +function mdrpc_createset_1 +version SUNWprivate_1.1 +end + +function mdrpc_del_drv_sidenms_1 +version SUNWprivate_1.1 +end + +function mdrpc_deldrvs_1 +version SUNWprivate_1.1 +end + +function mdrpc_delhosts_1 +version SUNWprivate_1.1 +end + +function mdrpc_delset_1 +version SUNWprivate_1.1 +end + +function mdrpc_drvused_1 +version SUNWprivate_1.1 +end + +function mdrpc_flush_internal_1 +version SUNWprivate_1.1 +end + +function mdrpc_getset_1 +version SUNWprivate_1.1 +end + +function mdrpc_gtimeout_1 +version SUNWprivate_1.1 +end + +function mdrpc_hostname_1 +version SUNWprivate_1.1 +end + +function mdrpc_lock_set_1 +version SUNWprivate_1.1 +end + +function mdrpc_nullproc_1 +version SUNWprivate_1.1 +end + +function mdrpc_ownset_1 +version SUNWprivate_1.1 +end + +function mdrpc_setnameok_1 +version SUNWprivate_1.1 +end + +function mdrpc_setnumbusy_1 +version SUNWprivate_1.1 +end + +function mdrpc_stimeout_1 +version SUNWprivate_1.1 +end + +function mdrpc_unlock_set_1 +version SUNWprivate_1.1 +end + +function mdrpc_upd_dr_dbinfo_1 +version SUNWprivate_1.1 +end + +function mdrpc_upd_dr_flags_1 +version SUNWprivate_1.1 +end + +function mdrpc_upd_sr_flags_1 +version SUNWprivate_1.1 +end + +function mdrpc_updmeds_1 +version SUNWprivate_1.1 +end + +function mdrpc_add_drv_sidenms_2 +version SUNWprivate_1.1 +end + +function mdrpc_adddrvs_2 +version SUNWprivate_1.1 +end + +function mdrpc_addhosts_2 +version SUNWprivate_1.1 +end + +function mdrpc_createset_2 +version SUNWprivate_1.1 +end + +function mdrpc_del_drv_sidenms_2 +version SUNWprivate_1.1 +end + +function mdrpc_deldrvs_2 +version SUNWprivate_1.1 +end + +function mdrpc_delhosts_2 +version SUNWprivate_1.1 +end + +function mdrpc_delset_2 +version SUNWprivate_1.1 +end + +function mdrpc_devinfo_2 +version SUNWprivate_1.1 +end + +function mdrpc_drvused_2 +version SUNWprivate_1.1 +end + +function mdrpc_flush_internal_2 +version SUNWprivate_1.1 +end + +function mdrpc_getset_2 +version SUNWprivate_1.1 +end + +function mdrpc_mngetset_2 +version SUNWprivate_1.1 +end + +function mdrpc_gtimeout_2 +version SUNWprivate_1.1 +end + +function mdrpc_hostname_2 +version SUNWprivate_1.1 +end + +function mdrpc_lock_set_2 +version SUNWprivate_1.1 +end + +function mdrpc_nullproc_2 +version SUNWprivate_1.1 +end + +function mdrpc_ownset_2 +version SUNWprivate_1.1 +end + +function mdrpc_setnameok_2 +version SUNWprivate_1.1 +end + +function mdrpc_setnumbusy_2 +version SUNWprivate_1.1 +end + +function mdrpc_stimeout_2 +version SUNWprivate_1.1 +end + +function mdrpc_unlock_set_2 +version SUNWprivate_1.1 +end + +function mdrpc_upd_dr_dbinfo_2 +version SUNWprivate_1.1 +end + +function mdrpc_upd_dr_flags_2 +version SUNWprivate_1.1 +end + +function mdrpc_upd_sr_flags_2 +version SUNWprivate_1.1 +end + +function mdrpc_upd_nr_flags_2 +version SUNWprivate_1.1 +end + +function mdrpc_updmeds_2 +version SUNWprivate_1.1 +end + +function mdrpc_mncreateset_2 +version SUNWprivate_1.1 +end + +function mdrpc_mnsetmaster_2 +version SUNWprivate_1.1 +end + +function mdrpc_mn_mirror_resync_all_2 +version SUNWprivate_1.1 +end + +function mdrpc_mn_sp_update_abr_2 +version SUNWprivate_1.1 +end + +function xdr_mdrpc_bool_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_createset_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_createset_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mncreateset_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devinfo_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devidstr_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devid_name_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devinfo_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devinfo_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devinfo_2_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devid_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devid_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_devid_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drives_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drives_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drv_sidenm_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drv_sidenm_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drvused_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_drvused_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_generic_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_getset_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_getset_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_getset_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mngetset_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_gtimeout_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_host_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_host_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_hostname_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_null_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_setlock_res +version SUNWprivate_1.1 +end + +function xdr_mdrpc_setno_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_setno_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_sp_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_sp_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_stimeout_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_stimeout_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_upd_dr_flags_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_upd_dr_flags_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_upd_sr_flags_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_upd_sr_flags_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_upd_nr_flags_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_updmeds_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_updmeds_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mnsetmaster_2_args +version SUNWprivate_1.1 +end + +function xdr_stringarray +version SUNWprivate_1.1 +end + +function med_get_data_1 +version SUNWprivate_1.1 +end + +function med_get_rec_1 +version SUNWprivate_1.1 +end + +function med_hostname_1 +version SUNWprivate_1.1 +end + +function med_null_1 +version SUNWprivate_1.1 +end + +function med_upd_data_1 +version SUNWprivate_1.1 +end + +function med_upd_rec_1 +version SUNWprivate_1.1 +end + +function md_med_def_timeout +version SUNWprivate_1.1 +end + +function md_med_pmap_timeout +version SUNWprivate_1.1 +end + +function med_null_err +version SUNWprivate_1.1 +end + +function xdr_md_med_errno_t +version SUNWprivate_1.1 +end + +function xdr_med_args_t +version SUNWprivate_1.1 +end + +function xdr_med_data_t +version SUNWprivate_1.1 +end + +function xdr_med_err_t +version SUNWprivate_1.1 +end + +function xdr_med_get_data_res_t +version SUNWprivate_1.1 +end + +function xdr_med_get_rec_res_t +version SUNWprivate_1.1 +end + +function xdr_med_hnm_res_t +version SUNWprivate_1.1 +end + +function xdr_med_med_t +version SUNWprivate_1.1 +end + +function xdr_med_rec_t +version SUNWprivate_1.1 +end + +function xdr_med_res_t +version SUNWprivate_1.1 +end + +function xdr_med_upd_data_args_t +version SUNWprivate_1.1 +end + +function xdr_med_upd_rec_args_t +version SUNWprivate_1.1 +end + +function mhd_list_1 +version SUNWprivate_1.1 +end + +function mhd_relown_1 +version SUNWprivate_1.1 +end + +function mhd_status_1 +version SUNWprivate_1.1 +end + +function mhd_tkown_1 +version SUNWprivate_1.1 +end + +function mhd_null_error +version SUNWprivate_1.1 +end + +function xdr_mhd_drive_status_t +version SUNWprivate_1.1 +end + +function xdr_mhd_drivename_t +version SUNWprivate_1.1 +end + +function xdr_mhd_error_t +version SUNWprivate_1.1 +end + +function xdr_mhd_ff_mode_t +version SUNWprivate_1.1 +end + +function xdr_mhd_list_args_t +version SUNWprivate_1.1 +end + +function xdr_mhd_list_res_t +version SUNWprivate_1.1 +end + +function xdr_mhd_opts_t +version SUNWprivate_1.1 +end + +function xdr_mhd_relown_args_t +version SUNWprivate_1.1 +end + +function xdr_mhd_set_t +version SUNWprivate_1.1 +end + +function xdr_mhd_status_args_t +version SUNWprivate_1.1 +end + +function xdr_mhd_status_res_t +version SUNWprivate_1.1 +end + +function xdr_mhd_tkown_args_t +version SUNWprivate_1.1 +end + +function xdr_mhd_cinfo_t +version SUNWprivate_1.1 +end + +function xdr_mhd_ctlrtype_t +version SUNWprivate_1.1 +end + +function xdr_mhd_did_flags_t +version SUNWprivate_1.1 +end + +function xdr_mhd_drive_id_t +version SUNWprivate_1.1 +end + +function xdr_mhd_drive_info_list_t +version SUNWprivate_1.1 +end + +function xdr_mhd_drive_info_t +version SUNWprivate_1.1 +end + +function xdr_mhd_mhiargs_t +version SUNWprivate_1.1 +end + +function xdr_mhd_serial_t +version SUNWprivate_1.1 +end + +function xdr_mhd_mhioctkown_t +version SUNWprivate_1.1 +end + +function xdr_md_mn_msg_t +version SUNWprivate_1.1 +end + +function xdr_md_mn_nodeid_t +version SUNWprivate_1.1 +end + +function meta_get_current_root +version SUNWprivate_1.1 +end + +function meta_get_current_root_dev +version SUNWprivate_1.1 +end + +function meta_gettimeofday +version SUNWprivate_1.1 +end + +function meta_replicaslice +version SUNWprivate_1.1 +end + +function meta_get_tstate +version SUNWprivate_1.1 +end + +function meta_setmdvtoc +version SUNWprivate_1.1 +end + +function meta_check_devicesize +version SUNWprivate_1.1 +end + +function clnt_devid +version SUNWprivate_1.1 +end + +function meta_number_to_string +version SUNWprivate_1.1 +end + +function meta_repartition_drive +version SUNWprivate_1.1 +end + +function mdmn_send_message +version SUNWprivate_1.1 +end + +function copy_result +version SUNWprivate_1.1 +end + +function free_result +version SUNWprivate_1.1 +end + +function copy_msg +version SUNWprivate_1.1 +end + +function copy_msg_1 +version SUNWprivate_1.1 +end + +function free_msg +version SUNWprivate_1.1 +end + +function mdmn_get_handler +version SUNWprivate_1.1 +end + +function mdmn_get_submessage_generator +version SUNWprivate_1.1 +end + +function mdmn_get_message_class +version SUNWprivate_1.1 +end + +function mdmn_get_timeout +version SUNWprivate_1.1 +end + +function meta_read_nodelist +version SUNWprivate_1.1 +end + +function meta_write_nodelist +version SUNWprivate_1.1 +end + +function meta_free_nodelist +version SUNWprivate_1.1 +end + +function meta_is_mn_set +version SUNWprivate_1.1 +end + +function meta_ping_mnset +version SUNWprivate_1.1 +end + +function meta_mn_send_command +version SUNWprivate_1.1 +end + +function meta_mn_send_suspend_writes +version SUNWprivate_1.1 +end + +function meta_mn_send_setsync +version SUNWprivate_1.1 +end + +function meta_mn_send_metaclear_command +version SUNWprivate_1.1 +end + +function meta_mn_send_resync_starting +version SUNWprivate_1.1 +end + +function meta_mn_change_owner +version SUNWprivate_1.1 +end + +function meta_is_mn_name +version SUNWprivate_1.1 +end + +function meta_reconfig_choose_master +version SUNWprivate_1.1 +end + +function meta_mnsync_user_records +version SUNWprivate_1.1 +end + +function meta_mnsync_diskset_mddbs +version SUNWprivate_1.1 +end + +function meta_mnjoin_all +version SUNWprivate_1.1 +end + +function mdmn_create_msgid +version SUNWprivate_1.1 +end + +function mdmn_suspend +version SUNWprivate_1.1 +end + +function mdmn_resume +version SUNWprivate_1.1 +end + +function mdmn_reinit_set +version SUNWprivate_1.1 +end + +function mdmn_msgtype_lock +version SUNWprivate_1.1 +end + +function mdmn_abort +version SUNWprivate_1.1 +end + +function mdmn_send_1 +version SUNWprivate_1.1 +end + +function mdmn_work_1 +version SUNWprivate_1.1 +end + +function mdmn_wakeup_initiator_1 +version SUNWprivate_1.1 +end + +function mdmn_wakeup_master_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_lock_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_unlock_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_suspend_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_resume_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_reinit_set_1 +version SUNWprivate_1.1 +end + +function mdmn_comm_msglock_1 +version SUNWprivate_1.1 +end + +function clnt_mdcommdctl +version SUNWprivate_1.1 +end + +function mdrpc_mdcommdctl_2 +version SUNWprivate_1.1 +end + +function clnt_mn_is_stale +version SUNWprivate_1.1 +end + +function mdrpc_mn_is_stale_2 +version SUNWprivate_1.1 +end + +function clnt_clr_mnsetlock +version SUNWprivate_1.1 +end + +function mdrpc_clr_mnsetlock_2 +version SUNWprivate_1.1 +end + +function xdr_mdrpc_sp_flags_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_sp_flags_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mdcommdctl_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mdcommdctl_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_getdrivedesc_res +version SUNWprivate_1.1 +end + +function dd_list_dup +version SUNWprivate_1.1 +end + +function mdmn_allocate_changelog +version SUNWprivate_1.1 +end + +function mdmn_reset_changelog +version SUNWprivate_1.1 +end + +function mdmn_log_msg +version SUNWprivate_1.1 +end + +function mdmn_unlog_msg +version SUNWprivate_1.1 +end + +function mdmn_snarf_changelog +version SUNWprivate_1.1 +end + +function mdmn_get_changelogrec +version SUNWprivate_1.1 +end + +function clnt_reset_mirror_owner +version SUNWprivate_1.1 +end + +function mdrpc_reset_mirror_owner_2 +version SUNWprivate_1.1 +end + +function clnt_mn_susp_res_io +version SUNWprivate_1.1 +end + +function mdrpc_mn_susp_res_io_2 +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mn_susp_res_io_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_mn_susp_res_io_2_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_nodeid_args +version SUNWprivate_1.1 +end + +function xdr_mdrpc_nodeid_2_args +version SUNWprivate_1.1 +end + +function meta_is_member +version SUNWprivate_1.1 +end + +function meta_mn_singlenode +version SUNWprivate_1.1 +end + +function meta_sp_setstatus +version SUNWprivate_1.1 +end + +function xdr_mp_unit_t +version SUNWprivate_1.1 +end + +function xdr_md_set_params_t +version SUNWprivate_1.1 +end + +function meta_fixdevid +version SUNWprivate_1.1 +end + +function meta_upd_ctdnames +version SUNWprivate_1.1 +end + +function pathname_reload +version SUNWprivate_1.1 +end + +function meta_deviceid_to_nmlist +version SUNWprivate_1.1 +end + +function meta_mn_send_get_tstate +version SUNWprivate_1.1 +end + +function meta_client_create_retry +version SUNWprivate_1.1 +end + +function meta_client_create +version SUNWprivate_1.1 +end + +function read_master_block +version SUNWprivate_1.1 +end diff --git a/usr/src/lib/lvm/libmeta/spec/sparc/Makefile b/usr/src/lib/lvm/libmeta/spec/sparc/Makefile new file mode 100644 index 0000000000..8d93c87287 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/sparc/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libmeta/spec/sparc/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB) diff --git a/usr/src/lib/lvm/libmeta/spec/sparcv9/Makefile b/usr/src/lib/lvm/libmeta/spec/sparcv9/Makefile new file mode 100644 index 0000000000..5a7be1d65b --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/sparcv9/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libmeta/spec/sparcv9/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB64) diff --git a/usr/src/lib/lvm/libmeta/spec/versions b/usr/src/lib/lvm/libmeta/spec/versions new file mode 100644 index 0000000000..523cb927d5 --- /dev/null +++ b/usr/src/lib/lvm/libmeta/spec/versions @@ -0,0 +1,31 @@ +#pragma ident "%Z%%M% %I% %E% SMI" +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +i386 { + SUNWprivate_1.1; +} +sparc { + SUNWprivate_1.1; +} diff --git a/usr/src/lib/lvm/libpreen/Makefile b/usr/src/lib/lvm/libpreen/Makefile new file mode 100644 index 0000000000..ca1bb50937 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/Makefile @@ -0,0 +1,49 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 1998-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include $(SRC)/lib/Makefile.lib + +SUBDIRS = $(MACH) + +all := TARGET= all +install := TARGET= install +clean := TARGET= clean +clobber := TARGET= clobber +lint := TARGET= lint +debug := TARGET= debug + +.KEEP_STATE: + +all clean clobber debug install: spec .WAIT $(SUBDIRS) + +lint: $(SUBDIRS) + +spec $(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: diff --git a/usr/src/lib/lvm/libpreen/Makefile.com b/usr/src/lib/lvm/libpreen/Makefile.com new file mode 100644 index 0000000000..fdfee4627d --- /dev/null +++ b/usr/src/lib/lvm/libpreen/Makefile.com @@ -0,0 +1,50 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +LIBRARY= preen_md.a +VERS= .1 +OBJECTS= mdpreen.o + +include $(SRC)/lib/lvm/Makefile.lvm + +ROOTLIBDIR= $(ROOT)/usr/lib/drv +LIBS= $(DYNLIB) # don't build a static lib +CPPFLAGS += -D_FILE_OFFSET_BITS=64 +LDLIBS += -lmeta -lc +ZDEFS= + +MAPDIR= $(SRC)/lib/lvm/libpreen/spec/$(TRANSMACH) +SPECMAPFILE= $(MAPDIR)/mapfile + +.KEEP_STATE: + +all: $(LIBS) + +include $(SRC)/lib/lvm/Makefile.targ + +$(ROOTLIBDIR)/$(DYNLIB) := FILEMODE= 555 diff --git a/usr/src/lib/lvm/libpreen/common/mdpreen.c b/usr/src/lib/lvm/libpreen/common/mdpreen.c new file mode 100644 index 0000000000..a28a6c2560 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/common/mdpreen.c @@ -0,0 +1,335 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * preenlib interface for SVM. + * + * On startup fsck attempts to check filesystems in parallel. However + * running mutiple fscks on the same disk at the same time + * significantly degrades the performance. fsck code avoids such + * behavior. To analyse such patterns it needs the physical disk + * instance. preen_build_devs provides that information for + * filesystems that are on top of metadevices. + */ + +#include <meta.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include <sdssc.h> + +#define MAX_N2M_ALIAS_LINE (2*FILENAME_MAX + 1) +#define NAME_TO_MAJOR "/etc/name_to_major" +#define MD_MODULE "md" + +/* + * Macros to produce a quoted string containing the value of a + * preprocessor macro. For example, if SIZE is defined to be 256, + * VAL2STR(SIZE) is "256". This is used to construct format + * strings for scanf-family functions below. + */ +#define QUOTE(x) #x +#define VAL2STR(x) QUOTE(x) + +extern void preen_addunit(void *cookie, char *dname, int (*cf)(), + void *datap, uint_t unit); +extern int preen_subdev(char *name, struct dk_cinfo *dkiop, void *dp); + +static int +get_major_from_n2m(char *modname, int *major) +{ + FILE *fp; + char drv[FILENAME_MAX + 1]; + int entry; + int found = 0; + char line[MAX_N2M_ALIAS_LINE]; + int status = 0; + + if ((fp = fopen(NAME_TO_MAJOR, "r")) == NULL) { + return (-1); + } + + while ((fgets(line, sizeof (line), fp) != NULL) && + status == 0) { + + if (sscanf(line, "%" VAL2STR(FILENAME_MAX) "s %d", + drv, &entry) != 2) { + status = -1; + } + if (strcmp(drv, modname) == 0) { + *major = entry; + found = 1; + break; + } + } + + /* + * if no match is found return -1 + */ + if (found == 0) + status = -1; + + (void) fclose(fp); + return (status); +} + +/* + * If the name contains a diskset name, it is parsed out and returned. + * The dev_path can be either a md pathname /dev/md/rdsk/d0 or a path + * name that contains a diskset /dev/md/red/rdsk/d0. + */ + +static char * +parse_path(char *dev_path) +{ + char *cpdev; + char *cp, *cpp; + char *setname; + size_t size; + + /* + * paths are /dev/md/rdsk/dx or /dev/md/<setname>/rdsk/dx + * cp points to /rdsk/dx. Scan back to the previous slash. + * If this matches "dev", then path is a local set. + * + * The /rdsk/d pattern in strstr is used so that users with + * a twisted mind can create a diskset called "rdsk" and + * would still want everything to work!! + */ + cp = strstr(dev_path, "/rdsk/d"); + + for (cpdev = cp - 1; *cpdev != '/'; cpdev--); + cpdev = cpdev - 3; /* backspace 3 char */ + if (strncmp(cpdev, "dev", strlen("dev")) == 0) + return (Strdup(MD_LOCAL_NAME)); + + /* + * extract the setname from the path + */ + cpp = cp; + for (cp--; *cp != '/'; cp--); + size = (size_t)(cpp - cp); + setname = (char *)Malloc(size); + (void) strlcpy(setname, (const char *)(cp + 1), size); + + return (setname); +} + +/* + * This routine is called from preenlib the first time. It is then + * recursively called through preen_subdev. + * + * The argument passed in (uname) starts with the special device from + * /etc/vfstab. Recursive calls pass in the underlying physical device + * names. + */ +void +preen_build_devs( + char *uname, /* name of metadevice */ + struct dk_cinfo *dkiop, /* associated controller info */ + void *dp /* magic info */ +) +{ + char *setname = NULL; + mdsetname_t *sp; + mdname_t *namep; /* metadevice name */ + mdnamelist_t *nlp = NULL; /* list of real devices */ + mdnamelist_t *p; + devid_nmlist_t *nm_list = NULL; + md_error_t status = mdnullerror; + md_error_t *ep = &status; + int ep_valid = 0; /* does ep contain a real error */ + struct stat statb; + static int md_major = -1; + side_t sideno; + + if (stat(uname, &statb) != 0) + return; + + if (md_major == -1 && + get_major_from_n2m(MD_MODULE, &md_major) != 0) + return; + + /* + * If the path passed in is not a metadevice, then add that + * device to the list (preen_addunit) since it has to be a + * physical device. + */ + + if (major(statb.st_rdev) != md_major) { + preen_addunit(dp, dkiop->dki_dname, NULL, NULL, + dkiop->dki_unit); + return; + } + /* + * Bind to the cluster library + */ + + if (sdssc_bind_library() == SDSSC_ERROR) + return; + + if (md_init_daemon("fsck", ep) != 0) { + ep_valid = 1; + goto out; + } + + /* + * parse the path name to get the diskset name. + */ + + setname = parse_path(uname); + if ((sp = metasetname(setname, ep)) == NULL) { + ep_valid = 1; + goto out; + } + + /* check for ownership */ + if (meta_check_ownership(sp, ep) != 0) { + /* + * Don't own the set but we are here implies + * that this is a clustered proxy device. Simply add + * the unit. + */ + preen_addunit(dp, dkiop->dki_dname, NULL, NULL, + dkiop->dki_unit); + ep_valid = 1; + goto out; + } + + /* + * get list of underlying physical devices. + */ + if ((namep = metaname(&sp, uname, ep)) == NULL) { + ep_valid = 1; + goto out; + } + + if (namep->dev == NODEV64) { + goto out; + } + + if (meta_getdevs(sp, namep, &nlp, ep) != 0) { + ep_valid = 1; + goto out; + } + + if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) { + ep_valid = 1; + goto out; + } + + /* gather and add the underlying devs */ + for (p = nlp; (p != NULL); p = p->next) { + mdname_t *devnp = p->namep; + int fd; + struct dk_cinfo cinfo; + ddi_devid_t md_did; + char *devname; + char *minor_name = NULL; + char mname[MAXPATHLEN]; + + /* + * we don't want to use the rname anymore because + * that may have changed. Use the device id information + * to find the correct ctd name and open based on that. + * If there isn't a devid or we have a did device, then + * use the rname. In clustering, it's corrected for us. + * If no devid it's at least worth a try. + */ + if (((md_did = meta_getdidbykey(sp->setno, sideno, + devnp->key, ep)) == NULL) || ((minor_name = + meta_getdidminorbykey(sp->setno, sideno, + devnp->key, ep)) == NULL)) { + devname = devnp->rname; + if (md_did) + Free(md_did); + } else { + if (strstr(minor_name, ",raw") == NULL) { + (void) snprintf(mname, MAXPATHLEN, "%s,raw", + minor_name); + } else { + (void) snprintf(mname, MAXPATHLEN, "%s", + minor_name); + } + + /* + * We need to make sure we call this with a specific + * mname (raw mname) so that we get the exact slice + * with the given device id. Otherwise we could try + * to open a slice that doesn't really exist. + */ + if (meta_deviceid_to_nmlist("/dev", md_did, + mname, &nm_list) != 0) { + (void) mdsyserror(ep, errno, devnp->rname); + ep_valid = 1; + Free(md_did); + Free(minor_name); + goto out; + } + devname = Strdup(nm_list->devname); + Free(md_did); + Free(minor_name); + devid_free_nmlist(nm_list); + } + /* get device name and (real) cinfo */ + if ((fd = open(devname, O_RDONLY, 0)) < 0) { + (void) mdsyserror(ep, errno, devname); + ep_valid = 1; + goto out; + } + + if (ioctl(fd, DKIOCINFO, &cinfo) != 0) { + (void) mdsyserror(ep, errno, devname); + (void) close(fd); + ep_valid = 1; + goto out; + } + (void) close(fd); /* sd/ssd bug */ + + /* + * preen_subdev fails when the device name has been + * resolved to the physical layer. Hence it is added + * to preen_addunit. + */ + if (preen_subdev(devname, &cinfo, dp) != 0) { + preen_addunit(dp, cinfo.dki_dname, NULL, NULL, + cinfo.dki_unit); + } + } + + /* cleanup, if we fail, just add this composite device to the list */ +out: + if (setname != NULL) + Free(setname); + if (ep_valid != 0) { + mde_perror(&status, ""); + mdclrerror(&status); + } + metafreenamelist(nlp); +} diff --git a/usr/src/lib/lvm/libpreen/i386/Makefile b/usr/src/lib/lvm/libpreen/i386/Makefile new file mode 100644 index 0000000000..bb9355b10a --- /dev/null +++ b/usr/src/lib/lvm/libpreen/i386/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +install debug: all $(ROOTLIBDIR) $(ROOTLIBS) diff --git a/usr/src/lib/lvm/libpreen/sparc/Makefile b/usr/src/lib/lvm/libpreen/sparc/Makefile new file mode 100644 index 0000000000..eff30f413e --- /dev/null +++ b/usr/src/lib/lvm/libpreen/sparc/Makefile @@ -0,0 +1,30 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# +include ../Makefile.com + +install debug: all $(ROOTLIBDIR) $(ROOTLIBS) diff --git a/usr/src/lib/lvm/libpreen/spec/Makefile b/usr/src/lib/lvm/libpreen/spec/Makefile new file mode 100644 index 0000000000..4f28d95836 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/Makefile @@ -0,0 +1,29 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libpreen/spec/Makefile + +include $(SRC)/lib/Makefile.spec.arch diff --git a/usr/src/lib/lvm/libpreen/spec/Makefile.targ b/usr/src/lib/lvm/libpreen/spec/Makefile.targ new file mode 100644 index 0000000000..582a2c4653 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/Makefile.targ @@ -0,0 +1,36 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libpreen/spec/Makefile.targ + +LIBRARY = libpreen.a +VERS = .1 + +OBJECTS = preen.o + +TRANSCPP = + +SPECCPP = -I.. -I../../inc diff --git a/usr/src/lib/lvm/libpreen/spec/amd64/Makefile b/usr/src/lib/lvm/libpreen/spec/amd64/Makefile new file mode 100644 index 0000000000..c7d89e007c --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/amd64/Makefile @@ -0,0 +1,46 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#amd64_C_PICFLAGS = $(amd64_C_BIGPICFLAGS) + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB64) diff --git a/usr/src/lib/lvm/libpreen/spec/i386/Makefile b/usr/src/lib/lvm/libpreen/spec/i386/Makefile new file mode 100644 index 0000000000..a50dd5cd36 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/i386/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libpreen/spec/i386/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#i386_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB) diff --git a/usr/src/lib/lvm/libpreen/spec/preen.spec b/usr/src/lib/lvm/libpreen/spec/preen.spec new file mode 100644 index 0000000000..839c0fcafe --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/preen.spec @@ -0,0 +1,31 @@ +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# lib/lvm/libpreen/spec/preen.spec + +function preen_build_devs +version SUNWprivate_1.1 +end diff --git a/usr/src/lib/lvm/libpreen/spec/sparc/Makefile b/usr/src/lib/lvm/libpreen/spec/sparc/Makefile new file mode 100644 index 0000000000..f3d039672f --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/sparc/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libpreen/spec/sparc/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB) diff --git a/usr/src/lib/lvm/libpreen/spec/sparcv9/Makefile b/usr/src/lib/lvm/libpreen/spec/sparcv9/Makefile new file mode 100644 index 0000000000..520e996ffb --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/sparcv9/Makefile @@ -0,0 +1,47 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2000-2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# lib/lvm/libpreen/spec/sparcv9/Makefile + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(ROOTABILIB64) diff --git a/usr/src/lib/lvm/libpreen/spec/versions b/usr/src/lib/lvm/libpreen/spec/versions new file mode 100644 index 0000000000..523cb927d5 --- /dev/null +++ b/usr/src/lib/lvm/libpreen/spec/versions @@ -0,0 +1,31 @@ +#pragma ident "%Z%%M% %I% %E% SMI" +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +i386 { + SUNWprivate_1.1; +} +sparc { + SUNWprivate_1.1; +} diff --git a/usr/src/lib/lvm/libsvm/Makefile b/usr/src/lib/lvm/libsvm/Makefile new file mode 100644 index 0000000000..1917939812 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/Makefile @@ -0,0 +1,58 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include $(SRC)/lib/Makefile.lib + +HDRS = libsvm.h +HDRDIR = common/hdrs +SUBDIRS = $(MACH) + +all := TARGET= all +install := TARGET= install +check := TARGET= check +clean := TARGET= clean +clobber := TARGET= clobber +lint := TARGET= lint +debug := TARGET= debug + +.KEEP_STATE: + +all clean clobber debug install: spec .WAIT $(SUBDIRS) + +install_h: $(ROOTHDRS) + +check: $(CHECKHDRS) + +lint: $(SUBDIRS) + +spec $(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: + +include $(SRC)/lib/Makefile.targ diff --git a/usr/src/lib/lvm/libsvm/Makefile.com b/usr/src/lib/lvm/libsvm/Makefile.com new file mode 100644 index 0000000000..3a87715a12 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/Makefile.com @@ -0,0 +1,64 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +LIBRARY= libsvm.a +VERS= .1 +OBJECTS= check_svm.o \ + getdrvname.o \ + metaconf.o \ + metainterfaces.o \ + modops.o \ + start_svm.o \ + debug.o \ + update_mdconf.o + +include $(SRC)/lib/lvm/Makefile.lvm + +ROOTLIBDIR= $(ROOT)/usr/snadm/lib + +LIBS = $(DYNLIB) # don't build a static lib +LDLIBS += -lmeta -ldevid -lc +# +# XXX There isn't a lint library for libspmicommon. For now, we work +# around this by only using the library when we build (as opposed to lint). +# +all debug install := LDLIBS += -L/usr/snadm/lib -lspmicommon + +DYNFLAGS += -R/usr/snadm/lib +CPPFLAGS += -D_FILE_OFFSET_BITS=64 +CPPFLAGS += -I$(SRC)/lib/lvm/libsvm/common/hdrs +ZDEFS = + +MAPDIR= $(SRC)/lib/lvm/libsvm/spec/$(TRANSMACH) +SPECMAPFILE= $(MAPDIR)/mapfile + +.KEEP_STATE: + +all: $(LIBS) + +include $(SRC)/lib/lvm/Makefile.targ diff --git a/usr/src/lib/lvm/libsvm/common/check_svm.c b/usr/src/lib/lvm/libsvm/common/check_svm.c new file mode 100644 index 0000000000..5c92ac2788 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/check_svm.c @@ -0,0 +1,169 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <meta.h> +#include <sys/types.h> +#include <sys/mkdev.h> +#include <sys/stat.h> +#include <limits.h> +#include <svm.h> + +/* + * FUNCTION: valid_bootlist + * + * INPUT: file pointer, line buffer, line_length + * + * RETURN VALUES: + * 0 - SUCCESS + * -1 - FAIL + * + */ + +int +valid_bootlist(FILE *fp, int line_len) +{ + char *bp = NULL; + char *line; + + /* + * errno may not be cleared by callee routines and we + * we want to catch fgets failures hence errno is reset. + */ + errno = 0; + if ((line = malloc(line_len)) == NULL) + return (RET_ERROR); + + while (fgets(line, line_len, fp) != NULL) { + bp = strstr(line, "mddb_bootlist"); + if (bp != NULL) { + /* if not commented out then breakout */ + if (*line != '*' && *line != '#') { + break; + } + } + } + + free(line); + if (bp == NULL || errno != 0) + return (RET_ERROR); + + return (RET_SUCCESS); +} + +/* + * FUNCTION: svm_check + * Check the existance of DiskSuite or SVM + * + * INPUT: rootpath + * + * RETURN VALUES: + * 0 - SUCCESS + * -1 - FAIL + */ + +int +svm_check(char *path) +{ + FILE *fp; + char tmppath[PATH_MAX]; + int rval; + + (void) strcat(strcpy(tmppath, path), MD_CONF); + + if ((fp = fopen(tmppath, "r")) == NULL) { + rval = errno; + goto free_exit; + } + + rval = valid_bootlist(fp, MDDB_BOOTLIST_MAX_LEN); + + debug_printf("svm_check(): valid bootlist in %s. status %d\n", + tmppath, rval); + + if (rval == RET_SUCCESS) { + goto free_exit; + } + (void) fclose(fp); + + /* not found in md.conf try etc/system */ + (void) strcat(strcpy(tmppath, path), SYSTEM_FILE); + + if ((fp = fopen(tmppath, "r")) == NULL) { + rval = errno; + goto free_exit; + } + + rval = valid_bootlist(fp, MDDB_BOOTLIST_MAX_LEN); + + debug_printf("svm_check(): valid bootlist in %s. status %d\n", + tmppath, rval); +free_exit: + (void) fclose(fp); + if (rval > 0) + rval = RET_ERROR; + return (rval); +} + +/* + * FUNCTION: svm_is_md + * Check if the the given device name has an md driver. + * INPUT: special device name (/dev/dsk/c0t0d0s0 or /dev/md/dsk/d10) + * + * RETURN: + * 1 - if it is a metadevice. + * 0 - if it is not a metadevice. + */ + +int +svm_is_md(char *device_name) +{ + char buf[30]; + struct stat sbuf; + int rval = 0; + + (void) memset(buf, 0, 30); + + debug_printf("svm_is_md(): device %s\n", device_name); + if (stat(device_name, &sbuf) != 0) + return (RET_ERROR); + + if (get_drv_name(major(sbuf.st_rdev), "/", buf) == RET_ERROR) { + debug_printf("svm_is_md(): device get_drv_name failed: %s\n", + device_name); + return (0); + } + if (strcmp(buf, MD_MODULE) == 0) { + debug_printf("svm_is_md(): device %s succeed\n", device_name); + rval = 1; + } + return (rval); +} diff --git a/usr/src/lib/lvm/libsvm/common/debug.c b/usr/src/lib/lvm/libsvm/common/debug.c new file mode 100644 index 0000000000..38f7ae56cc --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/debug.c @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> + +/* The following defines are for tracing output (from libsmpicommon) */ + +#define LOG 0x1 /* write message to log file */ +#define SCR 0x2 /* write message to the screen */ +#define LOGSCR LOG|SCR /* write message to the log and screen */ +#define LEVEL0 0x0001 /* message level 0 */ +#define LEVEL1 0x0002 /* message level 1 */ +#define LEVEL2 0x0004 /* message level 2 */ +#define LEVEL3 0x0010 /* message level 3 */ + +extern int get_trace_level(void); +extern int write_status(unsigned char, unsigned int, char *, ...); + +const char libsvm_str[] = "LIB_SVM: "; +const int libsvm_len = sizeof (libsvm_str); + +/*PRINTFLIKE1*/ +void +debug_printf(char *fmt, ...) +{ + va_list ap; + char *cp; + char *buf; + + if (get_trace_level() > 5) { + if ((buf = calloc(PATH_MAX, sizeof (char))) == NULL) + return; + (void) strcpy(buf, libsvm_str); + /* + * libsvm_len - 1 is because the length includes NULL + */ + + cp = buf + (libsvm_len - 1); + va_start(ap, fmt); + if (vsnprintf(cp, (PATH_MAX - (libsvm_len - 1)), + fmt, ap) >= 0) { + write_status(LOGSCR, LEVEL0, buf); + } + free(buf); + va_end(ap); + } +} diff --git a/usr/src/lib/lvm/libsvm/common/getdrvname.c b/usr/src/lib/lvm/libsvm/common/getdrvname.c new file mode 100644 index 0000000000..9bef7fa115 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/getdrvname.c @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <sys/types.h> +#include <svm.h> + +/* + * Macros to produce a quoted string containing the value of a + * preprocessor macro. For example, if SIZE is defined to be 256, + * VAL2STR(SIZE) is "256". This is used to construct format + * strings for scanf-family functions below. + */ +#define QUOTE(x) #x +#define VAL2STR(x) QUOTE(x) + +/* + * FUNCTION: + * Return the driver name for a major number + * + * INPUT: major number, mount point for name_to_major file, pointer + * to a valid buffer. + * + * RETURN VALUES: + * 0 - SUCCESS - buf contain the driver name. + * -1 - FAIL + * + */ + +int +get_drv_name(major_t major, char *mnt, char *buf) +{ + FILE *fp; + char drv[FILENAME_MAX + 1]; + char entry[FILENAME_MAX + 1]; + char line[MAX_N2M_ALIAS_LINE]; + char fname[PATH_MAX]; + + int status = RET_NOERROR; + (void) snprintf(fname, sizeof (fname), "%s%s", mnt, NAME_TO_MAJOR); + + if ((fp = fopen(fname, "r")) == NULL) { + return (RET_ERROR); + } + + while ((fgets(line, sizeof (line), fp) != NULL) && + status == RET_NOERROR) { + if (sscanf(line, + "%" VAL2STR(FILENAME_MAX) "s %" VAL2STR(FILENAME_MAX) "s", + drv, entry) != 2) { + status = RET_ERROR; + } + if (atoi(entry) == major) + break; + + } + + if (status == RET_NOERROR) + (void) strcpy(buf, drv); + (void) fclose(fp); + return (status); +} diff --git a/usr/src/lib/lvm/libsvm/common/hdrs/libsvm.h b/usr/src/lib/lvm/libsvm/common/hdrs/libsvm.h new file mode 100644 index 0000000000..98c13a2684 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/hdrs/libsvm.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSVM_H +#define _LIBSVM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * The following declarations are for libsvm which provides + * Solaris Install with a set of interfaces required to upgrade + * mirrored roots. These are controlled by a Contract PSARC 2000/049 + * and should not be changed without informing Install. + */ + +typedef struct { + char *root_md; /* metaroot device name */ + int count; /* number of components in the metadevice */ + char *md_comps[1]; /* array of "ctds" component names */ +} svm_info_t; + +/* Convertion of MDDB flags */ +#define SVM_DONT_CONV 0x01 /* Don't convert MDDB to devid mode */ +#define SVM_CONV 0x02 /* Convert MDDB to devid mode */ + + +extern int svm_check(char *rootpath); +extern int svm_start(char *rootpath, svm_info_t **svm_infopp, + int repl_state_flag); +extern int svm_stop(); +extern void svm_free(svm_info_t *svm_infop); +extern int svm_is_md(char *device_name); +extern int svm_get_components(char *root_md_device, svm_info_t **svmpp); +extern svm_info_t *svm_alloc(); +extern int get_mdcomponents(char *devname, svm_info_t **pp); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBSVM_H */ diff --git a/usr/src/lib/lvm/libsvm/common/hdrs/svm.h b/usr/src/lib/lvm/libsvm/common/hdrs/svm.h new file mode 100644 index 0000000000..cb5d60f30f --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/hdrs/svm.h @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _SVM_H +#define _SVM_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + + +#define RET_SUCCESS 0 +#define RET_ERROR -1 +#define RET_NOERROR RET_SUCCESS + + +#define PROP_KEEP_REPL_STATE "md_keep_repl_state" +#define PROP_DEVID_DESTROY "md_devid_destroy" + +#define MD_CONF "/kernel/drv/md.conf" +#define MD_CONF_ORIG "/tmp/md.conf.orig" +#define SYSTEM_FILE "/etc/system" +#define NAME_TO_MAJOR "/etc/name_to_major" +#define VFSTAB "/etc/vfstab" + +#define MD_MODULE "md" +#define ROOT_MNTPT "/" +#define ROOT_METADEVICE "/dev/md/dsk/" + + +typedef enum { + MD_STR_NOTFOUND, /* bootlist not found */ + MD_STR_START, /* bootlist found, convertion started */ + MD_STR_DONE /* bootlist converversion done */ +} convflag_t; + +/* The following defines have been taken from addrem.h */ +#define MAX_CMD_LINE 256 +#define MAX_N2M_ALIAS_LINE FILENAME_MAX + FILENAME_MAX + 1 +#define MAXLEN_NAM_TO_MAJ_ENT FILENAME_MAX + MAX_STR_MAJOR + 1 +#define OPT_LEN 128 +#define CADDR_HEX_STR 16 +#define UINT_STR 10 +#define MODLINE_ENT_MAX (4 * UINT_STR) + CADDR_HEX_STR + MODMAXNAMELEN +#define MAX_STR_MAJOR UINT_STR +#define STR_LONG 10 +#define PERM_STR 4 +#define MAX_PERM_ENTRY (2 * STR_LONG) + PERM_STR + (2 * FILENAME_MAX) + 1 +#define MAX_DBFILE_ENTRY MAX_PERM_ENTRY + +extern void create_diskset_links(); +extern int copyfile(char *from, char *to); +extern int get_drv_name(major_t major, char *file_name, char *buf); +extern int mod_unload(char *modname); +extern int valid_bootlist(FILE *fp, int line_size); +extern int convert_bootlist(char *systemfile, char *mdconf, char **tmpfilename); +extern int write_xlate_to_mdconf(char *rootpath); +extern int write_targ_nm_table(char *rootpath); +extern int get_rootmetadevice(char *rootpath, char **devname); +extern void set_upgrade_prop(char *prop_name, int val); +extern int is_upgrade_prop(char *prop_name); +extern int create_in_file_prop(char *prop_name, char *fname); +extern void debug_printf(char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _SVM_H */ diff --git a/usr/src/lib/lvm/libsvm/common/metaconf.c b/usr/src/lib/lvm/libsvm/common/metaconf.c new file mode 100644 index 0000000000..504f38ba73 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/metaconf.c @@ -0,0 +1,195 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#include <stdio.h> +#include <ctype.h> +#include <sys/types.h> +#include <sys/mkdev.h> +#include <sys/stat.h> +#include <unistd.h> +#include <dirent.h> +#include <limits.h> +#include <string.h> +#include <libsvm.h> +#include <svm.h> +#include <errno.h> + + +#define VERSION "1.0" +#define DISK_DIR "/dev/rdsk" + +extern int _map_to_effective_dev(); + +int +is_blankline(char *buf) +{ + for (; *buf != 0; buf++) { + if (!isspace(*buf)) + return (0); + } + return (1); +} + +/* + * FUNCTION: write_targ_nm_table + * creates a tuple table of <driver name, major number > in md.conf + * INPUT: rootpath + * + * RETURN VALUES: + * RET_SUCCESS + * RET_ERROR + */ + +int +write_targ_nm_table(char *path) +{ + FILE *targfp = NULL; + FILE *mdfp = NULL; + char buf[PATH_MAX], *cp; + int retval = RET_SUCCESS; + int first_entry = 1; + + if ((mdfp = fopen(MD_CONF, "a")) == NULL) + return (RET_ERROR); + + (void) snprintf(buf, sizeof (buf), "%s%s", path, NAME_TO_MAJOR); + + if ((targfp = fopen(buf, "r")) == NULL) { + (void) fclose(mdfp); + return (RET_ERROR); + } + + while (fgets(buf, PATH_MAX, targfp) != NULL && + (retval == RET_SUCCESS)) { + cp = strrchr(buf, '\n'); + *cp = 0; + if (is_blankline(buf)) + continue; + if (first_entry) { + if (fprintf(mdfp, "md_targ_nm_table=\"%s\"", buf) < 0) + retval = RET_ERROR; + first_entry = 0; + } + if (fprintf(mdfp, ",\"%s\"", buf) < 0) + retval = RET_ERROR; + } + if (!first_entry) + if (fprintf(mdfp, ";\n") < 0) + retval = RET_ERROR; + (void) fclose(mdfp); + (void) fclose(targfp); + return (retval); +} + +/* + * FUNCTION: write_xlate_to_mdconf + * creates a tuple table of <miniroot devt, target devt> in md.conf + * INPUT: rootpath + * + * RETURN VALUES: + * RET_SUCCESS + * RET_ERROR + */ + +int +write_xlate_to_mdconf(char *path) +{ + FILE *fptr = NULL; + struct dirent *dp; + DIR *dirp; + struct stat statb_dev; + struct stat statb_edev; + char *devname; + char edevname[PATH_MAX]; + char targname[PATH_MAX]; + char diskdir[PATH_MAX]; + int first_devid = 1; + int ret = RET_SUCCESS; + + if ((fptr = fopen(MD_CONF, "a")) == NULL) { + return (RET_ERROR); + } + + + (void) snprintf(diskdir, sizeof (diskdir), "%s%s", path, DISK_DIR); + if ((dirp = opendir(diskdir)) == NULL) { + (void) fclose(fptr); + return (RET_ERROR); + } + + /* special case to write the first tuple in the table */ + while (((dp = readdir(dirp)) != (struct dirent *)0) && + (ret != RET_ERROR)) { + if ((strcmp(dp->d_name, ".") == 0) || + (strcmp(dp->d_name, "..") == 0)) + continue; + + if ((strlen(diskdir) + strlen(dp->d_name) + 2) > PATH_MAX) { + continue; + } + + (void) snprintf(targname, sizeof (targname), "%s/%s", + diskdir, dp->d_name); + + if (stat(targname, &statb_dev) != 0) { + continue; + } + + if ((devname = strstr(targname, DISK_DIR)) == NULL) { + continue; + } + + if (_map_to_effective_dev((char *)devname, (char *)&edevname) + != 0) { + continue; + } + + if (stat(edevname, &statb_edev) != 0) { + continue; + } + + if (first_devid) { + if (fprintf(fptr, "md_xlate_ver=\"%s\";\n" + "md_xlate=%lu,%lu", VERSION, + statb_edev.st_rdev, statb_dev.st_rdev) < 0) + ret = RET_ERROR; + first_devid = 0; + } + if (fprintf(fptr, ",%lu,%lu", statb_edev.st_rdev, + statb_dev.st_rdev) < 0) + ret = RET_ERROR; + } /* end while */ + + if (!first_devid) + if (fprintf(fptr, ";\n") < 0) + ret = RET_ERROR; + (void) fclose(fptr); + (void) closedir(dirp); + return (ret); +} diff --git a/usr/src/lib/lvm/libsvm/common/metainterfaces.c b/usr/src/lib/lvm/libsvm/common/metainterfaces.c new file mode 100644 index 0000000000..20746d4b58 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/metainterfaces.c @@ -0,0 +1,490 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <string.h> +#include <sys/vfstab.h> +#include <meta.h> +#include <libsvm.h> +#include <svm.h> +#include <sdssc.h> + + +extern int mod_unload(char *modname); +static int inited = 0; + +/* + * FUNCTION: init_metalib + * initialize libmeta only once. + * + * RETURN VALUES: + * 0 - SUCCESS + * -1 - FAIL + */ + +static int +init_metalib() +{ + int largc = 1; + char *largv = "libsvm"; + md_error_t status = mdnullerror; + + if (!inited) { + if (md_init_nosig(largc, &largv, 0, 1, &status) != 0 || + meta_check_root(&status) != 0) { + return (-1); + } + inited = 1; + } + return (RET_SUCCESS); +} + +/* + * FUNCTION: reset_metalib + * + * INPUT: ptr to md_error_t + */ + +static void +reset_metalib(md_error_t *ep) +{ + inited = 0; + (void) close_admin(ep); +} + +/* + * FUNCTION: metahalt + * halt the metadb + * + */ + +static void +metahalt() +{ + mdsetname_t *sp; + md_error_t status = mdnullerror; + + (void) init_metalib(); + if ((sp = metasetname(MD_LOCAL_NAME, &status)) == NULL) { + return; + } + if (meta_lock(sp, TRUE, &status)) { + return; + } + if (metaioctl(MD_HALT, NULL, &status, NULL) != 0) { + debug_printf("metahalt(): errno %d\n", + status.info.md_error_info_t_u.sys_error.errnum); + } + (void) meta_unlock(sp, &status); + reset_metalib(&status); +} + +/* + * FUNCTION: svm_stop + * Halt the SDS/SVM configuration and unload md module. + * + * RETURN VALUES: + * 0 - SUCCESS + * RET_ERROR + */ + +#define MAX_TIMEOUT 1800 +int +svm_stop() +{ + int rval = RET_SUCCESS; + int timeval = 0; + int sleep_int = 5; + + metahalt(); + + if ((rval = mod_unload(MD_MODULE)) != 0) { + timeval += sleep_int; + (void) sleep(sleep_int); + while (timeval < MAX_TIMEOUT) { + if ((rval = mod_unload(MD_MODULE)) == 0) { + debug_printf("svm_stop(): mod_unload succeeded." + " Time %d\n", timeval); + + break; + } + + debug_printf("svm_stop(): mod_unload failed. Trying " + "in %d s (%d)\n", sleep_int, timeval); + + timeval += sleep_int; + (void) sleep(sleep_int); + metahalt(); + } + + if (rval != 0) { + rval = RET_ERROR; + debug_printf("svm_stop(): mod_unload FAILED!\n"); + } + } + + return (rval); +} + +/* + * FUNCTION: get_rootmetadevice + * parses the vfstab to return the metadevice + * + * INPUT: + * mount point + * mdname - pointer to string pointer that will contain the + * metadevice name. Caller must free the allocated space. + * RETURN VALUES: + * mdname - md root device name + * 0 - SUCCESS + * !0 - FAIL + * > 0 errno + * RET_ERROR + */ + +int +get_rootmetadevice(char *mntpath, char **mdname) +{ + struct vfstab v; + FILE *fp; + int rval = RET_SUCCESS; + char *cp; + char vfstab_name[PATH_MAX + 1]; + + if (mdname == NULL) + return (EINVAL); + + *mdname = NULL; + + if (snprintf(vfstab_name, PATH_MAX + 1, "%s%s", mntpath, VFSTAB) < 0) + return (ENOMEM); + + debug_printf("get_rootmetadevice(): mntpath %s %s\n", mntpath, + vfstab_name); + + if ((fp = fopen(vfstab_name, "r")) == NULL) { + rval = errno; + return (rval); + } + + if ((rval = getvfsfile(fp, &v, ROOT_MNTPT)) != 0) { + goto out; + } + + + debug_printf("get_rootmetadevice(): vfs_special %s\n", v.vfs_special); + if (strstr(v.vfs_special, ROOT_METADEVICE) == NULL) { + /* md device not found */ + rval = RET_ERROR; + goto out; + } + + /* found a match fill it and return */ + cp = v.vfs_special + strlen(ROOT_METADEVICE); + + *mdname = (char *)malloc(strlen(cp) + 1); + + if (*mdname == NULL) { + rval = ENOMEM; + goto out; + } + (void) strcpy(*mdname, cp); + debug_printf("get_rootmetadevice(): *mdname %s rval %d\n", + *mdname, rval); +out: + (void) fclose(fp); + return (rval); +} + +/* + * FUNCTION: create_diskset_links + * Create the diskset name symlinks in /dev/md from the diskset + * names found in the set records. These are normally created + * in rpc.metad when you create the set but those symlinks are + * sitting out on the real system disk and we're running off the + * devfs that got created when we booted off the install image. + */ + +void +create_diskset_links() +{ + int max_sets; + int i; + md_error_t error = mdnullerror; + + /* + * Resolve the function pointers for libsds_sc so that we can + * snarf the set records. + */ + (void) sdssc_bind_library(); + (void) init_metalib(); + + if ((max_sets = get_max_sets(&error)) == 0) { + debug_printf("create_diskset_links(): get_max_sets failed\n"); + mdclrerror(&error); + return; + } + + for (i = 1; i < max_sets; i++) { + md_set_record *sr; + char setname[MAXPATHLEN]; + char setnum[MAXPATHLEN]; + + if ((sr = metad_getsetbynum(i, &error)) == NULL) { + mdclrerror(&error); + continue; + } + + (void) snprintf(setname, MAXPATHLEN, "/dev/md/%s", + sr->sr_setname); + (void) snprintf(setnum, MAXPATHLEN, "shared/%d", i); + /* + * Ignore failures to create the symlink. This could + * happen because suninstall is restartable so the + * symlink might have already been created. + */ + (void) symlink(setnum, setname); + } +} + +/* + * FUNCTION: svm_alloc + * Return a pointer to an opaque piece of zeroed memory. + * + * RETURN VALUES: + * Non null - SUCCESS + * NULL - FAIL + */ + +svm_info_t * +svm_alloc() +{ + return ((svm_info_t *)calloc(1, sizeof (svm_info_t))); +} + +/* + * FUNCTION: svm_free + * + * INPUT: pointer to struct svm_info + */ + +void +svm_free(svm_info_t *svmp) +{ + int i; + + if (svmp == NULL) + return; + + for (i = 0; i < svmp->count; i++) { + free(svmp->md_comps[i]); + } + free(svmp->root_md); + free(svmp); +} + +/* + * FUNCTION: get_mdcomponents + * Given "uname" metadevice, return the physical components + * of that metadevice. + * + * INPUT: + * uname - metadevice name + * + * RETURN VALUES: + * svmp - structure containing md name and components + * RET_SUCCESS + * RET_ERROR + * + */ + +int +get_mdcomponents(char *uname, svm_info_t **svmpp) +{ + + svm_info_t *svmp; + md_error_t status, *ep; + mdname_t *namep; + mdnamelist_t *nlp = NULL; + mdnamelist_t *p; + mdsetname_t *sp = NULL; + char *strp = NULL; + int rval, cnt; + + rval = RET_SUCCESS; + cnt = 0; + status = mdnullerror; + ep = &status; + svmp = *svmpp; + + (void) init_metalib(); + + debug_printf("get_mdcomponents(): Enter unit name %s\n", uname); + + if (((namep = metaname(&sp, uname, ep)) == NULL) || + (metachkmeta(namep, ep) != 0)) { + debug_printf("get_mdcomponents(): " + "metaname or metachkmeta failed\n"); + mdclrerror(ep); + return (RET_ERROR); + } + + debug_printf("get_mdcomponents(): meta_getdevs %s\n", namep->cname); + + if ((meta_getdevs(sp, namep, &nlp, ep)) < 0) { + debug_printf("get_mdcomponents(): " + "comp %s - meta_getdevs failed\n", uname); + metafreenamelist(nlp); + mdclrerror(ep); + return (RET_ERROR); + } + + /* compute the number of devices */ + + for (p = nlp, cnt = 0; p != NULL; p = p->next, cnt++) + ; + + /* + * Need to add n -1 components since slvmp already has space + * for one device. + */ + + svmp = (svm_info_t *)realloc(svmp, sizeof (svm_info_t) + + (sizeof (char *) * (cnt - 1))); + + if (svmp == NULL) { + debug_printf("get_mdcomponents(): realloc of svmp failed\n"); + metafreenamelist(nlp); + return (RET_ERROR); + } + + + for (p = nlp, cnt = 0; p != NULL; p = p->next, cnt++) { + mdname_t *devnp = p->namep; + + if ((strp = strdup(devnp->cname)) == NULL) { + rval = RET_ERROR; + break; + } + svmp->md_comps[cnt] = strp; + } + + /* count is set to the number of devices in the list */ + + svmp->count = cnt; + svmp->root_md = strdup(uname); + if (rval == RET_SUCCESS && svmp->root_md != NULL) { + debug_printf("get_mdcomponents(): root_md %s count %d \n", + svmp->root_md, svmp->count); + for (cnt = 0; cnt < svmp->count; cnt++) + debug_printf("get_mdcomponents(): %s\n", + svmp->md_comps[cnt]); + } else { + rval = RET_ERROR; + svm_free(svmp); + svmp = NULL; + debug_printf("get_mdcomponents(): malloc failed\n"); + + } + + + metafreenamelist(nlp); + *svmpp = svmp; + return (rval); +} + + +/* + * FUNCTION: svm_get_components + * return svm_infop with the components of a metadevice. + * + * INPUT: + * md_device - eg. /dev/md/dsk/d10, /dev/md/foo/dsk/d10, or + * /dev/md/shared/1/dsk/d10 + * + * RETURN: + * 0 - SUCCESS + * !0 - FAIL + */ + +int +svm_get_components(char *md_device, svm_info_t **svmpp) +{ + int len; + + /* + * If this is a named diskset with a shared name + * (e.g. /dev/md/shared/1/dsk/d10) call get_mdcomponents with + * the diskset and metadevice name (e.g. foo/d10). + * Otherwise this is a regular name (e.g. /dev/md/dsk/d10 or + * /dev/md/foo/dsk/d10 or d10 or foo/d10) all of which + * get_mdcomponents can handle directly. + */ + + len = strlen("/dev/md/shared/"); + if (strncmp(md_device, "/dev/md/shared/", len) == 0) { + int numlen; + int setnum; + char *cp; + char *slashp; + char mdname[MAXPATHLEN]; + mdsetname_t *sp; + md_error_t error = mdnullerror; + + cp = md_device + len; + + if ((slashp = strstr(cp, "/")) == NULL) + return (RET_ERROR); + numlen = slashp - cp; + if (numlen >= MAXPATHLEN - 1) + return (RET_ERROR); + + (void) strlcpy(mdname, cp, numlen + 1); + /* setnum now contains the diskset number */ + setnum = atoi(mdname); + if ((sp = metasetnosetname(setnum, &error)) == NULL || + !mdisok(&error)) + return (RET_ERROR); + + cp = slashp + 1; + /* cp now pointing at dsk/... */ + if ((slashp = strstr(cp, "/")) == NULL) + return (RET_ERROR); + + (void) snprintf(mdname, MAXPATHLEN, "%s/%s", sp->setname, + slashp + 1); + /* mdname now contains diskset and metadevice name e.g. foo/d10 */ + + debug_printf("svm_get_components(): mdname %s\n", mdname); + return (get_mdcomponents(mdname, svmpp)); + + } else { + debug_printf("svm_get_components(): md_device %s\n", md_device); + return (get_mdcomponents(md_device, svmpp)); + } +} diff --git a/usr/src/lib/lvm/libsvm/common/modops.c b/usr/src/lib/lvm/libsvm/common/modops.c new file mode 100644 index 0000000000..78914a9069 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/modops.c @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <sys/errno.h> +#include <sys/modctl.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <svm.h> + +/* + * FUNCTION: get modid + * Given a module name returns module id. + * + * INPUT: module name + * + * RETURN VALUES: + * > 0 SUCCESS + * -1 FAIL + */ + +static int +get_modid(char *modname) +{ + struct modinfo modinfo; + int id; + int rval = RET_ERROR; + + id = -1; /* look for all modules */ + + modinfo.mi_id = modinfo.mi_nextid = id; + modinfo.mi_info = MI_INFO_ALL | MI_INFO_NOBASE; + + do { + if (modctl(MODINFO, id, &modinfo) < 0) + break; + + modinfo.mi_name[MODMAXNAMELEN - 1] = '\0'; + /* if we find a match break out */ + if (strcmp(modinfo.mi_name, modname) == 0) { + rval = modinfo.mi_id; + break; + } + /* LINTED */ + } while (1); + + return (rval); +} + +/* + * FUNCTION: mod_unload + * unload a module. + * + * INPUT: module name + * + * RETURN VALUES: + * 0 - SUCCESS + * !0 - FAIL + * > 0 errno + * -1 + * NOTE: If we fail to get the module id because the module is not + * currently loaded we still want to try to force a reload of the + * .conf file when it does load. + */ +int +mod_unload(char *modname) +{ + int id; + major_t major; + int rval = RET_SUCCESS; + + id = get_modid(modname); + + if (id != -1) { + if (modctl(MODUNLOAD, id) < 0) { + rval = errno; + } + } + + if ((modctl(MODGETMAJBIND, modname, strlen(modname) + 1, + &major)) != 0) { + return (errno); + } + + if ((modctl(MODUNLOADDRVCONF, major) != 0) || + (modctl(MODLOADDRVCONF, major) != 0)) { + return (errno); + } + + return (rval); +} diff --git a/usr/src/lib/lvm/libsvm/common/start_svm.c b/usr/src/lib/lvm/libsvm/common/start_svm.c new file mode 100644 index 0000000000..f423d4f418 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/start_svm.c @@ -0,0 +1,284 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <ctype.h> +#include <malloc.h> +#include <stdio.h> +#include <fcntl.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/param.h> +#include <limits.h> +#include <meta.h> +#include <svm.h> +#include <libsvm.h> + +#define MODEBITS (S_ISUID|S_ISGID|S_ISVTX|S_IRWXU|S_IRWXG|S_IRWXO) +#define ISREG(A) (((A).st_mode & S_IFMT) == S_IFREG) +#define DEFAULT_ROOTDIR "/a" + + +/* + * FUNCTION: svm_start + * starts SDS/SVM configuration. If root mirroring exists then the + * components of the root mirror are returned in svmpp. + * + * INPUT: mntpnt - root mount point + * svmpp - prealloced structure to return components + * repl_state_flag - SVM_CONV/SVM_DONT_CONV + * + * RETURN: + * 0 - SUCCESS + * !0 - ERROR + * if > 0 errno + */ + +int +svm_start(char *mntpnt, svm_info_t **svmpp, int repl_state_flag) +{ + char *rootdir, *tf; + char *mdevnamep = NULL; + char system_file[PATH_MAX]; + char mdconf[PATH_MAX]; + int rval = 0; + + if (mntpnt == NULL) + rootdir = DEFAULT_ROOTDIR; + else + rootdir = mntpnt; + + if ((rval = snprintf(system_file, PATH_MAX, "%s%s", + rootdir, SYSTEM_FILE)) < 0) { + return (RET_ERROR); + } + + if ((rval = snprintf(mdconf, PATH_MAX, "%s%s", + rootdir, MD_CONF)) < 0) { + return (RET_ERROR); + } + + debug_printf("svm_start(): repl_state_flag %s\n", + (repl_state_flag == SVM_DONT_CONV) ? "SVM_DONT_CONV": + "SVM_CONV"); + + if (copyfile(MD_CONF, MD_CONF_ORIG)) + return (RET_ERROR); + + switch (rval = convert_bootlist(system_file, mdconf, &tf)) { + case 0: + case -1: /* found in etc/system flag */ + break; + default: /* convert bootlist failed */ + debug_printf("svm_start(): convert_bootlist failed." + "rval %d\n", rval); + goto errout; + } + + if (repl_state_flag == SVM_DONT_CONV) { + rval = create_in_file_prop(PROP_KEEP_REPL_STATE, tf); + if (rval != 0) + goto errout; + } + + if (is_upgrade_prop(PROP_DEVID_DESTROY)) { + rval = create_in_file_prop(PROP_DEVID_DESTROY, tf); + /* + * For the idempotent behavior reset internal + * flag incase we have to return due to errors + */ + set_upgrade_prop(PROP_DEVID_DESTROY, 0); + if (rval != 0) + goto errout; + } + + + /* + * Since svm_start is called only after svm_check, + * we can assume that there is a valid metadb. If the mddb_bootlist + * is not found in etc/system, then it must be in md.conf which + * we copied to temporary file pointed to by tf + */ + if (copyfile(tf, MD_CONF)) { + debug_printf("svm_start(): copy of %s to %s failed\n", tf, + MD_CONF); + goto errout; + } + + if ((rval = write_xlate_to_mdconf(rootdir)) != 0) { + debug_printf("svm_start(): write_xlate_to_mdconf(%s) failed\n", + rootdir); + goto errout; + } + + if ((rval = write_targ_nm_table(rootdir)) != 0) { + goto errout; + } + + /* run devfsadm to create the devices specified in md.conf */ + if ((rval = system("/usr/sbin/devfsadm -r /tmp -p " + "/tmp/root/etc/path_to_inst -i md")) != 0) { + debug_printf("svm_start(): devfsadm -i md failed: %d\n", rval); + goto errout; + } + + /* + * We have to unload md after the devfsadm run so that when metainit + * loads things it gets the right information from md.conf. + */ + if (rval = svm_stop()) { + debug_printf("svm_start(): svm_stop failed.\n"); + return (RET_ERROR); + } + + if ((rval = system("/usr/sbin/metainit -r")) != 0) { + debug_printf("svm_start(): metainit -r failed: %d\n", rval); + goto errout; + } + + create_diskset_links(); + + if ((rval = system("/usr/sbin/metasync -r")) != 0) { + debug_printf("svm_start(): metasync -r failed: %d\n", rval); + goto errout; + } + + /* + * We ignore failures from metadevadm, since it can fail if + * miniroot dev_t's don't match target dev_ts. But it still + * will update md.conf with device Id information which is + * why we are calling it here. + */ + + (void) system("/usr/sbin/metadevadm -r"); + + /* + * check to see if we have a root metadevice and if so + * get its components. + */ + + if ((rval = get_rootmetadevice(rootdir, &mdevnamep)) == 0) { + if (rval = get_mdcomponents(mdevnamep, svmpp)) { + debug_printf("svm_start(): get_mdcomponents(%s,..)" + "failed %d\n", mdevnamep, rval); + goto errout; + } + + } else { + rval = 0; /* not a mirrored root */ + debug_printf("svm_start(): get_rootmetadevice(%s,..) " + "No root mirrors! ", rootdir); + } +errout: + free(mdevnamep); + if (rval != 0) { + struct stat sbuf; + if (stat(MD_CONF_ORIG, &sbuf) == 0) + (void) copyfile(MD_CONF_ORIG, MD_CONF); + debug_printf("svm_start(): svm_start failed: %d\n", rval); + } else { + int i; + + if ((*svmpp)->count > 0) { + debug_printf("svmpp: "); + debug_printf(" root_md: %s", (*svmpp)->root_md); + debug_printf(" count: %d", (*svmpp)->count); + for (i = 0; i < (*svmpp)->count; i++) { + debug_printf(" md_comps[%d]: %s", i, + (*svmpp)->md_comps[i]); + } + debug_printf(" \n"); + } else { + if ((*svmpp)->count == 0) + debug_printf("svm_start(): no mirrored root\n"); + } + debug_printf("svm_start(): svm_start succeeded.\n"); + } + return (rval); +} + +/* + * FUNCTION: copyfile + * + * INPUT: self descriptive + * + * RETURN: + * RET_SUCCESS + * RET_ERROR + */ +int +copyfile(char *from, char *to) +{ + int fromfd, tofd; + char buf[1024]; + ssize_t rbytes; + struct stat fromstat; + + if ((fromfd = open(from, O_RDONLY | O_NDELAY)) < 0) + return (RET_ERROR); + + if ((fstat(fromfd, &fromstat) < 0) || ! ISREG(fromstat)) { + (void) close(fromfd); + return (RET_ERROR); + } + + if ((tofd = open(to, O_CREAT | O_WRONLY | O_TRUNC, + (fromstat.st_mode & MODEBITS))) < 0) { + (void) close(fromfd); + return (RET_ERROR); + } + + /* + * in case the file exists then perm is forced by this chmod + */ + (void) fchmod(tofd, fromstat.st_mode & MODEBITS); + + for (;;) { + rbytes = read(fromfd, buf, sizeof (buf)); + /* + * no need to check for negative values since the file + * has been successfully stat'ed + */ + if (rbytes == 0) + break; + if (write(tofd, buf, rbytes) != rbytes) { + rbytes = -1; + break; + } + } + + (void) close(fromfd); + (void) close(tofd); + if (rbytes < 0) { + (void) unlink(to); + return (RET_ERROR); + } + return (RET_SUCCESS); +} diff --git a/usr/src/lib/lvm/libsvm/common/update_mdconf.c b/usr/src/lib/lvm/libsvm/common/update_mdconf.c new file mode 100644 index 0000000000..f757648911 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/common/update_mdconf.c @@ -0,0 +1,379 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <devid.h> +#include <errno.h> +#include <string.h> +#include <assert.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <meta.h> +#include <libsvm.h> +#include <svm.h> + +/* + * magic strings in system + */ +#define BEGMDDBSTR "* Begin MDD database info (do not edit)\n" +#define ENDMDDBSTR "* End MDD database info (do not edit)\n" +#define NEW_BEGMDDBSTR "# Begin MDD database info (do not edit)\n" +#define NEW_ENDMDDBSTR "# End MDD database info (do not edit)\n" + +#define MDDBBOOTLIST "mddb_bootlist" + +#define SYS_COMMENTCHAR '*' +#define CONF_COMMENTCHAR '#' + +typedef struct { + char *prop_name; + int prop_val; +} md_prop_t; + +typedef enum { + MDDB_SYS_FILE, + MDDB_MDCONF_FILE +} ftype_t; + +static md_prop_t upgrade_props[] = { + { PROP_KEEP_REPL_STATE, 0 }, + { PROP_DEVID_DESTROY, 0}, + { NULL, 0} +}; + +/* + * The following functions manage upgrade properties + */ + +void +set_upgrade_prop(char *prop_name, int val) +{ + md_prop_t *upp; + + upp = &upgrade_props[0]; + + for (; upp->prop_name != NULL; upp++) { + if (strcmp(upp->prop_name, prop_name) == 0) { + upp->prop_val = val; + return; + } + } +} + +int +is_upgrade_prop(char *prop_name) +{ + md_prop_t *upp; + + upp = &upgrade_props[0]; + + for (; upp->prop_name != NULL; upp++) { + if (strcmp(upp->prop_name, prop_name) == 0) { + return (upp->prop_val == 1); + } + } + return (0); +} + +int +create_in_file_prop(char *prop_name, char *fname) +{ + FILE *fp; + md_prop_t *upp; + int rval = RET_ERROR; + + if ((fp = fopen(fname, "a")) == NULL) { + return (errno); + } + + upp = &upgrade_props[0]; + + for (; upp->prop_name != NULL; upp++) { + if (strcmp(upp->prop_name, prop_name) == 0) { + (void) fprintf(fp, "%s = 1;\n", upp->prop_name); + rval = RET_SUCCESS; + break; + } + } + (void) fclose(fp); + return (rval); +} + +static int +is_devid_added(char *str) +{ + int cnt = 0; + char *cp; + + /* there are exactly 3 colons in the string for devid */ + for (cnt = 0; cnt < 4; cnt++) { + if ((cp = strchr(str, ':')) == NULL) + break; + str = ++cp; + } + return (cnt == 3); +} + +/* + * FUNCTION: parse_bootlist + * Parse the bootlist and add the extra field to mddb_boolist entry to + * conform to devid changes. + * + * Old format: <drivername>:<minor_number>:<offset> + * New format: <drivername>:<minor_number>:<offset>:<devid> + * Devid of id0 implies no device id. + * + * INPUT: *line - contains the mddb_bootlist + * *tfp - File pointer to the md.conf.tmp file. + * + * RETURN: + * 0 - Success + * > 0 - Failure. Errno returned + */ + +static int +parse_bootlist(char *line, FILE *tfp) +{ + char output[1024]; + char *cp; + int retval = RET_SUCCESS; + + (void) memset(output, 0, sizeof (output)); + + if (line[0] == SYS_COMMENTCHAR) { + output[0] = CONF_COMMENTCHAR; + } + /* move the line start of mddbbootlist */ + cp = strstr(line, MDDBBOOTLIST); + if (cp != NULL) + line = cp; + + /* grab the "mddb_boolist" word */ + cp = strtok(line, "= "); + (void) strcat(output, cp); + (void) strcat(output, "=\042"); /* add back the EQUAL and QUOTE chars */ + + /* + * The line passed in is for example, + * mddb_bootlist1="sd:7:16:id1,sd@SIBM_DDRS34560SUN4.2G2N9688_____/h"; + * At this point mddb_bootlist and "=" have been parsed out. + * The remaining string consists of driver name, colon separator and + * the device id(if it exists) within quotes. + * The deviceid string can contain upper and lower letters, digits + * and +-.=_~. Quotes, spaces and \n and \t are not + * allowed. They are converted to either _ or their ascii value. + * So using space,\n,;and quotes as a separator is safe. + */ + + while ((cp = strtok(NULL, " \n\042;")) != NULL) { + (void) strcat(output, cp); + if (!is_devid_added(cp)) { + /* append :id0 for devid */ + (void) strcat(strcat(output, ":"), + devid_str_encode(NULL, NULL)); + + /* no devid => SDS->SLVM migration. Set the flag */ + set_upgrade_prop(PROP_DEVID_DESTROY, 1); + } + (void) strcat(output, " "); /* leave space between entries */ + } + + /* remove the extra space at the end */ + output[strlen(output) - 1] = 0; + (void) strcat(output, "\042;\n"); + if (fprintf(tfp, "%s", output) < 0) { + retval = errno; + } + return (retval); +} + +/* + * FUNCTION: snarf_n_modify_bootlist + * This function stuffs the mddb_bootlist from either etc/system + * or kernel/drv/md.conf of the target system into a temporary file tname. + * The boolist in the temporary file is in device ID format. + * + * INPUT: *fp - file pointer that contains the mddb_bootlist. + * *tname - file into which the modified bootlist will be written to. + * * buf - buffer handed by upper level routine for reading in contents. + * * bufsiz - size of the buffer. + * mddb_file - flag + * + * RETURN: + * 0 - Success + * > 0 - Failure. Errno returned. + */ + +static int +snarf_n_modify_bootlist( + FILE *fp, /* File pointer to snarf from */ + char *tname, /* name of the temporary file */ + char *buf, /* Buffer to read into */ + int bufsz, /* buffer size */ + ftype_t mddb_file /* flag to indicate if its /etc/system or md.conf */ +) +{ + FILE *tfp; + int rval = RET_SUCCESS; + char *fname = SYSTEM_FILE; + char *mddb_start = BEGMDDBSTR; + char *mddb_end = ENDMDDBSTR; + convflag_t cstatus = MD_STR_NOTFOUND; + + if (mddb_file == MDDB_MDCONF_FILE) { + fname = MD_CONF; + mddb_start = NEW_BEGMDDBSTR; + mddb_end = NEW_ENDMDDBSTR; + } + + if ((tfp = fopen(tname, "a")) == NULL) + return (errno); + debug_printf("Convert from %s\n", fname); + + rewind(fp); + while (fgets(buf, bufsz, fp) != NULL) { + if (strcmp(buf, mddb_start) == 0) { + cstatus = MD_STR_START; + if (fprintf(tfp, "%s", NEW_BEGMDDBSTR) < 0) { + rval = errno; + break; + } + continue; + } + if (cstatus == MD_STR_START) { + if (strcmp(buf, mddb_end) == 0) { + cstatus = MD_STR_DONE; + if (fprintf(tfp, "%s", NEW_ENDMDDBSTR) < 0) { + rval = errno; + break; + } + + if (mddb_file == MDDB_MDCONF_FILE) + continue; + else + break; + } + + rval = parse_bootlist(buf, tfp); + if (rval == RET_SUCCESS) + continue; + else + break; + } + if (mddb_file == MDDB_MDCONF_FILE) { + if (fprintf(tfp, "%s\n", buf) < 0) { + rval = errno; + break; + } + } + + } /* while (fgets */ + + if (cstatus == MD_STR_NOTFOUND || cstatus == MD_STR_START) + rval = RET_ERROR; + (void) fclose(tfp); + return (rval); +} + + +/* + * FUNCTION: convert_bootlist + * Get the bootlist from $ROOT/etc/system and add modified bootlist to + * md.conf. + * The function converts the mddb_boolist format from that in /etc/system + * to md.conf. Also new fields are added to handle the devid id format. + * A copy of md.conf is created and the new entries are added to it. + * The name of the new file is returned to the calling program. + * + * Input: system file name + * md.conf file name + * pointer to temp file name. + * RETURN: + * *tname - name of the file that has md.conf + new mddb_boolist entries + * 0 - success + * -1 - mddb_bootlist not found + * > 0 - errno + * + */ + +int +convert_bootlist( + char *sname, /* system file name */ + char *mdconf, /* md.conf file name */ + char **tname /* temp file name */ +) +{ + FILE *fp; + char cmd_buf[MDDB_BOOTLIST_MAX_LEN]; + int retval = RET_SUCCESS; + + /* check names */ + assert(sname != NULL); + assert(tname != NULL); + + /* get temp name */ + *tname = tmpnam(NULL); + + if ((fp = fopen(sname, "r")) == NULL) { + retval = errno; + goto out; + } + if (valid_bootlist(fp, MDDB_BOOTLIST_MAX_LEN) == RET_SUCCESS) { + if ((retval = copyfile(mdconf, *tname)) == RET_ERROR) { + debug_printf("convert_bootlist: copy %s %s failed\n", + mdconf, *tname); + goto out; + } + retval = snarf_n_modify_bootlist(fp, *tname, cmd_buf, + MDDB_BOOTLIST_MAX_LEN, MDDB_SYS_FILE); + } else { + (void) fclose(fp); /* close system file */ + if ((fp = fopen(mdconf, "r")) == NULL) { + retval = errno; + goto out; + } + if (valid_bootlist(fp, MDDB_BOOTLIST_MAX_LEN) == RET_ERROR) { + retval = RET_ERROR; + goto out; + } + retval = snarf_n_modify_bootlist(fp, *tname, cmd_buf, + MDDB_BOOTLIST_MAX_LEN, MDDB_MDCONF_FILE); + } +out: + debug_printf("convert_bootlist: retval %d\n", retval); + if (fp != NULL) + (void) fclose(fp); + + if ((retval != RET_SUCCESS) && (*tname != NULL)) { + (void) unlink(*tname); + free(*tname); + } + return (retval); +} diff --git a/usr/src/lib/lvm/libsvm/i386/Makefile b/usr/src/lib/lvm/libsvm/i386/Makefile new file mode 100644 index 0000000000..4fc9526b2a --- /dev/null +++ b/usr/src/lib/lvm/libsvm/i386/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +install debug: $(ROOTLIBDIR) $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/lib/lvm/libsvm/sparc/Makefile b/usr/src/lib/lvm/libsvm/sparc/Makefile new file mode 100644 index 0000000000..4fc9526b2a --- /dev/null +++ b/usr/src/lib/lvm/libsvm/sparc/Makefile @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../Makefile.com + +install debug: $(ROOTLIBDIR) $(ROOTLIBS) $(ROOTLINKS) diff --git a/usr/src/lib/lvm/libsvm/spec/Makefile b/usr/src/lib/lvm/libsvm/spec/Makefile new file mode 100644 index 0000000000..7256a09a20 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/Makefile @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# + +include $(SRC)/lib/Makefile.spec.arch diff --git a/usr/src/lib/lvm/libsvm/spec/Makefile.targ b/usr/src/lib/lvm/libsvm/spec/Makefile.targ new file mode 100644 index 0000000000..3a5ed0eb26 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/Makefile.targ @@ -0,0 +1,41 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# + +LIBRARY = libsvm.a +VERS = .1 + +OBJECTS = svm.o + +TRANSCPP = + +SPECCPP = -I.. -I../../inc + +# +# usr/snadm/lib/abi targets +# +SVMLIB_ABILIB= $(SNADMINLIB_ABI)/$(ABILIB) +SNADMINLIB_ABI=$(ROOT)/usr/snadm/lib/abi diff --git a/usr/src/lib/lvm/libsvm/spec/amd64/Makefile b/usr/src/lib/lvm/libsvm/spec/amd64/Makefile new file mode 100644 index 0000000000..a3a067f7a0 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/amd64/Makefile @@ -0,0 +1,50 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#amd64_C_PICFLAGS = $(amd64_C_BIGPICFLAGS) + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(SPECMAP) $(ABILIB) +$(DISABLE_APPTRACE) $(INS) -s -d -m $(DIRMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) $(SNADMINLIB_ABI)/amd64 +$(DISABLE_APPTRACE) $(INS) -s -m $(FILEMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) -f $(SNADMINLIB_ABI)/amd64 $(ABILIB) diff --git a/usr/src/lib/lvm/libsvm/spec/i386/Makefile b/usr/src/lib/lvm/libsvm/spec/i386/Makefile new file mode 100644 index 0000000000..23807a39c5 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/i386/Makefile @@ -0,0 +1,53 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#ident "%Z%%M% %I% %E% SMI" +# +# Copyright 2001-2003 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#i386_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +ROOTABILIB= $(SNADMINLIB_ABI)/$(ABILIB) + +$(ROOTABILIB): $(SNADMINLIB_ABI) $(SPECMAP) + +$(ROOTABILIB): $(ABILIB) + $(INS.file) + +$(DISABLE_APPTRACE)install: $(ROOTABILIB) diff --git a/usr/src/lib/lvm/libsvm/spec/sparc/Makefile b/usr/src/lib/lvm/libsvm/spec/sparc/Makefile new file mode 100644 index 0000000000..19aecc452c --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/sparc/Makefile @@ -0,0 +1,50 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(SPECMAP) $(ABILIB) +$(DISABLE_APPTRACE) $(INS) -s -d -m $(DIRMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) $(SNADMINLIB_ABI) +$(DISABLE_APPTRACE) $(INS) -s -m $(FILEMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) -f $(SNADMINLIB_ABI) $(ABILIB) diff --git a/usr/src/lib/lvm/libsvm/spec/sparcv9/Makefile b/usr/src/lib/lvm/libsvm/spec/sparcv9/Makefile new file mode 100644 index 0000000000..704f35f988 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/sparcv9/Makefile @@ -0,0 +1,50 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +#pragma ident "%Z%%M% %I% %E% SMI" +# +# Copyright (c) 2001 by Sun Microsystems, Inc. +# All rights reserved. +# + +.KEEP_STATE: + +# To enable apptrace, comment out the following line +DISABLE_APPTRACE= $(POUND_SIGN) + +include ../Makefile.targ + +# Add arch specific objects here +OBJECTS += + +include $(SRC)/lib/Makefile.lib + +# Uncomment the following if the linker complains +#sparc_C_PICFLAGS = -K PIC + +include $(SRC)/lib/Makefile.spec + +$(DISABLE_APPTRACE)install: $(SPECMAP) $(ABILIB) +$(DISABLE_APPTRACE) $(INS) -s -d -m $(DIRMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) $(SNADMINLIB_ABI)/sparcv9 +$(DISABLE_APPTRACE) $(INS) -s -m $(FILEMODE) -u $(OWNER) \ +$(DISABLE_APPTRACE) -g $(GROUP) -f $(SNADMINLIB_ABI)/sparcv9 $(ABILIB) diff --git a/usr/src/lib/lvm/libsvm/spec/svm.spec b/usr/src/lib/lvm/libsvm/spec/svm.spec new file mode 100644 index 0000000000..ae381eaae5 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/svm.spec @@ -0,0 +1,55 @@ +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +#pragma ident "%Z%%M% %I% %E% SMI" +# + +function svm_check +version SUNWprivate_1.1 +end + +function svm_start +version SUNWprivate_1.1 +end + +function svm_stop +version SUNWprivate_1.1 +end + +function svm_is_md +version SUNWprivate_1.1 +end + +function svm_get_components +version SUNWprivate_1.1 +end + +function svm_alloc +version SUNWprivate_1.1 +end + +function svm_free +version SUNWprivate_1.1 +end + diff --git a/usr/src/lib/lvm/libsvm/spec/versions b/usr/src/lib/lvm/libsvm/spec/versions new file mode 100644 index 0000000000..523cb927d5 --- /dev/null +++ b/usr/src/lib/lvm/libsvm/spec/versions @@ -0,0 +1,31 @@ +#pragma ident "%Z%%M% %I% %E% SMI" +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +i386 { + SUNWprivate_1.1; +} +sparc { + SUNWprivate_1.1; +} |