summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml8
-rw-r--r--usr/src/cmd/lvm/rpc.mdcommd/mddoors.c63
-rw-r--r--usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c237
-rw-r--r--usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c100
-rw-r--r--usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c20
-rw-r--r--usr/src/cmd/lvm/util/metaclust.c24
-rw-r--r--usr/src/cmd/mdb/common/modules/md/dumpmirror.c230
-rw-r--r--usr/src/cmd/mdb/common/modules/md/md.c12
-rw-r--r--usr/src/cmd/mdb/common/modules/md/metastat.c116
-rw-r--r--usr/src/cmd/mdb/intel/amd64/md/Makefile9
-rw-r--r--usr/src/cmd/mdb/intel/ia32/md/Makefile9
-rw-r--r--usr/src/cmd/mdb/sparc/v9/md/Makefile9
-rw-r--r--usr/src/head/meta.h12
-rw-r--r--usr/src/lib/lvm/libmeta/common/mapfile-vers24
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_db.c11
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c50
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_mn_comm.c280
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c69
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c34
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_mn_subr.c27
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_nameinfo.c10
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_runtime.c85
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_set.c14
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_set_hst.c443
-rw-r--r--usr/src/lib/lvm/libmeta/common/meta_sp.c90
-rw-r--r--usr/src/uts/common/io/lvm/md/md.c3
-rw-r--r--usr/src/uts/common/io/lvm/md/md_ioctl.c66
-rw-r--r--usr/src/uts/common/io/lvm/md/md_mddb.c958
-rw-r--r--usr/src/uts/common/io/lvm/md/md_subr.c203
-rw-r--r--usr/src/uts/common/io/lvm/mirror/mirror.c136
-rw-r--r--usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c43
-rw-r--r--usr/src/uts/common/io/lvm/mirror/mirror_resync.c1009
-rw-r--r--usr/src/uts/common/io/lvm/softpart/sp.c11
-rw-r--r--usr/src/uts/common/io/lvm/softpart/sp_ioctl.c1
-rw-r--r--usr/src/uts/common/sys/lvm/md_mirror.h39
-rw-r--r--usr/src/uts/common/sys/lvm/md_sp.h11
-rw-r--r--usr/src/uts/common/sys/lvm/mdio.h33
-rw-r--r--usr/src/uts/common/sys/lvm/mdmn_commd.x52
-rw-r--r--usr/src/uts/common/sys/lvm/mdvar.h6
39 files changed, 3235 insertions, 1322 deletions
diff --git a/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml b/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml
index c3cff1c1ca..e9910ae6ef 100644
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdcomm.xml
@@ -1,7 +1,7 @@
<?xml version='1.0'?>
<!DOCTYPE service_bundle SYSTEM '/usr/share/lib/xml/dtd/service_bundle.dtd.1'>
<!--
- Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ Copyright 2008 Sun Microsystems, Inc. All rights reserved.
Use is subject to license terms.
CDDL HEADER START
@@ -23,8 +23,6 @@
CDDL HEADER END
- pragma ident "%Z%%M% %I% %E% SMI"
-
NOTE: This service manifest is not editable; its contents will
be overwritten by package or patch operations, including
operating system upgrade. Make customizations in a different
@@ -82,8 +80,8 @@
<propval name='endpoint_type' type='astring' value='tli' />
<propval name='wait' type='boolean' value='true' />
<propval name='isrpc' type='boolean' value='true' />
- <propval name='rpc_low_version' type='integer' value='1' />
- <propval name='rpc_high_version' type='integer' value='1' />
+ <propval name='rpc_low_version' type='integer' value='2' />
+ <propval name='rpc_high_version' type='integer' value='2' />
<propval name='proto' type='astring' value='tcp' />
</property_group>
diff --git a/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c b/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c
index 401bd07421..7c12bb59eb 100644
--- a/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mddoors.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <door.h>
#include <locale.h>
#include <meta.h>
@@ -106,7 +104,7 @@ exit_daemon_lock(void)
if (close(daemon_lock_fd) == -1) {
syslog(LOG_DAEMON | LOG_DEBUG,
gettext("close(%s) failed - %s\n"),
- daemon_lock_file, strerror(errno));
+ daemon_lock_file, strerror(errno));
return;
}
unlink(daemon_lock_file);
@@ -133,37 +131,32 @@ door2rpc(void *cookie, /* required by the doors infrastructure */
md_mn_kresult_t kresult;
md_mn_kmsg_t *kmsg = (md_mn_kmsg_t *)(void *)argp;
- err = mdmn_send_message(kmsg->kmsg_setno,
- kmsg->kmsg_type,
- kmsg->kmsg_flags,
- (char *)&(kmsg->kmsg_data),
- kmsg->kmsg_size,
- &result,
- &ep);
+ err = mdmn_send_message(kmsg->kmsg_setno, kmsg->kmsg_type,
+ kmsg->kmsg_flags, kmsg->kmsg_recipient, (char *)&(kmsg->kmsg_data),
+ kmsg->kmsg_size, &result, &ep);
+
if (result == NULL) {
kresult.kmmr_comm_state = MDMNE_RPC_FAIL;
} else {
kresult.kmmr_comm_state = result->mmr_comm_state;
- }
- if (err == 0) {
- kresult.kmmr_msgtype = result->mmr_msgtype;
- kresult.kmmr_flags = result->mmr_flags;
- kresult.kmmr_exitval = result->mmr_exitval;
- kresult.kmmr_failing_node = result->mmr_failing_node;
- size = result->mmr_out_size;
- if (size > 0) {
- /* This is the maximum of data we can transfer, here */
- if (size > MDMN_MAX_KRES_DATA) {
- size = MDMN_MAX_KRES_DATA;
+ if (err == 0) {
+ kresult.kmmr_msgtype = result->mmr_msgtype;
+ kresult.kmmr_flags = result->mmr_flags;
+ kresult.kmmr_exitval = result->mmr_exitval;
+ kresult.kmmr_failing_node = result->mmr_failing_node;
+ size = result->mmr_out_size;
+ if (size > 0) {
+ /* This is the max data we can transfer, here */
+ if (size > MDMN_MAX_KRES_DATA) {
+ size = MDMN_MAX_KRES_DATA;
+ }
+ bcopy(result->mmr_out, &(kresult.kmmr_res_data),
+ size);
+ kresult.kmmr_res_size = size;
+ } else {
+ kresult.kmmr_res_size = 0;
}
- bcopy(result->mmr_out, &(kresult.kmmr_res_data), size);
- kresult.kmmr_res_size = size;
- } else {
- kresult.kmmr_res_size = 0;
}
- }
-
- if (result != NULL) {
free_result(result);
}
@@ -252,7 +245,7 @@ main(void)
* At this point we are single threaded.
* We give mdmn_send_message() a chance to initialize safely.
*/
- (void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0);
+ (void) mdmn_send_message(0, 0, 0, 0, 0, 0, 0, 0);
/* setup the door handle */
mdmn_door_handle = door_create(door2rpc, NULL,
@@ -266,12 +259,12 @@ main(void)
if (metaioctl(MD_MN_SET_DOORH, &mdmn_door_handle, &ep,
"mddoors") != 0) {
syslog(LOG_DAEMON | LOG_DEBUG, gettext(
- "Couldn't set door handle"));
+ "Couldn't set door handle"));
exit(1);
}
(void) pause();
syslog(LOG_DAEMON | LOG_ERR, gettext(
- "Unexpected exit from pause()"));
+ "Unexpected exit from pause()"));
return (1);
}
diff --git a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c
index 1413b2791d..9fda15beec 100644
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_server.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -42,38 +41,40 @@
/*
* This is the communication daemon for SVM Multi Node Disksets.
* It runs on every node and provides the following rpc services:
- * - mdmn_send_svc_1
- * - mdmn_work_svc_1
- * - mdmn_wakeup_initiator_svc_1
- * - mdmn_wakeup_master_svc_1
- * - mdmn_comm_lock_svc_1
- * - mdmn_comm_unlock_svc_1
- * - mdmn_comm_suspend_svc_1
- * - mdmn_comm_resume_svc_1
- * - mdmn_comm_reinit_set_svc_1
+ * - mdmn_send_svc_2
+ * - mdmn_work_svc_2
+ * - mdmn_wakeup_initiator_svc_2
+ * - mdmn_wakeup_master_svc_2
+ * - mdmn_comm_lock_svc_2
+ * - mdmn_comm_unlock_svc_2
+ * - mdmn_comm_suspend_svc_2
+ * - mdmn_comm_resume_svc_2
+ * - mdmn_comm_reinit_set_svc_2
* where send, lock, unlock and reinit are meant for external use,
* work and the two wakeups are for internal use only.
*
* NOTE:
- * On every node only one of those xxx_1 functions can be active at the
+ * On every node only one of those xxx_2 functions can be active at the
* same time because the daemon is single threaded.
*
+ * (not quite true, as mdmn_send_svc_2 and mdmn_work_svc_2 do thr_create()s
+ * as part of their handlers, so those aspects are multi-threaded)
*
* In case an event occurs that has to be propagated to all the nodes...
*
* One node (the initiator)
* calls the libmeta function mdmn_send_message()
- * This function calls the local daemon thru mdmn_send_svc_1.
+ * This function calls the local daemon thru mdmn_send_svc_2.
*
* On the initiator:
- * mdmn_send_svc_1()
+ * mdmn_send_svc_2()
* - starts a thread -> mdmn_send_to_work() and returns.
* mdmn_send_to_work()
* - sends this message over to the master of the diskset.
- * This is done by calling mdmn_work_svc_1 on the master.
+ * This is done by calling mdmn_work_svc_2 on the master.
* - registers to the initiator_table
* - exits without doing a svc_sendreply() for the call to
- * mdmn_send_svc_1. This means that call is blocked until somebody
+ * mdmn_send_svc_2. This means that call is blocked until somebody
* (see end of this comment) does a svc_sendreply().
* This means mdmn_send_message() does not yet return.
* - A timeout surveillance is started at this point.
@@ -82,42 +83,42 @@
* to the caller.
*
* On the master:
- * mdmn_work_svc_1()
+ * mdmn_work_svc_2()
* - starts a thread -> mdmn_master_process_msg() and returns
* mdmn_master_process_msg()
* - logs the message to the change log
* - executes the message locally
* - flags the message in the change log
- * - sends the message to mdmn_work_svc_1() on all the
+ * - sends the message to mdmn_work_svc_2() on all the
* other nodes (slaves)
- * after each call to mdmn_work_svc_1 the thread goes to sleep and
- * will be woken up by mdmn_wakeup_master_svc_1() as soon as the
+ * after each call to mdmn_work_svc_2 the thread goes to sleep and
+ * will be woken up by mdmn_wakeup_master_svc_2() as soon as the
* slave node is done with this message.
* - In case the slave doesn't respond in a apropriate time, an error
* is assumed to ensure the master doesn't wait forever.
*
* On a slave:
- * mdmn_work_svc_1()
+ * mdmn_work_svc_2()
* - starts a thread -> mdmn_slave_process_msg() and returns
* mdmn_slave_process_msg()
* - processes this message locally by calling the appropriate message
* handler, that creates some result.
- * - sends that result thru a call to mdmn_wakeup_master_svc_1() to
+ * - sends that result thru a call to mdmn_wakeup_master_svc_2() to
* the master.
*
* Back on the master:
- * mdmn_wakeup_master_svc_1()
+ * mdmn_wakeup_master_svc_2()
* - stores the result into the master_table.
* - signals the mdmn_master_process_msg-thread.
* - returns
* mdmn_master_process_msg()
* - after getting the results from all nodes
* - sends them back to the initiating node thru a call to
- * mdmn_wakeup_initiator_svc_1.
+ * mdmn_wakeup_initiator_svc_2.
*
* Back on the initiator:
- * mdmn_wakeup_initiator_svc_1()
- * - calls svc_sendreply() which makes the call to mdmn_send_svc_1()
+ * mdmn_wakeup_initiator_svc_2()
+ * - calls svc_sendreply() which makes the call to mdmn_send_svc_2()
* return.
* which allows the initial mdmn_send_message() call to return.
*/
@@ -195,8 +196,8 @@ mdmn_clnt_create(char *ignore, void *data, struct timeval *time_out)
{
md_mnnode_desc *node = (md_mnnode_desc *)data;
- return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, ONE, "tcp",
- time_out));
+ return (clnt_create_timed(node->nd_priv_ic, MDMN_COMMD, TWO, "tcp",
+ time_out));
}
#define FLUSH_DEBUGFILE() \
@@ -219,15 +220,15 @@ panic_system(int nid, md_mn_msgtype_t type, int master_err, int master_exitval,
if (master_err != MDMNE_ACK) {
snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on master "
- "when processing message type %d\n", type);
+ "when processing message type %d\n", type);
} else if (slave_result == NULL) {
snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: RPC fail on node "
- "%d when processing message type %d\n", nid, type);
+ "%d when processing message type %d\n", nid, type);
} else {
snprintf(msg_buf, MAXPATHLEN, "rpc.mdcommd: Inconsistent "
- "return value from node %d when processing message "
- "type %d. Master exitval = %d, Slave exitval = %d\n",
- nid, type, master_exitval, slave_result->mmr_exitval);
+ "return value from node %d when processing message "
+ "type %d. Master exitval = %d, Slave exitval = %d\n",
+ nid, type, master_exitval, slave_result->mmr_exitval);
}
commd_err.size = strlen(msg_buf);
commd_err.md_message = (uint64_t)(uintptr_t)&msg_buf[0];
@@ -335,12 +336,17 @@ timeout_initiator(set_t setno, md_mn_msgclass_t class)
commd_debug(MD_MMV_MISC, "timeout_ini: (%d, 0x%llx-%d)\n",
MSGID_ELEMS(mid));
+ /*
+ * Give the result the corresponding msgid from the failed message.
+ */
+ MSGID_COPY(&mid, &(resultp->mmr_msgid));
/* return to mdmn_send_message() and let it deal with the situation */
mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
free(resultp);
commd_debug(MD_MMV_MISC, "timeout_ini: sendreplied\n");
+ svc_done(transp);
mdmn_unregister_initiator_table(setno, class);
}
@@ -499,13 +505,13 @@ mdmn_is_node_dead(md_mnnode_desc *node)
* Perform some global initializations.
*
* the following routines have to call this before operation can start:
- * - mdmn_send_svc_1
- * - mdmn_work_svc_1
- * - mdmn_comm_lock_svc_1
- * - mdmn_comm_unlock_svc_1
- * - mdmn_comm_suspend_svc_1
- * - mdmn_comm_resume_svc_1
- * - mdmn_comm_reinit_set_svc_1
+ * - mdmn_send_svc_2
+ * - mdmn_work_svc_2
+ * - mdmn_comm_lock_svc_2
+ * - mdmn_comm_unlock_svc_2
+ * - mdmn_comm_suspend_svc_2
+ * - mdmn_comm_resume_svc_2
+ * - mdmn_comm_reinit_set_svc_2
*
* This is a single threaded daemon, so it can only be in one of the above
* routines at the same time.
@@ -547,8 +553,7 @@ global_init(void)
__savetime = gethrtime();
(void) time(&clock_val);
- commd_debug(MD_MMV_MISC, "global init called %s\n",
- ctime(&clock_val));
+ commd_debug(MD_MMV_MISC, "global init called %s\n", ctime(&clock_val));
/* start a thread that flushes out the debug on a regular basis */
thr_create(NULL, 0, (void *(*)(void *))flush_fcout,
@@ -663,9 +668,9 @@ mdmn_init_client(set_t setno, md_mn_nodeid_t nid)
*/
while ((client[setno][nid] == (CLIENT *) NULL) &&
(tout < MD_CLNT_CREATE_TOUT)) {
- client[setno][nid] = meta_client_create_retry
- (node->nd_nodename, mdmn_clnt_create,
- (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
+ client[setno][nid] = meta_client_create_retry(
+ node->nd_nodename, mdmn_clnt_create,
+ (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
/* Is the node dead? */
if (mdmn_is_node_dead(node) == 1) {
commd_debug(MD_MMV_SYSLOG,
@@ -889,9 +894,9 @@ mdmn_init_set(set_t setno, int todo)
*/
while ((client[setno][nid] == (CLIENT *) NULL) &&
(tout < MD_CLNT_CREATE_TOUT)) {
- client[setno][nid] = meta_client_create_retry
- (node->nd_nodename, mdmn_clnt_create,
- (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
+ client[setno][nid] = meta_client_create_retry(
+ node->nd_nodename, mdmn_clnt_create,
+ (void *) node, MD_CLNT_CREATE_SUBTIMEOUT, &ep);
/* Is the node dead? */
if (mdmn_is_node_dead(node) == 1) {
commd_debug(MD_MMV_SYSLOG,
@@ -942,7 +947,7 @@ mdmn_init_set(set_t setno, int todo)
void *
mdmn_send_to_work(void *arg)
{
- int *rpc_err;
+ int *rpc_err = NULL;
int success;
int try_master;
set_t setno;
@@ -956,9 +961,6 @@ mdmn_send_to_work(void *arg)
msg = matp->mat_msg;
transp = matp->mat_transp;
- /* the alloc was done in mdmn_send_svc_1 */
- free(matp);
-
class = mdmn_get_message_class(msg->msg_type);
setno = msg->msg_setno;
@@ -980,8 +982,7 @@ mdmn_send_to_work(void *arg)
if (success == MDMNE_CLASS_BUSY) {
md_mn_msgid_t active_mid;
- mdmn_get_initiator_table_id(setno, class,
- &active_mid);
+ mdmn_get_initiator_table_id(setno, class, &active_mid);
commd_debug(MD_MMV_SEND,
"send_to_work: received but locally busy "
@@ -1011,7 +1012,8 @@ mdmn_send_to_work(void *arg)
* Send the request to the work function on the master
* this call will return immediately
*/
- rpc_err = mdmn_work_1(msg, client[setno][set_master]);
+ rpc_err = mdmn_work_2(msg, client[setno][set_master],
+ set_master);
/* Everything's Ok? */
if (rpc_err == NULL) {
@@ -1043,7 +1045,7 @@ mdmn_send_to_work(void *arg)
/*
* If we are here, we sucessfully delivered the message.
* We register the initiator_table, so that
- * wakeup_initiator_1 can do the sendreply with the
+ * wakeup_initiator_2 can do the sendreply with the
* results for us.
*/
success = MDMNE_ACK;
@@ -1068,15 +1070,27 @@ mdmn_send_to_work(void *arg)
md_mn_result_t *resultp;
resultp = Zalloc(sizeof (md_mn_result_t));
resultp->mmr_comm_state = success;
+ /*
+ * copy the MSGID so that we know _which_ message
+ * failed (if the transp has got mangled)
+ */
+ MSGID_COPY(&(msg->msg_msgid), &(resultp->mmr_msgid));
mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
commd_debug(MD_MMV_SEND,
"send_to_work: not registered (%d, 0x%llx-%d) cs=%d\n",
MSGID_ELEMS(msg->msg_msgid), success);
free_result(resultp);
+ /*
+ * We don't have a timeout registered to wake us up, so we're
+ * now done with this handle. Release it back to the pool.
+ */
+ svc_done(transp);
}
free_msg(msg);
+ /* the alloc was done in mdmn_send_svc_2 */
+ Free(matp);
mutex_unlock(mx);
return (NULL);
@@ -1186,7 +1200,7 @@ do_send_message(md_mn_msg_t *msg, md_mnnode_desc *node)
int timeout_retries = 0;
int *ret = NULL;
set_t setno;
- cond_t *cv; /* see mdmn_wakeup_master_svc_1 */
+ cond_t *cv; /* see mdmn_wakeup_master_svc_2 */
mutex_t *mx; /* protection for class_busy */
timestruc_t timeout; /* surveillance for remote daemon */
md_mn_nodeid_t nid;
@@ -1251,7 +1265,7 @@ retry_rpc:
}
/* send it over, it will return immediately */
- ret = mdmn_work_1(msg, client[setno][nid]);
+ ret = mdmn_work_2(msg, client[setno][nid], nid);
rw_unlock(&client_rwlock[setno]);
@@ -1462,7 +1476,7 @@ mdmn_master_process_msg(md_mn_msg_t *msg)
result->mmr_comm_state = MDMNE_LOG_FAIL;
/*
* Note that the mark_busy was already done by
- * mdmn_work_svc_1()
+ * mdmn_work_svc_2()
*/
mutex_lock(&mdmn_busy_mutex[setno]);
mdmn_mark_class_unbusy(setno, orig_class);
@@ -1487,8 +1501,8 @@ mdmn_master_process_msg(md_mn_msg_t *msg)
commd_debug(MD_MMV_SYSLOG,
"proc_mas: No client for initiator \n");
} else {
- ret = mdmn_wakeup_initiator_1(result,
- client[setno][sender]);
+ ret = mdmn_wakeup_initiator_2(result,
+ client[setno][sender], sender);
}
rw_unlock(&client_rwlock[setno]);
@@ -1677,6 +1691,12 @@ proceed:
continue;
}
+ /* If a DIRECTED message, skip non-recipient nodes */
+ if ((cmsg->msg_flags & MD_MSGF_DIRECTED) &&
+ nid != cmsg->msg_recipient) {
+ continue;
+ }
+
mutex_lock(mx);
/*
* Register the node that is addressed,
@@ -1865,7 +1885,8 @@ proceed:
commd_debug(MD_MMV_SYSLOG,
"proc_mas: unable to create client for initiator\n");
} else {
- ret = mdmn_wakeup_initiator_1(result, client[setno][sender]);
+ ret = mdmn_wakeup_initiator_2(result, client[setno][sender],
+ sender);
}
rw_unlock(&client_rwlock[setno]);
@@ -2046,14 +2067,14 @@ mdmn_slave_process_msg(md_mn_msg_t *msg)
rw_unlock(&client_rwlock[setno]);
break;
} else {
- ret = mdmn_wakeup_master_1(result,
- client[setno][sender]);
+ ret = mdmn_wakeup_master_2(result,
+ client[setno][sender], sender);
/*
- * if mdmn_wakeup_master_1 returns NULL, it can be that
+ * if mdmn_wakeup_master_2 returns NULL, it can be that
* the master (or the commd on the master) had died.
* In that case, we destroy the client to the master
* and retry.
- * If mdmn_wakeup_master_1 doesn't return MDMNE_ACK,
+ * If mdmn_wakeup_master_2 doesn't return MDMNE_ACK,
* the commd on the master is alive but
* something else is wrong,
* in that case a retry doesn't make sense => break out
@@ -2097,8 +2118,19 @@ mdmn_slave_process_msg(md_mn_msg_t *msg)
}
-md_mn_result_t *
-mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
+/*
+ * mdmn_send_svc_2:
+ * ---------------
+ * Check that the issuing node is a legitimate one (i.e. is licensed to send
+ * messages to us), that the RPC request can be staged.
+ *
+ * Returns:
+ * 0 => no RPC request is in-flight, no deferred svc_sendreply()
+ * 1 => queued RPC request in-flight. Completion will be made (later)
+ * by a wakeup_initiator_2() [hopefully]
+ */
+int
+mdmn_send_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
{
int err;
set_t setno;
@@ -2121,7 +2153,7 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
free_result(resultp);
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
- return (NULL);
+ return (0);
}
/* check if the global initialization is done */
@@ -2152,7 +2184,7 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
(char *)resultp);
free_result(resultp);
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
- return (NULL);
+ return (0);
}
}
@@ -2169,7 +2201,7 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)resultp);
free_result(resultp);
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
- return (NULL);
+ return (0);
}
@@ -2184,10 +2216,10 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
free_result(resultp);
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
commd_debug(MD_MMV_SEND,
- "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
- "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
- msg->msg_type);
- return (NULL);
+ "send: type locked (%d, 0x%llx-%d), set=%d, class=%d, "
+ "type=%d\n", MSGID_ELEMS(msg->msg_msgid), setno, class,
+ msg->msg_type);
+ return (0);
}
@@ -2213,7 +2245,7 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
free_result(resultp);
commd_debug(MD_MMV_SEND,
"send: init err = %d\n", err);
- return (NULL);
+ return (0);
}
}
@@ -2227,10 +2259,10 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
free_result(resultp);
commd_debug(MD_MMV_SEND,
- "send: class suspended (%d, 0x%llx-%d), set=%d, "
- "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
- setno, class, msg->msg_type);
- return (NULL);
+ "send: class suspended (%d, 0x%llx-%d), set=%d, "
+ "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
+ setno, class, msg->msg_type);
+ return (0);
}
mutex_unlock(&mdmn_busy_mutex[setno]);
@@ -2238,10 +2270,10 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
if (check_license(rqstp, 0) == FALSE) {
svc_freeargs(transp, xdr_md_mn_msg_t, (caddr_t)msg);
commd_debug(MD_MMV_SEND,
- "send: check licence fail(%d, 0x%llx-%d), set=%d, "
- "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
- setno, class, msg->msg_type);
- return (NULL);
+ "send: check licence fail(%d, 0x%llx-%d), set=%d, "
+ "class=%d, type=%d\n", MSGID_ELEMS(msg->msg_msgid),
+ setno, class, msg->msg_type);
+ return (0);
}
@@ -2268,17 +2300,17 @@ mdmn_send_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
MSGID_ELEMS(msg->msg_msgid));
/*
* We return here without sending results. This will be done by
- * mdmn_wakeup_initiator_svc_1() as soon as the results are available.
+ * mdmn_wakeup_initiator_svc_2() as soon as the results are available.
* Until then the calling send_message will be blocked, while we
* are able to take calls.
*/
- return (NULL);
+ return (1);
}
/* ARGSUSED */
int *
-mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
+mdmn_work_svc_2(md_mn_msg_t *omsg, struct svc_req *rqstp)
{
int err;
set_t setno;
@@ -2362,7 +2394,7 @@ mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
mutex_lock(&mdmn_busy_mutex[setno]);
- /* check if class is locked via a call to mdmn_comm_lock_svc_1 */
+ /* check if class is locked via a call to mdmn_comm_lock_svc_2 */
if (mdmn_is_class_locked(setno, class) == TRUE) {
mutex_unlock(&mdmn_busy_mutex[setno]);
*retval = MDMNE_CLASS_LOCKED;
@@ -2430,14 +2462,14 @@ mdmn_work_svc_1(md_mn_msg_t *omsg, struct svc_req *rqstp)
/* ARGSUSED */
int *
-mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp)
+mdmn_wakeup_initiator_svc_2(md_mn_result_t *res, struct svc_req *rqstp)
{
int *retval;
int err;
set_t setno;
mutex_t *mx; /* protection of initiator_table */
- SVCXPRT *transp;
+ SVCXPRT *transp = NULL;
md_mn_msgid_t initiator_table_id;
md_mn_msgclass_t class;
@@ -2491,13 +2523,14 @@ mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp)
* Search the initiator wakeup table.
* If we find an entry here (which should always be true)
* we are on the initiating node and we wakeup the original
- * local rpc call
+ * local rpc call.
*/
mdmn_get_initiator_table_id(setno, class, &initiator_table_id);
if (MSGID_CMP(&(initiator_table_id), &(res->mmr_msgid))) {
transp = mdmn_get_initiator_table_transp(setno, class);
mdmn_svc_sendreply(transp, xdr_md_mn_result_t, (char *)res);
+ svc_done(transp);
mdmn_unregister_initiator_table(setno, class);
*retval = MDMNE_ACK;
@@ -2532,7 +2565,7 @@ mdmn_wakeup_initiator_svc_1(md_mn_result_t *res, struct svc_req *rqstp)
*/
/* ARGSUSED */
int *
-mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp)
+mdmn_wakeup_master_svc_2(md_mn_result_t *ores, struct svc_req *rqstp)
{
int *retval;
@@ -2645,7 +2678,7 @@ mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp)
* This is mainly done for debug purpose.
* This set/class combination immediately is blocked,
* even in the middle of sending messages to multiple slaves.
- * This remains until the user issues a mdmn_comm_unlock_svc_1 for the same
+ * This remains until the user issues a mdmn_comm_unlock_svc_2 for the same
* set/class combination.
*
* Special messages of class MD_MSG_CLASS0 can never be locked.
@@ -2666,7 +2699,7 @@ mdmn_wakeup_master_svc_1(md_mn_result_t *ores, struct svc_req *rqstp)
/* ARGSUSED */
int *
-mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_lock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
{
int *retval;
set_t setno = msc->msc_set;
@@ -2722,7 +2755,7 @@ mdmn_comm_lock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
*/
/* ARGSUSED */
int *
-mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_unlock_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
{
int *retval;
set_t setno = msc->msc_set;
@@ -2766,7 +2799,7 @@ mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
}
/*
- * mdmn_comm_suspend_svc_1(setno, class)
+ * mdmn_comm_suspend_svc_2(setno, class)
*
* Drain all outstanding messages for a given set/class combination
* and don't allow new messages to be processed.
@@ -2812,7 +2845,7 @@ mdmn_comm_unlock_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
/* ARGSUSED */
int *
-mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_suspend_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
{
int *retval;
int failure = 0;
@@ -2902,7 +2935,7 @@ mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
}
/*
- * mdmn_comm_resume_svc_1(setno, class)
+ * mdmn_comm_resume_svc_2(setno, class)
*
* Resume processing messages for a given set.
* This incorporates the repeal of a previous suspend operation.
@@ -2927,7 +2960,7 @@ mdmn_comm_suspend_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
*/
/* ARGSUSED */
int *
-mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
+mdmn_comm_resume_svc_2(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
{
int *retval;
set_t startset, endset;
@@ -3029,7 +3062,7 @@ mdmn_comm_resume_svc_1(md_mn_set_and_class_t *msc, struct svc_req *rqstp)
}
/* ARGSUSED */
int *
-mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp)
+mdmn_comm_reinit_set_svc_2(set_t *setnop, struct svc_req *rqstp)
{
int *retval;
md_mnnode_desc *node;
@@ -3093,7 +3126,7 @@ mdmn_comm_reinit_set_svc_1(set_t *setnop, struct svc_req *rqstp)
/* ARGSUSED */
int *
-mdmn_comm_msglock_svc_1(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
+mdmn_comm_msglock_svc_2(md_mn_type_and_lock_t *mmtl, struct svc_req *rqstp)
{
int *retval;
md_mn_msgtype_t type = mmtl->mmtl_type;
diff --git a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c
index 5525f8546a..f4e6478dc7 100644
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_commd_service.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/lvm/mdmn_commd.h>
#include <stdio.h>
#include <stdlib.h> /* getenv, exit */
@@ -60,16 +58,16 @@ static int _rpcpmstart; /* Started by a port monitor ? */
static int _rpcsvcstate = _IDLE; /* Set when a request is serviced */
static int _rpcsvccount = 0; /* Number of requests being serviced */
-extern md_mn_result_t *mdmn_send_svc_1();
-extern int *mdmn_work_svc_1();
-extern int *mdmn_wakeup_initiator_svc_1();
-extern int *mdmn_wakeup_master_svc_1();
-extern int *mdmn_comm_lock_svc_1();
-extern int *mdmn_comm_unlock_svc_1();
-extern int *mdmn_comm_suspend_svc_1();
-extern int *mdmn_comm_resume_svc_1();
-extern int *mdmn_comm_reinit_set_svc_1();
-extern int *mdmn_comm_msglock_svc_1();
+extern int mdmn_send_svc_2();
+extern int *mdmn_work_svc_2();
+extern int *mdmn_wakeup_initiator_svc_2();
+extern int *mdmn_wakeup_master_svc_2();
+extern int *mdmn_comm_lock_svc_2();
+extern int *mdmn_comm_unlock_svc_2();
+extern int *mdmn_comm_suspend_svc_2();
+extern int *mdmn_comm_resume_svc_2();
+extern int *mdmn_comm_reinit_set_svc_2();
+extern int *mdmn_comm_msglock_svc_2();
static void
@@ -107,7 +105,7 @@ closedown(void)
}
static void
-mdmn_commd_1(rqstp, transp)
+mdmn_commd_2(rqstp, transp)
struct svc_req *rqstp;
register SVCXPRT *transp;
{
@@ -124,7 +122,6 @@ mdmn_commd_1(rqstp, transp)
char *(*local)();
int free_result = 0;
-
_rpcsvccount++;
switch (rqstp->rq_proc) {
case NULLPROC:
@@ -132,6 +129,7 @@ mdmn_commd_1(rqstp, transp)
(char *)NULL);
_rpcsvccount--;
_rpcsvcstate = _SERVED;
+ svc_done(transp);
return;
case mdmn_send:
@@ -140,81 +138,94 @@ mdmn_commd_1(rqstp, transp)
(void) memset((char *)&argument, 0, sizeof (argument));
if (!svc_getargs(transp, _xdr_argument, (caddr_t)&argument)) {
svcerr_decode(transp);
+ svc_done(transp);
_rpcsvccount--;
_rpcsvcstate = _SERVED;
return;
}
/*
- * mdmn_send_1 will not always do a sendreply.
+ * mdmn_send_2 will not always do a sendreply.
* it will register in a table and let the mdmn_wakeup1
* do the sendreply for that call.
* in order to register properly we need the transp handle
+ * If we get a 0 back from mdmn_send_svc_2() we have no pending
+ * RPC in-flight, so we drop the service count.
*/
- (void) mdmn_send_svc_1((md_mn_msg_t *)&argument, rqstp);
+ if (mdmn_send_svc_2((md_mn_msg_t *)&argument, rqstp) == 0) {
+ _rpcsvccount--;
+ _rpcsvcstate = _SERVED;
+ svc_done(rqstp->rq_xprt);
+ }
- return; /* xdr_free is called by mdmn_wakeup_initiator_svc_1 */
+ return; /* xdr_free is called by mdmn_wakeup_initiator_svc_2 */
case mdmn_work:
_xdr_argument = xdr_md_mn_msg_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_work_svc_1;
+ local = (char *(*)()) mdmn_work_svc_2;
free_result = 1;
break;
case mdmn_wakeup_master:
_xdr_argument = xdr_md_mn_result_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_wakeup_master_svc_1;
+ local = (char *(*)()) mdmn_wakeup_master_svc_2;
free_result = 1;
break;
case mdmn_wakeup_initiator:
+ /*
+ * We must have had an in-flight RPC request to get here,
+ * so drop the in-flight count.
+ */
_xdr_argument = xdr_md_mn_result_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_wakeup_initiator_svc_1;
+ local = (char *(*)()) mdmn_wakeup_initiator_svc_2;
free_result = 1;
+ _rpcsvccount--;
break;
case mdmn_comm_lock:
_xdr_argument = xdr_md_mn_set_and_class_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_lock_svc_1;
+ local = (char *(*)()) mdmn_comm_lock_svc_2;
break;
case mdmn_comm_unlock:
_xdr_argument = xdr_md_mn_set_and_class_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_unlock_svc_1;
+ local = (char *(*)()) mdmn_comm_unlock_svc_2;
break;
case mdmn_comm_suspend:
_xdr_argument = xdr_md_mn_set_and_class_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_suspend_svc_1;
+ local = (char *(*)()) mdmn_comm_suspend_svc_2;
break;
case mdmn_comm_resume:
_xdr_argument = xdr_md_mn_set_and_class_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_resume_svc_1;
+ local = (char *(*)()) mdmn_comm_resume_svc_2;
break;
case mdmn_comm_reinit_set:
_xdr_argument = xdr_u_int;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_reinit_set_svc_1;
+ local = (char *(*)()) mdmn_comm_reinit_set_svc_2;
break;
case mdmn_comm_msglock:
_xdr_argument = xdr_md_mn_type_and_lock_t;
_xdr_result = xdr_int;
- local = (char *(*)()) mdmn_comm_msglock_svc_1;
+ local = (char *(*)()) mdmn_comm_msglock_svc_2;
break;
default:
svcerr_noproc(transp);
_rpcsvccount--;
_rpcsvcstate = _SERVED;
+ svc_done(transp);
return;
}
(void) memset((char *)&argument, 0, sizeof (argument));
@@ -222,6 +233,7 @@ mdmn_commd_1(rqstp, transp)
svcerr_decode(transp);
_rpcsvccount--;
_rpcsvcstate = _SERVED;
+ svc_done(transp);
return;
}
result = (*local)(&argument, rqstp);
@@ -231,12 +243,15 @@ mdmn_commd_1(rqstp, transp)
}
if (!svc_freeargs(transp, _xdr_argument, (caddr_t)&argument)) {
_msgout(gettext("unable to free arguments"));
+ svc_done(transp);
exit(1);
}
if (free_result == 1) {
free(result);
}
+
+ svc_done(transp);
_rpcsvccount--;
_rpcsvcstate = _SERVED;
}
@@ -249,6 +264,7 @@ static void
exit_commd()
{
md_error_t ep = mdnullerror;
+ syslog(LOG_DAEMON | LOG_DEBUG, gettext("mdcommd exiting"));
(void) metaioctl(MD_MN_SET_COMMD_RUNNING, 0, &ep, "rpc.mdcommd");
}
@@ -259,10 +275,23 @@ main()
pid_t pid;
int i;
md_error_t ep = mdnullerror;
+ int mode = RPC_SVC_MT_USER;
(void) sigset(SIGPIPE, SIG_IGN);
/*
+ * Attempt to set MT_USER behaviour for mdcommd service routines.
+ * If this isn't done, there is a possibility that the transport
+ * handle might be freed before the thread created by mdmn_send_svc_2
+ * can use it. A consequence of this is that svc_done() must be
+ * called on the handle when it's no longer needed.
+ */
+ if (rpc_control(RPC_SVC_MTMODE_SET, &mode) == FALSE) {
+ _msgout(gettext("cannot set MT_USER mode for RPC service"));
+ exit(1);
+ }
+
+ /*
* If stdin looks like a TLI endpoint, we assume
* that we were started by a port monitor. If
* t_getstate fails with TBADF, this is not a
@@ -294,9 +323,9 @@ main()
}
if (nconf)
freenetconfigent(nconf);
- if (!svc_reg(transp, MDMN_COMMD, ONE, mdmn_commd_1, 0)) {
+ if (!svc_reg(transp, MDMN_COMMD, TWO, mdmn_commd_2, 0)) {
_msgout(gettext(
- "unable to register (MDMN_COMMD, ONE)."));
+ "unable to register (MDMN_COMMD, TWO)."));
exit(1);
}
@@ -307,7 +336,8 @@ main()
(void) alarm(_RPCSVC_CLOSEDOWN/2);
}
- (void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)1, &ep,
+ pid = getpid();
+ (void) metaioctl(MD_MN_SET_COMMD_RUNNING, (void *)pid, &ep,
"rpc.mdcommd");
svc_run();
exit(1);
@@ -343,8 +373,8 @@ main()
openlog("mdmn_commd", LOG_PID, LOG_DAEMON);
#endif
}
- if (!svc_create(mdmn_commd_1, MDMN_COMMD, ONE, "tcp")) {
- _msgout(gettext("unable to create (MDMN_COMMD, ONE) for tcp."));
+ if (!svc_create(mdmn_commd_2, MDMN_COMMD, TWO, "tcp")) {
+ _msgout(gettext("unable to create (MDMN_COMMD, TWO) for tcp."));
exit(1);
}
diff --git a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c
index 715c4a3307..e05022bf5b 100644
--- a/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c
+++ b/usr/src/cmd/lvm/rpc.mdcommd/mdmn_subr.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
@@ -446,6 +444,7 @@ dump_msg(uint_t dbc, char *prefix, md_mn_msg_t *msg)
commd_debug(dbc, "%s sender = %d\n", prefix, msg->msg_sender);
commd_debug(dbc, "%s flags = 0x%x\n", prefix, msg->msg_flags);
commd_debug(dbc, "%s setno = %d\n", prefix, msg->msg_setno);
+ commd_debug(dbc, "%s recipient = %d\n", prefix, msg->msg_recipient);
commd_debug(dbc, "%s type = %d\n", prefix, msg->msg_type);
commd_debug(dbc, "%s size = %d\n", prefix, msg->msg_event_size);
if (msg->msg_event_size) {
@@ -513,9 +512,8 @@ mdmn_get_mce_by_msg(md_mn_msg_t *msg)
class = msg->msg_msgid.mid_oclass;
}
- mct_index = submsg +
- class * MAX_SUBMESSAGES +
- nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES;
+ mct_index = submsg + class * MAX_SUBMESSAGES +
+ nodeid * MAX_SUBMESSAGES * MD_MN_NCLASSES;
mct_offset = mct_index * sizeof (md_mn_mce_t);
@@ -694,12 +692,12 @@ mdmn_check_completion(md_mn_msg_t *msg, md_mn_result_t *result)
}
}
commd_debug(MD_MMV_MISC,
- "mdmn_check_completion: msg already processed \n");
+ "mdmn_check_completion: msg already processed \n");
dump_result(MD_MMV_MISC, "mdmn_check_completion", result);
return (MDMN_MCT_DONE);
}
commd_debug(MD_MMV_MISC,
- "mdmn_check_completion: msg not yet processed\n");
+ "mdmn_check_completion: msg not yet processed\n");
return (MDMN_MCT_NOT_DONE);
}
diff --git a/usr/src/cmd/lvm/util/metaclust.c b/usr/src/cmd/lvm/util/metaclust.c
index deeff350e2..cb98329aac 100644
--- a/usr/src/cmd/lvm/util/metaclust.c
+++ b/usr/src/cmd/lvm/util/metaclust.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <meta.h>
#include <sdssc.h>
#include <signal.h>
@@ -117,6 +115,8 @@ static void
sigalarmhandler(int sig)
{
int i, n, ret, stat_loc = 0;
+ FILE *pgcore;
+ char corecmd[256];
n = sizeof (step_table) / sizeof (step_table[0]);
for (i = 0; i < n; i++) {
@@ -130,6 +130,25 @@ sigalarmhandler(int sig)
step_table[i].step_nam,
meta_print_hrtime(gethrtime() - start_time));
+ /*
+ * See what the child was actually doing when the timeout expired.
+ * A core-dump of this would be _really_ good, so let's just
+ * try a 'gcore -g c_pid' and hope
+ */
+
+ (void) memset(corecmd, 0, sizeof (corecmd));
+ (void) snprintf(corecmd, sizeof (corecmd),
+ "/bin/gcore -g %d >/dev/null 2>&1", (int)c_pid);
+
+ pgcore = popen(corecmd, "r");
+
+ if (pgcore == NULL) {
+ meta_mc_log(MC_LOG1, gettext("Could not grab core for pid %s"),
+ c_pid);
+ } else {
+ (void) pclose(pgcore);
+ }
+
if ((ret = kill(c_pid, SIGKILL)) == 0) {
/*
* The child will wait forever until the status is retrieved
@@ -1762,7 +1781,6 @@ main(int argc, char **argv)
"rpc.mdcommd for set %s\n"), sp->setname);
md_exit(local_sp, 1);
}
- meta_ping_mnset(setno);
/* Unblock mddb parse messages */
if (s_ownset(sp->setno, ep) == MD_SETOWNER_YES) {
diff --git a/usr/src/cmd/mdb/common/modules/md/dumpmirror.c b/usr/src/cmd/mdb/common/modules/md/dumpmirror.c
new file mode 100644
index 0000000000..53e70438b7
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/md/dumpmirror.c
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "mdinclude.h"
+
+/*
+ * Display an arbitrary bitmap by showing the set bits in the array.
+ * Output will be <start>-<end> for ranges or <position> for singleton bits.
+ */
+static void
+print_mm_bm(unsigned char *bm, uint_t size, char *bm_name)
+{
+ int i;
+ int first_set = -1;
+ int need_comma = 0;
+
+ mdb_printf("%s set bits: ", bm_name);
+ for (i = 0; i < size; i++) {
+ if (isset(bm, i)) {
+ if (first_set == -1) {
+ first_set = i;
+ }
+ } else {
+ if (first_set != -1) {
+ if (first_set != (i-1)) {
+ mdb_printf("%s%u-%u",
+ (need_comma ? "," : ""),
+ first_set, (i-1));
+ } else {
+ mdb_printf("%s%u",
+ (need_comma ? "," : ""), first_set);
+ }
+ need_comma = 1;
+ first_set = -1;
+ }
+ }
+ }
+ if (first_set != -1) {
+ mdb_printf("%s%u-%u", (need_comma ? "," : ""), first_set,
+ size-1);
+ }
+ mdb_printf("\n");
+}
+
+/*
+ * Print uchar_t sized count fields (typically un_pernode_dirty_map entries)
+ */
+
+static void
+print_mm_cnt_c(unsigned char *bm, uint_t size, char *bm_name)
+{
+ int i;
+ int need_comma = 0;
+
+ mdb_printf("%s set counts: ", bm_name);
+ for (i = 0; i < size; i++) {
+ if (bm[i]) {
+ mdb_printf("%s(%d,%3d)", (need_comma ? "," : ""), i,
+ (uint_t)bm[i]);
+ need_comma = 1;
+ }
+ }
+ mdb_printf("\n");
+}
+
+static void
+print_mm_cnt_w(unsigned short *bm, uint_t size, char *bm_name)
+{
+ int i;
+ int need_comma = 0;
+
+ mdb_printf("%s set counts: ", bm_name);
+ for (i = 0; i < size; i++) {
+ if (bm[i]) {
+ mdb_printf("%s(%d,%5d)", (need_comma ? "," : ""), i,
+ (uint_t)bm[i]);
+ need_comma = 1;
+ }
+ }
+ mdb_printf("\n");
+}
+
+/*
+ * Print the associated bitmaps for the specified mm_unit_t
+ * These are:
+ * un_pernode_dirty_bm
+ * un_goingclean_bm
+ * un_dirty_bm
+ * un_goingdirty_bm
+ * un_resync_bm
+ *
+ * Associated counts for unit:
+ * un_pernode_dirty_sum[] (uchar_t)
+ * un_outstanding_writes[] (ushort_t)
+ *
+ */
+
+/* ARGSUSED */
+int
+printmmbm(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ mm_unit_t mm, *mmp;
+ unsigned char *rr_dirty_bm, *rr_goingclean_bm, *rr_goingdirty_bm;
+ unsigned char *rr_resync_bm;
+ uintptr_t un_dbm, un_gcbm, un_gdbm, un_rrbm, un_pnds, un_ow;
+ uint_t num_rr, rr_bitmap_size;
+ int i;
+ uintptr_t un_pernode_bm;
+ unsigned char *rr_pernode_dirty, *rr_pnds;
+ unsigned short *rr_ow;
+ /* just enough for un_pernode_dirty_bm[] plus three digits */
+ char pernode_str[25];
+
+ if (argc != 0)
+ return (DCMD_USAGE);
+
+ if (!(flags & DCMD_ADDRSPEC)) {
+ mdb_warn("No mm_unit_t address specified");
+ return (DCMD_ERR);
+ }
+
+ if (mdb_vread(&mm, sizeof (mm_unit_t), addr) == -1) {
+ mdb_warn("failed to read mm_unit_t at %p\n", addr);
+ return (DCMD_ERR);
+ }
+
+ mmp = &mm;
+
+ num_rr = mm.un_rrd_num;
+
+ un_dbm = (uintptr_t)mmp->un_dirty_bm;
+ un_gcbm = (uintptr_t)mmp->un_goingclean_bm;
+ un_gdbm = (uintptr_t)mmp->un_goingdirty_bm;
+ un_rrbm = (uintptr_t)mmp->un_resync_bm;
+ un_pnds = (uintptr_t)mmp->un_pernode_dirty_sum;
+ un_ow = (uintptr_t)mmp->un_outstanding_writes;
+
+ rr_bitmap_size = howmany(num_rr, NBBY);
+ rr_dirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+ UM_SLEEP|UM_GC);
+ rr_goingclean_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+ UM_SLEEP|UM_GC);
+ rr_goingdirty_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+ UM_SLEEP|UM_GC);
+ rr_resync_bm = (unsigned char *)mdb_alloc(rr_bitmap_size,
+ UM_SLEEP|UM_GC);
+ rr_pnds = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+ rr_ow = (unsigned short *)mdb_alloc(num_rr * sizeof (unsigned short),
+ UM_SLEEP|UM_GC);
+
+ if (mdb_vread(rr_dirty_bm, rr_bitmap_size, un_dbm) == -1) {
+ mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm);
+ return (DCMD_ERR);
+ }
+ if (mdb_vread(rr_goingclean_bm, rr_bitmap_size, un_gcbm) == -1) {
+ mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm);
+ return (DCMD_ERR);
+ }
+ if (mdb_vread(rr_goingdirty_bm, rr_bitmap_size, un_gdbm) == -1) {
+ mdb_warn("failed to read un_goingdirty_bm at %p\n", un_gdbm);
+ return (DCMD_ERR);
+ }
+ if (mdb_vread(rr_resync_bm, rr_bitmap_size, un_rrbm) == -1) {
+ mdb_warn("failed to read un_resync_bm at %p\n", un_rrbm);
+ return (DCMD_ERR);
+ }
+ if (mdb_vread(rr_pnds, num_rr, un_pnds) == -1) {
+ mdb_warn("failed to read un_pernode_dirty_sum at %p\n",
+ un_pnds);
+ return (DCMD_ERR);
+ }
+ if (mdb_vread(rr_ow, num_rr * sizeof (unsigned short), un_ow) == -1) {
+ mdb_warn("failed to read un_outstanding_writes at %p\n", un_ow);
+ return (DCMD_ERR);
+ }
+
+ print_mm_bm(rr_dirty_bm, num_rr, "un_dirty_bm");
+ print_mm_bm(rr_goingclean_bm, num_rr, "un_goingclean_bm");
+ print_mm_bm(rr_goingdirty_bm, num_rr, "un_goingdirty_bm");
+ print_mm_bm(rr_resync_bm, num_rr, "un_resync_bm");
+
+ /*
+ * Load all the un_pernode_bm[] entries and iterate through the non-
+ * NULL entries
+ */
+ rr_pernode_dirty = (unsigned char *)mdb_alloc(rr_bitmap_size,
+ UM_SLEEP|UM_GC);
+
+ for (i = 0; i < 128; i++) {
+ un_pernode_bm = (uintptr_t)mmp->un_pernode_dirty_bm[i];
+ if (un_pernode_bm) {
+ mdb_snprintf(pernode_str, sizeof (pernode_str),
+ "un_pernode_dirty_bm[%d]", i);
+ if (mdb_vread(rr_pernode_dirty, rr_bitmap_size,
+ un_pernode_bm) == -1) {
+ mdb_warn("failed to read %s at %p\n",
+ pernode_str, un_pernode_bm);
+ return (DCMD_ERR);
+ }
+ print_mm_bm(rr_pernode_dirty, num_rr, pernode_str);
+ }
+ }
+ print_mm_cnt_c(rr_pnds, num_rr, "un_pernode_dirty_sum");
+
+ print_mm_cnt_w(rr_ow, num_rr, "un_outstanding_writes");
+
+ return (DCMD_OK);
+}
diff --git a/usr/src/cmd/mdb/common/modules/md/md.c b/usr/src/cmd/mdb/common/modules/md/md.c
index b23804c10b..371dfa14f3 100644
--- a/usr/src/cmd/mdb/common/modules/md/md.c
+++ b/usr/src/cmd/mdb/common/modules/md/md.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/mdb_modapi.h>
@@ -37,6 +34,7 @@ extern int set_io(uintptr_t, uint_t, int, const mdb_arg_t *);
extern int dumpnamespace(uintptr_t, uint_t, int, const mdb_arg_t *);
extern int dumpsetaddr(uintptr_t, uint_t, int, const mdb_arg_t *);
extern int dumphotspare(uintptr_t, uint_t, int, const mdb_arg_t *);
+extern int printmmbm(uintptr_t, uint_t, int, const mdb_arg_t *);
extern void set_io_help();
/* from mdbgen */
@@ -79,6 +77,8 @@ const mdb_dcmd_t dcmds[] = {
dumpsetaddr },
{ "simple_de_ic", NULL, "simple mddb_de_ic_t",
simple_de_ic },
+ { "printmmbm", NULL, "print bitmaps for given mm_unit_t",
+ printmmbm },
{ NULL }
};
diff --git a/usr/src/cmd/mdb/common/modules/md/metastat.c b/usr/src/cmd/mdb/common/modules/md/metastat.c
index cf01c779f0..d3f27ec233 100644
--- a/usr/src/cmd/mdb/common/modules/md/metastat.c
+++ b/usr/src/cmd/mdb/common/modules/md/metastat.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include "mdinclude.h"
typedef struct submirror_cb {
@@ -117,16 +115,84 @@ print_submirror(uintptr_t addr, void *arg, submirror_cb_t *data)
return (WALK_NEXT);
}
+/*
+ * Construct an RLE count for the number of 'cleared' bits in the given 'bm'
+ * Output the RLE count in form: [<set>.<cleared>.<set>.<cleared>...]
+ * RLE is Run Length Encoding, a method for compactly describing a bitmap
+ * as a series of numbers indicating the count of consecutive set or cleared
+ * bits.
+ *
+ * Input:
+ * <bm> bitmap to scan
+ * <size> length of bitmap (in bits)
+ * <comp_bm> RLE count array to be updated
+ * <opstr> Descriptive text for bitmap RLE count display
+ */
+static void
+print_comp_bm(unsigned char *bm, uint_t size, ushort_t *comp_bm, char *opstr)
+{
+ int cnt_clean, tot_dirty, cur_idx;
+ int i, cur_clean, cur_dirty, printit, max_set_cnt, max_reset_cnt;
+
+ cnt_clean = 1;
+ printit = 0;
+ cur_clean = 0;
+ cur_dirty = 0;
+ cur_idx = 0;
+ tot_dirty = 0;
+ max_set_cnt = max_reset_cnt = 0;
+ for (i = 0; i < size; i++) {
+ if (isset(bm, i)) {
+ /* If we're counting clean bits, flush the count out */
+ if (cnt_clean) {
+ cnt_clean = 0;
+ comp_bm[cur_idx] = cur_clean;
+ printit = 1;
+ if (cur_clean > max_reset_cnt) {
+ max_reset_cnt = cur_clean;
+ }
+ }
+ cur_clean = 0;
+ cur_dirty++;
+ tot_dirty++;
+ } else {
+ if (!cnt_clean) {
+ cnt_clean = 1;
+ comp_bm[cur_idx] = cur_dirty;
+ printit = 1;
+ if (cur_dirty > max_set_cnt) {
+ max_set_cnt = cur_dirty;
+ }
+ }
+ cur_dirty = 0;
+ cur_clean++;
+ }
+ if (printit) {
+ mdb_printf("%u.", comp_bm[cur_idx++]);
+ printit = 0;
+ }
+ }
+
+ mdb_printf("\nTotal %s bits = %lu\n", opstr, tot_dirty);
+ mdb_printf("Total %s transactions = %lu\n", opstr, cur_idx);
+ mdb_printf("Maximum %s set count = %lu, reset count = %lu\n", opstr,
+ max_set_cnt, max_reset_cnt);
+}
+
void
print_mirror(void *un_addr, void *mdcptr, uint_t verbose)
{
- mm_unit_t mm;
+ mm_unit_t mm, *mmp;
void **ptr;
int setno = 0;
minor_t un_self_id;
diskaddr_t un_total_blocks;
ushort_t mm_un_nsm;
submirror_cb_t data;
+ uint_t num_rr, rr_blksize;
+ ushort_t *comp_rr;
+ unsigned char *rr_dirty_bm, *rr_goingclean_bm;
+ uintptr_t un_dbm, un_gcbm;
/* read in the device */
if (mdb_vread(&mm, sizeof (mm_unit_t),
@@ -134,6 +200,9 @@ print_mirror(void *un_addr, void *mdcptr, uint_t verbose)
mdb_warn("failed to read mm_unit_t at %p\n", un_addr);
return;
}
+
+ mmp = &mm;
+
un_self_id = ((mdc_unit_t *)mdcptr)->un_self_id;
un_total_blocks = ((mdc_unit_t *)mdcptr)->un_total_blocks;
mm_un_nsm = mm.un_nsm;
@@ -148,6 +217,39 @@ print_mirror(void *un_addr, void *mdcptr, uint_t verbose)
}
mdb_inc_indent(2);
mdb_printf("Size: %llu blocks\n", un_total_blocks);
+
+ /*
+ * Dump out the current un_dirty_bm together with its size
+ * Also, attempt to Run Length encode the bitmap to see if this
+ * is a viable option
+ */
+ num_rr = mm.un_rrd_num;
+ rr_blksize = mm.un_rrd_blksize;
+
+ un_dbm = (uintptr_t)mmp->un_dirty_bm;
+ un_gcbm = (uintptr_t)mmp->un_goingclean_bm;
+
+ mdb_printf("RR size: %lu bits\n", num_rr);
+ mdb_printf("RR block size: %lu blocks\n", rr_blksize);
+
+ rr_dirty_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+ rr_goingclean_bm = (unsigned char *)mdb_alloc(num_rr, UM_SLEEP|UM_GC);
+ comp_rr = (ushort_t *)mdb_alloc(num_rr * sizeof (ushort_t),
+ UM_SLEEP|UM_GC);
+
+ if (mdb_vread(rr_dirty_bm, num_rr, un_dbm) == -1) {
+ mdb_warn("failed to read un_dirty_bm at %p\n", un_dbm);
+ return;
+ }
+ if (mdb_vread(rr_goingclean_bm, num_rr, un_gcbm) == -1) {
+ mdb_warn("failed to read un_goingclean_bm at %p\n", un_gcbm);
+ return;
+ }
+
+ print_comp_bm(rr_dirty_bm, num_rr, comp_rr, "dirty");
+
+ print_comp_bm(rr_goingclean_bm, num_rr, comp_rr, "clean");
+
/*
* find the sub mirrors, search through each metadevice looking
* at the un_parent.
diff --git a/usr/src/cmd/mdb/intel/amd64/md/Makefile b/usr/src/cmd/mdb/intel/amd64/md/Makefile
index db8020f91c..879bc72856 100644
--- a/usr/src/cmd/mdb/intel/amd64/md/Makefile
+++ b/usr/src/cmd/mdb/intel/amd64/md/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
MODULE = md.so
MDBTGT = kvm
MODSRCS = dumphotspare.c \
+ dumpmirror.c \
dumpnamespace.c \
findset.c \
md.c \
diff --git a/usr/src/cmd/mdb/intel/ia32/md/Makefile b/usr/src/cmd/mdb/intel/ia32/md/Makefile
index 1ae436a304..8531855e44 100644
--- a/usr/src/cmd/mdb/intel/ia32/md/Makefile
+++ b/usr/src/cmd/mdb/intel/ia32/md/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
MODULE = md.so
MDBTGT = kvm
MODSRCS = dumphotspare.c \
+ dumpmirror.c \
dumpnamespace.c \
findset.c \
md.c \
diff --git a/usr/src/cmd/mdb/sparc/v9/md/Makefile b/usr/src/cmd/mdb/sparc/v9/md/Makefile
index d88a16960a..d0ad7e3906 100644
--- a/usr/src/cmd/mdb/sparc/v9/md/Makefile
+++ b/usr/src/cmd/mdb/sparc/v9/md/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#pragma ident "%Z%%M% %I% %E% SMI"
MODULE = md.so
MDBTGT = kvm
MODSRCS = dumphotspare.c \
+ dumpmirror.c \
dumpnamespace.c \
findset.c \
md.c \
diff --git a/usr/src/head/meta.h b/usr/src/head/meta.h
index 3b6fe350da..3a29c840c5 100644
--- a/usr/src/head/meta.h
+++ b/usr/src/head/meta.h
@@ -1844,10 +1844,12 @@ extern int meta_update_devtree(minor_t mnum);
/* meta_mn_comm.c */
extern int mdmn_send_message(set_t setno, md_mn_msgtype_t type,
- uint_t flags, char *data, int size,
- md_mn_result_t **resp, md_error_t *ep);
+ uint_t flags, md_mn_nodeid_t recipient,
+ char *data, int size, md_mn_result_t **resp,
+ md_error_t *ep);
extern int mdmn_send_message_with_msgid(set_t setno,
- md_mn_msgtype_t type, uint_t flags, char *data,
+ md_mn_msgtype_t type, uint_t flags,
+ md_mn_nodeid_t recipient, char *data,
int size, md_mn_result_t **resp,
md_mn_msgid_t *msgid, md_error_t *ep);
extern int mdmn_create_msgid(md_mn_msgid_t *id);
@@ -1931,11 +1933,11 @@ extern int clnt_imp_adddrvs(char *hostname,
md_timeval32_t timestamp,
ulong_t genid, md_error_t *ep);
-/* Flags for direction in copy_msg_1 */
+/* Flags for direction in copy_msg_2 */
#define MD_MN_COPY_TO_ONDISK 0x0001
#define MD_MN_COPY_TO_INCORE 0x0002
-extern void copy_msg_1(md_mn_msg_t *incorep,
+extern void copy_msg_2(md_mn_msg_t *incorep,
md_mn_msg_od_t *ondiskp, int direction);
extern void free_msg(md_mn_msg_t *msg);
diff --git a/usr/src/lib/lvm/libmeta/common/mapfile-vers b/usr/src/lib/lvm/libmeta/common/mapfile-vers
index 960a5fe5a4..4cbee994c8 100644
--- a/usr/src/lib/lvm/libmeta/common/mapfile-vers
+++ b/usr/src/lib/lvm/libmeta/common/mapfile-vers
@@ -22,8 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
SUNWprivate_1.1 {
global:
@@ -92,7 +90,7 @@ SUNWprivate_1.1 {
commitset;
comp_state_to_name;
copy_msg;
- copy_msg_1;
+ copy_msg_2;
copy_result;
crcfreetab;
crcfunc;
@@ -160,12 +158,12 @@ SUNWprivate_1.1 {
md_med_pmap_timeout;
mdmn_abort;
mdmn_allocate_changelog;
- mdmn_comm_lock_1;
- mdmn_comm_msglock_1;
- mdmn_comm_reinit_set_1;
- mdmn_comm_resume_1;
- mdmn_comm_suspend_1;
- mdmn_comm_unlock_1;
+ mdmn_comm_lock_2;
+ mdmn_comm_msglock_2;
+ mdmn_comm_reinit_set_2;
+ mdmn_comm_resume_2;
+ mdmn_comm_suspend_2;
+ mdmn_comm_unlock_2;
mdmn_create_msgid;
mdmn_get_changelogrec;
mdmn_get_handler;
@@ -177,14 +175,14 @@ SUNWprivate_1.1 {
mdmn_reinit_set;
mdmn_reset_changelog;
mdmn_resume;
- mdmn_send_1;
+ mdmn_send_2;
mdmn_send_message;
mdmn_snarf_changelog;
mdmn_suspend;
mdmn_unlog_msg;
- mdmn_wakeup_initiator_1;
- mdmn_wakeup_master_1;
- mdmn_work_1;
+ mdmn_wakeup_initiator_2;
+ mdmn_wakeup_master_2;
+ mdmn_work_2;
mdnullerror;
md_perror;
md_post_sig;
diff --git a/usr/src/lib/lvm/libmeta/common/meta_db.c b/usr/src/lib/lvm/libmeta/common/meta_db.c
index c79cfca3be..e3410773f5 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_db.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_db.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
@@ -928,7 +927,7 @@ meta_db_addsidenms(
*/
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
- MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns,
+ MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns,
sizeof (md_mn_msg_meta_db_newside_t),
&resultp, ep);
if (send_rval != 0) {
@@ -1048,7 +1047,7 @@ meta_db_delsidenm(
*/
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
- MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds,
+ MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds,
sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
if (send_rval != 0) {
if (resultp == NULL)
@@ -1542,7 +1541,7 @@ meta_db_attach(
flags |= MD_MSGF_NO_LOG;
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_DB_ATTACH,
- flags, (char *)&attach,
+ flags, 0, (char *)&attach,
sizeof (md_mn_msg_meta_db_attach_t),
&resultp, ep);
if (send_rval != 0) {
@@ -2007,7 +2006,7 @@ meta_db_detach(
flags |= MD_MSGF_NO_LOG;
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_DB_DETACH,
- flags, (char *)&detach,
+ flags, 0, (char *)&detach,
sizeof (md_mn_msg_meta_db_detach_t),
&resultp, ep);
if (send_rval != 0) {
diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c
index 05a5ea9df5..388a5d9de7 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_changelog.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,14 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <unistd.h>
-
#include <wait.h>
#include <sys/time.h>
#include <meta.h>
@@ -131,7 +128,7 @@ copy_changelog(mdmn_changelog_record_t *incp,
odp->lr_class = incp->lr_class;
odp->lr_msglen = incp->lr_msglen;
if (incp->lr_msglen)
- copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction);
+ copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
} else {
incp->lr_revision = odp->lr_revision;
incp->lr_flags = odp->lr_flags;
@@ -139,7 +136,7 @@ copy_changelog(mdmn_changelog_record_t *incp,
incp->lr_class = odp->lr_class;
incp->lr_msglen = odp->lr_msglen;
if (odp->lr_msglen)
- copy_msg_1(&incp->lr_msg, &odp->lr_od_msg, direction);
+ copy_msg_2(&incp->lr_msg, &odp->lr_od_msg, direction);
}
}
@@ -196,7 +193,7 @@ mdmn_allocate_changelog(mdsetname_t *sp, md_error_t *ep)
(void) mdstealerror(ep, &req.ur_mde);
#ifdef DEBUG
syslog(LOG_DEBUG, "allocate_log: %s\n",
- mde_sperror(ep, ""));
+ mde_sperror(ep, ""));
#endif
Free(mdmn_changelog[setno]);
return (-1);
@@ -389,13 +386,14 @@ mdmn_unlog_msg(md_mn_msg_t *msg)
assert(lr != NULL);
if (!MSGID_CMP(&(msg->msg_msgid), &(lr->lr_msg.msg_msgid))) {
syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
- "unlog_msg: msgid mismatch\n"
- "\t\tstored: ID = (%d, 0x%llx-%d) setno %d class %d type %d\n"
- "\t\tattempting to unlog:\n"
- "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
- MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
- lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid),
- msg->msg_setno, class, msg->msg_type);
+ "unlog_msg: msgid mismatch\n"
+ "\t\tstored: ID = (%d, 0x%llx-%d) setno %d "
+ "class %d type %d\n"
+ "\t\tattempting to unlog:\n"
+ "\t\tID = (%d, 0x%llx-%d) setno %d class %d type %d.\n"),
+ MSGID_ELEMS(lr->lr_msg.msg_msgid), lr->lr_setno,
+ lr->lr_class, lr->lr_msgtype, MSGID_ELEMS(msg->msg_msgid),
+ msg->msg_setno, class, msg->msg_type);
return (-1);
}
lr->lr_msglen = 0;
@@ -462,10 +460,10 @@ mdmn_commitlog(md_set_desc *sd, md_error_t *ep)
if (!(MD_MNSET_DESC(sd)) || !sd->sd_mn_am_i_master) {
if (!(MD_MNSET_DESC(sd))) {
syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
- "mdmn_commitlog - Not MN Set\n"));
+ "mdmn_commitlog - Not MN Set\n"));
} else {
syslog(LOG_DAEMON | LOG_ERR, dgettext(TEXT_DOMAIN,
- "mdmn_commit_log - Not Master\n"));
+ "mdmn_commit_log - Not Master\n"));
}
return (-1);
}
@@ -485,7 +483,7 @@ mdmn_commitlog(md_set_desc *sd, md_error_t *ep)
req.ur_size = MDMN_LOGRECSIZE_OD;
req.ur_data = (uintptr_t)&clodrec;
if ((retval = metaioctl(MD_MN_DB_USERREQ, &req, &req.ur_mde,
- NULL)) != 0) {
+ NULL)) != 0) {
(void) mdstealerror(ep, &req.ur_mde);
#ifdef DEBUG
syslog(LOG_DAEMON|LOG_DEBUG,
@@ -501,16 +499,16 @@ mdmn_commitlog(md_set_desc *sd, md_error_t *ep)
recs[lrc] = 0;
/* Commit to mddb on disk */
METAD_SETUP_LR(MD_DB_COMMIT_MANY, setno,
- mdmn_changelog[setno][0].lr_selfid);
+ mdmn_changelog[setno][0].lr_selfid);
req.ur_size = size;
req.ur_data = (uintptr_t)recs;
if ((retval = metaioctl(MD_MN_DB_USERREQ, &req,
- &req.ur_mde, NULL)) != 0) {
+ &req.ur_mde, NULL)) != 0) {
(void) mdstealerror(ep, &req.ur_mde);
#ifdef DEBUG
syslog(LOG_DAEMON|LOG_DEBUG,
- "mdmn_commitlog - metaioctl COMMIT_MANY"
- "Failure\n%s", mde_sperror(ep, ""));
+ "mdmn_commitlog - metaioctl COMMIT_MANY"
+ "Failure\n%s", mde_sperror(ep, ""));
#endif
}
}
@@ -609,7 +607,7 @@ mdmn_snarf_changelog(set_t set, md_error_t *ep)
}
lr = (mdmn_changelog_record_od_t *)get_ur_rec(set, MD_UR_GET_NEXT,
- MDDB_UR_LR, &id, ep);
+ MDDB_UR_LR, &id, ep);
if (lr == NULL)
return (0);
@@ -618,7 +616,7 @@ mdmn_snarf_changelog(set_t set, md_error_t *ep)
if (mdmn_changelog[set] == NULL) {
/* Allocate incore state for the log */
mdmn_changelog[set] = Zalloc(MDMN_LOGHDR_SIZE *
- mdmn_logrecs);
+ mdmn_logrecs);
}
do {
diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c
index e005e83348..2feae7301a 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_comm.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <unistd.h>
#include <wait.h>
@@ -72,181 +70,264 @@ mdmn_get_timeout(md_mn_msgtype_t msgtype)
void
ldump_msg(char *prefix, md_mn_msg_t *msg)
{
- (void) fprintf(stderr, "%s &msg = 0x%x\n", prefix, (uint_t)msg);
- (void) fprintf(stderr, "%s ID = (%d, 0x%llx-%d)\n", prefix,
+ (void) fprintf(stderr, "%s &msg = 0x%x\n", prefix, (uint_t)msg);
+ (void) fprintf(stderr, "%s ID = (%d, 0x%llx-%d)\n", prefix,
MSGID_ELEMS(msg->msg_msgid));
- (void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender);
- (void) fprintf(stderr, "%s flags = 0x%x\n", prefix, msg->msg_flags);
- (void) fprintf(stderr, "%s setno = %d\n", prefix, msg->msg_setno);
- (void) fprintf(stderr, "%s type = %d\n", prefix, msg->msg_type);
- (void) fprintf(stderr, "%s size = %d\n", prefix, msg->msg_event_size);
+ (void) fprintf(stderr, "%s sender = %d\n", prefix, msg->msg_sender);
+ (void) fprintf(stderr, "%s flags = 0x%x\n",
+ prefix, msg->msg_flags);
+ (void) fprintf(stderr, "%s setno = %d\n", prefix, msg->msg_setno);
+ (void) fprintf(stderr, "%s recipient = %d\n",
+ prefix, msg->msg_recipient);
+ (void) fprintf(stderr, "%s type = %d\n", prefix, msg->msg_type);
+ (void) fprintf(stderr, "%s size = %d\n",
+ prefix, msg->msg_event_size);
}
+#define COMMD_PROGNAME "rpc.mdcommd"
+
+extern uint_t meta_rpc_err_mask(void);
+
+/*
+ * If a clnt_call gets an RPC error, force the message out here with details.
+ * This would be nice to send to commd_debug(), but we can't call rpc.mdcommd
+ * code from libmeta.
+ */
+static void
+mdmn_handle_RPC_error(CLIENT *clnt, char *ident, md_mn_nodeid_t nid)
+{
+ /*
+ * This is sized for a max message which would look like this:
+ * "mdmn_wakeup_initiator: rpc.mdcommd node 4294967295"
+ */
+ char errstr[51];
+ struct rpc_err e;
+
+ CLNT_GETERR((CLIENT *) clnt, &e);
+ if (meta_rpc_err_mask() & (1 << e.re_status)) {
+ if (nid == 0) {
+ (void) snprintf(errstr, sizeof (errstr),
+ "%s: %s node (local)", ident, COMMD_PROGNAME);
+ } else {
+ (void) snprintf(errstr, sizeof (errstr),
+ "%s: %s node %d", ident, COMMD_PROGNAME, nid);
+ }
+ syslog(LOG_WARNING, "mdmn_handle_RPC_error: %s",
+ clnt_sperror(clnt, errstr));
+ }
+}
/* Default timeout can be changed using clnt_control() */
static struct timeval TIMEOUT = { 25, 0 };
md_mn_result_t *
-mdmn_send_1(argp, clnt)
+mdmn_send_2(argp, clnt, nid)
md_mn_msg_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
md_mn_result_t *clnt_res = Zalloc(sizeof (md_mn_result_t));
- if (clnt_call(clnt, mdmn_send,
+ res = clnt_call(clnt, mdmn_send,
(xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp,
- (xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_md_mn_result_t, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_send", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_work_1(argp, clnt)
+mdmn_work_2(argp, clnt, nid)
md_mn_msg_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_work,
+ res = clnt_call(clnt, mdmn_work,
(xdrproc_t)xdr_md_mn_msg_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- Free(clnt_res);
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_work", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_wakeup_initiator_1(argp, clnt)
+mdmn_wakeup_initiator_2(argp, clnt, nid)
md_mn_result_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_wakeup_initiator,
+ res = clnt_call(clnt, mdmn_wakeup_initiator,
(xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- Free(clnt_res);
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_wakeup_initiator", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_wakeup_master_1(argp, clnt)
+mdmn_wakeup_master_2(argp, clnt, nid)
md_mn_result_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_wakeup_master,
+ res = clnt_call(clnt, mdmn_wakeup_master,
(xdrproc_t)xdr_md_mn_result_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- Free(clnt_res);
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_wakeup_master", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_lock_1(argp, clnt)
+mdmn_comm_lock_2(argp, clnt, nid)
md_mn_set_and_class_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_lock,
+ res = clnt_call(clnt, mdmn_comm_lock,
(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_lock", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_unlock_1(argp, clnt)
+mdmn_comm_unlock_2(argp, clnt, nid)
md_mn_set_and_class_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_unlock,
+ res = clnt_call(clnt, mdmn_comm_unlock,
(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_unlock", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_suspend_1(argp, clnt)
+mdmn_comm_suspend_2(argp, clnt, nid)
md_mn_set_and_class_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_suspend,
+ res = clnt_call(clnt, mdmn_comm_suspend,
(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_suspend", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_resume_1(argp, clnt)
+mdmn_comm_resume_2(argp, clnt, nid)
md_mn_set_and_class_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_resume,
+ res = clnt_call(clnt, mdmn_comm_resume,
(xdrproc_t)xdr_md_mn_set_and_class_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_resume", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_reinit_set_1(argp, clnt)
+mdmn_comm_reinit_set_2(argp, clnt, nid)
set_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_reinit_set,
+ res = clnt_call(clnt, mdmn_comm_reinit_set,
(xdrproc_t)xdr_set_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_reinit_set", nid);
+ Free(clnt_res);
+ return (NULL);
}
int *
-mdmn_comm_msglock_1(argp, clnt)
+mdmn_comm_msglock_2(argp, clnt, nid)
md_mn_type_and_lock_t *argp;
CLIENT *clnt;
+ md_mn_nodeid_t nid;
{
+ enum clnt_stat res;
int *clnt_res = Zalloc(sizeof (int));
- if (clnt_call(clnt, mdmn_comm_msglock,
+ res = clnt_call(clnt, mdmn_comm_msglock,
(xdrproc_t)xdr_md_mn_type_and_lock_t, (caddr_t)argp,
- (xdrproc_t)xdr_int, (caddr_t)clnt_res,
- TIMEOUT) != RPC_SUCCESS) {
- return (NULL);
+ (xdrproc_t)xdr_int, (caddr_t)clnt_res, TIMEOUT);
+
+ if (res == RPC_SUCCESS) {
+ return (clnt_res);
}
- return (clnt_res);
+ mdmn_handle_RPC_error(clnt, "mdmn_comm_msglock", nid);
+ Free(clnt_res);
+ return (NULL);
}
@@ -370,6 +451,7 @@ copy_msg(md_mn_msg_t *msg, md_mn_msg_t *dest)
nmsg->msg_flags = msg->msg_flags;
nmsg->msg_setno = msg->msg_setno;
nmsg->msg_type = msg->msg_type;
+ nmsg->msg_recipient = msg->msg_recipient;
nmsg->msg_event_size = msg->msg_event_size;
if (msg->msg_event_size > 0) {
bcopy(msg->msg_event_data, nmsg->msg_event_data,
@@ -379,7 +461,7 @@ copy_msg(md_mn_msg_t *msg, md_mn_msg_t *dest)
}
void
-copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
+copy_msg_2(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
{
assert((direction == MD_MN_COPY_TO_ONDISK) ||
(direction == MD_MN_COPY_TO_INCORE));
@@ -390,6 +472,7 @@ copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
msgod->msg_flags = msg->msg_flags;
msgod->msg_setno = msg->msg_setno;
msgod->msg_type = msg->msg_type;
+ msgod->msg_recipient = msg->msg_recipient;
msgod->msg_od_event_size = msg->msg_event_size;
/* paranoid checks */
if (msg->msg_event_size != 0 && msg->msg_event_data != NULL)
@@ -401,6 +484,7 @@ copy_msg_1(md_mn_msg_t *msg, md_mn_msg_od_t *msgod, int direction)
msg->msg_flags = msgod->msg_flags;
msg->msg_setno = msgod->msg_setno;
msg->msg_type = msgod->msg_type;
+ msg->msg_recipient = msgod->msg_recipient;
msg->msg_event_size = msgod->msg_od_event_size;
if (msg->msg_event_data == NULL)
msg->msg_event_data = Zalloc(msg->msg_event_size);
@@ -462,7 +546,7 @@ mdmn_get_local_clnt(uint_t flag)
if (mdmn_clients == (md_mn_client_list_t *)NULL) {
/* if there is no entry, create a client and return a it */
local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD,
- ONE, "tcp");
+ TWO, "tcp");
} else {
/*
* If there is an entry from a previous put operation,
@@ -517,6 +601,13 @@ mdmn_put_local_clnt(CLIENT *local_daemon)
* a msgid is already attached to it.
* In that case mdmn_send_message_with_msgid() has to be called directly.
*
+ * The recipient argument is almost always unused, and is therefore typically
+ * set to zero, as zero is an invalid cluster nodeid. The exceptions are the
+ * marking and clearing of the DRL from a node that is not currently the
+ * owner. In these cases, the recipient argument will be the nodeid of the
+ * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner
+ * nodes will not receive these messages.
+ *
* Return values / CAVEAT EMPTOR: see mdmn_send_message_with_msgid()
*/
@@ -525,13 +616,14 @@ mdmn_send_message(
set_t setno,
md_mn_msgtype_t type,
uint_t flags,
+ md_mn_nodeid_t recipient,
char *data,
int size,
md_mn_result_t **result,
md_error_t *ep)
{
- return (mdmn_send_message_with_msgid(
- setno, type, flags, data, size, result, MD_NULL_MSGID, ep));
+ return (mdmn_send_message_with_msgid(setno, type, flags,
+ recipient, data, size, result, MD_NULL_MSGID, ep));
}
/*
* mdmn_send_message_with_msgid()
@@ -561,6 +653,7 @@ mdmn_send_message_with_msgid(
set_t setno,
md_mn_msgtype_t type,
uint_t flags,
+ md_mn_nodeid_t recipient,
char *data,
int size,
md_mn_result_t **result,
@@ -619,6 +712,7 @@ mdmn_send_message_with_msgid(
*/
msg.msg_flags = flags;
msg.msg_setno = setno;
+ msg.msg_recipient = recipient;
msg.msg_type = type;
msg.msg_event_size = size;
msg.msg_event_data = data;
@@ -655,7 +749,7 @@ mdmn_send_message_with_msgid(
* - retries1 or retries2 exceeded
*/
for (; ; ) {
- *result = mdmn_send_1(&msg, local_daemon);
+ *result = mdmn_send_2(&msg, local_daemon, 0);
resp = *result;
if (resp != (md_mn_result_t *)NULL) {
/* Bingo! */
@@ -800,8 +894,8 @@ mdmn_suspend(set_t setno, md_mn_msgclass_t class, long timeout)
if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) {
return (MDE_DS_COMMDCTL_SUSPEND_FAIL);
}
- local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
- "tcp");
+ local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+ "tcp");
if (local_daemon == (CLIENT *)NULL) {
clnt_pcreateerror("local_daemon");
return (MDE_DS_COMMDCTL_SUSPEND_FAIL);
@@ -818,7 +912,7 @@ mdmn_suspend(set_t setno, md_mn_msgclass_t class, long timeout)
msc.msc_class = class;
msc.msc_flags = 0;
- resp = mdmn_comm_suspend_1(&msc, local_daemon);
+ resp = mdmn_comm_suspend_2(&msc, local_daemon, 0);
clnt_destroy(local_daemon);
if (resp == NULL) {
@@ -861,8 +955,8 @@ mdmn_resume(set_t setno, md_mn_msgclass_t class, uint_t flags, long timeout)
if ((setno >= MD_MAXSETS) || (class >= MD_MN_NCLASSES)) {
return (MDE_DS_COMMDCTL_RESUME_FAIL);
}
- local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
- "tcp");
+ local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+ "tcp");
if (local_daemon == (CLIENT *)NULL) {
clnt_pcreateerror("local_daemon");
return (MDE_DS_COMMDCTL_RESUME_FAIL);
@@ -879,7 +973,7 @@ mdmn_resume(set_t setno, md_mn_msgclass_t class, uint_t flags, long timeout)
msc.msc_class = class;
msc.msc_flags = flags;
- resp = mdmn_comm_resume_1(&msc, local_daemon);
+ resp = mdmn_comm_resume_2(&msc, local_daemon, 0);
if (resp != NULL) {
if (*resp == MDMNE_ACK) {
@@ -905,10 +999,8 @@ mdmn_abort(void)
md_error_t mdne = mdnullerror;
(void) mdmn_send_message(0, /* No set is needed for this message */
- MD_MN_MSG_ABORT,
- MD_MSGF_LOCAL_ONLY,
- dummy, sizeof (dummy),
- &resultp, &mdne);
+ MD_MN_MSG_ABORT, MD_MSGF_LOCAL_ONLY, 0,
+ dummy, sizeof (dummy), &resultp, &mdne);
if (resultp != NULL) {
Free(resultp);
@@ -935,8 +1027,8 @@ mdmn_reinit_set(set_t setno, long timeout)
if ((setno == 0) || (setno >= MD_MAXSETS)) {
return (1);
}
- local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
- "tcp");
+ local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+ "tcp");
if (local_daemon == (CLIENT *)NULL) {
clnt_pcreateerror("local_daemon");
return (1);
@@ -949,7 +1041,7 @@ mdmn_reinit_set(set_t setno, long timeout)
}
}
- resp = mdmn_comm_reinit_set_1(&setno, local_daemon);
+ resp = mdmn_comm_reinit_set_2(&setno, local_daemon, 0);
if (resp != NULL) {
if (*resp == MDMNE_ACK) {
@@ -984,8 +1076,8 @@ mdmn_msgtype_lock(md_mn_msgtype_t msgtype, uint_t locktype)
if ((msgtype == 0) || (msgtype >= MD_MN_NMESSAGES)) {
return (1);
}
- local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, ONE,
- "tcp");
+ local_daemon = meta_client_create(LOCALHOST_IPv4, MDMN_COMMD, TWO,
+ "tcp");
if (local_daemon == (CLIENT *)NULL) {
clnt_pcreateerror("local_daemon");
return (1);
@@ -993,7 +1085,7 @@ mdmn_msgtype_lock(md_mn_msgtype_t msgtype, uint_t locktype)
mmtl.mmtl_type = msgtype;
mmtl.mmtl_lock = locktype;
- resp = mdmn_comm_msglock_1(&mmtl, local_daemon);
+ resp = mdmn_comm_msglock_2(&mmtl, local_daemon, 0);
if (resp != NULL) {
if (*resp == MDMNE_ACK) {
diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c
index ce70615f4e..041bb9b76d 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_handlers.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <unistd.h>
#include <wait.h>
@@ -448,7 +447,7 @@ mdmn_do_choose_owner(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
myflags |= msg->msg_flags & MD_MSGF_INHERIT_BITS;
ret = mdmn_send_message(MD_MIN2SET(d->msg_chooseid_mnum),
- MD_MN_MSG_CHANGE_OWNER, myflags, (char *)&chownermsg,
+ MD_MN_MSG_CHANGE_OWNER, myflags, 0, (char *)&chownermsg,
sizeof (chownermsg), &resp1, &mde);
if (resp1 != NULL)
free_result(resp1);
@@ -2120,3 +2119,67 @@ mdmn_do_addmdname(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
resp->mmr_exitval = 0;
}
+
+/*
+ * This is used to issue a MD_MN_RR_DIRTY ioctl to the mirror.
+ */
+/*ARGSUSED*/
+void
+mdmn_do_mark_dirty(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
+{
+ md_mn_msg_rr_dirty_t *d;
+ md_mn_rr_dirty_params_t rp;
+ int ret;
+
+ resp->mmr_out_size = 0;
+ resp->mmr_err_size = 0;
+ resp->mmr_out = NULL;
+ resp->mmr_err = NULL;
+ resp->mmr_comm_state = MDMNE_ACK;
+ d = (md_mn_msg_rr_dirty_t *)((void *)(msg->msg_event_data));
+
+ (void) memset(&rp, 0, sizeof (rp));
+ MD_SETDRIVERNAME(&rp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
+ rp.rr_mnum = d->rr_mnum;
+ rp.rr_nodeid = d->rr_nodeid;
+ rp.rr_start = (ushort_t)((d->rr_range >> 16) & 0xffff);
+ rp.rr_end = (ushort_t)(d->rr_range & 0xffff);
+
+ ret = metaioctl(MD_MN_RR_DIRTY, &rp, &rp.mde, NULL);
+
+ resp->mmr_exitval = ret;
+}
+
+/*
+ * This is used to issue a MD_MN_RR_CLEAN ioctl to the mirror.
+ */
+/*ARGSUSED*/
+void
+mdmn_do_mark_clean(md_mn_msg_t *msg, uint_t flags, md_mn_result_t *resp)
+{
+ md_mn_msg_rr_clean_t *d;
+ md_mn_rr_clean_params_t *rcp;
+ int ret;
+
+ resp->mmr_out_size = 0;
+ resp->mmr_err_size = 0;
+ resp->mmr_out = NULL;
+ resp->mmr_err = NULL;
+ resp->mmr_comm_state = MDMNE_ACK;
+ d = (md_mn_msg_rr_clean_t *)((void *)(msg->msg_event_data));
+
+ rcp = Zalloc(sizeof (struct md_mn_rr_clean_params) +
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
+ MD_SETDRIVERNAME(rcp, MD_MIRROR, MD_MIN2SET(d->rr_mnum))
+ rcp->rr_mnum = d->rr_mnum;
+ rcp->rr_nodeid = d->rr_nodeid;
+ rcp->rr_start_size = d->rr_start_size;
+ (void) memcpy(MDMN_RR_CLEAN_PARAMS_DATA(rcp), MDMN_MSG_RR_CLEAN_DATA(d),
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(d));
+
+ ret = metaioctl(MD_MN_RR_CLEAN, rcp, &rcp->mde, NULL);
+
+ Free(rcp);
+
+ resp->mmr_exitval = ret;
+}
diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c
index b24f278617..2fdd7a8713 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_msg_table.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <meta.h>
extern void mdmn_do_cmd(HANDLER_PARMS);
@@ -56,6 +54,8 @@ extern void mdmn_do_delkeyname(HANDLER_PARMS);
extern void mdmn_do_get_tstate(HANDLER_PARMS);
extern void mdmn_do_get_mirstate(HANDLER_PARMS);
extern void mdmn_do_addmdname(HANDLER_PARMS);
+extern void mdmn_do_mark_dirty(HANDLER_PARMS);
+extern void mdmn_do_mark_clean(HANDLER_PARMS);
extern int mdmn_smgen_test6(SMGEN_PARMS);
extern int mdmn_smgen_state_upd(SMGEN_PARMS);
@@ -693,10 +693,36 @@ md_mn_msg_tbl_entry_t msg_table[MD_MN_NMESSAGES] = {
* Add metadevice name into replica
*/
MD_MSG_CLASS1, /* message class */
- mdmn_do_addmdname, /* add ,etadevice name */
+ mdmn_do_addmdname, /* add metadevice name */
NULL, /* submessage generator */
90, /* times out in 90 secs */
10000, 2, /* class busy retry / time delta */
10, 1000 /* comm fail retry / time delta */
},
+
+ {
+ /*
+ * MD_MN_MSG_RR_DIRTY
+ * Mark given range of un_dirty_bm as dirty
+ */
+ MD_MSG_CLASS2, /* message class */
+ mdmn_do_mark_dirty, /* message handler */
+ NULL, /* submessage generator */
+ 8, /* timeout in seconds */
+ UINT_MAX, 10, /* class busy retry / time delta */
+ UINT_MAX, 100 /* comm fail retry / time delta */
+ },
+
+ {
+ /*
+ * MD_MN_MSG_RR_CLEAN
+ * Mark given range of un_dirty_bm as clean
+ */
+ MD_MSG_CLASS2, /* message class */
+ mdmn_do_mark_clean, /* message handler */
+ NULL, /* submessage generator */
+ 8, /* timeout in seconds */
+ UINT_MAX, 10, /* class busy retry / time delta */
+ UINT_MAX, 100 /* comm fail retry / time delta */
+ },
};
diff --git a/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c
index aa7127453f..c1cbd68bf4 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_mn_subr.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
@@ -62,7 +60,7 @@ meta_is_mn_set(
/* Local set cannot be MultiNode */
if ((sp == NULL) || (sp->setname == NULL) ||
- (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
+ (strcmp(sp->setname, MD_LOCAL_NAME) == 0))
return (0);
sd = metaget_setdesc(sp, ep);
ASSERT(sd != NULL);
@@ -128,7 +126,7 @@ meta_ping_mnset(set_t setno)
md_mn_result_t *resp = NULL;
(void) mdmn_send_message(setno, MD_MN_MSG_TEST2,
- MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, data,
+ MD_MSGF_NO_LOG | MD_MSGF_FAIL_ON_SUSPEND, 0, data,
sizeof (data), &resp, &mde);
if (resp != (md_mn_result_t *)NULL) {
@@ -234,9 +232,8 @@ meta_mn_send_command(
} else {
send_message_type = MD_MN_MSG_BC_CMD;
}
- err = mdmn_send_message(
- sp->setno, send_message_type, send_message_flags,
- cmd, 1024, &resp, ep);
+ err = mdmn_send_message(sp->setno, send_message_type,
+ send_message_flags, 0, cmd, 1024, &resp, ep);
free(cmd);
@@ -285,9 +282,9 @@ meta_mn_send_command(
"Command not attempted: Unable to log message "
"in set %s\n"), sp->setname);
if (c.c_flags & MDDB_C_STALE) {
- (void) mdmddberror(ep, MDE_DB_STALE,
- (minor_t)NODEV64, sp->setno, 0, NULL);
- mde_perror(ep, "");
+ (void) mdmddberror(ep, MDE_DB_STALE,
+ (minor_t)NODEV64, sp->setno, 0, NULL);
+ mde_perror(ep, "");
}
} else {
(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
@@ -333,7 +330,7 @@ meta_mn_send_suspend_writes(
*/
result = mdmn_send_message(MD_MIN2SET(mnum),
MD_MN_MSG_SUSPEND_WRITES,
- MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+ MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
(char *)&suspwrmsg, sizeof (suspwrmsg), &resp, ep);
if (resp != NULL) {
free_result(resp);
@@ -608,7 +605,7 @@ meta_mn_send_setsync(
* time required.
*/
ret = mdmn_send_message(sp->setno, MD_MN_MSG_SETSYNC,
- MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+ MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
(char *)&setsyncmsg, sizeof (setsyncmsg), &resp, ep);
if (resp != NULL) {
free_result(resp);
@@ -720,7 +717,7 @@ meta_mn_send_resync_starting(
resyncmsg.msg_resync_mnum = mnum;
result = mdmn_send_message(MD_MIN2SET(mnum),
MD_MN_MSG_RESYNC_STARTING,
- MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
+ MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND, 0,
(char *)&resyncmsg, sizeof (resyncmsg), &resp, ep);
if (resp != NULL) {
@@ -905,7 +902,7 @@ meta_mn_send_get_tstate(
tstatemsg.gettstate_dev = dev;
result = mdmn_send_message(MD_MIN2SET(mnum),
MD_MN_MSG_GET_TSTATE,
- MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST,
+ MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0,
(char *)&tstatemsg, sizeof (tstatemsg), &resp, ep);
if (result == 0)
diff --git a/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c
index 986f4e2705..fb128747b3 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_nameinfo.c
@@ -1244,11 +1244,11 @@ meta_isopen(
* and the message doesn't need being logged either.
* Hence NO_LOG and NO_MCT
*/
- err = mdmn_send_message(
- sp->setno, MD_MN_MSG_CLU_CHECK,
- MD_MSGF_NO_MCT | MD_MSGF_STOP_ON_ERROR |
- MD_MSGF_NO_LOG | MD_MSGF_OVERRIDE_SUSPEND,
- (char *)&d, sizeof (md_isopen_t), &resp, ep);
+ err = mdmn_send_message(sp->setno,
+ MD_MN_MSG_CLU_CHECK, MD_MSGF_NO_MCT |
+ MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG |
+ MD_MSGF_OVERRIDE_SUSPEND, 0, (char *)&d,
+ sizeof (md_isopen_t), &resp, ep);
if (err == 0) {
d.isopen = resp->mmr_exitval;
} else {
diff --git a/usr/src/lib/lvm/libmeta/common/meta_runtime.c b/usr/src/lib/lvm/libmeta/common/meta_runtime.c
index f9c5915088..f401219f44 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_runtime.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_runtime.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
@@ -171,9 +169,8 @@ do_owner_ioctls(void)
ownerioctls_onp) != 0) {
(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
"%s: illegal value for %s: %s.\n"),
- function_namep,
- ownerioctls_namep,
- param_valuep);
+ function_namep, ownerioctls_namep,
+ param_valuep);
syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
"%s: illegal value for %s: %s.\n"),
function_namep,
@@ -216,6 +213,32 @@ commd_get_outfile(void)
}
/*
+ * This controls what type of RPC errors are sent to syslog().
+ * It is used as a bitmask against the clnt_stat list, which defines
+ * 0 as RPC_SUCCESS, so likely shouldn't be set.
+ *
+ * The #define below provides a default of all errors in the list.
+ * The default can then be modified to reduce the amount of traffic
+ * going to syslog in the event of RPC errors.
+ */
+
+#define DEFAULT_ERRMASK (UINT_MAX & ~(1 << RPC_SUCCESS))
+
+uint_t
+meta_rpc_err_mask(void)
+{
+ char *param_valuep;
+ uint_t retval = DEFAULT_ERRMASK;
+
+ param_valuep = meta_get_rt_param("commd_RPC_errors", B_FALSE);
+ if (param_valuep != NULL) {
+ retval = (uint_t)strtol(param_valuep, NULL, 16);
+ free(param_valuep);
+ }
+ return (retval);
+}
+
+/*
* The following lines define private functions
*/
@@ -232,27 +255,23 @@ meta_get_rt_param(const char *param_namep, boolean_t warn_if_not_found)
line_bufferp = (char *)malloc(line_buffer_size);
if (line_bufferp == NULL) {
- (void) fprintf(stderr,
- dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
- function_namep);
- syslog(LOG_ERR,
- dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
- function_namep);
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+ "%s: malloc failed\n"), function_namep);
+ syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: malloc failed\n"),
+ function_namep);
return (param_valuep);
}
param_filep = fopen(param_file_namep, "r");
if (param_filep == NULL) {
- (void) fprintf(stderr,
- dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
- function_namep, param_file_namep);
- syslog(LOG_ERR,
- dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
- function_namep, param_file_namep);
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+ "%s: can't open %s\n"), function_namep, param_file_namep);
+ syslog(LOG_ERR, dgettext(TEXT_DOMAIN, "%s: can't open %s\n"),
+ function_namep, param_file_namep);
free(line_bufferp);
return (param_valuep);
}
while ((fgets(line_bufferp, line_buffer_size, param_filep) != NULL) &&
- (param_valuep == NULL)) {
+ (param_valuep == NULL)) {
newlinep = strchr(line_bufferp, '\n');
if (newlinep != NULL) {
@@ -261,10 +280,10 @@ meta_get_rt_param(const char *param_namep, boolean_t warn_if_not_found)
}
param_name_tokenp = strtok(line_bufferp, token_separator_listp);
if ((param_name_tokenp != NULL) &&
- (strcmp(param_namep, param_name_tokenp) == 0)) {
+ (strcmp(param_namep, param_name_tokenp) == 0)) {
param_value_tokenp = strtok(NULL,
- token_separator_listp);
+ token_separator_listp);
}
if (param_value_tokenp != NULL) {
param_valuep = strdup(param_value_tokenp);
@@ -282,18 +301,12 @@ meta_get_rt_param(const char *param_namep, boolean_t warn_if_not_found)
}
}
if ((param_valuep == NULL) && (warn_if_not_found == B_TRUE)) {
- (void) fprintf(stderr,
- dgettext(TEXT_DOMAIN,
- "%s: value of %s not set or error in %s\n"),
- function_namep,
- param_namep,
- param_file_namep);
- syslog(LOG_ERR,
- dgettext(TEXT_DOMAIN,
- "%s: value of %s not set or error in %s\n"),
- function_namep,
- param_namep,
- param_file_namep);
+ (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+ "%s: value of %s not set or error in %s\n"),
+ function_namep, param_namep, param_file_namep);
+ syslog(LOG_ERR, dgettext(TEXT_DOMAIN,
+ "%s: value of %s not set or error in %s\n"),
+ function_namep, param_namep, param_file_namep);
}
free(line_bufferp);
(void) fclose(param_filep);
diff --git a/usr/src/lib/lvm/libmeta/common/meta_set.c b/usr/src/lib/lvm/libmeta/common/meta_set.c
index 58c7f85735..59d592ce40 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_set.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_set.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
@@ -1877,7 +1875,6 @@ metadrivename_withdrkey(
return (NULL);
}
-
/*
* Get the devid associated with the key.
*
@@ -1893,6 +1890,11 @@ metadrivename_withdrkey(
*/
dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
free(devidp);
+
+ /* dnp could be NULL if the devid could not be decoded. */
+ if (dnp == NULL) {
+ return (NULL);
+ }
dnp->side_names_key = key;
} else {
/*
@@ -1981,6 +1983,9 @@ metadrivename_withdrkey(
*/
dnp = meta_getdnp_bydevid(sp, sideno, devidp, key, ep);
free(devidp);
+ if (dnp == NULL) {
+ return (NULL);
+ }
dnp->side_names_key = key;
}
}
@@ -5733,6 +5738,7 @@ meta_mnsync_diskset_mddbs(
lr->lr_msg.msg_type,
lr->lr_msg.msg_flags | MD_MSGF_REPLAY_MSG |
MD_MSGF_OVERRIDE_SUSPEND,
+ lr->lr_msg.msg_recipient,
lr->lr_msg.msg_event_data,
lr->lr_msg.msg_event_size,
&resultp,
diff --git a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c
index fcfd8faaa3..364b463c84 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_set_hst.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_set_hst.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Just in case we're not in a build environment, make sure that
* TEXT_DOMAIN gets set to something.
@@ -148,7 +147,7 @@ add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_MD_ADDSIDE,
MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
- (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
+ 0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
&resultp, ep);
if (send_rval != 0) {
(void) mdstealerror(ep, &(resultp->mmr_ep));
@@ -178,7 +177,7 @@ add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
* Let's see if it is hsp or not
*/
nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
- otherside, nm.key, &drvnm, NULL, NULL, ep);
+ otherside, nm.key, &drvnm, NULL, NULL, ep);
if (nm.devname == NULL || drvnm == NULL) {
if (nm.devname)
Free((void *)(uintptr_t)nm.devname);
@@ -229,9 +228,9 @@ add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
* increment the count to sync up with the other sides.
*/
for (i = 0; i < nm.ref_count; i++) {
- if (add_name(sp, sideno, nm.key, dname, mnum,
- cname, NULL, NULL, ep) == -1)
- rval = -1;
+ if (add_name(sp, sideno, nm.key, dname, mnum,
+ cname, NULL, NULL, ep) == -1)
+ rval = -1;
}
Free(cname);
@@ -323,17 +322,17 @@ create_multinode_set_on_hosts(
(void) strcpy(nd->nd_nodename, node_v[i]);
nd->nd_ctime = now;
nd->nd_flags = (MD_MN_NODE_ALIVE |
- MD_MN_NODE_ADD);
+ MD_MN_NODE_ADD);
nl2 = nl;
while (nl2) {
- if (strcmp(nl2->msl_node_name,
- node_v[i]) == 0) {
- nd->nd_nodeid = nl2->msl_node_id;
- (void) strcpy(nd->nd_priv_ic,
- nl2->msl_node_addr);
- break;
- }
- nl2 = nl2->next;
+ if (strcmp(nl2->msl_node_name,
+ node_v[i]) == 0) {
+ nd->nd_nodeid = nl2->msl_node_id;
+ (void) strcpy(nd->nd_priv_ic,
+ nl2->msl_node_addr);
+ break;
+ }
+ nl2 = nl2->next;
}
/*
@@ -1123,7 +1122,7 @@ del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
send_rval = mdmn_send_message(sp->setno,
MD_MN_MSG_META_MD_DELSIDE,
MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
- (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
+ 0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
&resultp, ep);
if (send_rval != 0) {
(void) mdstealerror(ep, &(resultp->mmr_ep));
@@ -1156,8 +1155,8 @@ del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
* actually removed.
*/
for (i = 0; i < nm.ref_count; i++) {
- if (del_name(sp, sideno, nm.key, ep) == -1)
- return (-1);
+ if (del_name(sp, sideno, nm.key, ep) == -1)
+ return (-1);
}
}
}
@@ -1183,7 +1182,7 @@ recreate_set(
continue;
}
has_set = nodehasset(sp, nd->nd_nodename,
- NHS_NST_EQ, &xep);
+ NHS_NST_EQ, &xep);
if (has_set >= 0) {
nd = nd->nd_next;
@@ -1207,7 +1206,7 @@ recreate_set(
continue;
has_set = nodehasset(sp, sd->sd_nodes[i],
- NHS_NST_EQ, &xep);
+ NHS_NST_EQ, &xep);
if (has_set >= 0)
continue;
@@ -1967,7 +1966,8 @@ make_sideno_sidenm(
return (-1);
/* find the end of the link list */
- for (sn = dnp->side_names; sn->next != NULL; sn = sn->next);
+ for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
+ ;
sn_next = &sn->next;
if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
@@ -1986,13 +1986,13 @@ make_sideno_sidenm(
* used instead of meta_getnextside_devinfo.
*/
if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
- &sn->dname, &sn->mnum, ep) == -1)
+ &sn->dname, &sn->mnum, ep) == -1)
err = -1;
} else {
/* decrement sideno, to look like the previous sideno */
sideno--;
- if (meta_getnextside_devinfo(sp, np->bname, &sideno, &sn->cname,
- &sn->dname, &sn->mnum, ep) == -1)
+ if (meta_getnextside_devinfo(sp, np->bname, &sideno,
+ &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
err = -1;
}
@@ -2377,14 +2377,14 @@ meta_multinode_set_addhosts(
nd->nd_ctime = now;
nl2 = nl;
while (nl2) {
- if (strcmp(nl2->msl_node_name,
- node_v[nodeindex]) == 0) {
- nd->nd_nodeid = nl2->msl_node_id;
- (void) strcpy(nd->nd_priv_ic,
- nl2->msl_node_addr);
- break;
- }
- nl2 = nl2->next;
+ if (strcmp(nl2->msl_node_name,
+ node_v[nodeindex]) == 0) {
+ nd->nd_nodeid = nl2->msl_node_id;
+ (void) strcpy(nd->nd_priv_ic,
+ nl2->msl_node_addr);
+ break;
+ }
+ nl2 = nl2->next;
}
/*
@@ -2773,16 +2773,16 @@ out:
* rpc.mdcommd is running on the nodes with a set.
*/
if (remote_sets_created == 1) {
- for (i = 0; i < node_c; i++) {
- if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
- sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
- if (rval == 0)
- (void) mdstealerror(ep, &xep);
- rval = -1;
- mde_perror(ep, dgettext(TEXT_DOMAIN,
- "Unable to reinit rpc.mdcommd.\n"));
+ for (i = 0; i < node_c; i++) {
+ if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
+ sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
+ if (rval == 0)
+ (void) mdstealerror(ep, &xep);
+ rval = -1;
+ mde_perror(ep, dgettext(TEXT_DOMAIN,
+ "Unable to reinit rpc.mdcommd.\n"));
+ }
}
- }
}
}
if ((suspend1_flag) || (suspendall_flag)) {
@@ -2819,17 +2819,18 @@ out:
* rpc.mdcommd is be running on the nodes with a set.
*/
if (remote_sets_created == 1) {
- for (i = 0; i < node_c; i++) {
- /* Already verified to be alive */
- if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
- sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
- if (rval == 0)
- (void) mdstealerror(ep, &xep);
- rval = -1;
- mde_perror(ep, dgettext(TEXT_DOMAIN,
- "Unable to resume rpc.mdcommd.\n"));
+ for (i = 0; i < node_c; i++) {
+ /* Already verified to be alive */
+ if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
+ sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
+ &xep)) {
+ if (rval == 0)
+ (void) mdstealerror(ep, &xep);
+ rval = -1;
+ mde_perror(ep, dgettext(TEXT_DOMAIN,
+ "Unable to resume rpc.mdcommd.\n"));
+ }
}
- }
}
meta_ping_mnset(sp->setno);
/*
@@ -4031,7 +4032,8 @@ meta_set_deletehosts(
rb_medr.med_rec_sn = sp->setno;
(void) strcpy(rb_medr.med_rec_snm, sp->setname);
for (i = 0; i < MD_MAXSIDES; i++)
- (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
+ (void) strcpy(rb_medr.med_rec_nodes[i],
+ sd->sd_nodes[i]);
rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
(void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
rb_medr.med_rec_foff = 0;
@@ -4432,45 +4434,52 @@ meta_set_deletehosts(
* alive nodes are updated correctly.
*/
if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
- if ((oha == TRUE) &&
- (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+ if ((oha == TRUE) && (!(nd->nd_flags &
+ MD_MN_NODE_ALIVE))) {
nd->nd_flags |= MD_MN_NODE_DEL;
nd->nd_flags &= ~MD_MN_NODE_OK;
nd = nd->nd_next;
continue;
- }
- if (nd->nd_flags & MD_MN_NODE_OWN) {
- /*
- * Going to set locally cached node
- * flags to rollback join so in case
- * of error, the rollback code knows
- * which nodes to re-join.
- * rpc.metad ignores the RB_JOIN flag.
- */
- nd->nd_flags |= MD_MN_NODE_RB_JOIN;
- nd->nd_flags &= ~MD_MN_NODE_OWN;
+ }
+ if (nd->nd_flags & MD_MN_NODE_OWN) {
+ /*
+ * Going to set locally cached
+ * node flags to rollback join
+ * so in case of error, the
+ * rollback code knows which
+ * nodes to re-join. rpc.metad
+ * ignores the RB_JOIN flag.
+ */
+ nd->nd_flags |=
+ MD_MN_NODE_RB_JOIN;
+ nd->nd_flags &= ~MD_MN_NODE_OWN;
- /*
- * Be careful in ordering of following
- * steps so that recovery from a panic
- * between the steps is viable.
- * Only reset master info in rpc.metad
- * - don't reset local cached info
- * which will be used to set master
- * info back if failure (rollback).
- */
- if (clnt_withdrawset(nd->nd_nodename,
- sp, ep))
- goto rollback;
-
- /* Reset master on deleted node */
- if (clnt_mnsetmaster(node_v[i], sp, "",
- MD_MN_INVALID_NID, ep))
- goto rollback;
- }
-
- nd->nd_flags |= MD_MN_NODE_DEL;
- nd->nd_flags &= ~MD_MN_NODE_OK;
+ /*
+ * Be careful in ordering of
+ * following steps so that
+ * recovery from a panic
+ * between the steps is viable.
+ * Only reset master info in
+ * rpc.metad - don't reset
+ * local cached info which will
+ * be used to set master info
+ * back if failure (rollback).
+ */
+ if (clnt_withdrawset(
+ nd->nd_nodename, sp, ep))
+ goto rollback;
+
+ /*
+ * Reset master on deleted node
+ */
+ if (clnt_mnsetmaster(node_v[i],
+ sp, "", MD_MN_INVALID_NID,
+ ep))
+ goto rollback;
+ }
+
+ nd->nd_flags |= MD_MN_NODE_DEL;
+ nd->nd_flags &= ~MD_MN_NODE_OK;
}
nd = nd->nd_next;
}
@@ -4503,37 +4512,37 @@ meta_set_deletehosts(
/* Send reinit */
nd = sd->sd_nodelist;
while (nd) {
- if ((oha == TRUE) &&
- (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
- nd = nd->nd_next;
- continue;
- }
- /* Class is ignored for REINIT */
- if (clnt_mdcommdctl(nd->nd_nodename,
- COMMDCTL_REINIT,
- sp, NULL, MD_MSCF_NO_FLAGS, ep)) {
- mde_perror(ep, dgettext(TEXT_DOMAIN,
- "Unable to reinit rpc.mdcommd.\n"));
- goto rollback;
- }
- nd = nd->nd_next;
+ if ((oha == TRUE) &&
+ (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+ nd = nd->nd_next;
+ continue;
+ }
+ /* Class is ignored for REINIT */
+ if (clnt_mdcommdctl(nd->nd_nodename,
+ COMMDCTL_REINIT, sp, NULL,
+ MD_MSCF_NO_FLAGS, ep)) {
+ mde_perror(ep, dgettext(TEXT_DOMAIN,
+ "Unable to reinit rpc.mdcommd.\n"));
+ goto rollback;
+ }
+ nd = nd->nd_next;
}
/* Send resume */
nd = sd->sd_nodelist;
while (nd) {
- if ((oha == TRUE) &&
- (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
- nd = nd->nd_next;
- continue;
- }
- if (clnt_mdcommdctl(nd->nd_nodename,
- COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
- MD_MSCF_DONT_RESUME_CLASS1, ep)) {
- mde_perror(ep, dgettext(TEXT_DOMAIN,
- "Unable to resume rpc.mdcommd.\n"));
- goto rollback;
- }
- nd = nd->nd_next;
+ if ((oha == TRUE) &&
+ (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+ nd = nd->nd_next;
+ continue;
+ }
+ if (clnt_mdcommdctl(nd->nd_nodename,
+ COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
+ MD_MSCF_DONT_RESUME_CLASS1, ep)) {
+ mde_perror(ep, dgettext(TEXT_DOMAIN,
+ "Unable to resume rpc.mdcommd.\n"));
+ goto rollback;
+ }
+ nd = nd->nd_next;
}
meta_ping_mnset(sp->setno);
}
@@ -4727,50 +4736,52 @@ meta_set_deletehosts(
RB_TEST(24, "deletehosts", ep)
}
} else {
- nd = sd->sd_nodelist;
- /* All nodes guaranteed to be ALIVE unless in oha mode */
- while (nd) {
- /*
- * If mirror owner was set to a deleted node, then
- * each existing node resets mirror owner to NULL.
- *
- * During OHA mode, don't issue RPCs to
- * non-alive nodes since there is no reason to
- * wait for RPC timeouts.
- */
- if ((oha == TRUE) &&
- (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
- nd = nd->nd_next;
- continue;
- }
+ nd = sd->sd_nodelist;
+ /* All nodes guaranteed ALIVE unless in oha mode */
+ while (nd) {
+ /*
+ * If mirror owner was set to a deleted node,
+ * then each existing node resets mirror owner
+ * to NULL.
+ *
+ * During OHA mode, don't issue RPCs to
+ * non-alive nodes since there is no reason to
+ * wait for RPC timeouts.
+ */
+ if ((oha == TRUE) &&
+ (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
+ nd = nd->nd_next;
+ continue;
+ }
- /* Skip nodes being deleted */
- if (strinlst(nd->nd_nodename, node_c, node_v)) {
- nd = nd->nd_next;
- continue;
- }
+ /* Skip nodes being deleted */
+ if (strinlst(nd->nd_nodename, node_c, node_v)) {
+ nd = nd->nd_next;
+ continue;
+ }
- /*
- * If mirror owner is a deleted node, reset mirror
- * owners to NULL. If an error occurs, print a
- * warning and continue. Don't fail metaset
- * because of mirror owner reset problem since next
- * node to grab mirror will resolve this issue.
- * Before next node grabs mirrors, metaset will show
- * the deleted node as owner which is why an attempt
- * to reset the mirror owner is made.
- */
- if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
- node_c, &node_id_list[0], &xep) == -1) {
- mde_perror(&xep, dgettext(TEXT_DOMAIN,
- "Unable to reset mirror owner on"
- " node %s\n"), nd->nd_nodename);
- mdclrerror(&xep);
- }
+ /*
+ * If mirror owner is a deleted node, reset
+ * mirror owners to NULL. If an error occurs,
+ * print a warning and continue. Don't fail
+ * metaset because of mirror owner reset
+ * problem since next node to grab mirror
+ * will resolve this issue. Before next node
+ * grabs mirrors, metaset will show the deleted
+ * node as owner which is why an attempt to
+ * reset the mirror owner is made.
+ */
+ if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
+ node_c, &node_id_list[0], &xep) == -1) {
+ mde_perror(&xep, dgettext(TEXT_DOMAIN,
+ "Unable to reset mirror owner on"
+ " node %s\n"), nd->nd_nodename);
+ mdclrerror(&xep);
+ }
- RB_TEST(21, "deletehosts", ep)
- nd = nd->nd_next;
- }
+ RB_TEST(21, "deletehosts", ep)
+ nd = nd->nd_next;
+ }
}
}
@@ -4790,10 +4801,10 @@ meta_set_deletehosts(
for (i = 0; i < MD_MAXSIDES; i++) {
if (strinlst(sd->sd_nodes[i], node_c, node_v))
(void) memset(&medr.med_rec_nodes[i],
- '\0', sizeof (md_node_nm_t));
+ '\0', sizeof (md_node_nm_t));
else
(void) strcpy(medr.med_rec_nodes[i],
- sd->sd_nodes[i]);
+ sd->sd_nodes[i]);
}
crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
@@ -5636,79 +5647,85 @@ meta_set_auto_take(
/* Lock the set on our side */
if (clnt_lock_set(hostname, sp, ep)) {
- rval = -1;
- goto out;
+ rval = -1;
+ goto out;
}
if (take_val) {
- /* enable auto_take but only if it is not already set */
- if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
- /* verify that we're the only host in the set */
- for (i = 0; i < MD_MAXSIDES; i++) {
- if (sd->sd_nodes[i] == NULL || sd->sd_nodes[i][0] == '\0')
- continue;
+ /* enable auto_take but only if it is not already set */
+ if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
+ /* verify that we're the only host in the set */
+ for (i = 0; i < MD_MAXSIDES; i++) {
+ if (sd->sd_nodes[i] == NULL ||
+ sd->sd_nodes[i][0] == '\0')
+ continue;
- if (strcmp(sd->sd_nodes[i], hostname) != 0) {
- (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL,
- NULL, sp->setname);
- rval = -1;
- goto out;
- }
- }
+ if (strcmp(sd->sd_nodes[i], hostname) != 0) {
+ (void) mddserror(ep, MDE_DS_SINGLEHOST,
+ sp->setno, NULL, NULL, sp->setname);
+ rval = -1;
+ goto out;
+ }
+ }
- if (clnt_enable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
- rval = -1;
+ if (clnt_enable_sr_flags(hostname, sp,
+ MD_SR_AUTO_TAKE, ep))
+ rval = -1;
- /* Disable SCSI reservations */
- if (sd->sd_flags & MD_SR_MB_DEVID)
- dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
- &xep);
- else
- dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
- if (! mdisok(&xep))
- mdclrerror(&xep);
+ /* Disable SCSI reservations */
+ if (sd->sd_flags & MD_SR_MB_DEVID)
+ dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
+ PRINT_FAST, &xep);
+ else
+ dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
+ &xep);
- if (dd != NULL) {
- if (rel_own_bydd(sp, dd, TRUE, &xep))
- mdclrerror(&xep);
+ if (! mdisok(&xep))
+ mdclrerror(&xep);
+
+ if (dd != NULL) {
+ if (rel_own_bydd(sp, dd, TRUE, &xep))
+ mdclrerror(&xep);
+ }
}
- }
} else {
- /* disable auto_take, if set, or error */
- if (sd->sd_flags & MD_SR_AUTO_TAKE) {
- if (clnt_disable_sr_flags(hostname, sp, MD_SR_AUTO_TAKE, ep))
- rval = -1;
-
- /* Enable SCSI reservations */
- if (sd->sd_flags & MD_SR_MB_DEVID)
- dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST,
- &xep);
- else
- dd = metaget_drivedesc(sp, MD_BASICNAME_OK, &xep);
- if (! mdisok(&xep))
- mdclrerror(&xep);
+ /* disable auto_take, if set, or error */
+ if (sd->sd_flags & MD_SR_AUTO_TAKE) {
+ if (clnt_disable_sr_flags(hostname, sp,
+ MD_SR_AUTO_TAKE, ep))
+ rval = -1;
- if (dd != NULL) {
- mhd_mhiargs_t mhiargs = defmhiargs;
+ /* Enable SCSI reservations */
+ if (sd->sd_flags & MD_SR_MB_DEVID)
+ dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
+ PRINT_FAST, &xep);
+ else
+ dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
+ &xep);
- if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
- mdclrerror(&xep);
- }
+ if (! mdisok(&xep))
+ mdclrerror(&xep);
- } else {
- (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno, NULL, NULL,
- sp->setname);
- rval = -1;
- }
+ if (dd != NULL) {
+ mhd_mhiargs_t mhiargs = defmhiargs;
+
+ if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
+ mdclrerror(&xep);
+ }
+ } else {
+ (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
+ NULL, NULL, sp->setname);
+ rval = -1;
+ }
}
out:
cl_sk = cl_get_setkey(sp->setno, sp->setname);
if (clnt_unlock_set(hostname, cl_sk, &xep)) {
- if (rval == 0)
- (void) mdstealerror(ep, &xep);
- rval = -1;
+ if (rval == 0)
+ (void) mdstealerror(ep, &xep);
+ rval = -1;
}
cl_set_setkey(NULL);
diff --git a/usr/src/lib/lvm/libmeta/common/meta_sp.c b/usr/src/lib/lvm/libmeta/common/meta_sp.c
index 7fc6396a53..c69c211f00 100644
--- a/usr/src/lib/lvm/libmeta/common/meta_sp.c
+++ b/usr/src/lib/lvm/libmeta/common/meta_sp.c
@@ -18,6 +18,7 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -1895,7 +1896,7 @@ meta_sp_extlist_from_wm(
wm.wm_mdname);
result = mdmn_send_message(sp->setno,
MD_MN_MSG_ADDMDNAME,
- MD_MSGF_PANIC_WHEN_INCONSISTENT,
+ MD_MSGF_PANIC_WHEN_INCONSISTENT, 0,
(char *)send_params, message_size, &resp,
ep);
Free(send_params);
@@ -2384,10 +2385,11 @@ meta_sp_get_start(
}
/*
- * FUNCTION: meta_sp_update_wm()
+ * FUNCTION: meta_sp_update_wm_common()
* INPUT: sp - the operating set
* msp - a pointer to the XDR unit structure
* extlist - the extent list specifying watermarks to update
+ * iocval - either MD_IOC_SPUPDATEWM or MD_MN_IOC_SPUPDATEWM
* OUTPUT: ep - return error pointer
* RETURNS: int - -1 if error, 0 on success
* PURPOSE: steps backwards through the extent list updating
@@ -2401,10 +2403,11 @@ meta_sp_get_start(
* are realized.
*/
static int
-meta_sp_update_wm(
+meta_sp_update_wm_common(
mdsetname_t *sp,
md_sp_t *msp,
sp_ext_node_t *extlist,
+ int iocval,
md_error_t *ep
)
{
@@ -2493,8 +2496,8 @@ meta_sp_update_wm(
MD_SETDRIVERNAME(&update_params, MD_SP,
MD_MIN2SET(update_params.mnum));
- if (metaioctl(MD_IOC_SPUPDATEWM, &update_params,
- &update_params.mde, msp->common.namep->cname) != 0) {
+ if (metaioctl(iocval, &update_params, &update_params.mde,
+ msp->common.namep->cname) != 0) {
(void) mdstealerror(ep, &update_params.mde);
rval = -1;
goto out;
@@ -2507,6 +2510,30 @@ out:
return (rval);
}
+static int
+meta_sp_update_wm(
+ mdsetname_t *sp,
+ md_sp_t *msp,
+ sp_ext_node_t *extlist,
+ md_error_t *ep
+)
+{
+ return (meta_sp_update_wm_common(sp, msp, extlist, MD_IOC_SPUPDATEWM,
+ ep));
+}
+
+static int
+meta_mn_sp_update_wm(
+ mdsetname_t *sp,
+ md_sp_t *msp,
+ sp_ext_node_t *extlist,
+ md_error_t *ep
+)
+{
+ return (meta_sp_update_wm_common(sp, msp, extlist, MD_MN_IOC_SPUPDATEWM,
+ ep));
+}
+
/*
* FUNCTION: meta_sp_clear_wm()
* INPUT: sp - the operating set
@@ -4227,9 +4254,9 @@ meta_create_sp(
int committed = 0;
int repart_options = MD_REPART_FORCE;
int create_flag = MD_CRO_32BIT;
+ int mn_set_master = 0;
md_set_desc *sd;
- mm_unit_t *mm;
md_set_mmown_params_t *ownpar = NULL;
int comp_is_mirror = 0;
@@ -4417,19 +4444,7 @@ meta_create_sp(
goto out;
}
if (MD_MNSET_DESC(sd) && sd->sd_mn_am_i_master) {
- mm = (mm_unit_t *)meta_get_unit(sp, compnp, ep);
- if (mm == NULL) {
- rval = -1;
- goto out;
- } else {
- rval = meta_mn_change_owner(&ownpar, sp->setno,
- meta_getminor(compnp->dev),
- sd->sd_mn_mynode->nd_nodeid,
- MD_MN_MM_PREVENT_CHANGE |
- MD_MN_MM_SPAWN_THREAD);
- if (rval == -1)
- goto out;
- }
+ mn_set_master = 1;
}
}
@@ -4450,22 +4465,22 @@ meta_create_sp(
committed = 1;
/* write watermarks */
- if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
- rval = -1;
- goto out;
- }
-
/*
- * Allow mirror ownership to change. If we don't succeed in this
- * ioctl it isn't fatal, but the cluster will probably hang fairly
- * soon as the mirror owner won't change. However, we have
- * successfully written the watermarks out to the device so the
- * softpart creation has succeeded
+ * Special-case for Multi-node sets. As we now have a distributed DRL
+ * update mechanism, we _will_ hit the ioctl-within-ioctl deadlock case
+ * unless we use a 'special' MN-capable ioctl to stage the watermark
+ * update. This only affects the master-node in an MN set.
*/
- if (ownpar) {
- (void) meta_mn_change_owner(&ownpar, sp->setno, ownpar->d.mnum,
- ownpar->d.owner,
- MD_MN_MM_ALLOW_CHANGE | MD_MN_MM_SPAWN_THREAD);
+ if (mn_set_master) {
+ if (meta_mn_sp_update_wm(sp, msp, extlist, ep) < 0) {
+ rval = -1;
+ goto out;
+ }
+ } else {
+ if (meta_sp_update_wm(sp, msp, extlist, ep) < 0) {
+ rval = -1;
+ goto out;
+ }
}
/* second phase of commit, set status to MD_SP_OK */
@@ -5838,7 +5853,7 @@ update_sp_status(
sp_setstat_params.sp_setstat_status = status;
result = mdmn_send_message(sp->setno,
- MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS,
+ MD_MN_MSG_SP_SETSTAT, MD_MSGF_DEFAULT_FLAGS, 0,
(char *)&sp_setstat_params,
sizeof (sp_setstat_params),
&resp, ep);
@@ -6022,7 +6037,7 @@ meta_sp_recover_from_wm(
compnp->cname);
result = mdmn_send_message(sp->setno,
MD_MN_MSG_ADDKEYNAME, MD_MSGF_DEFAULT_FLAGS,
- (char *)send_params, message_size, &resp,
+ 0, (char *)send_params, message_size, &resp,
ep);
Free(send_params);
if (resp != NULL) {
@@ -6154,7 +6169,7 @@ meta_sp_recover_from_wm(
sizeof (*un_array[i]) - sizeof (mp_ext_t) +
(un_array[i]->un_numexts * sizeof (mp_ext_t)));
result = mdmn_send_message(sp->setno,
- MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS,
+ MD_MN_MSG_IOCSET, MD_MSGF_DEFAULT_FLAGS, 0,
(char *)&send_params, mess_size, &resp,
ep);
if (resp != NULL) {
@@ -6303,7 +6318,8 @@ out:
send_params.delkeyname_key = np->key;
(void) mdmn_send_message(sp->setno,
MD_MN_MSG_DELKEYNAME, MD_MSGF_DEFAULT_FLAGS,
- (char *)&send_params, sizeof (send_params),
+ 0, (char *)&send_params,
+ sizeof (send_params),
&resp, ep);
if (resp != NULL) {
free_result(resp);
diff --git a/usr/src/uts/common/io/lvm/md/md.c b/usr/src/uts/common/io/lvm/md/md.c
index f0255ef443..223a1a36af 100644
--- a/usr/src/uts/common/io/lvm/md/md.c
+++ b/usr/src/uts/common/io/lvm/md/md.c
@@ -1858,6 +1858,9 @@ is_mt_ioctl(int cmd) {
case MD_MN_RESYNC:
case MD_MN_SETSYNC:
case MD_MN_POKE_HOTSPARES:
+ case MD_MN_RR_DIRTY:
+ case MD_MN_RR_CLEAN:
+ case MD_MN_IOC_SPUPDATEWM:
return (1);
default:
return (0);
diff --git a/usr/src/uts/common/io/lvm/md/md_ioctl.c b/usr/src/uts/common/io/lvm/md/md_ioctl.c
index d6badaf7a5..fc13ea95c3 100644
--- a/usr/src/uts/common/io/lvm/md/md_ioctl.c
+++ b/usr/src/uts/common/io/lvm/md/md_ioctl.c
@@ -80,40 +80,80 @@ extern int med_get_t_ioctl(mddb_med_t_parm_t *tpp, int mode);
extern int med_set_t_ioctl(mddb_med_t_parm_t *tpp, int mode);
extern unit_t md_get_nextunit(set_t setno);
-static int md_mn_commd_present;
-
/* md_mddb.c */
extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep);
extern void mddb_setexit(mddb_set_t *s);
extern md_krwlock_t nm_lock;
+#define MD_MN_COMMD_CMD "rpc.mdcommd"
+static pid_t md_mn_commd_pid;
+
/*
* md_mn_is_commd_present:
* ----------------------
* Determine if commd is running on this node.
*
- * Returns:
- * 1 if commd has been started
- * 0 if commd has not been started or has exited
+ * If md_mn_commd_pid is 0, trust it. Otherwise, do some in-depth checking
+ * to make sure it's still the one we originally set up by checking the
+ * provided PID's u_comm for the right program name in u_comm.
+ *
+ * This one's intended for the "something went awry" cases, and not for
+ * general use, due to its higher cost for the good/normal case.
*/
int
md_mn_is_commd_present(void)
{
- return (md_mn_commd_present ? 1 : 0);
+ proc_t *commd_procp;
+
+ if (md_mn_commd_pid == (pid_t)0) {
+ return (0);
+ }
+
+ /* some in-depth checking */
+ mutex_enter(&pidlock);
+ if ((commd_procp = prfind(md_mn_commd_pid)) != NULL &&
+ strncmp(commd_procp->p_user.u_comm,
+ MD_MN_COMMD_CMD, strlen(MD_MN_COMMD_CMD)) == 0) {
+ mutex_exit(&pidlock);
+ /*
+ * returns a little more info than asked for, but it will
+ * never be PID 0 when valid.
+ */
+ return ((int)md_mn_commd_pid);
+ }
+ /* if it's not there, make sure we only do these contortions once */
+ md_mn_commd_pid = (pid_t)0;
+ mutex_exit(&pidlock);
+
+ cmn_err(CE_WARN, "!rpc.mdcommd exited abnormally");
+ return (0);
+}
+
+/*
+ * This version merely checks the PID value that was set via an ioctl.
+ * It's intended to be used in the main code flow, where performance is
+ * critical, and accuracy can be sacrificed a little. If something is
+ * already known to be wrong, don't use this, but use
+ * md_mn_is_commd_present() instead.
+ */
+int
+md_mn_is_commd_present_lite(void)
+{
+ return ((int)md_mn_commd_pid);
}
/*
* md_mn_clear_commd_present:
* -------------------------
- * Clear the commd_present flag. Called only from a CPR request to suspend /
- * terminate a resync thread. We clear the md_mn_commd_present flag so that
+ * Clear the md_mn_commd_pid. Called only from a CPR request to suspend /
+ * terminate a resync thread. We clear the md_mn_commd_pid so that
* any RPC request that was in transit can complete with a failure and _not_
* result in an unexpected system panic.
*/
void
md_mn_clear_commd_present()
{
- md_mn_commd_present = 0;
+ md_mn_commd_pid = (pid_t)0;
}
/*
@@ -855,7 +895,6 @@ getnum_ioctl(void *d, int mode)
return (mderror(mdep, MDE_UNIT_NOT_FOUND));
}
- rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER);
/* if array length is not 0 then allocate the output buffers */
if (minor_array_length != 0) {
sz = minor_array_length * ((int)sizeof (minor_t));
@@ -863,6 +902,7 @@ getnum_ioctl(void *d, int mode)
m_ptr = minors;
}
+ rw_enter(&md_ops[modindex]->md_link_rw.lock, RW_READER);
next = md_ops[modindex]->md_head;
count = 0;
while (next) {
@@ -2976,6 +3016,7 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp)
setno,
MD_MN_MSG_TEST1,
flags,
+ 0,
(char *)&msg_test,
sizeof (msg_test),
result);
@@ -3019,6 +3060,7 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp)
setno,
MD_MN_MSG_TEST2,
flags,
+ 0,
(char *)&msg_test,
sizeof (msg_test),
result);
@@ -3408,7 +3450,7 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp)
}
/*
- * Update md_mn_commd_present global to reflect presence or absence of
+ * Update md_mn_commd_pid global to reflect presence or absence of
* /usr/sbin/rpc.mdcommd. This allows us to determine if an RPC failure
* is expected during a mdmn_ksend_message() handshake. If the commd is
* not present then an RPC failure is acceptable. If the commd _is_
@@ -3420,7 +3462,7 @@ md_base_ioctl(md_dev64_t dev, int cmd, caddr_t data, int mode, IOLOCK *lockp)
if (! (mode & FWRITE))
return (EACCES);
- md_mn_commd_present = (int)(intptr_t)data;
+ md_mn_commd_pid = (pid_t)(intptr_t)data;
err = 0;
break;
}
diff --git a/usr/src/uts/common/io/lvm/md/md_mddb.c b/usr/src/uts/common/io/lvm/md/md_mddb.c
index d44571033a..302d33f5f2 100644
--- a/usr/src/uts/common/io/lvm/md/md_mddb.c
+++ b/usr/src/uts/common/io/lvm/md/md_mddb.c
@@ -18,13 +18,12 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/time.h>
@@ -643,7 +642,7 @@ computefreeblks(
freeblks = 0;
for (mbip = s->s_mbiarray[i]; mbip != NULL;
- mbip = mbip->mbi_next) {
+ mbip = mbip->mbi_next) {
freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
}
if (freeblks == 0) /* this happen when there is no */
@@ -798,7 +797,7 @@ mddb_devid_free_delete(
if ((did_freep1->free_blk == firstblk) &&
(did_freep1->free_offset <= offset) &&
((did_freep1->free_length + did_freep1->free_offset) >=
- (length + offset))) {
+ (length + offset))) {
/* Have found our entry - remove from list */
block_found = 1;
did_freep_before = did_freep1;
@@ -816,17 +815,17 @@ mddb_devid_free_delete(
* offset, length.
*/
did_freep_before->free_length = offset -
- did_freep_before->free_offset;
+ did_freep_before->free_offset;
/*
* did_freep_after points to area in block after
* offset, length.
*/
did_freep_after = (mddb_did_free_t *)kmem_zalloc
- (sizeof (mddb_did_free_t), KM_SLEEP);
+ (sizeof (mddb_did_free_t), KM_SLEEP);
did_freep_after->free_blk = did_freep_before->free_blk;
did_freep_after->free_offset = offset + length;
did_freep_after->free_length = old_length - length -
- did_freep_before->free_length;
+ did_freep_before->free_length;
/*
* Add before and after areas to free list
* If area before or after offset, length has length
@@ -835,28 +834,30 @@ mddb_devid_free_delete(
if (did_freep_after->free_length) {
did_freep_after->free_next = did_freep1;
if (did_freep2) {
- did_freep2->free_next = did_freep_after;
+ did_freep2->free_next =
+ did_freep_after;
} else {
- s->s_did_icp->did_ic_freep =
- did_freep_after;
+ s->s_did_icp->did_ic_freep =
+ did_freep_after;
}
did_freep1 = did_freep_after;
} else {
kmem_free(did_freep_after,
- sizeof (mddb_did_free_t));
+ sizeof (mddb_did_free_t));
}
if (did_freep_before->free_length) {
did_freep_before->free_next = did_freep1;
if (did_freep2) {
- did_freep2->free_next = did_freep_before;
+ did_freep2->free_next =
+ did_freep_before;
} else {
- s->s_did_icp->did_ic_freep =
- did_freep_before;
+ s->s_did_icp->did_ic_freep =
+ did_freep_before;
}
} else {
kmem_free(did_freep_before,
- sizeof (mddb_did_free_t));
+ sizeof (mddb_did_free_t));
}
break;
} else {
@@ -934,10 +935,10 @@ mddb_devid_free_get(
if (freep->free_length == 0) {
if (freep2) {
freep2->free_next =
- freep->free_next;
+ freep->free_next;
} else {
s->s_did_icp->did_ic_freep =
- freep->free_next;
+ freep->free_next;
}
kmem_free(freep, sizeof (mddb_did_free_t));
}
@@ -971,7 +972,7 @@ mddb_devid_free_get(
/* Add unused part of block to free list */
(void) mddb_devid_free_add(s, blk_num,
- len, (dbtob(blk_cnt) - len));
+ len, (dbtob(blk_cnt) - len));
}
return ((caddr_t)devid_ptr);
@@ -1015,9 +1016,9 @@ mddb_devid_add(
return (0);
devid_len = ddi_devid_sizeof(devid);
- devid_ptr = (ddi_devid_t)
- mddb_devid_free_get(s, devid_len, &blk, &blkcnt,
- &offset);
+ devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
+ devid_len, &blk, &blkcnt, &offset);
+
if (devid_ptr == NULL) {
return (1);
}
@@ -1090,7 +1091,7 @@ mddb_devid_delete(mddb_set_t *s, uint_t index)
/* Add new free space in disk block to free list */
(void) mddb_devid_free_add(s, did_info->info_firstblk,
- did_info->info_offset, did_info->info_length);
+ did_info->info_offset, did_info->info_length);
return (0);
}
@@ -1439,7 +1440,7 @@ writeblks(
for (i = 0; i < cnt; i++)
blkarray[i] = blk + i;
ret = wrtblklst(s, buffer, blkarray, cnt,
- li, 0, MDDB_WR_ONLY_MASTER);
+ li, 0, MDDB_WR_ONLY_MASTER);
kmem_free(blkarray, size);
return (ret);
}
@@ -1505,7 +1506,7 @@ writelocall(
did_blk = s->s_did_icp->did_ic_blkp;
did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
crcgen(did_blk, &did_blk->blk_checksum,
- dbtob(lbp->lb_didblkcnt), NULL);
+ dbtob(lbp->lb_didblkcnt), NULL);
}
crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
@@ -1521,20 +1522,20 @@ writelocall(
did_dbp = s->s_did_icp->did_ic_dbp;
while (did_dbp) {
err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
- did_dbp->db_firstblk,
- did_dbp->db_blkcnt, li,
- MDDB_WR_ONLY_MASTER);
+ did_dbp->db_firstblk,
+ did_dbp->db_blkcnt, li,
+ MDDB_WR_ONLY_MASTER);
did_dbp = did_dbp->db_next;
}
/* write out device id area block */
err |= writeblks(s, (caddr_t)did_blk,
- lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
- MDDB_WR_ONLY_MASTER);
+ lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
+ MDDB_WR_ONLY_MASTER);
}
/* write out locator block */
err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
- MDDB_WR_ONLY_MASTER);
+ MDDB_WR_ONLY_MASTER);
}
/*
@@ -1715,7 +1716,7 @@ sizeofde(
size_t size;
size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
- sizeof (mddb_block_t) * dep->de_blkcount;
+ sizeof (mddb_block_t) * dep->de_blkcount;
return (size);
}
@@ -1727,7 +1728,7 @@ sizeofde32(
size_t size;
size = sizeof (*dep) - sizeof (dep->de32_blks) +
- sizeof (mddb_block_t) * dep->de32_blkcount;
+ sizeof (mddb_block_t) * dep->de32_blkcount;
return (size);
}
@@ -1760,7 +1761,7 @@ create_db32rec(
if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
db32p->db32_firstentry = 0x4;
de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
- + sizeof (db32p->db32_firstentry)));
+ + sizeof (db32p->db32_firstentry)));
for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
detode32(dep, de32p);
if ((dep->de_next != NULL) && (de32p->de32_next == 0))
@@ -2067,9 +2068,9 @@ readcopy(
dep = (mddb_de_ic_t *)
kmem_zalloc(sizeof (mddb_de_ic_t) -
- sizeof (mddb_block_t) +
- sizeof (mddb_block_t) * de32p->de32_blkcount,
- KM_SLEEP);
+ sizeof (mddb_block_t) +
+ sizeof (mddb_block_t) * de32p->de32_blkcount,
+ KM_SLEEP);
de32tode(de32p, dep);
dbp->db_firstentry = dep;
@@ -2078,10 +2079,10 @@ readcopy(
de32p2 = nextentry(de32p);
dep2 = (mddb_de_ic_t *)kmem_zalloc(
- sizeof (mddb_de_ic_t) -
- sizeof (mddb_block_t) +
- sizeof (mddb_block_t) *
- de32p2->de32_blkcount, KM_SLEEP);
+ sizeof (mddb_de_ic_t) -
+ sizeof (mddb_block_t) +
+ sizeof (mddb_block_t) *
+ de32p2->de32_blkcount, KM_SLEEP);
de32tode(de32p2, dep2);
@@ -2277,10 +2278,9 @@ getoptdev(
if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
!= NULL) {
error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
- prop_op,
- DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
- "removable-media",
- (caddr_t)&propvalue, &proplength);
+ prop_op, DDI_PROP_NOTPROM |
+ DDI_PROP_DONTPASS, "removable-media",
+ (caddr_t)&propvalue, &proplength);
if (error == DDI_PROP_SUCCESS)
removable = 1;
@@ -2348,7 +2348,7 @@ getuserdata(
(MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
(type >= MDDB_FIRST_MODID) &&
((rbp->rb_revision == MDDB_REV_RB) ||
- (rbp->rb_revision == MDDB_REV_RBFN))) {
+ (rbp->rb_revision == MDDB_REV_RBFN))) {
switch (dep->de_flags) {
@@ -2512,7 +2512,7 @@ writeoptrecord(
* In a MN diskset, any node can write optimized record(s).
*/
wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
- dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
+ dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
/*
* For MN diskset, set error in optinfo structure so
* that mddb_commitrec knows which replica failed.
@@ -2556,10 +2556,10 @@ writeoptrecord(
lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
if (lp == bfp->bf_locator) {
dep->de_optinfo[0].o_flags |=
- MDDB_F_EWRITE;
+ MDDB_F_EWRITE;
} else {
dep->de_optinfo[1].o_flags |=
- MDDB_F_EWRITE;
+ MDDB_F_EWRITE;
}
}
err |= MDDB_F_EWRITE;
@@ -2689,7 +2689,7 @@ fixoptrecord(
create_db32rec(db32p, dbp);
crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
- 1, MDDB_WR_ONLY_MASTER);
+ 1, MDDB_WR_ONLY_MASTER);
kmem_free((caddr_t)db32p, MDDB_BSIZE);
return (err);
}
@@ -2932,13 +2932,13 @@ ridev(
if (MD_UPGRADE) {
ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
- clp->l_mnum);
+ clp->l_mnum);
} else {
if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
return (EINVAL);
ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
- clp->l_mnum);
+ clp->l_mnum);
}
if (clp->l_devid != 0) {
@@ -3099,7 +3099,7 @@ writecopy(
create_db32rec(db32p, dbp);
crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
- MDDB_WR_ONLY_MASTER);
+ MDDB_WR_ONLY_MASTER);
kmem_free((caddr_t)db32p, MDDB_BSIZE);
if (err)
return (err);
@@ -3804,7 +3804,7 @@ writestart(
lnp->ln_revision = MDDB_REV_LN;
crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
- lbp->lb_lnblkcnt, 0);
+ lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
@@ -4413,28 +4413,34 @@ locator2cfgloc(
}
if (lbp->lb_flags & MDDB_DEVID_STYLE) {
- did_info = &(did_icp->did_ic_blkp->blk_info[li]);
- if (did_info->info_flags & MDDB_DID_EXISTS) {
- sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
- if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
- /* copy device id from mddb to cfg_loc structure */
- szalloc = clp->l_devid_sz;
- if (sz <= szalloc) {
- for (i = 0; i < sz; i++) {
- ((char *)(uintptr_t)clp->l_devid)[i] =
- ((char *)did_icp->did_ic_devid[li])[i];
+ did_info = &(did_icp->did_ic_blkp->blk_info[li]);
+ if (did_info->info_flags & MDDB_DID_EXISTS) {
+ sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
+ if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
+ /*
+ * copy device id from mddb to
+ * cfg_loc structure
+ */
+ szalloc = clp->l_devid_sz;
+ if (sz <= szalloc) {
+ for (i = 0; i < sz; i++) {
+ ((char *)(uintptr_t)
+ clp->l_devid)[i] =
+ ((char *)did_icp->
+ did_ic_devid[li])[i];
+ }
+ clp->l_devid_flags |= MDDB_DEVID_VALID;
+ (void) strcpy(clp->l_minor_name,
+ did_info->info_minor_name);
+ } else {
+ clp->l_devid_flags |=
+ MDDB_DEVID_NOSPACE;
}
- clp->l_devid_flags |= MDDB_DEVID_VALID;
- (void) strcpy(clp->l_minor_name,
- did_info->info_minor_name);
- } else {
- clp->l_devid_flags |= MDDB_DEVID_NOSPACE;
+ } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
+ clp->l_devid_flags = MDDB_DEVID_SZ;
+ clp->l_devid_sz = sz;
}
- } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
- clp->l_devid_flags = MDDB_DEVID_SZ;
- clp->l_devid_sz = sz;
}
- }
}
/*
@@ -4770,8 +4776,7 @@ get_mbs_n_lbs(
* lb_blkcnt will be set correctly for MN set later once getmasters
* has determined that the set is a MN set.
*/
- lb_blkcnt = ((setno == MD_LOCAL_SET) ?
- MDDB_LOCAL_LBCNT : MDDB_LBCNT);
+ lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
@@ -4919,8 +4924,8 @@ get_mbs_n_lbs(
/* Read in device ID block */
if (did_icp == NULL) {
did_icp = (mddb_did_ic_t *)
- kmem_zalloc(sizeof (mddb_did_ic_t),
- KM_SLEEP);
+ kmem_zalloc(sizeof (mddb_did_ic_t),
+ KM_SLEEP);
} else {
/* Reuse did_icp, but clear out data */
if (did_icp->did_ic_blkp !=
@@ -4932,22 +4937,23 @@ get_mbs_n_lbs(
(mddb_did_blk_t *)NULL;
}
if (did_icp->did_ic_dbp !=
- (mddb_did_db_t *)NULL) {
+ (mddb_did_db_t *)NULL) {
did_dbp1 = did_icp->did_ic_dbp;
while (did_dbp1) {
- did_dbp2 = did_dbp1->db_next;
- kmem_free((caddr_t)did_dbp1->db_ptr,
- dbtob(did_dbp1->db_blkcnt));
- kmem_free((caddr_t)did_dbp1,
- sizeof (mddb_did_db_t));
- did_dbp1 = did_dbp2;
+ did_dbp2 = did_dbp1->db_next;
+ kmem_free((caddr_t)
+ did_dbp1->db_ptr,
+ dbtob(did_dbp1->db_blkcnt));
+ kmem_free((caddr_t)did_dbp1,
+ sizeof (mddb_did_db_t));
+ did_dbp1 = did_dbp2;
}
did_icp->did_ic_dbp =
- (mddb_did_db_t *)NULL;
+ (mddb_did_db_t *)NULL;
}
for (i = 0; i < MDDB_NLB; i++) {
did_icp->did_ic_devid[i] =
- (ddi_devid_t)NULL;
+ (ddi_devid_t)NULL;
}
}
@@ -4985,7 +4991,7 @@ get_mbs_n_lbs(
if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
continue;
if (crcchk(did_blkp, &did_blkp->blk_checksum,
- dbtob(lbp->lb_didblkcnt), NULL))
+ dbtob(lbp->lb_didblkcnt), NULL))
continue;
/*
@@ -5037,82 +5043,106 @@ get_mbs_n_lbs(
* have been updated to match this valid device
* id information.
*/
- for (li = 0; li < lbp->lb_loccnt; li++) {
- did_info = &did_blkp->blk_info[li];
- if (did_info->info_flags & MDDB_DID_EXISTS)
- did_info->info_flags &=
- ~(MDDB_DID_VALID | MDDB_DID_UPDATED);
- }
+ for (li = 0; li < lbp->lb_loccnt; li++) {
+ did_info = &did_blkp->blk_info[li];
+ if (did_info->info_flags & MDDB_DID_EXISTS)
+ did_info->info_flags &=
+ ~(MDDB_DID_VALID |
+ MDDB_DID_UPDATED);
+ }
- cont_flag = 0;
- for (li = 0; li < lbp->lb_loccnt; li++) {
- did_info = &did_blkp->blk_info[li];
- did_block = (caddr_t)NULL;
- if (did_info->info_flags & MDDB_DID_EXISTS) {
- /* Check if block has already been read in */
- did_dbp = did_icp->did_ic_dbp;
- while (did_dbp != 0) {
- if (did_dbp->db_firstblk ==
- did_info->info_firstblk)
- break;
- else
- did_dbp = did_dbp->db_next;
- }
- /* if block not found, read it in */
- if (did_dbp == NULL) {
- did_block = (caddr_t)(kmem_zalloc(dbtob
- (did_info->info_blkcnt), KM_SLEEP));
- buffer = (caddr_t)did_block;
- for (blk = did_info->info_firstblk;
- blk < (did_info->info_firstblk +
- did_info->info_blkcnt); blk++) {
- physblk = getphysblk(blk, rip->ri_mbip);
- err = getblks(s, buffer, dev, physblk,
- btodb(MDDB_BSIZE), 0);
- if (err) {
- rip->ri_flags |= err;
+ cont_flag = 0;
+ for (li = 0; li < lbp->lb_loccnt; li++) {
+ did_info = &did_blkp->blk_info[li];
+ did_block = (caddr_t)NULL;
+ if (did_info->info_flags & MDDB_DID_EXISTS) {
+ /*
+ * Check if block has
+ * already been read in
+ */
+ did_dbp = did_icp->did_ic_dbp;
+ while (did_dbp != 0) {
+ if (did_dbp->db_firstblk ==
+ did_info->info_firstblk)
+ break;
+ else
+ did_dbp =
+ did_dbp->db_next;
+ }
+ /* if block not found, read it in */
+ if (did_dbp == NULL) {
+ did_block = (caddr_t)
+ (kmem_zalloc(dbtob(
+ did_info->info_blkcnt),
+ KM_SLEEP));
+ buffer = (caddr_t)did_block;
+ for (blk =
+ did_info->info_firstblk;
+ blk < (did_info->
+ info_firstblk +
+ did_info->info_blkcnt);
+ blk++) {
+ physblk =
+ getphysblk(blk,
+ rip->ri_mbip);
+ err = getblks(s,
+ buffer, dev,
+ physblk, btodb(
+ MDDB_BSIZE), 0);
+ if (err) {
+ rip->ri_flags |=
+ err;
+ break;
+ }
+ buffer += MDDB_BSIZE;
+ }
+ if (err) {
+ kmem_free(did_block,
+ dbtob(did_info->
+ info_blkcnt));
+ did_block =
+ (caddr_t)NULL;
+ cont_flag = 1;
+ break;
+ }
+
+ /*
+ * Block read in -
+ * alloc Disk Block area
+ */
+ did_dbp = (mddb_did_db_t *)
+ kmem_zalloc(
+ sizeof (mddb_did_db_t),
+ KM_SLEEP);
+ did_dbp->db_ptr = did_block;
+ did_dbp->db_firstblk =
+ did_info->info_firstblk;
+ did_dbp->db_blkcnt =
+ did_info->info_blkcnt;
+
+ /* Add to front of dbp list */
+ did_dbp->db_next =
+ did_icp->did_ic_dbp;
+ did_icp->did_ic_dbp = did_dbp;
+ }
+ /* Check validity of devid in block */
+ if (crcchk(((char *)did_dbp->db_ptr +
+ did_info->info_offset),
+ &did_info->info_checksum,
+ did_info->info_length, NULL)) {
+ cont_flag = 1;
break;
}
- buffer += MDDB_BSIZE;
- }
- if (err) {
- kmem_free(did_block,
- dbtob(did_info->info_blkcnt));
- did_block = (caddr_t)NULL;
- cont_flag = 1;
- break;
- }
- /*
- * Block read in - alloc Disk Block area
- */
- did_dbp = (mddb_did_db_t *)kmem_zalloc(
- sizeof (mddb_did_db_t), KM_SLEEP);
- did_dbp->db_ptr = did_block;
- did_dbp->db_firstblk = did_info->info_firstblk;
- did_dbp->db_blkcnt = did_info->info_blkcnt;
-
- /* Add to front of dbp list */
- did_dbp->db_next = did_icp->did_ic_dbp;
- did_icp->did_ic_dbp = did_dbp;
- }
- /* Check validity of devid in block */
- if (crcchk(((char *)did_dbp->db_ptr +
- did_info->info_offset),
- &did_info->info_checksum,
- did_info->info_length, NULL)) {
- cont_flag = 1;
- break;
- }
-
- /* Block now pointed to by did_dbp */
- did_icp->did_ic_devid[li] = (ddi_devid_t)
- ((char *)did_dbp->db_ptr +
- did_info->info_offset);
- }
- }
- if (cont_flag)
- continue;
+ /* Block now pointed to by did_dbp */
+ did_icp->did_ic_devid[li] =
+ (ddi_devid_t)((char *)
+ did_dbp->db_ptr +
+ did_info->info_offset);
+ }
+ }
+ if (cont_flag)
+ continue;
}
/*
@@ -5194,11 +5224,11 @@ get_mbs_n_lbs(
(rip->ri_old_devid != (ddi_devid_t)NULL)) {
if (ddi_devid_compare(rip->ri_old_devid,
did_icp->did_ic_devid[li]) != 0)
- continue;
+ continue;
} else {
if (ddi_devid_compare(rip->ri_devid,
did_icp->did_ic_devid[li]) != 0)
- continue;
+ continue;
}
if (strcmp(rip->ri_minor_name,
@@ -5214,64 +5244,74 @@ get_mbs_n_lbs(
* information about itself.
*/
if (!mn_set) {
- for (li = 0; li < lbp->lb_loccnt; li++) {
- mddb_drvnm_t *dn;
- mddb_sidelocator_t *slp;
+ for (li = 0; li < lbp->lb_loccnt; li++) {
+ mddb_drvnm_t *dn;
+ mddb_sidelocator_t *slp;
- lp = &lbp->lb_locators[li];
- slp = &lbp->lb_sidelocators[s->s_sideno][li];
- if (lp->l_flags & MDDB_F_DELETED)
- continue;
- if (slp->l_mnum != md_getminor(rip->ri_dev))
- continue;
- if (lp->l_blkno != rip->ri_blkno)
- continue;
- dn = &lbp->lb_drvnm[slp->l_drvnm_index];
- if (strncmp(dn->dn_data, rip->ri_driver,
- MD_MAXDRVNM) == 0)
- break;
- }
+ lp = &lbp->lb_locators[li];
+ slp = &lbp->
+ lb_sidelocators[s->s_sideno][li];
+ if (lp->l_flags & MDDB_F_DELETED)
+ continue;
+ if (slp->l_mnum != md_getminor(
+ rip->ri_dev))
+ continue;
+ if (lp->l_blkno != rip->ri_blkno)
+ continue;
+ dn = &lbp->lb_drvnm[slp->l_drvnm_index];
+ if (strncmp(dn->dn_data,
+ rip->ri_driver, MD_MAXDRVNM) == 0)
+ break;
+ }
} else {
- for (li = 0; li < lbp->lb_loccnt; li++) {
- mddb_drvnm_t *dn;
- mddb_mnsidelocator_t *mnslp;
- mddb_mnlb_t *mnlbp;
- int i;
+ for (li = 0; li < lbp->lb_loccnt; li++) {
+ mddb_drvnm_t *dn;
+ mddb_mnsidelocator_t *mnslp;
+ mddb_mnlb_t *mnlbp;
+ int i;
- /*
- * Check all possible locators locking for
- * match to the currently read-in locator,
- * must match on:
- * - blkno
- * - side locator for this node's side
- * - side locator minor number
- * - side locator driver name
- */
+ /*
+ * Check all possible locators locking
+ * for match to the currently read-in
+ * locator, must match on:
+ * - blkno
+ * - side locator for this
+ * node's side
+ * - side locator minor number
+ * - side locator driver name
+ */
- /* Looking at sidelocs - cast lbp -> mnlbp */
- mnlbp = (mddb_mnlb_t *)lbp;
- lp = &mnlbp->lb_locators[li];
- if (lp->l_flags & MDDB_F_DELETED)
- continue;
- if (lp->l_blkno != rip->ri_blkno)
- continue;
+ /*
+ * Looking at sidelocs:
+ * cast lbp -> mnlbp
+ */
+ mnlbp = (mddb_mnlb_t *)lbp;
+ lp = &mnlbp->lb_locators[li];
+ if (lp->l_flags & MDDB_F_DELETED)
+ continue;
+ if (lp->l_blkno != rip->ri_blkno)
+ continue;
- for (i = 0; i < MD_MNMAXSIDES; i++) {
- mnslp = &mnlbp->lb_mnsidelocators[i][li];
- if (mnslp->mnl_sideno == s->s_sideno) {
- break;
- }
+ for (i = 0; i < MD_MNMAXSIDES; i++) {
+ mnslp = &mnlbp->
+ lb_mnsidelocators[i][li];
+ if (mnslp->mnl_sideno ==
+ s->s_sideno) {
+ break;
+ }
+ }
+ /* No matching side found */
+ if (i == MD_MNMAXSIDES)
+ continue;
+ if (mnslp->mnl_mnum !=
+ md_getminor(rip->ri_dev))
+ continue;
+ dn = &lbp->
+ lb_drvnm[mnslp->mnl_drvnm_index];
+ if (strncmp(dn->dn_data,
+ rip->ri_driver, MD_MAXDRVNM) == 0)
+ break;
}
- /* No matching side found */
- if (i == MD_MNMAXSIDES)
- continue;
- if (mnslp->mnl_mnum != md_getminor(rip->ri_dev))
- continue;
- dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
- if (strncmp(dn->dn_data, rip->ri_driver,
- MD_MAXDRVNM) == 0)
- break;
- }
}
}
@@ -5549,7 +5589,7 @@ load_old_replicas(
did_dbp1 = did_icp->did_ic_dbp;
while (did_dbp1) {
if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
- 0, dbtob(did_dbp1->db_blkcnt))) {
+ 0, dbtob(did_dbp1->db_blkcnt))) {
retval = MDDB_E_NOSPACE;
goto errout;
}
@@ -5904,9 +5944,9 @@ load_old_replicas(
/* Validate device id on current system */
newdev[li] = dev;
if (mddb_devid_validate(
- did_icp->did_ic_devid[li],
- &(newdev[li]),
- did_info->info_minor_name) == 0) {
+ did_icp->did_ic_devid[li],
+ &(newdev[li]),
+ did_info->info_minor_name) == 0) {
/* Set valid flag */
did_info->info_flags |= MDDB_DID_VALID;
} else {
@@ -5931,20 +5971,21 @@ load_old_replicas(
if (mddb_devid_add(s, li,
ret_devid, minor_name)) {
cmn_err(CE_WARN,
- "Not enough space in"
- " metadevice state"
- " database\n");
+ "Not enough space"
+ " in metadevice"
+ " state"
+ " database\n");
cmn_err(CE_WARN,
- "to add relocation"
- " information for"
- " device:\n");
+ "to add relocation"
+ " information for"
+ " device:\n");
cmn_err(CE_WARN,
- " major = %d, "
- " minor = %d\n",
- getmajor(ddi_dev),
- getminor(ddi_dev));
+ " major = %d, "
+ " minor = %d\n",
+ getmajor(ddi_dev),
+ getminor(ddi_dev));
} else {
- write_lb = 1;
+ write_lb = 1;
}
kmem_free(minor_name,
strlen(minor_name) + 1);
@@ -6509,7 +6550,7 @@ initit(
if (! s->s_mbiarray[i])
continue;
dev = md_expldev(
- s->s_lbp->lb_locators[i].l_dev);
+ s->s_lbp->lb_locators[i].l_dev);
dev = md_xlate_targ_2_mini(dev);
if (dev != NODEV64)
mddb_devclose(dev);
@@ -6518,7 +6559,7 @@ initit(
}
kmem_free((caddr_t)s->s_mbiarray,
- sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
+ sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
s->s_mbiarray = NULL;
}
@@ -6560,7 +6601,7 @@ initit(
*/
lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
- MDDB_LOCAL_LBCNT : MDDB_LBCNT);
+ MDDB_LOCAL_LBCNT : MDDB_LBCNT);
if (flag & MDDB_MULTINODE) {
lb_blkcnt = MDDB_MNLBCNT;
}
@@ -6623,7 +6664,7 @@ initit(
/* the btodb that follows is converting the directory block size */
/* Data tag part of mddb located after first block of mddb data */
lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
- btodb(MDDB_BSIZE));
+ btodb(MDDB_BSIZE));
/* Data tags are not used in MN diskset - so set count to 0 */
if (flag & MDDB_MULTINODE)
lbp->lb_dtblkcnt = (mddb_block_t)0;
@@ -6675,14 +6716,14 @@ initit(
devid_flag = 0;
if (devid_flag) {
lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
- lbp->lb_dtblkcnt;
+ lbp->lb_dtblkcnt;
lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
lbp->lb_flags |= MDDB_DEVID_STYLE;
did_icp = (mddb_did_ic_t *)kmem_zalloc
- (sizeof (mddb_did_ic_t), KM_SLEEP);
+ (sizeof (mddb_did_ic_t), KM_SLEEP);
did_blkp = (mddb_did_blk_t *)
- kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
+ kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
did_blkp->blk_magic = MDDB_MAGIC_DI;
did_blkp->blk_revision = MDDB_REV_DI;
did_icp->did_ic_blkp = did_blkp;
@@ -6846,8 +6887,7 @@ mddb_setexit(
* re-grab mutex
* set s_mn_parseflags_sending to zero
*/
- mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
- KM_SLEEP);
+ mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
(s->s_mn_parseflags & MDDB_PARSE_MASK) &&
(!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
@@ -6867,18 +6907,18 @@ mddb_setexit(
mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
for (i = 0; i < MDDB_NLB; i++) {
mddb_parse_msg->msg_lb_flags[i] =
- lbp->lb_locators[i].l_flags;
+ lbp->lb_locators[i].l_flags;
}
kresult = kmem_zalloc(sizeof (md_mn_kresult_t), KM_SLEEP);
while (rval != 0) {
rval = mdmn_ksend_message(s->s_setno,
- MD_MN_MSG_MDDB_PARSE, 0,
- (char *)mddb_parse_msg,
- sizeof (mddb_parse_msg), kresult);
+ MD_MN_MSG_MDDB_PARSE, 0, 0,
+ (char *)mddb_parse_msg,
+ sizeof (md_mn_msg_mddb_parse_t), kresult);
if (rval != 0)
cmn_err(CE_WARN, "mddb_setexit: Unable to send "
- "mddb update message to other nodes in "
- "diskset %s\n", s->s_setname);
+ "mddb update message to other nodes in "
+ "diskset %s\n", s->s_setname);
}
kmem_free(kresult, sizeof (md_mn_kresult_t));
@@ -6987,12 +7027,12 @@ mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
if (mddb_devid_add(s, li, ret_devid,
minor_name)) {
cmn_err(CE_WARN,
- "Not enough space in metadb"
- " to add device id for"
- " dev: major = %d, "
- "minor = %d\n",
- getmajor(ddi_dev),
- getminor(ddi_dev));
+ "Not enough space in metadb"
+ " to add device id for"
+ " dev: major = %d, "
+ "minor = %d\n",
+ getmajor(ddi_dev),
+ getminor(ddi_dev));
}
sz = strlen(minor_name) + 1;
kmem_free(minor_name, sz);
@@ -7179,13 +7219,10 @@ mddb_unload_set(
}
md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
- MD_SET_TAGDATA | MD_SET_USETAG |
- MD_SET_TOOFEW | MD_SET_STALE |
- MD_SET_OWNERSHIP | MD_SET_BADTAG |
- MD_SET_CLRTAG | MD_SET_MNSET |
- MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK |
- MD_SET_MN_MIR_STATE_RC | MD_SET_IMPORT |
- MD_SET_REPLICATED_IMPORT);
+ MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
+ MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
+ MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
+ MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
mutex_exit(SETMUTEX(setno));
}
@@ -7286,13 +7323,13 @@ mddb_locatorblock2splitname(
SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
- SPN_SUFFIX(spn).suf_len);
+ SPN_SUFFIX(spn).suf_len);
iprefix = mnsn->mn_ln_suffix.suf_prefix;
} else {
sn = &lnp->ln_suffixes[sideno][li];
SPN_SUFFIX(spn).suf_len = sn->suf_len;
bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
- SPN_SUFFIX(spn).suf_len);
+ SPN_SUFFIX(spn).suf_len);
iprefix = sn->suf_prefix;
}
SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
@@ -7328,7 +7365,7 @@ getdeldev(
* Data checking
*/
if (setno >= md_nsets || cp->c_id < 0 ||
- cp->c_id > cp->c_dbmax) {
+ cp->c_id > cp->c_dbmax) {
return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
}
@@ -7377,14 +7414,14 @@ getdeldev(
if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
- setno));
+ setno));
}
li = cp->c_id;
} else {
if (cp->c_id >= cp->c_dbcnt) {
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
- setno));
+ setno));
}
/* CSTYLED */
@@ -7446,7 +7483,7 @@ getdeldev(
* commitcnt to 0.
*/
(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
- MDDB_WR_ONLY_MASTER);
+ MDDB_WR_ONLY_MASTER);
lbp->lb_commitcnt = commitcnt;
}
@@ -7689,7 +7726,7 @@ md_update_locator_namespace(
lnp->ln_revision = MDDB_REV_LN;
crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
- lbp->lb_lnblkcnt, 0);
+ lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
@@ -7851,7 +7888,7 @@ update_mb_devid(
*/
if (devidptr != (ddi_devid_t)NULL) {
mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
- KM_SLEEP);
+ KM_SLEEP);
mb->mb_magic = MDDB_MAGIC_DU;
mb->mb_revision = MDDB_REV_MB;
mb2free = 1;
@@ -8077,7 +8114,7 @@ delnewside(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_TOOSMALL,
- NODEV32, setno));
+ NODEV32, setno));
}
}
@@ -8095,7 +8132,7 @@ delnewside(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
- setno));
+ setno));
}
if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
@@ -8105,7 +8142,7 @@ delnewside(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
- setno));
+ setno));
}
}
@@ -8119,9 +8156,9 @@ delnewside(
int j;
mnlbp = (mddb_mnlb_t *)lbp;
for (j = 0; j < MD_MNMAXSIDES; j++) {
- mnslp = &mnlbp->lb_mnsidelocators[j][i];
- if (mnslp->mnl_sideno == cp->c_sideno)
- break;
+ mnslp = &mnlbp->lb_mnsidelocators[j][i];
+ if (mnslp->mnl_sideno == cp->c_sideno)
+ break;
}
if (j < MD_MNMAXSIDES) {
mnslp->mnl_mnum = NODEV32;
@@ -8129,7 +8166,7 @@ delnewside(
mnlnp = (mddb_mnln_t *)lnp;
mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
bzero((caddr_t)mnsn,
- sizeof (md_mnname_suffix_t));
+ sizeof (md_mnname_suffix_t));
}
} else {
slp = &lbp->lb_sidelocators[cp->c_sideno][i];
@@ -8148,7 +8185,7 @@ delnewside(
lnp->ln_revision = MDDB_REV_LN;
crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
- lbp->lb_lnblkcnt, 0);
+ lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
@@ -8288,11 +8325,11 @@ newdev(
((daddr_t)lp->l_blkno == clp->l_blkno)) {
if (command == MDDB_NEWDEV) {
ddi_devid_free((ddi_devid_t)(uintptr_t)
- clp->l_devid);
+ clp->l_devid);
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep,
- MDE_DB_EXISTS, NODEV32, setno));
+ MDE_DB_EXISTS, NODEV32, setno));
}
}
} else {
@@ -8302,7 +8339,7 @@ newdev(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep,
- MDE_DB_EXISTS, NODEV32, setno));
+ MDE_DB_EXISTS, NODEV32, setno));
}
}
}
@@ -8345,7 +8382,7 @@ newdev(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
- setno));
+ setno));
}
}
@@ -8402,7 +8439,7 @@ newdev(
single_thread_end(s);
mddb_setexit(s);
return (mdmddberror(ep, MDE_DB_TOOSMALL,
- NODEV32, setno));
+ NODEV32, setno));
}
}
/*
@@ -8462,7 +8499,7 @@ newdev(
lnp->ln_revision = MDDB_REV_LN;
crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
- lbp->lb_lnblkcnt, 0);
+ lbp->lb_lnblkcnt, 0);
/*
* If a MN diskset and this is the master, set the PARSE_LOCNM
* flag in the mddb_set structure to show that the locator
@@ -8579,67 +8616,74 @@ mddb_configure(
mdclrerror(ep);
switch (command) {
- case MDDB_NEWDEV:
- err = newdev(cp, command, ep);
- break;
+ case MDDB_NEWDEV:
+ err = newdev(cp, command, ep);
+ break;
- case MDDB_NEWSIDE:
- case MDDB_DELSIDE:
- err = delnewside(cp, command, ep);
- break;
+ case MDDB_NEWSIDE:
+ case MDDB_DELSIDE:
+ err = delnewside(cp, command, ep);
+ break;
- case MDDB_GETDEV:
- case MDDB_DELDEV:
- case MDDB_ENDDEV:
- err = getdeldev(cp, command, ep);
- break;
+ case MDDB_GETDEV:
+ case MDDB_DELDEV:
+ case MDDB_ENDDEV:
+ err = getdeldev(cp, command, ep);
+ break;
- case MDDB_GETDRVRNAME:
- err = getdriver(&cp->c_locator);
- break;
+ case MDDB_GETDRVRNAME:
+ err = getdriver(&cp->c_locator);
+ break;
- case MDDB_USEDEV:
- /*
- * Note: must allow USEDEV ioctl during upgrade to support
- * auto-take disksets.
- *
- * Also during the set import if the md_devid_destroy
- * flag is set then error out
- */
+ case MDDB_USEDEV:
+ /*
+ * Note: must allow USEDEV ioctl during upgrade to
+ * support auto-take disksets.
+ *
+ * Also during the set import if the md_devid_destroy
+ * flag is set then error out
+ */
- if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
- return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
+ if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
+ return (mdmderror(ep, MDE_INVAL_UNIT,
+ MD_ADM_MINOR));
- if (setno >= md_nsets)
- return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
+ if (setno >= md_nsets)
+ return (mdmderror(ep, MDE_INVAL_UNIT,
+ MD_ADM_MINOR));
- if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
- if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
- err = mddbstatus2error(ep, err, NODEV32, setno);
- break;
+ if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
+ NULL) {
+ if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
+ NULL) {
+ err = mddbstatus2error(ep, err,
+ NODEV32, setno);
+ break;
+ }
}
- }
- if (setno == MD_LOCAL_SET)
- flag = MDDB_F_IOCTL;
- if (cp->c_locator.l_old_devid) {
- md_set_setstatus(setno, MD_SET_REPLICATED_IMPORT);
- }
- err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
- mddb_setexit(s);
- break;
+ if (setno == MD_LOCAL_SET)
+ flag = MDDB_F_IOCTL;
+ if (cp->c_locator.l_old_devid) {
+ md_set_setstatus(setno,
+ MD_SET_REPLICATED_IMPORT);
+ }
+ err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
+ mddb_setexit(s);
+ break;
- case MDDB_RELEASESET:
- mutex_enter(&mddb_lock);
- mddb_unload_set(cp->c_setno);
- mutex_exit(&mddb_lock);
- break;
+ case MDDB_RELEASESET:
+ mutex_enter(&mddb_lock);
+ mddb_unload_set(cp->c_setno);
+ mutex_exit(&mddb_lock);
+ break;
- case MDDB_SETDID:
- err = setdid(cp);
- break;
+ case MDDB_SETDID:
+ err = setdid(cp);
+ break;
- default:
- err = mdmddberror(ep, MDE_DB_INVALID, NODEV32, cp->c_setno);
+ default:
+ err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
+ cp->c_setno);
}
return (err);
@@ -8761,15 +8805,14 @@ mddb_createrec(
}
recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
- usersize, MDDB_BSIZE);
+ usersize, MDDB_BSIZE);
blkcnt = btodb(recsize);
if (mddb_maxblocks)
maxblocks = mddb_maxblocks;
else
- maxblocks = (MDDB_BSIZE -
- (sizeof (*db32p) + sizeof (*de32p) -
- sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
+ maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
+ sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
if (blkcnt > maxblocks) {
mddb_setexit(s);
@@ -8833,7 +8876,7 @@ mddb_createrec(
} while (dbp);
desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
- (sizeof (mddb_block_t) * blkcnt);
+ (sizeof (mddb_block_t) * blkcnt);
/*
* see if a directory block exists which will hold this entry
@@ -8872,7 +8915,8 @@ mddb_createrec(
mddb_setexit(s);
return (MDDB_E_NOSPACE);
}
- for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next);
+ for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
+ ;
dbp->db_next = newdbp;
bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
dbp->db_nextblk = getfreeblks(s, 1);
@@ -8888,10 +8932,10 @@ mddb_createrec(
* ready to add record
*/
desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
- (sizeof (mddb_block_t) * blkcnt);
+ (sizeof (mddb_block_t) * blkcnt);
if (dbp->db_firstentry) {
- for (dep = dbp->db_firstentry; dep->de_next;
- dep = dep->de_next);
+ for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
+ ;
dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
dep = dep->de_next;
} else {
@@ -8919,8 +8963,8 @@ mddb_createrec(
dep->de_blkcount = blkcnt;
flag_type = options &
(MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
- MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
- MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
+ MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
+ MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
switch (flag_type) {
case MD_CRO_OPTIMIZE:
dep->de_flags = MDDB_F_OPT;
@@ -9003,7 +9047,7 @@ mddb_createrec(
if ((options & MD_CRO_OPTIMIZE) == 0) {
for (i = 0; i < blkcnt; i++) {
err |= writeall(s, (caddr_t)tmppnt,
- dep->de_blks[i], 1, 0);
+ dep->de_blks[i], 1, 0);
tmppnt += MDDB_BSIZE;
}
} else {
@@ -9310,10 +9354,10 @@ mddb_getrecaddr_resize(
mddb_rb32_t *nrbp;
recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
- icsize, MDDB_BSIZE);
+ icsize, MDDB_BSIZE);
if (dep->de_recsize < recsize)
cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
- "nonoptimized records can be resized\n");
+ "nonoptimized records can be resized\n");
}
mddb_setexit(s);
@@ -9673,26 +9717,29 @@ mddb_commitrec(
lbp = s->s_lbp;
mnlbp = (mddb_mnlb_t *)lbp;
for (i = 0; i < 2; i++) {
- li = dep->de_optinfo[i].o_li;
- lp = &lbp->lb_locators[li];
- for (j = 0; j < MD_MNMAXSIDES; j++) {
- mnslp =
- &mnlbp->lb_mnsidelocators[j][li];
- if (mnslp->mnl_sideno == s->s_sideno)
- break;
- }
- if (j == MD_MNMAXSIDES)
- continue;
+ li = dep->de_optinfo[i].o_li;
+ lp = &lbp->lb_locators[li];
+ for (j = 0; j < MD_MNMAXSIDES; j++) {
+ mnslp =
+ &mnlbp->
+ lb_mnsidelocators[j][li];
+ if (mnslp->mnl_sideno ==
+ s->s_sideno)
+ break;
+ }
+ if (j == MD_MNMAXSIDES)
+ continue;
- dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
- recerr = &msg_recerr->msg_recerr[i];
- recerr->r_li = li;
- recerr->r_flags =
- dep->de_optinfo[i].o_flags;
- recerr->r_blkno = lp->l_blkno;
- recerr->r_mnum = md_getminor(lp->l_dev);
- (void) strncpy(recerr->r_driver_name,
- dn->dn_data, MD_MAXDRVNM);
+ dn = &lbp->
+ lb_drvnm[mnslp->mnl_drvnm_index];
+ recerr = &msg_recerr->msg_recerr[i];
+ recerr->r_li = li;
+ recerr->r_flags =
+ dep->de_optinfo[i].o_flags;
+ recerr->r_blkno = lp->l_blkno;
+ recerr->r_mnum = md_getminor(lp->l_dev);
+ (void) strncpy(recerr->r_driver_name,
+ dn->dn_data, MD_MAXDRVNM);
}
/* Release locks */
@@ -9711,17 +9758,17 @@ mddb_commitrec(
* the optimized resync records it owns.
*/
rval = mdmn_ksend_message(s->s_setno,
- MD_MN_MSG_MDDB_OPTRECERR,
- MD_MSGF_NO_BCAST,
- (char *)msg_recerr,
- sizeof (md_mn_msg_mddb_optrecerr_t),
- kres);
+ MD_MN_MSG_MDDB_OPTRECERR,
+ MD_MSGF_NO_BCAST, 0,
+ (char *)msg_recerr,
+ sizeof (md_mn_msg_mddb_optrecerr_t),
+ kres);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
cmn_err(CE_WARN, "mddb_commitrec: "
- "Unable to send optimized "
- "resync record failure "
- "message to other nodes in "
- "diskset %s\n", s->s_setname);
+ "Unable to send optimized "
+ "resync record failure "
+ "message to other nodes in "
+ "diskset %s\n", s->s_setname);
mdmn_ksend_show_error(rval, kres,
"MD_MN_MSG_MDDB_OPTRECERR");
}
@@ -9758,7 +9805,7 @@ mddb_commitrec(
}
kmem_free(kres, sizeof (md_mn_kresult_t));
kmem_free(msg_recerr,
- sizeof (md_mn_msg_mddb_optrecerr_t));
+ sizeof (md_mn_msg_mddb_optrecerr_t));
/* Resync record should be fixed - if possible */
s->s_optwaiterr--;
@@ -10723,8 +10770,7 @@ mddb_validate_lb(
if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
(ddi_devid_compare(rtn_devid, devid) == 0)) {
did_info->info_flags = MDDB_DID_VALID |
- MDDB_DID_EXISTS |
- MDDB_DID_UPDATED;
+ MDDB_DID_EXISTS | MDDB_DID_UPDATED;
} else {
cnt++;
/*
@@ -11051,7 +11097,7 @@ mddb_parse(mddb_parse_parm_t *mpp)
/* Assumes master blocks are already setup */
if (lbp == (mddb_lb_t *)NULL) {
lbp = (mddb_lb_t *)kmem_zalloc(
- dbtob(MDDB_MNLBCNT), KM_SLEEP);
+ dbtob(MDDB_MNLBCNT), KM_SLEEP);
}
err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
@@ -11135,7 +11181,7 @@ mddb_parse(mddb_parse_parm_t *mpp)
/* Free this node's old view of mddb locator blocks */
kmem_free((caddr_t)s->s_lbp,
- dbtob(s->s_lbp->lb_blkcnt));
+ dbtob(s->s_lbp->lb_blkcnt));
s->s_lbp = lbp;
} else {
if (lbp)
@@ -11206,7 +11252,7 @@ mddb_parse(mddb_parse_parm_t *mpp)
* master could have rewritten in during fixoptrecord.
*/
db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
- KM_SLEEP);
+ KM_SLEEP);
create_db32rec(db32p, dbp);
for (li = 0; li < lbp->lb_loccnt; li++) {
lp = &lbp->lb_locators[li];
@@ -11216,16 +11262,16 @@ mddb_parse(mddb_parse_parm_t *mpp)
continue;
err = readblks(s, (caddr_t)db32p,
- db32p->db32_blknum, 1, li);
+ db32p->db32_blknum, 1, li);
if (err)
continue;
/* Reverify db; go to next mddb if bad */
if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
(revchk(MDDB_REV_DB,
- db32p->db32_revision)) ||
+ db32p->db32_revision)) ||
(crcchk(db32p, &db32p->db32_checksum,
- MDDB_BSIZE, NULL))) {
+ MDDB_BSIZE, NULL))) {
continue;
} else {
break;
@@ -11254,9 +11300,8 @@ mddb_parse(mddb_parse_parm_t *mpp)
if (li == lbp->lb_loccnt) {
kmem_free((caddr_t)db32p, MDDB_BSIZE);
cmn_err(CE_PANIC, "md: mddb: Node unable to "
- "access any SVM state database "
- "replicas for diskset %s\n",
- s->s_setname);
+ "access any SVM state database "
+ "replicas for diskset %s\n", s->s_setname);
}
/*
* Setup temp copy of linked list of de's.
@@ -11505,45 +11550,53 @@ mddb_optrecfix(mddb_optrec_parm_t *mop)
lp->l_flags &= ~MDDB_F_ACTIVE;
}
} else {
- /*
- * Passed in li from slave does not match
- * the replica in the master's structures.
- * This could have occurred if a delete
- * mddb command was running when the
- * optimized resync record had a failure.
- * Search all replicas for this entry.
- * If no match, just ignore.
- * If a match, set replica in error.
- */
- for (li = 0; li < lbp->lb_loccnt; li++) {
- lp = &lbp->lb_locators[li];
- if (lp->l_flags & MDDB_F_DELETED)
- continue;
+ /*
+ * Passed in li from slave does not match
+ * the replica in the master's structures.
+ * This could have occurred if a delete
+ * mddb command was running when the
+ * optimized resync record had a failure.
+ * Search all replicas for this entry.
+ * If no match, just ignore.
+ * If a match, set replica in error.
+ */
+ for (li = 0; li < lbp->lb_loccnt; li++) {
+ lp = &lbp->lb_locators[li];
+ if (lp->l_flags & MDDB_F_DELETED)
+ continue;
- for (j = 0; j < MD_MNMAXSIDES; j++) {
- mnslp =
- &mnlbp->lb_mnsidelocators[j][li];
- if (mnslp->mnl_sideno == s->s_sideno)
- break;
- }
- if (j == MD_MNMAXSIDES)
- continue;
+ for (j = 0; j < MD_MNMAXSIDES; j++) {
+ mnslp =
+ &mnlbp->
+ lb_mnsidelocators[j][li];
+ if (mnslp->mnl_sideno ==
+ s->s_sideno)
+ break;
+ }
+ if (j == MD_MNMAXSIDES)
+ continue;
- dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
- if ((strncmp(dn->dn_data, recerr->r_driver_name,
- MD_MAXDRVNM) == 0) &&
- (recerr->r_blkno == lp->l_blkno) &&
- (recerr->r_mnum == mnslp->mnl_mnum)) {
- if ((lp->l_flags & MDDB_F_ACTIVE) ||
- ((lp->l_flags & MDDB_F_EWRITE)
- == 0)) {
- something_changed = 1;
- lp->l_flags |= MDDB_F_EWRITE;
- lp->l_flags &= ~MDDB_F_ACTIVE;
+ dn = &lbp->
+ lb_drvnm[mnslp->mnl_drvnm_index];
+ if ((strncmp(dn->dn_data,
+ recerr->r_driver_name,
+ MD_MAXDRVNM) == 0) &&
+ (recerr->r_blkno == lp->l_blkno) &&
+ (recerr->r_mnum ==
+ mnslp->mnl_mnum)) {
+ if ((lp->l_flags &
+ MDDB_F_ACTIVE) ||
+ ((lp->l_flags &
+ MDDB_F_EWRITE) == 0)) {
+ something_changed = 1;
+ lp->l_flags |=
+ MDDB_F_EWRITE;
+ lp->l_flags &=
+ ~MDDB_F_ACTIVE;
+ }
+ break;
}
- break;
}
- }
}
}
}
@@ -11693,8 +11746,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
/* Re-verify that set is not stale */
if (md_get_setstatus(setno) & MD_SET_STALE) {
mddb_setexit(s);
- return (mdmddberror(ep, MDE_DB_STALE,
- NODEV32, setno));
+ return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
}
lbp = s->s_lbp;
@@ -11735,34 +11787,39 @@ mddb_check_write_ioctl(mddb_config_t *info)
* They may have been altered by the previous master
*/
for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
- for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
- if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
- continue;
- }
- /* This has been alloc'ed while joining the set */
- if (dep->de_rb) {
- kmem_free(dep->de_rb, dep->de_recsize);
- dep->de_rb = (mddb_rb32_t *)NULL;
- }
- if (dep->de_rb_userdata) {
- kmem_free(dep->de_rb_userdata, dep->de_reqsize);
- dep->de_rb_userdata = (caddr_t)NULL;
- }
-
- err = getrecord(s, dep, li);
- if (err) {
+ for (dep = dbp->db_firstentry; dep; dep =
+ dep->de_next) {
+ if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
+ continue;
+ }
/*
- * When we see on error while reading the
- * changelog entries, we move on to the next
- * mddb
+ * This has been alloc'ed while
+ * joining the set
*/
- err = 1;
- break; /* out of inner for-loop */
+ if (dep->de_rb) {
+ kmem_free(dep->de_rb, dep->de_recsize);
+ dep->de_rb = (mddb_rb32_t *)NULL;
+ }
+ if (dep->de_rb_userdata) {
+ kmem_free(dep->de_rb_userdata,
+ dep->de_reqsize);
+ dep->de_rb_userdata = (caddr_t)NULL;
+ }
+
+ err = getrecord(s, dep, li);
+ if (err) {
+ /*
+ * When we see on error while reading
+ * the changelog entries, we move on
+ * to the next mddb
+ */
+ err = 1;
+ break; /* out of inner for-loop */
+ }
+ allocuserdata(dep);
}
- allocuserdata(dep);
- }
- if (err)
- break; /* out of outer for-loop */
+ if (err)
+ break; /* out of outer for-loop */
}
/* If err, try next mddb */
@@ -11773,7 +11830,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
/* Is incore locator block same as ondisk? */
if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
- == 1) {
+ == 1) {
write_out_mddb = 1;
kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
break;
@@ -11786,7 +11843,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
KM_SLEEP);
/* read in on-disk locator names */
err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
- lbp->lb_lnblkcnt, li);
+ lbp->lb_lnblkcnt, li);
/* If err, try next mddb */
if (err) {
@@ -11796,7 +11853,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
/* Are incore locator names same as ondisk? */
if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
- == 1) {
+ == 1) {
kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
write_out_mddb = 1;
break;
@@ -11885,7 +11942,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
/* Is incore locator block same as ondisk? */
if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
- == 1) {
+ == 1) {
kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
write_out_mddb = 1;
break;
@@ -11909,7 +11966,7 @@ mddb_check_write_ioctl(mddb_config_t *info)
/* Are incore locator names same as ondisk? */
if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
- == 1) {
+ == 1) {
kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
write_out_mddb = 1;
break;
@@ -12322,8 +12379,7 @@ update_mb(
/* disk is powered off or not there */
continue;
- if (md_get_setstatus(s->s_setno) &
- MD_SET_REPLICATED_IMPORT) {
+ if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
/*
* It is a replicated set
*/
diff --git a/usr/src/uts/common/io/lvm/md/md_subr.c b/usr/src/uts/common/io/lvm/md/md_subr.c
index 4ff713a78f..460547f957 100644
--- a/usr/src/uts/common/io/lvm/md/md_subr.c
+++ b/usr/src/uts/common/io/lvm/md/md_subr.c
@@ -86,6 +86,7 @@ extern md_set_io_t md_set_io[];
extern md_ops_t **md_ops;
extern md_ops_t *md_opslist;
extern ddi_modhandle_t *md_mods;
+extern dev_info_t *md_devinfo;
extern md_krwlock_t md_unit_array_rw;
extern kmutex_t md_mx;
@@ -113,7 +114,7 @@ extern void *lookup_entry(struct nm_next_hdr *, set_t,
extern struct nm_next_hdr *get_first_record(set_t, int, int);
struct mdq_anchor md_done_daemon; /* done request queue */
-struct mdq_anchor md_mstr_daemon; /* mirror timeout requests */
+struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */
struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */
struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */
struct mdq_anchor md_ff_daemonq; /* failfast request queue */
@@ -121,6 +122,7 @@ struct mdq_anchor md_mirror_daemon; /* mirror owner queue */
struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */
struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */
struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */
+struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */
int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */
int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */
@@ -129,6 +131,7 @@ int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */
int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */
int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */
+int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */
#ifdef DEBUG
/* Flag to switch on debug messages */
@@ -146,7 +149,7 @@ int md_release_reacquire_debug = 0; /* debug flag */
*
*/
-#define MD_DAEMON_QUEUES 10
+#define MD_DAEMON_QUEUES 11
md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
{&md_done_daemon, &md_done_daemon_threads},
@@ -158,6 +161,7 @@ md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
{&md_sp_daemon, &md_sp_daemon_threads},
{&md_mhs_daemon, &md_mhs_daemon_threads},
+ {&md_mto_daemon, &md_mto_daemon_threads},
{0, 0}
};
@@ -176,6 +180,12 @@ md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
uint_t md_retry_cnt = 1; /* global so it can be patched */
/*
+ * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
+ * Again, made patchable here should it prove useful.
+ */
+uint_t md_send_retry_limit = 30;
+
+/*
* Bug # 1212146
* Before this change the user had to pass in a short aligned buffer because of
* problems in some underlying device drivers. This problem seems to have been
@@ -712,9 +722,9 @@ md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
if (status & MD_SET_STALE)
flag |= MD_MSGF_NO_LOG;
rval = mdmn_ksend_message(s->s_setno,
- MD_MN_MSG_MDDB_PARSE, flag,
+ MD_MN_MSG_MDDB_PARSE, flag, 0,
(char *)mddb_parse_msg,
- sizeof (mddb_parse_msg), kresult);
+ sizeof (md_mn_msg_mddb_parse_t), kresult);
/* if the node hasn't yet joined, it's Ok. */
if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
(kresult->kmmr_comm_state !=
@@ -2817,6 +2827,15 @@ md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
+ if (alloc_lock) {
+ ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
+ mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
+ MUTEX_DEFAULT, NULL);
+ ui->ui_io_lock->io_list_front = NULL;
+ ui->ui_io_lock->io_list_back = NULL;
+ }
if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
rw_enter(&md_unit_array_rw.lock, RW_WRITER);
MDI_VOIDUNIT(mnum) = (void *) ui;
@@ -2829,15 +2848,6 @@ md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
ui->ui_link.ln_setno = setno;
ui->ui_link.ln_id = mnum;
ops->md_head = &ui->ui_link;
- if (alloc_lock) {
- ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
- mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
- MUTEX_DEFAULT, NULL);
- ui->ui_io_lock->io_list_front = NULL;
- ui->ui_io_lock->io_list_back = NULL;
- }
/* setup the unavailable field */
#if defined(_ILP32)
if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
@@ -3865,82 +3875,68 @@ md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
/*
* Send a kernel message.
* user has to provide for an allocated result structure
- * If the door handler disappears we retry forever emitting warnings every so
- * often.
- * TODO: make this a flaggable attribute so that the caller can decide if the
- * message is to be a 'one-shot' message or not.
+ * If the door handler disappears we retry, emitting warnings every so often.
+ *
+ * The recipient argument is almost always unused, and is therefore typically
+ * set to zero, as zero is an invalid cluster nodeid. The exceptions are the
+ * marking and clearing of the DRL from a node that is not currently the
+ * owner. In these cases, the recipient argument will be the nodeid of the
+ * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner
+ * nodes will not receive these messages.
+ *
+ * For the case where md_mn_is_commd_present() is false, we rely on the
+ * "result" having been kmem_zalloc()ed which, in effect, sets MDMNE_NULL for
+ * kmmr_comm_state making MDMN_KSEND_MSG_OK() result in 0.
*/
int
mdmn_ksend_message(
set_t setno,
md_mn_msgtype_t type,
uint_t flags,
+ md_mn_nodeid_t recipient,
char *data,
int size,
md_mn_kresult_t *result)
{
door_arg_t da;
md_mn_kmsg_t *kmsg;
- uint_t retry_cnt = 0;
+ uint_t send_try_cnt = 0;
+ uint_t retry_noise_cnt = 0;
int rval;
+ k_sigset_t oldmask, newmask;
if (size > MDMN_MAX_KMSG_DATA)
return (ENOMEM);
kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
kmsg->kmsg_flags = flags;
kmsg->kmsg_setno = setno;
+ kmsg->kmsg_recipient = recipient;
kmsg->kmsg_type = type;
kmsg->kmsg_size = size;
bcopy(data, &(kmsg->kmsg_data), size);
-#ifdef DEBUG_COMM
- printf("send msg: set=%d, flags=%d, type=%d, txid = 0x%llx,"
- " size=%d, data=%d, data2=%d\n",
- kmsg->kmsg_setno, kmsg->kmsg_flags, kmsg->kmsg_type,
- kmsg->kmsg_size, *(int *)data, *(int *)(char *)(&kmsg->kmsg_data));
-
-
-#endif /* DEBUG_COMM */
-
- da.data_ptr = (char *)(kmsg);
- da.data_size = sizeof (md_mn_kmsg_t);
- da.desc_ptr = NULL;
- da.desc_num = 0;
- da.rbuf = (char *)result;
- da.rsize = sizeof (*result);
-
/*
* Wait for the door handle to be established.
*/
-
while (mdmn_door_did == -1) {
- if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
+ if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
cmn_err(CE_WARN, "door handle not yet ready. "
"Check if /usr/lib/lvm/mddoors is running");
}
delay(md_hz);
}
- retry_cnt = 0;
-
- while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da, NULL,
- SIZE_MAX, 0)) != 0) {
- if (rval == EAGAIN) {
- if ((++retry_cnt % MD_MN_WARN_INTVL) == 0) {
- cmn_err(CE_WARN, "door call failed. "
- "Check if /usr/lib/lvm/mddoors is running");
- }
- } else {
- cmn_err(CE_WARN,
- "md door call failed. Returned %d", rval);
- }
- delay(md_hz);
+
+ /*
+ * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
+ * do not fail if the user process receives a signal while we're
+ * active in the door interface.
+ */
+ if (flags & MD_MSGF_BLK_SIGNAL) {
+ sigfillset(&newmask);
+ sigreplace(&newmask, &oldmask);
}
- kmem_free(kmsg, sizeof (md_mn_kmsg_t));
/*
- * Attempt to determine if the message failed (with an RPC_FAILURE)
- * because we are in the middle of shutting the system down.
- *
* If message failed with an RPC_FAILURE when rpc.mdcommd had
* been gracefully shutdown (md_mn_is_commd_present returns FALSE)
* then don't retry the message anymore. If message
@@ -3956,16 +3952,81 @@ mdmn_ksend_message(
*
*/
- retry_cnt = 0;
-
- if (result->kmmr_comm_state == MDMNE_RPC_FAIL) {
- while (md_mn_is_commd_present() == 1) {
- if ((++retry_cnt % MD_MN_WARN_INTVL) == 0)
+ retry_noise_cnt = send_try_cnt = 0;
+ while (md_mn_is_commd_present_lite()) {
+ /*
+ * data_ptr and data_size are initialized here because on
+ * return from the upcall, they contain data duplicated from
+ * rbuf and rsize. This causes subsequent upcalls to fail.
+ */
+ da.data_ptr = (char *)(kmsg);
+ da.data_size = sizeof (md_mn_kmsg_t);
+ da.desc_ptr = NULL;
+ da.desc_num = 0;
+ da.rbuf = (char *)result;
+ da.rsize = sizeof (*result);
+
+ while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
+ NULL, SIZE_MAX, 0)) != 0) {
+ if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
+ if (rval == EAGAIN) {
+ cmn_err(CE_WARN,
+ "md: door_upcall failed. "
+ "Check if mddoors is running.");
+ } else if (rval == EINTR) {
+ cmn_err(CE_WARN,
+ "md: door_upcall failed. "
+ "Check if rpc.mdcommd is running.");
+ } else {
+ cmn_err(CE_WARN,
+ "md: door_upcall failed. "
+ "Returned %d",
+ rval);
+ }
+ }
+ if (++send_try_cnt >= md_send_retry_limit)
break;
+
delay(md_hz);
+
+ /*
+ * data_ptr and data_size are re-initialized here
+ * because on return from the upcall, they contain
+ * data duplicated from rbuf and rsize. This causes
+ * subsequent upcalls to fail.
+ */
+ da.data_ptr = (char *)(kmsg);
+ da.data_size = sizeof (md_mn_kmsg_t);
+ da.desc_ptr = NULL;
+ da.desc_num = 0;
+ da.rbuf = (char *)result;
+ da.rsize = sizeof (*result);
}
+
+
+ /*
+ * If:
+ * - the send succeeded (MDMNE_ACK)
+ * - we had an MDMNE_RPC_FAIL and commd is now gone
+ * (note: since the outer loop is commd-dependent,
+ * checking MDMN_RPC_FAIL here is meaningless)
+ * - we were told not to retry
+ * - we exceeded the RPC failure send limit
+ * punch out of the outer loop prior to the delay()
+ */
+ if (result->kmmr_comm_state == MDMNE_ACK ||
+ (flags & MD_MSGF_KSEND_NORETRY) ||
+ (++send_try_cnt % md_send_retry_limit) == 0 ||
+ !md_mn_is_commd_present())
+ break;
+ delay(md_hz);
}
+ if (flags & MD_MSGF_BLK_SIGNAL) {
+ sigreplace(&oldmask, (k_sigset_t *)NULL);
+ }
+ kmem_free(kmsg, sizeof (md_mn_kmsg_t));
+
return (0);
}
@@ -4008,7 +4069,7 @@ mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
sigfillset(&newmask);
sigreplace(&newmask, &oldmask);
ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
- MD_MSGF_NO_LOG, (char *)&msg, sizeof (md_mn_msg_setcap_t),
+ MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
kres));
sigreplace(&oldmask, (k_sigset_t *)NULL);
@@ -4056,7 +4117,7 @@ mdmn_clear_all_capabilities(minor_t mnum)
sigreplace(&newmask, &oldmask);
ret = mdmn_ksend_message(MD_MIN2SET(mnum),
MD_MN_MSG_CLU_CHECK,
- MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT,
+ MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
(char *)&clumsg, sizeof (clumsg), kresult);
sigreplace(&oldmask, (k_sigset_t *)NULL);
@@ -4212,3 +4273,23 @@ find_hot_spare_pool(set_t setno, int hsp_id)
return ((hot_spare_pool_t *)0);
}
+
+/*
+ * md_create_taskq:
+ *
+ * Create a kernel taskq for the given set/unit combination. This is typically
+ * used to complete a RR_CLEAN request when the callee is unable to obtain the
+ * mutex / condvar access required to update the DRL safely.
+ */
+void *
+md_create_taskq(set_t setno, minor_t mnum)
+{
+ char name[20];
+ ddi_taskq_t *tqp;
+
+ (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
+
+ tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
+
+ return ((void *)tqp);
+}
diff --git a/usr/src/uts/common/io/lvm/mirror/mirror.c b/usr/src/uts/common/io/lvm/mirror/mirror.c
index e1c6cebf08..0097a0be57 100644
--- a/usr/src/uts/common/io/lvm/mirror/mirror.c
+++ b/usr/src/uts/common/io/lvm/mirror/mirror.c
@@ -173,6 +173,7 @@ static void
mirror_parent_init(md_mps_t *ps)
{
bzero(ps, offsetof(md_mps_t, ps_mx));
+ bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
}
/*ARGSUSED1*/
@@ -223,11 +224,17 @@ send_poke_hotspares_msg(daemon_request_t *drq)
kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
- MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, (char *)&pokehsp,
+ MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
sizeof (pokehsp), kresult);
if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
+ /* If we're shutting down already, pause things here. */
+ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC,
"ksend_message failure: POKE_HOTSPARES");
}
@@ -468,7 +475,7 @@ check_comp_4_hotspares(
}
kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
- rval = mdmn_ksend_message(setno, msgtype, msgflags,
+ rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
(char *)&allochspmsg, sizeof (allochspmsg),
kresult);
@@ -491,6 +498,12 @@ check_comp_4_hotspares(
kmem_free(kresult, sizeof (md_mn_kresult_t));
return (1);
}
+ /* If we're shutting down already, pause things here. */
+ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC,
"ksend_message failure: ALLOCATE_HOTSPARE");
}
@@ -1636,9 +1649,14 @@ fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
/*
* For directed mirror read (DMR) we only use the specified side and
* do not compute the source of the read.
+ * If we're running with MD_MPS_DIRTY_RD set we always return the
+ * first mirror side (this prevents unnecessary ownership switching).
+ * Otherwise we return the submirror according to the mirror read option
*/
if (ps->ps_flags & MD_MPS_DMR) {
sm_index = un->un_dmr_last_read;
+ } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
+ sm_index = md_find_nth_unit(running_bm, 0);
} else {
/* Normal (non-DMR) operation */
switch (un->un_read_option) {
@@ -1883,6 +1901,13 @@ mirror_build_incore(mm_unit_t *un, int snarfing)
mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
+ /*
+ * Allocate rwlocks for un_pernode_dirty_bm accessing.
+ */
+ for (i = 0; i < MD_MNMAXSIDES; i++) {
+ rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
+ }
+
/* place various information in the in-core data structures */
md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
MD_UNIT(MD_SID(un)) = un;
@@ -1903,6 +1928,7 @@ reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
uint_t bits = 0;
minor_t selfid;
md_unit_t *su;
+ int i;
md_destroy_unit_incore(mnum, &mirror_md_ops);
@@ -1917,6 +1943,15 @@ reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
if (un->un_resync_bm)
kmem_free((caddr_t)un->un_resync_bm, bitcnt);
+ if (un->un_pernode_dirty_sum)
+ kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
+
+ /*
+ * Destroy the taskq for deferred processing of DRL clean requests.
+ * This taskq will only be present for Multi Owner mirrors.
+ */
+ if (un->un_drl_task != NULL)
+ ddi_taskq_destroy(un->un_drl_task);
md_nblocks_set(mnum, -1ULL);
MD_UNIT(mnum) = NULL;
@@ -1965,6 +2000,12 @@ reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
mutex_destroy(&un->un_dmr_mx);
cv_destroy(&un->un_dmr_cv);
+ for (i = 0; i < MD_MNMAXSIDES; i++) {
+ rw_destroy(&un->un_pernode_dirty_mx[i]);
+ if (un->un_pernode_dirty_bm[i])
+ kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
+ }
+
/*
* Remove self from the namespace
*/
@@ -1972,7 +2013,9 @@ reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
(void) md_rem_selfname(un->c.un_self_id);
}
+ /* This frees the unit structure. */
mddb_deleterec_wrapper(un->c.un_record_id);
+
if (recid != 0)
mddb_deleterec_wrapper(recid);
@@ -2430,11 +2473,17 @@ set_sm_comp_state(
}
kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
- rval = mdmn_ksend_message(setno, msgtype, msgflags,
+ rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
(char *)&stchmsg, sizeof (stchmsg), kresult);
if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
+ /* If we're shutting down already, pause things here. */
+ if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC,
"ksend_message failure: STATE_UPDATE");
}
@@ -3435,11 +3484,12 @@ update_resync(daemon_queue_t *dq)
md_mps_t *ps = (md_mps_t *)dq;
buf_t *pb = ps->ps_bp;
mdi_unit_t *ui = ps->ps_ui;
- mm_unit_t *un;
+ mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id);
set_t setno;
int restart_resync;
- un = md_unit_writerlock(ui);
+ mutex_enter(&un->un_rrp_inflight_mx);
+ (void) md_unit_writerlock(ui);
ps->ps_un = un;
setno = MD_MIN2SET(getminor(pb->b_edev));
if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
@@ -3447,15 +3497,14 @@ update_resync(daemon_queue_t *dq)
* Synchronize our in-core view of what regions need to be
* resync'd with the on-disk version.
*/
- mutex_enter(&un->un_rrp_inflight_mx);
mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
un->un_dirty_bm);
- mutex_exit(&un->un_rrp_inflight_mx);
/* Region dirty map is now up to date */
}
restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
md_unit_writerexit(ui);
+ mutex_exit(&un->un_rrp_inflight_mx);
/* Restart the resync thread if it was previously blocked */
if (restart_resync) {
@@ -3581,9 +3630,8 @@ become_owner(daemon_queue_t *dq)
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno,
- MD_MN_MSG_REQUIRE_OWNER, msg_flags,
- /* flags */ (char *)msg,
- sizeof (md_mn_req_owner_t), kres);
+ MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
+ (char *)msg, sizeof (md_mn_req_owner_t), kres);
kmem_free(msg, sizeof (md_mn_req_owner_t));
@@ -3890,19 +3938,19 @@ mirror_write_strategy(buf_t *pb, int flag, void *private)
}
/*
- * For Multinode mirrors with a Resync Region (not ABR) we need to
- * become the mirror owner before continuing with the write(). For ABR
- * mirrors we check that we 'own' the resync if we're in
- * write-after-read mode. We do this _after_ ensuring that there are no
- * overlaps to ensure that the once we know that we are the owner, the
- * readerlock will not released until the write is complete. As a
- * change of ownership in a MN set requires the writerlock, this
- * ensures that ownership cannot be changed until the write is
- * complete
+ * For Multinode mirrors with no owner and a Resync Region (not ABR)
+ * we need to become the mirror owner before continuing with the
+ * write(). For ABR mirrors we check that we 'own' the resync if
+ * we're in write-after-read mode. We do this _after_ ensuring that
+ * there are no overlaps to ensure that once we know that we are
+ * the owner, the readerlock will not be released until the write is
+ * complete. As a change of ownership in a MN set requires the
+ * writerlock, this ensures that ownership cannot be changed until
+ * the write is complete.
*/
if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
(flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
- if (!MD_MN_MIRROR_OWNER(un)) {
+ if (MD_MN_NO_MIRROR_OWNER(un)) {
if (ps->ps_flags & MD_MPS_ON_OVERLAP)
mirror_overlap_tree_remove(ps);
md_kstat_waitq_exit(ui);
@@ -3922,10 +3970,11 @@ mirror_write_strategy(buf_t *pb, int flag, void *private)
if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
!(flag & MD_STR_WAR)) {
if (mirror_mark_resync_region(un, ps->ps_firstblk,
- ps->ps_lastblk)) {
+ ps->ps_lastblk, md_mn_mynode_id)) {
pb->b_flags |= B_ERROR;
pb->b_resid = pb->b_bcount;
- ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
+ if (ps->ps_flags & MD_MPS_ON_OVERLAP)
+ mirror_overlap_tree_remove(ps);
kmem_cache_free(mirror_parent_cache, ps);
md_kstat_waitq_exit(ui);
md_unit_readerexit(ui);
@@ -4169,9 +4218,9 @@ mirror_read_strategy(buf_t *pb, int flag, void *private)
/*
* Before reading the buffer, see if
- * we are the owner
+ * there is an owner.
*/
- if (!MD_MN_MIRROR_OWNER(un)) {
+ if (MD_MN_NO_MIRROR_OWNER(un)) {
ps->ps_call = NULL;
mirror_overlap_tree_remove(ps);
md_kstat_waitq_exit(ui);
@@ -4506,6 +4555,7 @@ mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
md_error_t mde = mdnullerror;
md_mps_t *ps;
int rs_active;
+ int rr, rr_start, rr_end;
/* Check that the given device is part of a multi-node set */
setno = MD_MIN2SET(p->mnum);
@@ -4580,6 +4630,25 @@ mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
if (p->rs_originator != md_mn_mynode_id) {
/*
+ * Clear our un_resync_bm for the regions completed.
+ * The owner (originator) will take care of itself.
+ */
+ BLK_TO_RR(rr_end, ps->ps_lastblk, un);
+ BLK_TO_RR(rr_start, p->rs_start, un);
+ if (ps->ps_lastblk && rr_end < rr_start) {
+ BLK_TO_RR(rr_start, ps->ps_firstblk, un);
+ mutex_enter(&un->un_resync_mx);
+ /*
+ * Update our resync bitmap to reflect that
+ * another node has synchronized this range.
+ */
+ for (rr = rr_start; rr <= rr_end; rr++) {
+ CLR_KEEPDIRTY(rr, un);
+ }
+ mutex_exit(&un->un_resync_mx);
+ }
+
+ /*
* On all but the originating node, first update
* the resync state, then unblock the previous
* region and block the next one. No need
@@ -4654,6 +4723,7 @@ mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
&p->mde, lockp);
}
}
+
break;
case MD_MN_MSG_RESYNC_FINISH:
/*
@@ -4792,6 +4862,24 @@ mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
un->c.un_status &= ~MD_UN_KEEP_DIRTY;
if (!broke_out)
un->c.un_status &= ~MD_UN_WAR;
+
+ /*
+ * Clear our un_resync_bm for the regions
+ * completed. The owner (originator) will
+ * take care of itself.
+ */
+ if (p->rs_originator != md_mn_mynode_id &&
+ (ps = un->un_rs_prev_overlap) != NULL) {
+ BLK_TO_RR(rr_start, ps->ps_firstblk,
+ un);
+ BLK_TO_RR(rr_end, ps->ps_lastblk, un);
+ mutex_enter(&un->un_resync_mx);
+ for (rr = rr_start; rr <= rr_end;
+ rr++) {
+ CLR_KEEPDIRTY(rr, un);
+ }
+ mutex_exit(&un->un_resync_mx);
+ }
}
/*
diff --git a/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c b/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c
index 8e8d8dc496..2b8b0d09d8 100644
--- a/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c
+++ b/usr/src/uts/common/io/lvm/mirror/mirror_ioctl.c
@@ -1624,7 +1624,7 @@ mirror_choose_owner_thread(md_mn_msg_chooseid_t *msg)
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno, MD_MN_MSG_CHOOSE_OWNER,
- MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)msg,
+ MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)msg,
sizeof (md_mn_msg_chooseid_t), kres);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres, "CHOOSE OWNER");
@@ -1664,7 +1664,8 @@ mirror_owner_thread(md_mn_req_owner_t *ownp)
kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno, MD_MN_MSG_REQUIRE_OWNER,
- MD_MSGF_NO_LOG, (char *)ownp, sizeof (md_mn_req_owner_t), kresult);
+ MD_MSGF_NO_LOG, 0, (char *)ownp, sizeof (md_mn_req_owner_t),
+ kresult);
if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
/*
@@ -2358,7 +2359,7 @@ mirror_get_status(mm_unit_t *un, IOLOCK *lockp)
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno, MD_MN_MSG_GET_MIRROR_STATE,
- MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, (char *)&msg,
+ MD_MSGF_NO_BCAST | MD_MSGF_NO_LOG, 0, (char *)&msg,
sizeof (msg), kres);
/* if the node hasn't yet joined, it's Ok. */
@@ -2949,6 +2950,42 @@ free_mem:
break;
}
+ case MD_MN_RR_DIRTY:
+ {
+ sz = sizeof (md_mn_rr_dirty_params_t);
+ d = kmem_zalloc(sz, KM_SLEEP);
+
+ if (ddi_copyin(data, d, sz, mode)) {
+ err = EFAULT;
+ break;
+ }
+
+ err = mirror_set_dirty_rr((md_mn_rr_dirty_params_t *)d);
+ break;
+ }
+
+ case MD_MN_RR_CLEAN:
+ {
+ md_mn_rr_clean_params_t tmp;
+
+ /* get the first part of the structure to find the size */
+ if (ddi_copyin(data, &tmp, sizeof (tmp), mode)) {
+ err = EFAULT;
+ break;
+ }
+
+ sz = MDMN_RR_CLEAN_PARAMS_SIZE(&tmp);
+ d = kmem_zalloc(sz, KM_SLEEP);
+
+ if (ddi_copyin(data, d, sz, mode)) {
+ err = EFAULT;
+ break;
+ }
+
+ err = mirror_set_clean_rr((md_mn_rr_clean_params_t *)d);
+ break;
+ }
+
default:
return (ENOTTY);
}
diff --git a/usr/src/uts/common/io/lvm/mirror/mirror_resync.c b/usr/src/uts/common/io/lvm/mirror/mirror_resync.c
index 4846cd4ad4..59785d670e 100644
--- a/usr/src/uts/common/io/lvm/mirror/mirror_resync.c
+++ b/usr/src/uts/common/io/lvm/mirror/mirror_resync.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
@@ -67,7 +65,7 @@ extern major_t md_major;
extern md_ops_t mirror_md_ops;
extern kmem_cache_t *mirror_child_cache; /* mirror child memory pool */
-extern mdq_anchor_t md_mstr_daemon;
+extern mdq_anchor_t md_mto_daemon;
extern daemon_request_t mirror_timeout;
extern md_resync_t md_cpr_resync;
extern clock_t md_hz;
@@ -141,81 +139,365 @@ int md_mirror_rr_sleep_timo = 1;
*/
int md_max_xfer_bufsz = 2048;
+/*
+ * mirror_generate_rr_bitmap:
+ * -------------------
+ * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
+ * bitmap associated with mirror 'un'
+ *
+ * Input:
+ * un - mirror unit to get bitmap data from
+ * *msgp - location to return newly allocated md_mn_msg_rr_clean_t
+ * *activep- location to return # of active i/os
+ *
+ * Returns:
+ * 1 => dirty bits cleared from un_dirty_bm and DRL flush required
+ * *msgp contains bitmap of to-be-cleared bits
+ * 0 => no bits cleared
+ * *msgp == NULL
+ */
static int
-process_resync_regions(mm_unit_t *un)
+mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
+ int *activep)
{
- int i;
- int cleared_dirty = 0;
+ unsigned int i, next_bit, data_bytes, start_bit;
+ int cleared_dirty = 0;
+
+ /* Skip any initial 0s. */
+retry_dirty_scan:
+ if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
+ un->un_rr_clean_start_bit = start_bit = 0;
+
/*
- * Number of reasons why we can not
- * proceed shutting down the mirror.
+ * Handle case where NO bits are set in PERNODE_DIRTY but the
+ * un_dirty_bm[] map does have entries set (after a 1st resync)
*/
+ for (; start_bit < un->un_rrd_num &&
+ !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
+ (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
+ ;
+
+ if (start_bit >= un->un_rrd_num) {
+ if (un->un_rr_clean_start_bit == 0) {
+ return (0);
+ } else {
+ un->un_rr_clean_start_bit = 0;
+ goto retry_dirty_scan;
+ }
+ }
+
+ /* how much to fit into this message */
+ data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
+ MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
+
+ (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
+ KM_SLEEP);
+
+ (*msgp)->rr_nodeid = md_mn_mynode_id;
+ (*msgp)->rr_mnum = MD_SID(un);
+ MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
+
+ next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
+
+ for (i = start_bit; i < next_bit; i++) {
+ if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
+ continue;
+ }
+ if (!IS_REGION_DIRTY(i, un)) {
+ continue;
+ }
+ if (un->un_outstanding_writes[i] != 0) {
+ (*activep)++;
+ continue;
+ }
+
+ /*
+ * Handle the case where a resync has completed and we still
+ * have the un_dirty_bm[] entries marked as dirty (these are
+ * the most recent DRL re-read from the replica). They need
+ * to be cleared from our un_dirty_bm[] but they will not have
+ * corresponding un_pernode_dirty[] entries set unless (and
+ * until) further write()s have been issued to the area.
+ * This handles the case where only the un_dirty_bm[] entry is
+ * set. Without this we'd not clear this region until a local
+ * write is issued to the affected area.
+ */
+ if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
+ (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
+ if (!IS_GOING_CLEAN(i, un)) {
+ SET_GOING_CLEAN(i, un);
+ (*activep)++;
+ continue;
+ }
+ /*
+ * Now we've got a flagged pernode_dirty, _or_ a clean
+ * bitmap entry to process. Update the bitmap to flush
+ * the REGION_DIRTY / GOING_CLEAN bits when we send the
+ * cross-cluster message.
+ */
+ cleared_dirty++;
+ setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
+ } else {
+ /*
+ * Not marked as active in the pernode bitmap, so skip
+ * any update to this. We just increment the 0 count
+ * and adjust the active count by any outstanding
+ * un_pernode_dirty_sum[] entries. This means we don't
+ * leave the mirror permanently dirty.
+ */
+ (*activep) += (int)un->un_pernode_dirty_sum[i];
+ }
+ }
+ if (!cleared_dirty) {
+ kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
+ *msgp = NULL;
+ }
+ un->un_rr_clean_start_bit = next_bit;
+ return (cleared_dirty);
+}
+
+/*
+ * There are three paths into here:
+ *
+ * md_daemon -> check_resync_regions -> prr
+ * mirror_internal_close -> mirror_process_unit_resync -> prr
+ * mirror_set_capability -> mirror_process_unit_resync -> prr
+ *
+ * The first one is a kernel daemon, the other two result from system calls.
+ * Thus, only the first case needs to deal with kernel CPR activity. This
+ * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
+ * NULL for system call paths.
+ */
+static int
+process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
+{
+ int i, start, end;
+ int cleared_dirty = 0;
+ /* Number of reasons why we can not proceed shutting down the mirror. */
int active = 0;
set_t setno = MD_UN2SET(un);
+ md_mn_msg_rr_clean_t *rmsg;
+ md_mn_kresult_t *kres;
+ int rval;
+ minor_t mnum = MD_SID(un);
+ mdi_unit_t *ui = MDI_UNIT(mnum);
+ md_mn_nodeid_t owner_node;
/*
- * Resync region processing must be
- * single threaded. We can't use
- * un_resync_mx for this purpose
- * since this mutex gets released
+ * We drop the readerlock here to assist lock ordering with
+ * update_resync. Once we have the un_rrp_inflight_mx, we
+ * can re-acquire it.
+ */
+ md_unit_readerexit(ui);
+
+ /*
+ * Resync region processing must be single threaded. We can't use
+ * un_resync_mx for this purpose since this mutex gets released
* when blocking on un_resync_cv.
*/
mutex_enter(&un->un_rrp_inflight_mx);
+ (void) md_unit_readerlock(ui);
+
mutex_enter(&un->un_resync_mx);
- while (un->un_resync_flg & MM_RF_STALL_CLEAN)
- cv_wait(&un->un_resync_cv, &un->un_resync_mx);
- /*
- * For a mirror we can only update the resync-record if we currently
- * own the mirror. If we are called and we don't have ownership we bail
- * out before scanning the outstanding_writes[] array. This cannot be
- * set as we'd have become the owner before initiating the i/o to the
- * mirror.
- * NOTE: we only need to check here (before scanning the array) as we
- * are called with the readerlock held. This means that a change
- * of ownership away from us will block until this resync check
- * has completed.
- */
- if (MD_MNSET_SETNO(setno)) {
- if (!MD_MN_MIRROR_OWNER(un)) {
- mutex_exit(&un->un_resync_mx);
+ rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
+ cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
+ rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+
+ if (cleared_dirty) {
+ owner_node = un->un_mirror_owner;
+ mutex_exit(&un->un_resync_mx);
+
+ /*
+ * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
+ * Receipt of the message will cause the mirror owner to
+ * update the on-disk DRL.
+ */
+
+ kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
+
+ /* release readerlock before sending message */
+ md_unit_readerexit(ui);
+
+ if (cprinfop) {
+ mutex_enter(&un->un_prr_cpr_mx);
+ CALLB_CPR_SAFE_BEGIN(cprinfop);
+ }
+
+ rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
+ MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
+ MD_MSGF_DIRECTED, un->un_mirror_owner,
+ (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
+
+ if (cprinfop) {
+ CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
+ mutex_exit(&un->un_prr_cpr_mx);
+ }
+
+ /* reacquire readerlock after message */
+ (void) md_unit_readerlock(ui);
+
+ if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
+ (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
+ /* if commd is gone, no point in printing a message */
+ if (md_mn_is_commd_present())
+ mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
+ kmem_free(kres, sizeof (md_mn_kresult_t));
+ kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
mutex_exit(&un->un_rrp_inflight_mx);
return (active);
}
+ kmem_free(kres, sizeof (md_mn_kresult_t));
+
+ /*
+ * If ownership changed while we were sending, we probably
+ * sent the message to the wrong node. Leave fixing that for
+ * the next cycle.
+ */
+ if (un->un_mirror_owner != owner_node) {
+ mutex_exit(&un->un_rrp_inflight_mx);
+ return (active);
+ }
+
+ /*
+ * Now that we've sent the message, clear them from the
+ * pernode_dirty arrays. These are ONLY cleared on a
+ * successful send, and failure has no impact.
+ */
+ cleared_dirty = 0;
+ start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
+ end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
+ mutex_enter(&un->un_resync_mx);
+ rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
+ RW_READER);
+ for (i = start; i < end; i++) {
+ if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
+ i - start)) {
+ if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
+ un->un_pernode_dirty_sum[i]--;
+ CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
+ un);
+ }
+ if (IS_REGION_DIRTY(i, un)) {
+ cleared_dirty++;
+ CLR_REGION_DIRTY(i, un);
+ CLR_GOING_CLEAN(i, un);
+ }
+ }
+ }
+ rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+
+ kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
}
+ mutex_exit(&un->un_resync_mx);
- for (i = 0; i < un->un_rrd_num; i++) {
+ mutex_exit(&un->un_rrp_inflight_mx);
- if (un->c.un_status & MD_UN_KEEP_DIRTY)
- if (IS_KEEPDIRTY(i, un))
- continue;
+ return (active);
+}
- if (!IS_REGION_DIRTY(i, un))
- continue;
- if (un->un_outstanding_writes[i] != 0) {
- active++;
- continue;
+static int
+process_resync_regions_owner(mm_unit_t *un)
+{
+ int i, start, end;
+ int cleared_dirty = 0;
+ /* Number of reasons why we can not proceed shutting down the mirror. */
+ int active = 0;
+ set_t setno = MD_UN2SET(un);
+ int mnset = MD_MNSET_SETNO(setno);
+ md_mn_msg_rr_clean_t *rmsg;
+ minor_t mnum = MD_SID(un);
+ mdi_unit_t *ui = MDI_UNIT(mnum);
+
+ /*
+ * We drop the readerlock here to assist lock ordering with
+ * update_resync. Once we have the un_rrp_inflight_mx, we
+ * can re-acquire it.
+ */
+ md_unit_readerexit(ui);
+
+ /*
+ * Resync region processing must be single threaded. We can't use
+ * un_resync_mx for this purpose since this mutex gets released
+ * when blocking on un_resync_cv.
+ */
+ mutex_enter(&un->un_rrp_inflight_mx);
+
+ (void) md_unit_readerlock(ui);
+
+ mutex_enter(&un->un_resync_mx);
+ un->un_waiting_to_clear++;
+ while (un->un_resync_flg & MM_RF_STALL_CLEAN)
+ cv_wait(&un->un_resync_cv, &un->un_resync_mx);
+ un->un_waiting_to_clear--;
+
+ if (mnset) {
+ rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
+ RW_READER);
+ cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
+
+ if (cleared_dirty) {
+ /*
+ * Clear the bits from the pernode_dirty arrays.
+ * If that results in any being cleared from the
+ * un_dirty_bm, commit it.
+ */
+ cleared_dirty = 0;
+ start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
+ end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
+ for (i = start; i < end; i++) {
+ if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
+ i - start)) {
+ if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
+ un)) {
+ un->un_pernode_dirty_sum[i]--;
+ CLR_PERNODE_DIRTY(
+ md_mn_mynode_id, i, un);
+ }
+ if (un->un_pernode_dirty_sum[i] == 0) {
+ cleared_dirty++;
+ CLR_REGION_DIRTY(i, un);
+ CLR_GOING_CLEAN(i, un);
+ }
+ }
+ }
+ kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
}
+ rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
+ } else {
+ for (i = 0; i < un->un_rrd_num; i++) {
+ if (un->c.un_status & MD_UN_KEEP_DIRTY)
+ if (IS_KEEPDIRTY(i, un))
+ continue;
- if (!IS_GOING_CLEAN(i, un)) {
- SET_GOING_CLEAN(i, un);
- active++;
- continue;
+ if (!IS_REGION_DIRTY(i, un))
+ continue;
+ if (un->un_outstanding_writes[i] != 0) {
+ active++;
+ continue;
+ }
+
+ if (!IS_GOING_CLEAN(i, un)) {
+ SET_GOING_CLEAN(i, un);
+ active++;
+ continue;
+ }
+ CLR_REGION_DIRTY(i, un);
+ CLR_GOING_CLEAN(i, un);
+ cleared_dirty++;
}
- CLR_REGION_DIRTY(i, un);
- CLR_GOING_CLEAN(i, un);
- cleared_dirty = 1;
}
+
if (cleared_dirty) {
un->un_resync_flg |= MM_RF_GATECLOSED;
mutex_exit(&un->un_resync_mx);
-
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
-
mutex_enter(&un->un_resync_mx);
un->un_resync_flg &= ~MM_RF_GATECLOSED;
- if (un->un_waiting_to_mark != 0) {
+
+ if (un->un_waiting_to_mark != 0 ||
+ un->un_waiting_to_clear != 0) {
active++;
cv_broadcast(&un->un_resync_cv);
}
@@ -227,6 +509,29 @@ process_resync_regions(mm_unit_t *un)
return (active);
}
+static int
+process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
+{
+ int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+ /*
+ * For a mirror we can only update the on-disk resync-record if we
+ * currently own the mirror. If we are called and there is no owner we
+ * bail out before scanning the outstanding_writes[] array.
+ * NOTE: we only need to check here (before scanning the array) as we
+ * are called with the readerlock held. This means that a change
+ * of ownership away from us will block until this resync check
+ * has completed.
+ */
+ if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
+ (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
+ return (0);
+ } else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
+ return (process_resync_regions_non_owner(un, cprinfop));
+ } else {
+ return (process_resync_regions_owner(un));
+ }
+}
+
/*
* Function that is callable from other modules to provide
* ability to cleanup dirty region bitmap on demand. Used
@@ -240,7 +545,7 @@ mirror_process_unit_resync(mm_unit_t *un)
{
int cleans = 0;
- while (process_resync_regions(un)) {
+ while (process_resync_regions(un, NULL)) {
cleans++;
if (cleans >= md_mirror_rr_cleans) {
@@ -265,6 +570,7 @@ check_resync_regions(daemon_request_t *timeout)
mdi_unit_t *ui;
mm_unit_t *un;
md_link_t *next;
+ callb_cpr_t cprinfo;
rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
@@ -272,8 +578,18 @@ check_resync_regions(daemon_request_t *timeout)
if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
continue;
+ un = MD_UNIT(next->ln_id);
+
+ /*
+ * Register this resync thread with the CPR mechanism. This
+ * allows us to detect when the system is suspended and so
+ * keep track of the RPC failure condition.
+ */
+ CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
+ "check_resync_regions");
+
ui = MDI_UNIT(next->ln_id);
- un = (mm_unit_t *)md_unit_readerlock(ui);
+ (void) md_unit_readerlock(ui);
/*
* Do not clean up resync regions if it is an ABR
@@ -287,8 +603,13 @@ check_resync_regions(daemon_request_t *timeout)
continue;
}
- (void) process_resync_regions(un);
+ (void) process_resync_regions(un, &cprinfo);
+
md_unit_readerexit(ui);
+
+ /* Remove this thread from the CPR callback table. */
+ mutex_enter(&un->un_prr_cpr_mx);
+ CALLB_CPR_EXIT(&cprinfo);
}
rw_exit(&mirror_md_ops.md_link_rw.lock);
@@ -306,7 +627,7 @@ md_mirror_timeout(void *throwaway)
mutex_enter(&mirror_timeout.dr_mx);
if (!mirror_timeout.dr_pending) {
mirror_timeout.dr_pending = 1;
- daemon_request(&md_mstr_daemon, check_resync_regions,
+ daemon_request(&md_mto_daemon, check_resync_regions,
(daemon_queue_t *)&mirror_timeout, REQ_OLD);
}
@@ -466,6 +787,7 @@ unit_setup_resync(mm_unit_t *un, int snarfing)
un->un_resync_flg = 0;
un->un_waiting_to_mark = 0;
un->un_waiting_to_commit = 0;
+ un->un_waiting_to_clear = 0;
un->un_goingclean_bm = NULL;
un->un_goingdirty_bm = NULL;
@@ -505,6 +827,27 @@ unit_setup_resync(mm_unit_t *un, int snarfing)
un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
un->un_rrd_num, NBBY)), KM_SLEEP);
+ /*
+ * Allocate pernode bitmap for this node. All other nodes' maps will
+ * be created 'on-the-fly' in the ioctl message handler
+ */
+ if (MD_MNSET_SETNO(MD_UN2SET(un))) {
+ un->un_pernode_dirty_sum =
+ (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
+ if (md_mn_mynode_id > 0) {
+ un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
+ kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
+ KM_SLEEP);
+ }
+
+ /*
+ * Allocate taskq to process deferred (due to locking) RR_CLEAN
+ * requests.
+ */
+ un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
+ MD_SID(un));
+ }
+
if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
return (0);
@@ -734,7 +1077,7 @@ send_mn_resync_done_message(
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
- MD_MSGF_NO_LOG, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
+ MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
mutex_exit(&un->un_rs_cpr_mx);
@@ -743,6 +1086,12 @@ send_mn_resync_done_message(
if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
(kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
+ /* If we're shutting down already, pause things here. */
+ if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
}
kmem_free(kres, sizeof (md_mn_kresult_t));
@@ -814,13 +1163,19 @@ send_mn_resync_next_message(
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
- (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
+ 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
mutex_exit(&un->un_rs_cpr_mx);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
+ /* If we're shutting down already, pause things here. */
+ if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
}
kmem_free(kres, sizeof (md_mn_kresult_t));
@@ -2301,7 +2656,7 @@ bail_out:
CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
rval = mdmn_ksend_message(setno,
- MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG,
+ MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
(char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
@@ -2311,6 +2666,12 @@ bail_out:
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres,
"RESYNC_FINISH");
+ /* If we're shutting down, pause things here. */
+ if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
cmn_err(CE_PANIC,
"ksend_message failure: RESYNC_FINISH");
}
@@ -2693,30 +3054,209 @@ mirror_ioctl_resync(
}
int
-mirror_mark_resync_region(struct mm_unit *un,
- diskaddr_t startblk, diskaddr_t endblk)
+mirror_mark_resync_region_non_owner(struct mm_unit *un,
+ diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
{
- int no_change;
- size_t start_rr;
- size_t current_rr;
- size_t end_rr;
+ int no_change;
+ size_t start_rr;
+ size_t current_rr;
+ size_t end_rr;
+ md_mn_msg_rr_dirty_t *rr;
+ md_mn_kresult_t *kres;
+ set_t setno = MD_UN2SET(un);
+ int rval;
+ md_mn_nodeid_t node_idx = source_node - 1;
+ mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
+ md_mn_nodeid_t owner_node;
+ minor_t mnum = MD_SID(un);
if (un->un_nsm < 2)
return (0);
+ /*
+ * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
+ * not, allocate it and then fill the [start..end] entries.
+ * Update un_pernode_dirty_sum if we've gone 0->1.
+ * Update un_dirty_bm if the corresponding entries are clear.
+ */
+ rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+ if (un->un_pernode_dirty_bm[node_idx] == NULL) {
+ un->un_pernode_dirty_bm[node_idx] =
+ (uchar_t *)kmem_zalloc(
+ (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
+ }
+ rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
BLK_TO_RR(end_rr, endblk, un);
BLK_TO_RR(start_rr, startblk, un);
- mutex_enter(&un->un_resync_mx);
no_change = 1;
+
+ mutex_enter(&un->un_resync_mx);
+ rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
un->un_outstanding_writes[current_rr]++;
+ if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
+ un->un_pernode_dirty_sum[current_rr]++;
+ SET_PERNODE_DIRTY(source_node, current_rr, un);
+ }
+ CLR_GOING_CLEAN(current_rr, un);
+ if (!IS_REGION_DIRTY(current_rr, un)) {
+ no_change = 0;
+ SET_REGION_DIRTY(current_rr, un);
+ SET_GOING_DIRTY(current_rr, un);
+ } else if (IS_GOING_DIRTY(current_rr, un))
+ no_change = 0;
+ }
+ rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+ mutex_exit(&un->un_resync_mx);
+
+ if (no_change) {
+ return (0);
+ }
+
+ /*
+ * If we have dirty regions to commit, send a
+ * message to the owning node so that the
+ * in-core bitmap gets updated appropriately.
+ * TODO: make this a kmem_cache pool to improve
+ * alloc/free performance ???
+ */
+ kres = (md_mn_kresult_t *)kmem_zalloc(sizeof (md_mn_kresult_t),
+ KM_SLEEP);
+ rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
+ KM_SLEEP);
+
+resend_mmrr:
+ owner_node = un->un_mirror_owner;
+
+ rr->rr_mnum = mnum;
+ rr->rr_nodeid = md_mn_mynode_id;
+ rr->rr_range = (ushort_t)start_rr << 16;
+ rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
+
+ /* release readerlock before sending message */
+ md_unit_readerexit(ui);
+
+ rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
+ MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
+ un->un_mirror_owner, (char *)rr,
+ sizeof (md_mn_msg_rr_dirty_t), kres);
+
+ /* reaquire readerlock on message completion */
+ (void) md_unit_readerlock(ui);
+
+ /* if the message send failed, note it, and pass an error back up */
+ if (!MDMN_KSEND_MSG_OK(rval, kres)) {
+ /* if commd is gone, no point in printing a message */
+ if (md_mn_is_commd_present())
+ mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
+ kmem_free(kres, sizeof (md_mn_kresult_t));
+ kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
+ return (1);
+ }
+
+ /*
+ * if the owner changed while we were sending the message, and it's
+ * not us, the new mirror owner won't yet have done the right thing
+ * with our data. Let him know. If we became the owner, we'll
+ * deal with that differently below. Note that receiving a message
+ * about another node twice won't hurt anything.
+ */
+ if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
+ goto resend_mmrr;
+
+ kmem_free(kres, sizeof (md_mn_kresult_t));
+ kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
+
+ mutex_enter(&un->un_resync_mx);
+
+ /*
+ * If we became the owner changed while we were sending the message,
+ * we have dirty bits in the un_pernode_bm that aren't yet reflected
+ * in the un_dirty_bm, as it was re-read from disk, and our bits
+ * are also not reflected in the on-disk DRL. Fix that now.
+ */
+ if (MD_MN_MIRROR_OWNER(un)) {
+ rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+ mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
+ un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
+ rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
+ un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
+
+ mutex_exit(&un->un_resync_mx);
+ mddb_commitrec_wrapper(un->un_rr_dirty_recid);
+ mutex_enter(&un->un_resync_mx);
+
+ un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
+ cv_broadcast(&un->un_resync_cv);
+ }
+
+ for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
+ CLR_GOING_DIRTY(current_rr, un);
+
+ mutex_exit(&un->un_resync_mx);
+
+ return (0);
+}
+
+int
+mirror_mark_resync_region_owner(struct mm_unit *un,
+ diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
+{
+ int no_change;
+ size_t start_rr;
+ size_t current_rr;
+ size_t end_rr;
+ int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+ md_mn_nodeid_t node_idx = source_node - 1;
+
+ if (un->un_nsm < 2)
+ return (0);
+
+ /*
+ * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
+ * not, allocate it and then fill the [start..end] entries.
+ * Update un_pernode_dirty_sum if we've gone 0->1.
+ * Update un_dirty_bm if the corresponding entries are clear.
+ */
+ if (mnset) {
+ rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
+ if (un->un_pernode_dirty_bm[node_idx] == NULL) {
+ un->un_pernode_dirty_bm[node_idx] =
+ (uchar_t *)kmem_zalloc(
+ (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
+ }
+ rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+ }
+
+ mutex_enter(&un->un_resync_mx);
+
+ if (mnset)
+ rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
+
+ no_change = 1;
+ BLK_TO_RR(end_rr, endblk, un);
+ BLK_TO_RR(start_rr, startblk, un);
+ for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
+ if (!mnset || source_node == md_mn_mynode_id)
+ un->un_outstanding_writes[current_rr]++;
+ if (mnset) {
+ if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
+ un->un_pernode_dirty_sum[current_rr]++;
+ SET_PERNODE_DIRTY(source_node, current_rr, un);
+ }
CLR_GOING_CLEAN(current_rr, un);
if (!IS_REGION_DIRTY(current_rr, un))
no_change = 0;
if (IS_GOING_DIRTY(current_rr, un))
no_change = 0;
}
+
+ if (mnset)
+ rw_exit(&un->un_pernode_dirty_mx[node_idx]);
+
if (no_change) {
mutex_exit(&un->un_resync_mx);
return (0);
@@ -2741,7 +3281,7 @@ mirror_mark_resync_region(struct mm_unit *un,
}
}
if (no_change) {
- if (un->un_waiting_to_mark == 0)
+ if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
cv_broadcast(&un->un_resync_cv);
mutex_exit(&un->un_resync_mx);
return (0);
@@ -2749,19 +3289,21 @@ mirror_mark_resync_region(struct mm_unit *un,
un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
un->un_waiting_to_commit++;
- while ((un->un_waiting_to_mark != 0) &&
- (!(un->un_resync_flg & MM_RF_GATECLOSED))) {
+ while (un->un_waiting_to_mark != 0 &&
+ !(un->un_resync_flg & MM_RF_GATECLOSED)) {
if (panicstr)
return (1);
cv_wait(&un->un_resync_cv, &un->un_resync_mx);
}
- if ((un->un_resync_flg & MM_RF_COMMIT_NEEDED)) {
+ if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
+
mutex_exit(&un->un_resync_mx);
mddb_commitrec_wrapper(un->un_rr_dirty_recid);
mutex_enter(&un->un_resync_mx);
+
un->un_resync_flg &= ~MM_RF_COMMITING;
cv_broadcast(&un->un_resync_cv);
}
@@ -2779,10 +3321,26 @@ mirror_mark_resync_region(struct mm_unit *un,
cv_broadcast(&un->un_resync_cv);
}
mutex_exit(&un->un_resync_mx);
+
return (0);
}
int
+mirror_mark_resync_region(struct mm_unit *un,
+ diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
+{
+ int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
+
+ if (mnset && !MD_MN_MIRROR_OWNER(un)) {
+ return (mirror_mark_resync_region_non_owner(un, startblk,
+ endblk, source_node));
+ } else {
+ return (mirror_mark_resync_region_owner(un, startblk, endblk,
+ source_node));
+ }
+}
+
+int
mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
{
short *owp;
@@ -2793,9 +3351,10 @@ mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
size_t size;
mddb_recid_t recid, old_recid;
uchar_t *old_dirty_bm;
- int i;
+ int i, j;
mddb_type_t typ1;
set_t setno = MD_UN2SET(un);
+ uchar_t *old_pns;
old_nregions = un->un_rrd_num;
new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
@@ -2840,6 +3399,11 @@ mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
un->un_outstanding_writes = (short *)kmem_zalloc(
new_nregions * sizeof (short), KM_SLEEP);
+ old_pns = un->un_pernode_dirty_sum;
+ if (old_pns)
+ un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
+ KM_SLEEP);
+
/*
* Now translate the old records into the new
* records
@@ -2847,15 +3411,41 @@ mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
for (i = 0; i < old_nregions; i++) {
/*
* only bring forward the
- * outstanding write counters and the dirty bits
+ * outstanding write counters and the dirty bits and also
+ * the pernode_summary counts
*/
if (!isset(old_dirty_bm, i))
continue;
setbit(un->un_dirty_bm, (i / rr_mult));
un->un_outstanding_writes[(i / rr_mult)] += owp[i];
+ if (old_pns)
+ un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
}
kmem_free((caddr_t)owp, old_nregions * sizeof (short));
+ if (old_pns)
+ kmem_free((caddr_t)old_pns, old_nregions);
+
+ /*
+ * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
+ */
+ for (j = 0; j < MD_MNMAXSIDES; j++) {
+ rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
+ old_dirty_bm = un->un_pernode_dirty_bm[j];
+ if (old_dirty_bm) {
+ un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
+ new_bm_size, KM_SLEEP);
+ for (i = 0; i < old_nregions; i++) {
+ if (!isset(old_dirty_bm, i))
+ continue;
+
+ setbit(un->un_pernode_dirty_bm[j],
+ (i / rr_mult));
+ }
+ kmem_free((caddr_t)old_dirty_bm, old_bm_size);
+ }
+ rw_exit(&un->un_pernode_dirty_mx[j]);
+ }
/* Save the old record id */
old_recid = un->un_rr_dirty_recid;
@@ -2891,6 +3481,7 @@ mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
mddb_recid_t recid, old_recid;
mddb_type_t typ1;
set_t setno = MD_UN2SET(un);
+ int i;
old_nregions = un->un_rrd_num;
new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
@@ -2924,6 +3515,8 @@ mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
* un_goingclean_bm
* un_resync_bm
* un_outstanding_writes
+ * un_pernode_dirty_sum
+ * un_pernode_dirty_bm[]
*/
old = un->un_goingdirty_bm;
un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
@@ -2947,6 +3540,28 @@ mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
old_nregions * sizeof (short));
kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
+ old = un->un_pernode_dirty_sum;
+ if (old) {
+ un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
+ new_nregions, KM_SLEEP);
+ bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
+ old_nregions);
+ kmem_free((caddr_t)old, old_nregions);
+ }
+
+ for (i = 0; i < MD_MNMAXSIDES; i++) {
+ rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
+ old = un->un_pernode_dirty_bm[i];
+ if (old) {
+ un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
+ new_bm_size, KM_SLEEP);
+ bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
+ old_bm_size);
+ kmem_free((caddr_t)old, old_bm_size);
+ }
+ rw_exit(&un->un_pernode_dirty_mx[i]);
+ }
+
/* Save the old record id */
old_recid = un->un_rr_dirty_recid;
@@ -2980,3 +3595,263 @@ mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
for (i = 0; i < sz; i++)
*dest++ |= *src++;
}
+
+/*
+ * mirror_set_dirty_rr:
+ * -------------------
+ * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
+ * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
+ * Called on every clean->dirty transition for the originating writer node.
+ * Note: only the non-owning nodes will initiate this message and it is only
+ * the owning node that has to process it.
+ */
+int
+mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
+{
+
+ minor_t mnum = iocp->rr_mnum;
+ mm_unit_t *un;
+ int start = (int)iocp->rr_start;
+ int end = (int)iocp->rr_end;
+ set_t setno = MD_MIN2SET(mnum);
+ md_mn_nodeid_t orignode = iocp->rr_nodeid; /* 1-based */
+ diskaddr_t startblk, endblk;
+
+ mdclrerror(&iocp->mde);
+
+ if ((setno >= md_nsets) ||
+ (MD_MIN2UNIT(mnum) >= md_nunits)) {
+ return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+ }
+
+ /* Must have _NO_ ioctl lock set if we update the RR on-disk */
+ un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+ if (un == NULL) {
+ return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
+ }
+ if (un->c.un_type != MD_METAMIRROR) {
+ return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
+ }
+ if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
+ return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+ }
+ if (un->un_nsm < 2) {
+ return (0);
+ }
+
+ /*
+ * Only process this message if we're the owner of the mirror.
+ */
+ if (!MD_MN_MIRROR_OWNER(un)) {
+ return (0);
+ }
+
+ RR_TO_BLK(startblk, start, un);
+ RR_TO_BLK(endblk, end, un);
+ return (mirror_mark_resync_region_owner(un, startblk, endblk,
+ orignode));
+}
+
+/*
+ * mirror_clean_rr_bits:
+ * --------------------
+ * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
+ * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
+ * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
+ * nodes. Callable from ioctl / interrupt / whatever context.
+ * un_resync_mx is held on entry.
+ */
+static void
+mirror_clean_rr_bits(
+ md_mn_rr_clean_params_t *iocp)
+{
+ minor_t mnum = iocp->rr_mnum;
+ mm_unit_t *un;
+ uint_t cleared_bits;
+ md_mn_nodeid_t node = iocp->rr_nodeid - 1;
+ md_mn_nodeid_t orignode = iocp->rr_nodeid;
+ int i, start, end;
+
+ un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+ cleared_bits = 0;
+ start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
+ end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
+ rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
+ for (i = start; i < end; i++) {
+ if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
+ if (IS_PERNODE_DIRTY(orignode, i, un)) {
+ un->un_pernode_dirty_sum[i]--;
+ CLR_PERNODE_DIRTY(orignode, i, un);
+ }
+ if (un->un_pernode_dirty_sum[i] == 0) {
+ cleared_bits++;
+ CLR_REGION_DIRTY(i, un);
+ CLR_GOING_CLEAN(i, un);
+ }
+ }
+ }
+ rw_exit(&un->un_pernode_dirty_mx[node]);
+ if (cleared_bits) {
+ /*
+ * We can only be called iff we are the mirror owner, however
+ * as this is a (potentially) decoupled routine the ownership
+ * may have moved from us by the time we get to execute the
+ * bit clearing. Hence we still need to check for being the
+ * owner before flushing the DRL to the replica.
+ */
+ if (MD_MN_MIRROR_OWNER(un)) {
+ mutex_exit(&un->un_resync_mx);
+ mddb_commitrec_wrapper(un->un_rr_dirty_recid);
+ mutex_enter(&un->un_resync_mx);
+ }
+ }
+}
+
+/*
+ * mirror_drl_task:
+ * ---------------
+ * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
+ * We need to obtain exclusive access to the un_resync_cv and then clear the
+ * necessary bits.
+ * On completion, we must also free the passed in argument as it is allocated
+ * at the end of the ioctl handler and won't be freed on completion.
+ */
+static void
+mirror_drl_task(void *arg)
+{
+ md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
+ minor_t mnum = iocp->rr_mnum;
+ mm_unit_t *un;
+
+ un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+ mutex_enter(&un->un_rrp_inflight_mx);
+ mutex_enter(&un->un_resync_mx);
+ un->un_waiting_to_clear++;
+ while (un->un_resync_flg & MM_RF_STALL_CLEAN)
+ cv_wait(&un->un_resync_cv, &un->un_resync_mx);
+ un->un_waiting_to_clear--;
+
+ un->un_resync_flg |= MM_RF_GATECLOSED;
+ mirror_clean_rr_bits(iocp);
+ un->un_resync_flg &= ~MM_RF_GATECLOSED;
+ if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
+ cv_broadcast(&un->un_resync_cv);
+ }
+ mutex_exit(&un->un_resync_mx);
+ mutex_exit(&un->un_rrp_inflight_mx);
+
+ kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
+}
+
+/*
+ * mirror_set_clean_rr:
+ * -------------------
+ * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
+ * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
+ * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
+ * nodes.
+ *
+ * Only the mirror-owner need process this message as it is the only RR updater.
+ * Non-owner nodes issue this request, but as we have no point-to-point message
+ * support we will receive the message on all nodes.
+ */
+int
+mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
+{
+
+ minor_t mnum = iocp->rr_mnum;
+ mm_unit_t *un;
+ set_t setno = MD_MIN2SET(mnum);
+ md_mn_nodeid_t node = iocp->rr_nodeid - 1;
+ int can_clear = 0;
+ md_mn_rr_clean_params_t *newiocp;
+ int rval = 0;
+
+ mdclrerror(&iocp->mde);
+
+ if ((setno >= md_nsets) ||
+ (MD_MIN2UNIT(mnum) >= md_nunits)) {
+ return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
+ }
+
+ /* Must have _NO_ ioctl lock set if we update the RR on-disk */
+ un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
+
+ if (un == NULL) {
+ return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
+ }
+ if (un->c.un_type != MD_METAMIRROR) {
+ return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
+ }
+ if (un->un_nsm < 2) {
+ return (0);
+ }
+
+ /*
+ * Check to see if we're the mirror owner. If not, there's nothing
+ * for us to to.
+ */
+ if (!MD_MN_MIRROR_OWNER(un)) {
+ return (0);
+ }
+
+ /*
+ * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
+ * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
+ * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
+ * we can just defer this cleaning until the next process_resync_regions
+ * timeout.
+ */
+ rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
+ if (un->un_pernode_dirty_bm[node] == NULL) {
+ un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
+ un->un_rrd_num, KM_SLEEP);
+ }
+ rw_exit(&un->un_pernode_dirty_mx[node]);
+
+ /*
+ * See if we can simply clear the un_dirty_bm[] entries. If we're not
+ * the issuing node _and_ we aren't in the process of marking/clearing
+ * the RR bitmaps, we can simply update the bits as needed.
+ * If we're the owning node and _not_ the issuing node, we should also
+ * sync the RR if we clear any bits in it.
+ */
+ mutex_enter(&un->un_resync_mx);
+ can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
+ if (can_clear) {
+ un->un_resync_flg |= MM_RF_GATECLOSED;
+ mirror_clean_rr_bits(iocp);
+ un->un_resync_flg &= ~MM_RF_GATECLOSED;
+ if (un->un_waiting_to_mark != 0 ||
+ un->un_waiting_to_clear != 0) {
+ cv_broadcast(&un->un_resync_cv);
+ }
+ }
+ mutex_exit(&un->un_resync_mx);
+
+ /*
+ * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
+ * we must schedule a blocking call to update the DRL on this node.
+ * As we're invoked from an ioctl we are going to have the original data
+ * disappear (kmem_free) once we return. So, copy the data into a new
+ * structure and let the taskq routine release it on completion.
+ */
+ if (!can_clear) {
+ size_t sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
+
+ newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
+
+ bcopy(iocp, newiocp, sz);
+
+ if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
+ newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
+ kmem_free(newiocp, sz);
+ rval = ENOMEM; /* probably starvation */
+ }
+ }
+
+ return (rval);
+}
diff --git a/usr/src/uts/common/io/lvm/softpart/sp.c b/usr/src/uts/common/io/lvm/softpart/sp.c
index 5c204341b8..f04fb2909f 100644
--- a/usr/src/uts/common/io/lvm/softpart/sp.c
+++ b/usr/src/uts/common/io/lvm/softpart/sp.c
@@ -118,6 +118,7 @@ extern mdq_anchor_t md_sp_daemon;
extern kmutex_t md_mx;
extern kcondvar_t md_cv;
extern md_krwlock_t md_unit_array_rw;
+extern clock_t md_hz;
static kmem_cache_t *sp_parent_cache = NULL;
static kmem_cache_t *sp_child_cache = NULL;
@@ -341,15 +342,19 @@ sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
- (char *)&sp_msg, sizeof (sp_msg), kres);
+ 0, (char *)&sp_msg, sizeof (sp_msg), kres);
if (!MDMN_KSEND_MSG_OK(rval, kres)) {
mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
-
+ /* If we're shutting down already, pause things here. */
+ if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
+ while (!md_mn_is_commd_present()) {
+ delay(md_hz);
+ }
+ }
/*
* Panic as we are now in an inconsistent state.
*/
-
cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
md_shortname(MD_SID(un)), str);
}
diff --git a/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c b/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c
index 36fbf38793..f08c94e628 100644
--- a/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c
+++ b/usr/src/uts/common/io/lvm/softpart/sp_ioctl.c
@@ -1150,6 +1150,7 @@ sp_admin_ioctl(int cmd, void *data, int mode, IOLOCK *lockp)
}
case MD_IOC_SPUPDATEWM:
+ case MD_MN_IOC_SPUPDATEWM:
{
if (! (mode & FWRITE))
return (EACCES);
diff --git a/usr/src/uts/common/sys/lvm/md_mirror.h b/usr/src/uts/common/sys/lvm/md_mirror.h
index f32b99a426..fc6bca9b07 100644
--- a/usr/src/uts/common/sys/lvm/md_mirror.h
+++ b/usr/src/uts/common/sys/lvm/md_mirror.h
@@ -18,6 +18,7 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -30,6 +31,9 @@
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_mirror_shared.h>
#include <sys/lvm/md_rename.h>
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
#ifdef __cplusplus
extern "C" {
@@ -331,9 +335,24 @@ typedef struct mm_mirror_ic {
kcondvar_t un_dmr_cv; /* condvar for DMR requests */
int un_dmr_last_read; /* last DMR submirror read */
callb_cpr_t un_rs_cprinfo; /* CPR info for resync thread */
- kmutex_t un_rs_cpr_mx; /* Mutex for CPR info */
+ kmutex_t un_rs_cpr_mx; /* mutex for resync CPR info */
+ kmutex_t un_prr_cpr_mx; /* mutex for prr CPR info */
uint_t un_resync_completed; /* type of last resync */
int un_abr_count; /* count of sp's with abr set */
+
+ uchar_t *un_pernode_dirty_bm[MD_MNMAXSIDES];
+ uchar_t *un_pernode_dirty_sum;
+
+ krwlock_t un_pernode_dirty_mx[MD_MNMAXSIDES];
+ ushort_t un_rr_clean_start_bit; /* where to start next clean */
+
+#ifdef _KERNEL
+ ddi_taskq_t *un_drl_task; /* deferred RR_CLEAN taskq */
+#else
+ void *un_drl_task; /* deferred RR_CLEAN taskq */
+#endif /* _KERNEL */
+ uint_t un_waiting_to_clear; /* Blocked waiting to clear */
+
}mm_mirror_ic_t;
#define MM_MN_OWNER_SENT 0x0001 /* RPC in progress */
@@ -416,9 +435,15 @@ typedef struct mm_unit {
#define un_dmr_last_read un_mmic.un_dmr_last_read
#define un_rs_cprinfo un_mmic.un_rs_cprinfo
#define un_rs_cpr_mx un_mmic.un_rs_cpr_mx
+#define un_prr_cpr_mx un_mmic.un_prr_cpr_mx
#define un_resync_completed un_mmic.un_resync_completed
#define un_abr_count un_mmic.un_abr_count
-
+#define un_pernode_dirty_bm un_mmic.un_pernode_dirty_bm
+#define un_pernode_dirty_sum un_mmic.un_pernode_dirty_sum
+#define un_pernode_dirty_mx un_mmic.un_pernode_dirty_mx
+#define un_rr_clean_start_bit un_mmic.un_rr_clean_start_bit
+#define un_drl_task un_mmic.un_drl_task
+#define un_waiting_to_clear un_mmic.un_waiting_to_clear
#define MM_RF_GATECLOSED 0x0001
#define MM_RF_COMMIT_NEEDED 0x0002
@@ -497,6 +522,12 @@ typedef struct optim_resync {
#define IS_KEEPDIRTY(i, un) (isset((un)->un_resync_bm, (i)))
#define CLR_KEEPDIRTY(i, un) (clrbit((un)->un_resync_bm, (i)))
+#define IS_PERNODE_DIRTY(n, i, un) \
+ (isset((un)->un_pernode_dirty_bm[(n)-1], (i)))
+#define CLR_PERNODE_DIRTY(n, i, un) \
+ (clrbit((un)->un_pernode_dirty_bm[(n)-1], (i)))
+#define SET_PERNODE_DIRTY(n, i, un) \
+ (setbit((un)->un_pernode_dirty_bm[(n)-1], (i)))
/*
* Write-On-Write handling.
@@ -579,13 +610,15 @@ extern int mirror_resync_unit(minor_t mnum, md_resync_ioctl_t *ri,
md_error_t *ep, IOLOCK *);
extern int mirror_ioctl_resync(md_resync_ioctl_t *p, IOLOCK *);
extern int mirror_mark_resync_region(mm_unit_t *, diskaddr_t,
- diskaddr_t);
+ diskaddr_t, md_mn_nodeid_t);
extern void resync_start_timeout(set_t setno);
extern int mirror_resize_resync_regions(mm_unit_t *, diskaddr_t);
extern int mirror_add_resync_regions(mm_unit_t *, diskaddr_t);
extern int mirror_probedevs(md_probedev_t *, IOLOCK *);
extern void mirror_copy_rr(int, uchar_t *, uchar_t *);
extern void mirror_process_unit_resync(mm_unit_t *);
+extern int mirror_set_dirty_rr(md_mn_rr_dirty_params_t *);
+extern int mirror_set_clean_rr(md_mn_rr_clean_params_t *);
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/lvm/md_sp.h b/usr/src/uts/common/sys/lvm/md_sp.h
index 4cf2725364..5aa3547b24 100644
--- a/usr/src/uts/common/sys/lvm/md_sp.h
+++ b/usr/src/uts/common/sys/lvm/md_sp.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,15 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS__MD_SP_H
#define _SYS__MD_SP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/lvm/mdvar.h>
#ifdef __cplusplus
@@ -99,6 +97,7 @@ typedef enum sp_ext_type {
#define MD_IOC_SPSTATUS (MDIOC_MISC|0)
#define MD_IOC_SPUPDATEWM (MDIOC_MISC|1)
#define MD_IOC_SPREADWM (MDIOC_MISC|2)
+#define MD_MN_IOC_SPUPDATEWM (MDIOC_MISC|3)
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/lvm/mdio.h b/usr/src/uts/common/sys/lvm/mdio.h
index 1cedfe2bc6..e604a98795 100644
--- a/usr/src/uts/common/sys/lvm/mdio.h
+++ b/usr/src/uts/common/sys/lvm/mdio.h
@@ -18,16 +18,15 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS__MDIO_H
#define _SYS__MDIO_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/debug.h>
#include <sys/ioctl.h>
#include <sys/types.h>
@@ -433,6 +432,31 @@ typedef struct md_mkdev_params {
unit_t un;
} md_mkdev_params_t;
+#define MDMN_RR_CLEAN_PARAMS_DATA(x) ((unsigned char *)(x) + \
+ sizeof (md_mn_rr_clean_params_t))
+#define MDMN_RR_CLEAN_PARAMS_SIZE(x) (sizeof (md_mn_rr_clean_params_t) + \
+ MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x))
+#define MDMN_RR_CLEAN_PARAMS_START_BIT(x) ((x)->rr_start_size >> 16)
+#define MDMN_RR_CLEAN_PARAMS_DATA_BYTES(x) ((x)->rr_start_size & 0xffff)
+
+typedef struct md_mn_rr_clean_params {
+ MD_DRIVER
+ md_error_t mde;
+ md_mn_nodeid_t rr_nodeid;
+ minor_t rr_mnum;
+ unsigned int rr_start_size; /* start_bit (16b) | data_bytes (16b) */
+ /* actual data goes here */
+} md_mn_rr_clean_params_t;
+
+typedef struct md_mn_rr_dirty_params {
+ MD_DRIVER
+ md_error_t mde;
+ minor_t rr_mnum;
+ md_mn_nodeid_t rr_nodeid;
+ ushort_t rr_start; /* First RR region to mark */
+ ushort_t rr_end; /* Last RR region to mark */
+} md_mn_rr_dirty_params_t;
+
/*
* Flags to coordinate sending device id between kernel and user space.
* To get devid from kernel:
@@ -756,7 +780,8 @@ typedef struct md_regen_param {
#define MD_IOCGET_HSP_NM (MDIOC|105) /* get hsp entry from namespace */
#define MD_IOCREM_DEV (MDIOC|106) /* remove device node for unit */
#define MD_IOCUPDATE_NM_RR_DID (MDIOC|107) /* update remotely repl did in NM */
-
+#define MD_MN_RR_DIRTY (MDIOC|108) /* Mark RR range as dirty */
+#define MD_MN_RR_CLEAN (MDIOC|109) /* Clean RR bits from bitmap */
#define MDIOC_MISC (MDIOC|128) /* misc module base */
/* Used in DEBUG_TEST code */
diff --git a/usr/src/uts/common/sys/lvm/mdmn_commd.x b/usr/src/uts/common/sys/lvm/mdmn_commd.x
index 3971eb9e00..3ec7b1fff4 100644
--- a/usr/src/uts/common/sys/lvm/mdmn_commd.x
+++ b/usr/src/uts/common/sys/lvm/mdmn_commd.x
@@ -20,11 +20,10 @@
% */
%
%/*
-% * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+% * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
% * Use is subject to license terms.
% */
%
-%#pragma ident "%Z%%M% %I% %E% SMI"
%#include <sys/types.h>
%#include <sys/types32.h>
@@ -103,6 +102,8 @@ enum md_mn_msgtype_t {
MD_MN_MSG_SETSYNC, /* Set resync status */
MD_MN_MSG_POKE_HOTSPARES, /* Call poke_hotspares */
MD_MN_MSG_ADDMDNAME, /* Add metadevice name */
+ MD_MN_MSG_RR_DIRTY, /* Mark RR range as dirty */
+ MD_MN_MSG_RR_CLEAN, /* Mark RR range as clean */
MD_MN_NMESSAGES /* insert elements before */
};
@@ -361,6 +362,39 @@ struct md_mn_msg_pokehsp_t {
minor_t pokehsp_setno;
};
+/* Message format for MD_MN_MSG_RR_DIRTY message */
+struct md_mn_msg_rr_dirty_t {
+ minor_t rr_mnum;
+ int rr_nodeid;
+ u_int rr_range; /* Start(16bits) | End(16bits) */
+};
+
+/* Message format for MD_MN_MSG_RR_CLEAN message */
+%#define MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES \
+% ((MDMN_MAX_KMSG_DATA) - \
+% sizeof (struct md_mn_msg_rr_clean_t))
+%#define MDMN_MSG_RR_CLEAN_SIZE_DATA(x) \
+% (sizeof (struct md_mn_msg_rr_clean_t) + (x))
+%#define MDMN_MSG_RR_CLEAN_MSG_SIZE(x) \
+% (sizeof (struct md_mn_msg_rr_clean_t) \
+% + MDMN_MSG_RR_CLEAN_DATA_BYTES(x))
+%#define MDMN_MSG_RR_CLEAN_DATA(x) \
+% ((unsigned char *)(x) + \
+% sizeof (struct md_mn_msg_rr_clean_t))
+
+/* since we cannot use ushorts, some macros to extract the parts from an int */
+%#define MDMN_MSG_RR_CLEAN_START_BIT(x) ((x)->rr_start_size >> 16)
+%#define MDMN_MSG_RR_CLEAN_DATA_BYTES(x) ((x)->rr_start_size & 0xffff)
+%#define MDMN_MSG_RR_CLEAN_START_SIZE_SET(x, start, size) \
+% ((x)->rr_start_size = (start << 16) | size)
+
+struct md_mn_msg_rr_clean_t {
+ md_mn_nodeid_t rr_nodeid;
+ unsigned int rr_mnum;
+ unsigned int rr_start_size; /* start_bit (16b) | data_bytes (16b) */
+ /* actual data goes here */
+};
+
%#define MD_MSGF_NO_LOG 0x00000001
%#define MD_MSGF_NO_BCAST 0x00000002
%#define MD_MSGF_STOP_ON_ERROR 0x00000004
@@ -373,6 +407,9 @@ struct md_mn_msg_pokehsp_t {
%#define MD_MSGF_FAIL_ON_SUSPEND 0x00000200
%#define MD_MSGF_NO_MCT 0x00000400
%#define MD_MSGF_PANIC_WHEN_INCONSISTENT 0x00000800
+%#define MD_MSGF_BLK_SIGNAL 0x00001000
+%#define MD_MSGF_KSEND_NORETRY 0x00002000
+%#define MD_MSGF_DIRECTED 0x00004000
%#define MD_MSGF_VERBOSE 0x10000000
%#define MD_MSGF_VERBOSE_2 0x20000000
@@ -418,7 +455,8 @@ struct md_mn_msg_t {
u_int msg_flags; /* See MD_MSGF_* above */
set_t msg_setno; /* which set is involved */
md_mn_msgtype_t msg_type; /* what type of message */
- char msg_spare[32]; /* Always good to hav'em */
+ md_mn_nodeid_t msg_recipient; /* who to send DIRECTED message to */
+ char msg_spare[28]; /* Always good to hav'em */
opaque msg_event<>; /* the actual event wrapped up */
};
%#define msg_event_data msg_event.msg_event_val
@@ -435,7 +473,8 @@ struct md_mn_msg_od_t {
uint32_t msg_flags; /* See MD_MSGF_* above */
set_t msg_setno; /* which set is involved */
md_mn_msgtype_t msg_type; /* what type of message */
- char msg_spare[32]; /* Always good to hav'em */
+ md_mn_nodeid_t msg_recipient; /* who to send DIRECTED message to */
+ char msg_spare[28]; /* Always good to hav'em */
uint32_t msg_ev_len;
char msg_ev_val[MD_MN_MSG_MAXDATALEN];
};
@@ -450,6 +489,7 @@ struct md_mn_kmsg_t {
u_int kmsg_flags;
set_t kmsg_setno;
md_mn_msgtype_t kmsg_type;
+ md_mn_nodeid_t kmsg_recipient; /* who to send DIRECTED message to */
int kmsg_size;
char kmsg_data[MDMN_MAX_KMSG_DATA];
};
@@ -549,7 +589,7 @@ struct md_mn_type_and_lock_t {
program MDMN_COMMD {
- version ONE {
+ version TWO {
md_mn_result_t
mdmn_send(md_mn_msg_t) = 1;
@@ -579,5 +619,5 @@ program MDMN_COMMD {
int
mdmn_comm_msglock(md_mn_type_and_lock_t) = 10;
- } = 1;
+ } = 2;
} = 100422;
diff --git a/usr/src/uts/common/sys/lvm/mdvar.h b/usr/src/uts/common/sys/lvm/mdvar.h
index c2745f51b8..8f70a29e8b 100644
--- a/usr/src/uts/common/sys/lvm/mdvar.h
+++ b/usr/src/uts/common/sys/lvm/mdvar.h
@@ -744,8 +744,8 @@ extern void freestr(char *cp);
extern int md_check_ioctl_against_unit(int, mdc_unit_t);
extern mddb_recid_t md_vtoc_to_efi_record(mddb_recid_t, set_t);
-extern int mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t, char *, int,
- md_mn_kresult_t *);
+extern int mdmn_ksend_message(set_t, md_mn_msgtype_t, uint_t,
+ md_mn_nodeid_t, char *, int, md_mn_kresult_t *);
extern void mdmn_ksend_show_error(int, md_mn_kresult_t *, const char *);
extern int mdmn_send_capability_message(minor_t, volcap_t, IOLOCK *);
extern void mdmn_clear_all_capabilities(minor_t);
@@ -755,9 +755,11 @@ extern boolean_t callb_md_mrs_cpr(void *, int);
extern void md_upd_set_unnext(set_t, unit_t);
extern int md_rem_selfname(minor_t);
extern void md_rem_hspname(set_t, mdkey_t);
+extern void *md_create_taskq(set_t, minor_t);
/* Externals from md_ioctl.c */
extern int md_mn_is_commd_present(void);
+extern int md_mn_is_commd_present_lite(void);
extern void md_mn_clear_commd_present(void);
extern int md_admin_ioctl(md_dev64_t, int, caddr_t, int, IOLOCK *lockp);
extern void md_get_geom(md_unit_t *, struct dk_geom *);