summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/nfs/nfs_server.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs/nfs/nfs_server.c')
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c208
1 files changed, 192 insertions, 16 deletions
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index 42d7e071ab..7b382608f5 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -106,6 +106,9 @@ static struct modlinkage modlinkage = {
char _depends_on[] = "misc/klmmod";
+/* for testing RG failover code path on non-Cluster system */
+int hanfsv4_force = 0;
+
int
_init(void)
{
@@ -125,7 +128,19 @@ _init(void)
nfs_srvfini();
}
+ /*
+ * Initialise some placeholders for nfssys() calls. These have
+ * to be declared by the nfs module, since that handles nfssys()
+ * calls - also used by NFS clients - but are provided by this
+ * nfssrv module. These also then serve as confirmation to the
+ * relevant code in nfs that nfssrv has been loaded, as they're
+ * initially NULL.
+ */
nfs_srv_quiesce_func = nfs_srv_quiesce_all;
+ nfs_srv_dss_func = rfs4_dss_setpaths;
+
+ /* setup DSS paths here; must be done before initial server startup */
+ rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
return (status);
}
@@ -166,6 +181,7 @@ static void acl_dispatch(struct svc_req *, SVCXPRT *);
static void common_dispatch(struct svc_req *, SVCXPRT *,
rpcvers_t, rpcvers_t, char *,
struct rpc_disptable *);
+static void hanfsv4_failover(void);
static int checkauth(struct exportinfo *, struct svc_req *, cred_t *, int,
bool_t);
static char *client_name(struct svc_req *req);
@@ -241,6 +257,12 @@ static nfs_server_running_t nfs_server_upordown;
static kmutex_t nfs_server_upordown_lock;
static kcondvar_t nfs_server_upordown_cv;
+/*
+ * DSS: distributed stable storage
+ * lists of all DSS paths: current, and before last warmstart
+ */
+nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
+
int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *);
/*
@@ -298,6 +320,11 @@ nfs_srv_shutdown_all(int quiesce) {
nfs_server_upordown == NFS_SERVER_OFFLINE) {
nfs_server_upordown = NFS_SERVER_QUIESCED;
cv_signal(&nfs_server_upordown_cv);
+
+ /* reset DSS state, for subsequent warm restart */
+ rfs4_dss_numnewpaths = 0;
+ rfs4_dss_newpaths = NULL;
+
cmn_err(CE_NOTE, "nfs_server: server is now quiesced; "
"NFSv4 state has been preserved");
}
@@ -458,7 +485,7 @@ nfs_svc(struct nfs_svc_args *arg, model_t model)
releasef(STRUCT_FGET(uap, fd));
- /* save the cluster nodeid */
+ /* HA-NFSv4: save the cluster nodeid */
if (cluster_bootflags & CLUSTER_BOOTED)
lm_global_nlmid = clconf_get_nodeid();
@@ -489,28 +516,20 @@ rfs4_server_start(int nfs4_srv_delegation)
/* is this an nfsd warm start? */
if (nfs_server_upordown == NFS_SERVER_QUIESCED) {
- int start_grace;
-
cmn_err(CE_NOTE, "nfs_server: "
"server was previously quiesced; "
"existing NFSv4 state will be re-used");
/*
- * Cluster: this is also the signal that
- * a failover has occurred, so create a new
- * server instance, and start its grace period.
- * We also need to reset all currently
- * active grace periods in case of multiple
- * failovers within the grace duration,
- * to avoid partitioning clients of the same
- * resource into different instances.
+ * HA-NFSv4: this is also the signal
+ * that a Resource Group failover has
+ * occurred.
*/
- if (cluster_bootflags & CLUSTER_BOOTED) {
- rfs4_grace_reset_all();
- start_grace = 1;
- rfs4_servinst_create(start_grace);
- }
+ if (cluster_bootflags & CLUSTER_BOOTED ||
+ hanfsv4_force)
+ hanfsv4_failover();
} else {
+ /* cold start */
rfs4_state_init();
nfs4_drc = rfs4_init_drc(nfs4_drc_max,
nfs4_drc_hash,
@@ -2836,3 +2855,160 @@ nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr,
return (error);
}
+
+/*
+ * Do the main work of handling HA-NFSv4 Resource Group failover on
+ * Sun Cluster.
+ * We need to detect whether any RG admin paths have been added or removed,
+ * and adjust resources accordingly.
+ * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In
+ * order to scale, the list and array of paths need to be held in more
+ * suitable data structures.
+ */
+static void
+hanfsv4_failover(void)
+{
+ int i, start_grace, numadded_paths = 0;
+ char **added_paths = NULL;
+ rfs4_dss_path_t *dss_path;
+
+ /*
+ * First, look for removed paths: RGs that have been failed-over
+ * away from this node.
+ * Walk the "currently-serving" rfs4_dss_pathlist and, for each
+ * path, check if it is on the "passed-in" rfs4_dss_newpaths array
+ * from nfsd. If not, that RG path has been removed.
+ *
+ * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed
+ * any duplicates.
+ */
+ dss_path = rfs4_dss_pathlist;
+ do {
+ int found = 0;
+ char *path = dss_path->path;
+
+ /* used only for non-HA so may not be removed */
+ if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+ dss_path = dss_path->next;
+ continue;
+ }
+
+ for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+ int cmpret;
+ size_t ncmp;
+ char *newpath = rfs4_dss_newpaths[i];
+
+ ncmp = MAX(strlen(path), strlen(newpath));
+ cmpret = strncmp(path, newpath, ncmp);
+
+ /*
+ * Since nfsd has sorted rfs4_dss_newpaths for us,
+ * once the return from strncmp is negative we know
+ * we've passed the point where "path" should be,
+ * and can stop searching: "path" has been removed.
+ */
+ if (cmpret < 0)
+ break;
+
+ if (cmpret == 0) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found == 0) {
+ unsigned index = dss_path->index;
+ rfs4_servinst_t *sip = dss_path->sip;
+ rfs4_dss_path_t *path_next = dss_path->next;
+
+ /*
+ * This path has been removed.
+ * We must clear out the servinst reference to
+ * it, since it's now owned by another
+ * node: we should not attempt to touch it.
+ */
+ ASSERT(dss_path == sip->dss_paths[index]);
+ sip->dss_paths[index] = NULL;
+
+ /* remove from "currently-serving" list, and destroy */
+ remque(dss_path);
+ kmem_free(dss_path, sizeof (rfs4_dss_path_t));
+
+ dss_path = path_next;
+ } else {
+ /* path was found; not removed */
+ dss_path = dss_path->next;
+ }
+ } while (dss_path != rfs4_dss_pathlist);
+
+ /*
+ * Now, look for added paths: RGs that have been failed-over
+ * to this node.
+ * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and,
+ * for each path, check if it is on the "currently-serving"
+ * rfs4_dss_pathlist. If not, that RG path has been added.
+ *
+ * Note: we don't do duplicate detection here; nfsd does that for us.
+ *
+ * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us
+ * an upper bound for the size needed for added_paths[numadded_paths].
+ */
+
+ /* probably more space than we need, but guaranteed to be enough */
+ if (rfs4_dss_numnewpaths > 0) {
+ size_t sz = rfs4_dss_numnewpaths * sizeof (char *);
+ added_paths = kmem_zalloc(sz, KM_SLEEP);
+ }
+
+ /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */
+ for (i = 0; i < rfs4_dss_numnewpaths; i++) {
+ int found = 0;
+ char *newpath = rfs4_dss_newpaths[i];
+
+ dss_path = rfs4_dss_pathlist;
+ do {
+ char *path = dss_path->path;
+
+ /* used only for non-HA */
+ if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) {
+ dss_path = dss_path->next;
+ continue;
+ }
+
+ if (strncmp(path, newpath, strlen(path)) == 0) {
+ found = 1;
+ break;
+ }
+
+ dss_path = dss_path->next;
+ } while (dss_path != rfs4_dss_pathlist);
+
+ if (found == 0) {
+ added_paths[numadded_paths] = newpath;
+ numadded_paths++;
+ }
+ }
+
+ /* did we find any added paths? */
+ if (numadded_paths > 0) {
+ /* create a new server instance, and start its grace period */
+ start_grace = 1;
+ rfs4_servinst_create(start_grace, numadded_paths, added_paths);
+
+ /* read in the stable storage state from these paths */
+ rfs4_dss_readstate(numadded_paths, added_paths);
+
+ /*
+ * Multiple failovers during a grace period will cause
+ * clients of the same resource group to be partitioned
+ * into different server instances, with different
+ * grace periods. Since clients of the same resource
+ * group must be subject to the same grace period,
+ * we need to reset all currently active grace periods.
+ */
+ rfs4_grace_reset_all();
+ }
+
+ if (rfs4_dss_numnewpaths > 0)
+ kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *));
+}