diff options
author | calum <none@none> | 2006-05-22 15:43:31 -0700 |
---|---|---|
committer | calum <none@none> | 2006-05-22 15:43:31 -0700 |
commit | cee8668251d5ec44fd1c6d6ddeb9c1d1821a57d2 (patch) | |
tree | 8d4780a53c331cd3c2879233916bc1608c9a6df0 /usr/src | |
parent | a4ac8bb3f5b2fff60581bee101792ac7a34bad8c (diff) | |
download | illumos-gate-cee8668251d5ec44fd1c6d6ddeb9c1d1821a57d2.tar.gz |
PSARC/2006/313 NFSv4: nfsd "-s" distributed stable storage
6244819 NFSv4 needs distributed stable storage to work on Cluster HA-NFS
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/fs.d/nfs/nfsd/Makefile | 9 | ||||
-rw-r--r-- | usr/src/cmd/fs.d/nfs/nfsd/nfsd.c | 283 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs4_srv.c | 137 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs4_state.c | 532 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs_server.c | 208 | ||||
-rw-r--r-- | usr/src/uts/common/fs/nfs/nfs_sys.c | 87 | ||||
-rw-r--r-- | usr/src/uts/common/nfs/nfs.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/nfs/nfs4.h | 37 | ||||
-rw-r--r-- | usr/src/uts/common/nfs/nfssys.h | 11 |
9 files changed, 999 insertions, 310 deletions
diff --git a/usr/src/cmd/fs.d/nfs/nfsd/Makefile b/usr/src/cmd/fs.d/nfs/nfsd/Makefile index 81a54a572d..1061f6d286 100644 --- a/usr/src/cmd/fs.d/nfs/nfsd/Makefile +++ b/usr/src/cmd/fs.d/nfs/nfsd/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 1989,2001-2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -32,7 +31,7 @@ ATTMK= $(TYPEPROG) include ../../Makefile.fstype -LDLIBS += -lnsl -lcmd +LDLIBS += -lnsl -lcmd -lnvpair LOCAL= nfsd.o OBJS= $(LOCAL) nfs_tbind.o thrpool.o diff --git a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c index 6dbb8053c4..169a3cd544 100644 --- a/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c +++ b/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c @@ -45,6 +45,7 @@ #include <sys/param.h> #include <sys/types.h> +#include <sys/stat.h> #include <syslog.h> #include <tiuser.h> #include <rpc/rpc.h> @@ -73,15 +74,22 @@ #include <deflt.h> #include <rpcsvc/daemon_utils.h> #include <rpcsvc/nfs4_prot.h> +#include <libnvpair.h> #include "nfs_tbind.h" #include "thrpool.h" /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */ #define QUIESCE_VERSMIN 4 +/* DSS: distributed stable storage */ +#define DSS_VERSMIN 4 static int nfssvc(int, struct netbuf, struct netconfig *); -static int nfssvcpool(int maxservers); +static int nfssvcpool(int maxservers); +static int dss_init(uint_t npaths, char **pathnames); +static void dss_mkleafdirs(uint_t npaths, char **pathnames); +static void dss_mkleafdir(char *dir, char *leaf, char *path); static void usage(void); +int qstrcmp(const void *s1, const void *s2); extern int _nfssys(int, void *); @@ -138,6 +146,8 @@ main(int ac, char *av[]) NETSELPDECL(providerp); char *defval; boolean_t can_do_mlp; + uint_t dss_npaths = 0; + char **dss_pathnames = NULL; MyName = *av; @@ -239,7 +249,7 @@ main(int ac, char *av[]) } opt_cnt = 0; - while ((i = getopt(ac, av, "ac:p:t:l:")) != EOF) { + while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) { switch (i) { case 'a': free(df_proto); @@ -261,6 +271,39 @@ main(int ac, char *av[]) opt_cnt++; break; + /* + * DSS: NFSv4 distributed stable storage. + * + * This is a Contracted Project Private interface, for + * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313. + */ + case 's': + if (strlen(optarg) < MAXPATHLEN) { + /* first "-s" option encountered? */ + if (dss_pathnames == NULL) { + /* + * Allocate maximum possible space + * required given cmdline arg count; + * "-s <path>" consumes two args. + */ + size_t sz = (ac / 2) * sizeof (char *); + dss_pathnames = (char **)malloc(sz); + if (dss_pathnames == NULL) { + (void) fprintf(stderr, "%s: " + "dss paths malloc failed\n", + av[0]); + exit(1); + } + (void) memset(dss_pathnames, 0, sz); + } + dss_pathnames[dss_npaths] = optarg; + dss_npaths++; + } else { + (void) fprintf(stderr, + "%s: -s pathname too long.\n", av[0]); + } + break; + case 't': provider = optarg; df_allflag = 0; @@ -410,6 +453,18 @@ main(int ac, char *av[]) exit(0); } + /* + * If we've been given a list of paths to be used for distributed + * stable storage, and provided we're going to run a version + * that supports it, setup the DSS paths. + */ + if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) { + if (dss_init(dss_npaths, dss_pathnames) != 0) { + syslog(LOG_ERR, "dss_init failed. Exiting."); + exit(1); + } + } + sigset(SIGTERM, sigflush); sigset(SIGUSR1, quiesce); @@ -520,7 +575,7 @@ done: if (num_fds == 0) { (void) syslog(LOG_ERR, - "Could not start NFS service for any protocol. Exiting."); + "Could not start NFS service for any protocol. Exiting"); exit(1); } @@ -643,7 +698,12 @@ sigflush(int sig) /* * SIGUSR1 handler. - * Request server quiesce, then exit. For subsequent warm start. + * + * Request that server quiesce, then (nfsd) exit. For subsequent warm start. + * + * This is a Contracted Project Private interface, for the sole use + * of Sun Cluster HA-NFS. See PSARC/2004/497. + * * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN. */ static void @@ -654,10 +714,10 @@ quiesce(int sig) if (nfs_server_vers_max >= QUIESCE_VERSMIN) { /* Request server quiesce at next shutdown */ - error = _nfssys(NFS_SVC_REQUEST_QUIESCE, &id); + error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id); if (error) { syslog(LOG_ERR, - "_nfssys(NFS_SVC_REQUEST_QUIESCE) failed: %s\n", + "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s", strerror(errno)); return; } @@ -668,3 +728,214 @@ quiesce(int sig) exit(0); } + +/* + * DSS: distributed stable storage. + * Create leaf directories as required, keeping an eye on path + * lengths. Calls exit(1) on failure. + * The pathnames passed in must already exist, and must be writeable by nfsd. + * Note: the leaf directories under NFS4_VAR_DIR are not created here; + * they're created at pkg install. + */ +static void +dss_mkleafdirs(uint_t npaths, char **pathnames) +{ + int i; + char *tmppath = NULL; + + /* + * Create the temporary storage used by dss_mkleafdir() here, + * rather than in that function, so that it only needs to be + * done once, rather than once for each call. Too big to put + * on the function's stack. + */ + tmppath = (char *)malloc(MAXPATHLEN); + if (tmppath == NULL) { + syslog(LOG_ERR, "tmppath malloc failed. Exiting"); + exit(1); + } + + for (i = 0; i < npaths; i++) { + char *p = pathnames[i]; + + dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath); + dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath); + } + + free(tmppath); +} + +/* + * Create "leaf" in "dir" (which must already exist). + * leaf: should start with a '/' + */ +static void +dss_mkleafdir(char *dir, char *leaf, char *tmppath) +{ + /* MAXPATHLEN includes the terminating NUL */ + if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) { + syslog(LOG_ERR, "stable storage path too long: %s%s. Exiting", + dir, leaf); + exit(1); + } + + (void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf); + + /* the directory may already exist: that's OK */ + if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) { + syslog(LOG_ERR, "error creating stable storage directory: " + "%s: %s. Exiting", strerror(errno), tmppath); + exit(1); + } +} + +/* + * Create the storage dirs, and pass the path list to the kernel. + * This requires the nfssrv module to be loaded; the _nfssys() syscall + * will fail ENOTSUP if it is not. + * Use libnvpair(3LIB) to pass the data to the kernel. + */ +static int +dss_init(uint_t npaths, char **pathnames) +{ + int i, j, nskipped, error; + char *bufp; + uint32_t bufsize; + size_t buflen; + nvlist_t *nvl; + + if (npaths > 1) { + /* + * We need to remove duplicate paths; this might be user error + * in the general case, but HA-NFSv4 can also cause this. + * Sort the pathnames array, and NULL out duplicates, + * then write the non-NULL entries to a new array. + * Sorting will also allow the kernel to optimise its searches. + */ + + qsort(pathnames, npaths, sizeof (char *), qstrcmp); + + /* now NULL out any duplicates */ + i = 0; j = 1; nskipped = 0; + while (j < npaths) { + if (strcmp(pathnames[i], pathnames[j]) == NULL) { + pathnames[j] = NULL; + j++; + nskipped++; + continue; + } + + /* skip i over any of its NULLed duplicates */ + i = j++; + } + + /* finally, write the non-NULL entries to a new array */ + if (nskipped > 0) { + int nreal; + size_t sz; + char **tmp_pathnames; + + nreal = npaths - nskipped; + + sz = nreal * sizeof (char *); + tmp_pathnames = (char **)malloc(sz); + if (tmp_pathnames == NULL) { + syslog(LOG_ERR, "tmp_pathnames malloc failed"); + exit(1); + } + + for (i = 0, j = 0; i < npaths; i++) + if (pathnames[i] != NULL) + tmp_pathnames[j++] = pathnames[i]; + free(pathnames); + pathnames = tmp_pathnames; + npaths = nreal; + } + + } + + /* Create directories to store the distributed state files */ + dss_mkleafdirs(npaths, pathnames); + + /* Create the name-value pair list */ + error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); + if (error) { + syslog(LOG_ERR, "nvlist_alloc failed: %s.", strerror(errno)); + return (1); + } + + /* Add the pathnames array as a single name-value pair */ + error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME, + pathnames, npaths); + if (error) { + syslog(LOG_ERR, "nvlist_add_string_array failed: %s.", + strerror(errno)); + nvlist_free(nvl); + return (1); + } + + /* + * Pack list into contiguous memory, for passing to kernel. + * nvlist_pack() will allocate the memory for the buffer, + * which we should free() when no longer needed. + * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary. + */ + bufp = NULL; + error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0); + if (error) { + syslog(LOG_ERR, "nvlist_pack failed: %s.", strerror(errno)); + nvlist_free(nvl); + return (1); + } + + /* Now we have the packed buffer, we no longer need the list */ + nvlist_free(nvl); + + /* + * Let the kernel know in advance how big the buffer is. + * NOTE: we cannot just pass buflen, since size_t is a long, and + * thus a different size between ILP32 userland and LP64 kernel. + * Use an int for the transfer, since that should be big enough; + * this is a no-op at the moment, here, since nfsd is 32-bit, but + * that could change. + */ + bufsize = (uint32_t)buflen; + error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize); + if (error) { + syslog(LOG_ERR, + "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ", + strerror(errno)); + free(bufp); + return (1); + } + + /* Pass the packed buffer to the kernel */ + error = _nfssys(NFS4_DSS_SETPATHS, bufp); + if (error) { + syslog(LOG_ERR, + "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno)); + free(bufp); + return (1); + } + + /* + * The kernel has now unpacked the buffer and extracted the + * pathnames array, we no longer need the buffer. + */ + free(bufp); + + return (0); +} + +/* + * Quick sort string compare routine, for qsort. + * Needed to make arg types correct. + */ +int +qstrcmp(const void *p1, const void *p2) +{ + char *s1 = *((char **)p1); + char *s2 = *((char **)p2); + + return (strcmp(s1, s2)); +} diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index c0222cc6e2..0646bace0f 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -54,6 +54,7 @@ #include <sys/policy.h> #include <sys/fem.h> #include <sys/sdt.h> +#include <sys/ddi.h> #include <rpc/types.h> #include <rpc/auth.h> @@ -272,10 +273,6 @@ rfs4_servinst_t *rfs4_cur_servinst = NULL; /* current server instance */ kmutex_t rfs4_servinst_lock; /* protects linked list */ int rfs4_seen_first_compound; /* set first time we see one */ -#ifdef DEBUG -int rfs4_servinst_debug = 0; -#endif - /* * NFS4 op dispatch table */ @@ -470,6 +467,8 @@ static char *rfs4_op_string[] = { void rfs4_ss_chkclid(rfs4_client_t *); +extern size_t strlcpy(char *dst, const char *src, size_t dstsize); + #ifdef nextdp #undef nextdp #endif @@ -601,9 +600,6 @@ rfs4_grace_start(rfs4_servinst_t *sip) { time_t now = gethrestime_sec(); - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_grace_start: inst %p: 0x%lx", (void *)sip, now)); - rw_enter(&sip->rwlock, RW_WRITER); sip->start_time = now; sip->grace_period = rfs4_grace_period; @@ -655,24 +651,13 @@ rfs4_clnt_in_grace(rfs4_client_t *cp) void rfs4_grace_reset_all(void) { -#ifdef DEBUG - int n = 0; -#endif rfs4_servinst_t *sip; mutex_enter(&rfs4_servinst_lock); - for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) { - if (rfs4_servinst_in_grace(sip)) { + for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) + if (rfs4_servinst_in_grace(sip)) rfs4_grace_start(sip); -#ifdef DEBUG - n++; -#endif - } - } mutex_exit(&rfs4_servinst_lock); - - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_grace_reset_all: reset %d instances", n)); } /* @@ -681,23 +666,52 @@ rfs4_grace_reset_all(void) void rfs4_grace_start_new(void) { -#ifdef DEBUG - int n = 0; -#endif rfs4_servinst_t *sip; mutex_enter(&rfs4_servinst_lock); - for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) { + for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) if (rfs4_servinst_grace_new(sip)) rfs4_grace_start(sip); -#ifdef DEBUG - n++; -#endif - } mutex_exit(&rfs4_servinst_lock); +} + +static rfs4_dss_path_t * +rfs4_dss_newpath(rfs4_servinst_t *sip, char *path, unsigned index) +{ + size_t len; + rfs4_dss_path_t *dss_path; + + dss_path = kmem_alloc(sizeof (rfs4_dss_path_t), KM_SLEEP); + + /* + * Take a copy of the string, since the original may be overwritten. + * Sadly, no strdup() in the kernel. + */ + /* allow for NUL */ + len = strlen(path) + 1; + dss_path->path = kmem_alloc(len, KM_SLEEP); + (void) strlcpy(dss_path->path, path, len); + + /* associate with servinst */ + dss_path->sip = sip; + dss_path->index = index; + + /* + * Add to list of served paths. + * No locking required, as we're only ever called at startup. + */ + if (rfs4_dss_pathlist == NULL) { + /* this is the first dss_path_t */ - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_grace_start_new: started %d new instances", n)); + /* needed for insque/remque */ + dss_path->next = dss_path->prev = dss_path; + + rfs4_dss_pathlist = dss_path; + } else { + insque(dss_path, rfs4_dss_pathlist); + } + + return (dss_path); } /* @@ -706,9 +720,11 @@ rfs4_grace_start_new(void) * recovery window. */ void -rfs4_servinst_create(int start_grace) +rfs4_servinst_create(int start_grace, int dss_npaths, char **dss_paths) { + unsigned i; rfs4_servinst_t *sip; + rfs4_oldstate_t *oldstate; sip = kmem_alloc(sizeof (rfs4_servinst_t), KM_SLEEP); rw_init(&sip->rwlock, NULL, RW_DEFAULT, NULL); @@ -718,11 +734,28 @@ rfs4_servinst_create(int start_grace) sip->next = NULL; sip->prev = NULL; + rw_init(&sip->oldstate_lock, NULL, RW_DEFAULT, NULL); + /* + * This initial dummy entry is required to setup for insque/remque. + * It must be skipped over whenever the list is traversed. + */ + oldstate = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP); + /* insque/remque require initial list entry to be self-terminated */ + oldstate->next = oldstate; + oldstate->prev = oldstate; + sip->oldstate = oldstate; + + + sip->dss_npaths = dss_npaths; + sip->dss_paths = kmem_alloc(dss_npaths * + sizeof (rfs4_dss_path_t *), KM_SLEEP); + + for (i = 0; i < dss_npaths; i++) { + sip->dss_paths[i] = rfs4_dss_newpath(sip, dss_paths[i], i); + } + mutex_enter(&rfs4_servinst_lock); - if (rfs4_cur_servinst == NULL) { - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_servinst_create: creating first instance")); - } else { + if (rfs4_cur_servinst != NULL) { /* add to linked list */ sip->prev = rfs4_cur_servinst; rfs4_cur_servinst->next = sip; @@ -731,11 +764,8 @@ rfs4_servinst_create(int start_grace) rfs4_grace_start(sip); /* make the new instance "current" */ rfs4_cur_servinst = sip; - mutex_exit(&rfs4_servinst_lock); - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_servinst_create: new current instance: %p; start_grace: %d", - (void *)sip, start_grace)); + mutex_exit(&rfs4_servinst_lock); } /* @@ -757,15 +787,17 @@ rfs4_servinst_destroy_all(void) for (sip = current; sip != NULL; sip = prev) { prev = sip->prev; rw_destroy(&sip->rwlock); + if (sip->oldstate) + kmem_free(sip->oldstate, sizeof (rfs4_oldstate_t)); + if (sip->dss_paths) + kmem_free(sip->dss_paths, + sip->dss_npaths * sizeof (rfs4_dss_path_t *)); kmem_free(sip, sizeof (rfs4_servinst_t)); #ifdef DEBUG n++; #endif } mutex_exit(&rfs4_servinst_lock); - - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_servinst_destroy_all: destroyed %d instances", n)); } /* @@ -777,10 +809,6 @@ rfs4_servinst_assign(rfs4_client_t *cp, rfs4_servinst_t *sip) { ASSERT(rfs4_dbe_refcnt(cp->dbe) > 0); - NFS4_DEBUG(rfs4_servinst_debug, (CE_NOTE, - "rfs4_servinst_assign: client: %p, old: %p, new: %p", (void *)cp, - (void *)cp->server_instance, (void *)sip)); - /* * The lock ensures that if the current instance is in the process * of changing, we will see the new one. @@ -7486,7 +7514,15 @@ rfs4_op_setclientid_confirm(nfs_argop4 *argop, nfs_resop4 *resop, } /* - * Record clientid in stable storage + * Update the client's associated server instance, if it's changed + * since the client was created. + */ + if (rfs4_servinst(cp) != rfs4_cur_servinst) + rfs4_servinst_assign(cp, rfs4_cur_servinst); + + /* + * Record clientid in stable storage. + * Must be done after server instance has been assigned. */ rfs4_ss_clid(cp, req); @@ -7501,13 +7537,6 @@ rfs4_op_setclientid_confirm(nfs_argop4 *argop, nfs_resop4 *resop, rfs4_update_lease(cp); /* - * Update the client's associated server instance, if it's changed - * since the client was created. - */ - if (rfs4_servinst(cp) != rfs4_cur_servinst) - rfs4_servinst_assign(cp, rfs4_cur_servinst); - - /* * Check to see if client can perform reclaims */ rfs4_ss_chkclid(cp); diff --git a/usr/src/uts/common/fs/nfs/nfs4_state.c b/usr/src/uts/common/fs/nfs/nfs4_state.c index 463cc89e6f..dd78ad7b74 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_state.c +++ b/usr/src/uts/common/fs/nfs/nfs4_state.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,7 +38,7 @@ #include <nfs/nfssys.h> #include <nfs/lm.h> #include <sys/pathname.h> - +#include <sys/nvpair.h> extern time_t rfs4_start_time; @@ -72,6 +71,11 @@ int rfs4_debug; static uint32_t rfs4_database_debug = 0x00; +static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf); +static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf); +static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip); +static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip); + /* * Couple of simple init/destroy functions for a general waiter */ @@ -333,6 +337,8 @@ static time_t rfs4_file_cache_time = 0; static time_t rfs4_deleg_state_cache_time = 0; static bool_t rfs4_client_create(rfs4_entry_t, void *); +static void rfs4_dss_remove_cpleaf(rfs4_client_t *); +static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *); static void rfs4_client_destroy(rfs4_entry_t); static bool_t rfs4_client_expiry(rfs4_entry_t); static uint32_t clientid_hash(void *); @@ -394,15 +400,8 @@ static void *deleg_state_mkkey(rfs4_entry_t); static void rfs4_state_rele_nounlock(rfs4_state_t *); -static rfs4_oldstate_t *rfs4_oldstate = NULL; -static krwlock_t rfs4_oldstate_lock; static int rfs4_ss_enabled = 0; -#define NFS4_VAR_DIR "/var/nfs" -#define NFS4_STATE_DIR NFS4_VAR_DIR"/v4_state" -#define NFS4_OLDSTATE_DIR NFS4_VAR_DIR"/v4_oldstate" -#define NFS4_SS_DIR_MODE 0755 - extern void (*rfs4_client_clrst)(struct nfs4clrst_args *); void @@ -411,24 +410,6 @@ rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn) kmem_free(ss_pn, sizeof (rfs4_ss_pn_t)); } -/* - * Free all malloced rsf4_oldstate_t memory - */ -void -rfs4_oldstate_free(rfs4_oldstate_t *ros) -{ - if (ros == NULL) - return; - - if (ros->cl_id4.id_val) - kmem_free(ros->cl_id4.id_val, ros->cl_id4.id_len); - - if (ros->ss_pn) - kmem_free(ros->ss_pn, sizeof (rfs4_ss_pn_t)); - - kmem_free(ros, sizeof (rfs4_oldstate_t)); -} - static rfs4_ss_pn_t * rfs4_ss_pnalloc(char *dir, char *leaf) { @@ -465,9 +446,8 @@ rfs4_ss_movestate(char *sdir, char *ddir, char *leaf) { rfs4_ss_pn_t *src, *dst; - if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL) { + if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL) return (NULL); - } if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) { rfs4_ss_pnfree(src); @@ -500,9 +480,8 @@ rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn) uint_t id_len; int err, kill_file, file_vers; - if (ss_pn == NULL) { + if (ss_pn == NULL) return (NULL); - } /* * open the state file. @@ -554,7 +533,7 @@ rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn) */ iov[0].iov_base = (caddr_t)&file_vers; iov[0].iov_len = sizeof (int); - iov[1].iov_base = (caddr_t)cl_ss; + iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier; iov[1].iov_len = NFS4_VERIFIER_SIZE; iov[2].iov_base = (caddr_t)&id_len; iov[2].iov_len = sizeof (uint_t); @@ -626,9 +605,11 @@ rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn) #define nextdp(dp) ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen)) /* + * Add entries from statedir to supplied oldstate list. + * Optionally, move all entries from statedir -> destdir. */ void -rfs4_ss_oldstate(char *dir, int do_move) +rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir) { rfs4_ss_pn_t *ss_pn; rfs4_oldstate_t *cl_ss = NULL; @@ -643,24 +624,11 @@ rfs4_ss_oldstate(char *dir, int do_move) /* * open the state directory */ - if (err = vn_open(dir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0)) { - return; - } - - /* - * if this is not a directory return - */ - if (dvp->v_type != VDIR) { - (void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED()); - VN_RELE(dvp); + if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0)) return; - } - err = VOP_ACCESS(dvp, VREAD, 0, CRED()); - if (err) { - /* Can't read the directory. So get the heck out. */ + if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED())) goto out; - } dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP); @@ -678,12 +646,9 @@ rfs4_ss_oldstate(char *dir, int do_move) uio.uio_resid = RFS4_SS_DIRSIZE; err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof); - VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); - - if (err) { + if (err) goto out; - } size = RFS4_SS_DIRSIZE - uio.uio_resid; @@ -700,131 +665,136 @@ rfs4_ss_oldstate(char *dir, int do_move) /* * Skip '.' and '..' */ - if (NFS_IS_DOTNAME(dep->d_name)) { + if (NFS_IS_DOTNAME(dep->d_name)) continue; - } - if ((ss_pn = rfs4_ss_pnalloc(dir, dep->d_name)) - == NULL) { + ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name); + if (ss_pn == NULL) continue; - } if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) { - if (do_move) { + if (destdir != NULL) { rfs4_ss_pnfree(ss_pn); cl_ss->ss_pn = rfs4_ss_movestate( - NFS4_STATE_DIR, - NFS4_OLDSTATE_DIR, - dep->d_name); + statedir, destdir, dep->d_name); } else { cl_ss->ss_pn = ss_pn; } - insque(cl_ss, rfs4_oldstate); + insque(cl_ss, oldstate); } else { rfs4_ss_pnfree(ss_pn); } } } -out: +out: (void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED()); VN_RELE(dvp); if (dirt) kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE); } -/* - * Validates that the needed directories exist - */ -bool_t -rfs4_validate_var(void) +static void +rfs4_ss_init(void) { - vnode_t *vp; - int i; - char *dnp; - bool_t ret_val = TRUE; - char *dir_names[] = { - NFS4_VAR_DIR, - NFS4_STATE_DIR, - NFS4_OLDSTATE_DIR, - NULL - }; + int npaths = 1; + char *default_dss_path = NFS4_DSS_VAR_DIR; - for (i = 0, dnp = dir_names[i]; dnp; i++) { - if (lookupname(dnp, UIO_SYSSPACE, - NO_FOLLOW, NULLVPP, &vp) != 0) { - cmn_err(CE_WARN, "!NFS4 stable storage directory " - "missing!: %s", dnp); - ret_val = FALSE; - } else { - VN_RELE(vp); - } - dnp = dir_names[i]; + /* read the default stable storage state */ + rfs4_dss_readstate(npaths, &default_dss_path); + + rfs4_ss_enabled = 1; +} + +static void +rfs4_ss_fini(void) +{ + rfs4_servinst_t *sip; + + mutex_enter(&rfs4_servinst_lock); + sip = rfs4_cur_servinst; + while (sip != NULL) { + rfs4_dss_clear_oldstate(sip); + sip = sip->next; } - return (ret_val); + mutex_exit(&rfs4_servinst_lock); } /* - * + * Remove all oldstate files referenced by this servinst. */ static void -rfs4_ss_init(void) +rfs4_dss_clear_oldstate(rfs4_servinst_t *sip) { - rw_init(&rfs4_oldstate_lock, NULL, RW_DEFAULT, NULL); + rfs4_oldstate_t *os_head, *osp; + + rw_enter(&sip->oldstate_lock, RW_WRITER); + os_head = sip->oldstate; - if (rfs4_validate_var() == FALSE) { - rfs4_oldstate = NULL; + if (os_head == NULL) return; - } - rfs4_oldstate = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP); - rfs4_oldstate->next = rfs4_oldstate; - rfs4_oldstate->prev = rfs4_oldstate; + /* skip dummy entry */ + osp = os_head->next; + while (osp != os_head) { + char *leaf = osp->ss_pn->leaf; + rfs4_oldstate_t *os_next; - /* - * load info from the OLD directory - */ - rfs4_ss_oldstate(NFS4_OLDSTATE_DIR, 0); + rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf); - /* - * Gather and move NFS4_STATE_DIR to NFS4_OLDSTATE_DIR - */ - rfs4_ss_oldstate(NFS4_STATE_DIR, 1); + if (osp->cl_id4.id_val) + kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len); + if (osp->ss_pn) + kmem_free(osp->ss_pn, sizeof (rfs4_ss_pn_t)); - rfs4_ss_enabled = 1; + os_next = osp->next; + remque(osp); + kmem_free(osp, sizeof (rfs4_oldstate_t)); + osp = os_next; + } + + /* free dummy entry */ + kmem_free(osp, sizeof (rfs4_oldstate_t)); + + sip->oldstate = NULL; + + rw_exit(&sip->oldstate_lock); } -static void -rfs4_ss_fini(void) +/* + * Form the state and oldstate paths, and read in the stable storage files. + */ +void +rfs4_dss_readstate(int npaths, char **paths) { + int i; + char *state, *oldstate; - rfs4_oldstate_t *ost, *osp, *os_head; + state = kmem_alloc(MAXPATHLEN, KM_SLEEP); + oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP); - rw_destroy(&rfs4_oldstate_lock); + for (i = 0; i < npaths; i++) { + char *path = paths[i]; - /* - * short circuit everything if we have no - * remaining oldstate! - */ - if (rfs4_oldstate == NULL) { - return; - } - - /* - * It is possible to start and immediately stop the server - * in which case we would not have cleaned up the oldstate - * circular queue so we may do it here. - */ - os_head = rfs4_oldstate; - osp = os_head->next; + (void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF); + (void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF); - while (osp != os_head) { - ost = osp->next; - remque(osp); - rfs4_oldstate_free(osp); - osp = ost; + /* + * Populate the current server instance's oldstate list. + * + * 1. Read stable storage data from old state directory, + * leaving its contents alone. + * + * 2. Read stable storage data from state directory, + * and move the latter's contents to old state + * directory. + */ + rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL); + rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate); } - kmem_free(os_head, sizeof (rfs4_oldstate_t)); + + kmem_free(state, MAXPATHLEN); + kmem_free(oldstate, MAXPATHLEN); } @@ -835,63 +805,63 @@ rfs4_ss_fini(void) void rfs4_ss_chkclid(rfs4_client_t *cp) { - rfs4_oldstate_t *ost, *osp, *os_head; + rfs4_servinst_t *sip; /* - * short circuit everything if we have no - * oldstate! + * It should be sufficient to check the oldstate data for just + * this client's instance. However, since our per-instance + * client grouping is solely temporal, HA-NFSv4 RG failover + * might result in clients of the same RG being partitioned into + * separate instances. + * + * Until the client grouping is improved, we must check the + * oldstate data for all instances with an active grace period. + * + * This also serves as the mechanism to remove stale oldstate data. + * The first time we check an instance after its grace period has + * expired, the oldstate data should be cleared. + * + * Start at the current instance, and walk the list backwards + * to the first. */ - if (rfs4_oldstate == NULL) { - return; - } + mutex_enter(&rfs4_servinst_lock); + for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) { + rfs4_ss_chkclid_sip(cp, sip); - /* - * if we are not in the grace_period then - * we can destroy and mutilate all the old state. - */ - if (!rfs4_clnt_in_grace(cp)) { - rw_enter(&rfs4_oldstate_lock, RW_WRITER); - if (rfs4_oldstate == NULL) { - /* - * some other thread is killing - * the state so we get to just return. - */ - rw_exit(&rfs4_oldstate_lock); - return; - } - - os_head = rfs4_oldstate; - rfs4_oldstate = NULL; - rw_exit(&rfs4_oldstate_lock); + /* if the above check found this client, we're done */ + if (cp->can_reclaim) + break; + } + mutex_exit(&rfs4_servinst_lock); +} - /* - * Now ditch the state files and structures - * we've malloc()'d - */ - osp = os_head->next; +static void +rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip) +{ + rfs4_oldstate_t *osp, *os_head; - while (osp != os_head) { - if (osp->ss_pn != NULL) { - (void) vn_remove(osp->ss_pn->pn, - UIO_SYSSPACE, RMFILE); - } - ost = osp->next; - remque(osp); - rfs4_oldstate_free(osp); - osp = ost; - } - kmem_free(os_head, sizeof (rfs4_oldstate_t)); + /* short circuit everything if this server instance has no oldstate */ + rw_enter(&sip->oldstate_lock, RW_READER); + os_head = sip->oldstate; + rw_exit(&sip->oldstate_lock); + if (os_head == NULL) return; - } /* - * we're still in grace, search for the clientid + * If this server instance is no longer in a grace period then + * the client won't be able to reclaim. No further need for this + * instance's oldstate data, so it can be cleared. */ - rw_enter(&rfs4_oldstate_lock, RW_READER); + if (!rfs4_servinst_in_grace(sip)) + return; - os_head = rfs4_oldstate; - osp = os_head->next; + /* this instance is still in grace; search for the clientid */ + + rw_enter(&sip->oldstate_lock, RW_READER); + os_head = sip->oldstate; + /* skip dummy entry */ + osp = os_head->next; while (osp != os_head) { if (osp->cl_id4.id_len == cp->nfs_client.id_len) { if (bcmp(osp->cl_id4.id_val, cp->nfs_client.id_val, @@ -903,25 +873,19 @@ rfs4_ss_chkclid(rfs4_client_t *cp) osp = osp->next; } - rw_exit(&rfs4_oldstate_lock); + rw_exit(&sip->oldstate_lock); } /* - * Place client information into stable storage. + * Place client information into stable storage: 1/3. + * First, generate the leaf filename, from the client's IP address and + * the server-generated short-hand clientid. */ void rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req) { const char *kinet_ntop6(uchar_t *, char *, size_t); - - nfs_client_id4 *cl_id4; - rfs4_ss_pn_t *ss_pn; char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN]; - vnode_t *vp; - struct uio uio; - struct iovec iov[4]; - int file_vers = NFS4_SS_VERSION; - int ioflag; struct sockaddr *ca; uchar_t *b; @@ -959,10 +923,70 @@ rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req) (void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf, (longlong_t)cp->clientid); + rfs4_ss_clid_write(cp, leaf); +} - if ((ss_pn = rfs4_ss_pnalloc(NFS4_STATE_DIR, leaf)) == NULL) { - return; +/* + * Place client information into stable storage: 2/3. + * DSS: distributed stable storage: the file may need to be written to + * multiple directories. + */ +static void +rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf) +{ + rfs4_servinst_t *sip; + + /* + * It should be sufficient to write the leaf file to (all) DSS paths + * associated with just this client's instance. However, since our + * per-instance client grouping is solely temporal, HA-NFSv4 RG + * failover might result in us losing DSS data. + * + * Until the client grouping is improved, we must write the DSS data + * to all instances' paths. Start at the current instance, and + * walk the list backwards to the first. + */ + mutex_enter(&rfs4_servinst_lock); + for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) { + int i, npaths = sip->dss_npaths; + + /* write the leaf file to all DSS paths */ + for (i = 0; i < npaths; i++) { + rfs4_dss_path_t *dss_path = sip->dss_paths[i]; + + /* HA-NFSv4 path might have been failed-away from us */ + if (dss_path == NULL) + continue; + + rfs4_ss_clid_write_one(cp, dss_path->path, leaf); + } } + mutex_exit(&rfs4_servinst_lock); +} + +/* + * Place client information into stable storage: 3/3. + * Write the stable storage data to the requested file. + */ +static void +rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf) +{ + int ioflag; + int file_vers = NFS4_SS_VERSION; + struct uio uio; + struct iovec iov[4]; + char *dir; + rfs4_ss_pn_t *ss_pn; + vnode_t *vp; + nfs_client_id4 *cl_id4 = &(cp->nfs_client); + + /* allow 2 extra bytes for '/' & NUL */ + dir = kmem_alloc(strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2, + KM_SLEEP); + (void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF); + + if ((ss_pn = rfs4_ss_pnalloc(dir, leaf)) == NULL) + return; if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp, CRCREAT, 0)) { @@ -970,19 +994,31 @@ rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req) return; } - if (cp->ss_pn) - rfs4_ss_pnfree(cp->ss_pn); - - cp->ss_pn = ss_pn; - - cl_id4 = &(cp->nfs_client); + /* + * We need to record leaf - i.e. the filename - so that we know + * what to remove, in the future. However, the dir part of cp->ss_pn + * should never be referenced directly, since it's potentially only + * one of several paths with this leaf in it. + */ + if (cp->ss_pn != NULL) { + if (strcmp(cp->ss_pn->leaf, leaf) == 0) { + /* we've already recorded *this* leaf */ + rfs4_ss_pnfree(ss_pn); + } else { + /* replace with this leaf */ + rfs4_ss_pnfree(cp->ss_pn); + cp->ss_pn = ss_pn; + } + } else { + cp->ss_pn = ss_pn; + } /* * Build a scatter list that points to the nfs_client_id4 */ iov[0].iov_base = (caddr_t)&file_vers; iov[0].iov_len = sizeof (int); - iov[1].iov_base = (caddr_t)cl_id4; + iov[1].iov_base = (caddr_t)&(cl_id4->verifier); iov[1].iov_len = NFS4_VERIFIER_SIZE; iov[2].iov_base = (caddr_t)&(cl_id4->id_len); iov[2].iov_len = sizeof (uint_t); @@ -1010,6 +1046,45 @@ rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req) } /* + * DSS: distributed stable storage. + * Unpack the list of paths passed by nfsd. + * Use nvlist_alloc(9F) to manage the data. + * The caller is responsible for allocating and freeing the buffer. + */ +int +rfs4_dss_setpaths(char *buf, size_t buflen) +{ + int error; + + /* + * If this is a "warm start", i.e. we previously had DSS paths, + * preserve the old paths. + */ + if (rfs4_dss_paths != NULL) { + /* + * Before we lose the ptr, destroy the nvlist and pathnames + * array from the warm start before this one. + */ + if (rfs4_dss_oldpaths) + nvlist_free(rfs4_dss_oldpaths); + rfs4_dss_oldpaths = rfs4_dss_paths; + } + + /* unpack the buffer into a searchable nvlist */ + error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP); + if (error) + return (error); + + /* + * Search the nvlist for the pathnames nvpair (which is the only nvpair + * in the list, and record its location. + */ + error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME, + &rfs4_dss_newpaths, &rfs4_dss_numnewpaths); + return (error); +} + +/* * Ultimately the nfssys() call NFS4_CLR_STATE endsup here * to find and mark the client for forced expire. */ @@ -1089,6 +1164,7 @@ rfs4_state_init() { int start_grace; extern boolean_t rfs4_cpr_callb(void *, int); + char *dss_path = NFS4_DSS_VAR_DIR; mutex_enter(&rfs4_state_lock); @@ -1114,6 +1190,9 @@ rfs4_state_init() else rfs4_start_time++; + /* DSS: distributed stable storage: initialise served paths list */ + rfs4_dss_pathlist = NULL; + /* * Create the first server instance, or a new one if the server has * been restarted; see above comments on rfs4_start_time. Don't @@ -1121,7 +1200,7 @@ rfs4_state_init() * clients' recovery window. */ start_grace = 0; - rfs4_servinst_create(start_grace); + rfs4_servinst_create(start_grace, 1, &dss_path); /* reset the "first NFSv4 request" status */ rfs4_seen_first_compound = 0; @@ -1355,6 +1434,13 @@ rfs4_state_fini() /* reset the "first NFSv4 request" status */ rfs4_seen_first_compound = 0; + + /* DSS: distributed stable storage */ + if (rfs4_dss_oldpaths) + nvlist_free(rfs4_dss_oldpaths); + if (rfs4_dss_paths) + nvlist_free(rfs4_dss_paths); + rfs4_dss_paths = rfs4_dss_oldpaths = NULL; } typedef union { @@ -1455,11 +1541,50 @@ rfs4_client_expiry(rfs4_entry_t u_entry) cp_expired = (cp->forced_expire || (gethrestime_sec() - cp->last_access > rfs4_lease_time)); + if (!cp->ss_remove && cp_expired) cp->ss_remove = 1; return (cp_expired); } +/* + * Remove the leaf file from all distributed stable storage paths. + */ +static void +rfs4_dss_remove_cpleaf(rfs4_client_t *cp) +{ + char *leaf = cp->ss_pn->leaf; + + rfs4_dss_remove_leaf(cp->server_instance, NFS4_DSS_STATE_LEAF, leaf); +} + +static void +rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf) +{ + int i, npaths = sip->dss_npaths; + + for (i = 0; i < npaths; i++) { + rfs4_dss_path_t *dss_path = sip->dss_paths[i]; + char *path, *dir; + size_t pathlen; + + /* the HA-NFSv4 path might have been failed-over away from us */ + if (dss_path == NULL) + continue; + + dir = dss_path->path; + + /* allow 3 extra bytes for two '/' & a NUL */ + pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3; + path = kmem_alloc(pathlen, KM_SLEEP); + (void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf); + + (void) vn_remove(path, UIO_SYSSPACE, RMFILE); + + kmem_free(path, pathlen); + } +} + static void rfs4_client_destroy(rfs4_entry_t u_entry) { @@ -1476,12 +1601,9 @@ rfs4_client_destroy(rfs4_entry_t u_entry) rfs4_client_rele(cp->cp_confirmed); if (cp->ss_pn) { - /* - * check if the stable storage file needs - * to be removed - */ + /* check if the stable storage files need to be removed */ if (cp->ss_remove) - (void) vn_remove(cp->ss_pn->pn, UIO_SYSSPACE, RMFILE); + rfs4_dss_remove_cpleaf(cp); rfs4_ss_pnfree(cp->ss_pn); } diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index 42d7e071ab..7b382608f5 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -106,6 +106,9 @@ static struct modlinkage modlinkage = { char _depends_on[] = "misc/klmmod"; +/* for testing RG failover code path on non-Cluster system */ +int hanfsv4_force = 0; + int _init(void) { @@ -125,7 +128,19 @@ _init(void) nfs_srvfini(); } + /* + * Initialise some placeholders for nfssys() calls. These have + * to be declared by the nfs module, since that handles nfssys() + * calls - also used by NFS clients - but are provided by this + * nfssrv module. These also then serve as confirmation to the + * relevant code in nfs that nfssrv has been loaded, as they're + * initially NULL. + */ nfs_srv_quiesce_func = nfs_srv_quiesce_all; + nfs_srv_dss_func = rfs4_dss_setpaths; + + /* setup DSS paths here; must be done before initial server startup */ + rfs4_dss_paths = rfs4_dss_oldpaths = NULL; return (status); } @@ -166,6 +181,7 @@ static void acl_dispatch(struct svc_req *, SVCXPRT *); static void common_dispatch(struct svc_req *, SVCXPRT *, rpcvers_t, rpcvers_t, char *, struct rpc_disptable *); +static void hanfsv4_failover(void); static int checkauth(struct exportinfo *, struct svc_req *, cred_t *, int, bool_t); static char *client_name(struct svc_req *req); @@ -241,6 +257,12 @@ static nfs_server_running_t nfs_server_upordown; static kmutex_t nfs_server_upordown_lock; static kcondvar_t nfs_server_upordown_cv; +/* + * DSS: distributed stable storage + * lists of all DSS paths: current, and before last warmstart + */ +nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; + int rfs4_dispatch(struct rpcdisp *, struct svc_req *, SVCXPRT *, char *); /* @@ -298,6 +320,11 @@ nfs_srv_shutdown_all(int quiesce) { nfs_server_upordown == NFS_SERVER_OFFLINE) { nfs_server_upordown = NFS_SERVER_QUIESCED; cv_signal(&nfs_server_upordown_cv); + + /* reset DSS state, for subsequent warm restart */ + rfs4_dss_numnewpaths = 0; + rfs4_dss_newpaths = NULL; + cmn_err(CE_NOTE, "nfs_server: server is now quiesced; " "NFSv4 state has been preserved"); } @@ -458,7 +485,7 @@ nfs_svc(struct nfs_svc_args *arg, model_t model) releasef(STRUCT_FGET(uap, fd)); - /* save the cluster nodeid */ + /* HA-NFSv4: save the cluster nodeid */ if (cluster_bootflags & CLUSTER_BOOTED) lm_global_nlmid = clconf_get_nodeid(); @@ -489,28 +516,20 @@ rfs4_server_start(int nfs4_srv_delegation) /* is this an nfsd warm start? */ if (nfs_server_upordown == NFS_SERVER_QUIESCED) { - int start_grace; - cmn_err(CE_NOTE, "nfs_server: " "server was previously quiesced; " "existing NFSv4 state will be re-used"); /* - * Cluster: this is also the signal that - * a failover has occurred, so create a new - * server instance, and start its grace period. - * We also need to reset all currently - * active grace periods in case of multiple - * failovers within the grace duration, - * to avoid partitioning clients of the same - * resource into different instances. + * HA-NFSv4: this is also the signal + * that a Resource Group failover has + * occurred. */ - if (cluster_bootflags & CLUSTER_BOOTED) { - rfs4_grace_reset_all(); - start_grace = 1; - rfs4_servinst_create(start_grace); - } + if (cluster_bootflags & CLUSTER_BOOTED || + hanfsv4_force) + hanfsv4_failover(); } else { + /* cold start */ rfs4_state_init(); nfs4_drc = rfs4_init_drc(nfs4_drc_max, nfs4_drc_hash, @@ -2836,3 +2855,160 @@ nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr, return (error); } + +/* + * Do the main work of handling HA-NFSv4 Resource Group failover on + * Sun Cluster. + * We need to detect whether any RG admin paths have been added or removed, + * and adjust resources accordingly. + * Currently we're using a very inefficient algorithm, ~ 2 * O(n**2). In + * order to scale, the list and array of paths need to be held in more + * suitable data structures. + */ +static void +hanfsv4_failover(void) +{ + int i, start_grace, numadded_paths = 0; + char **added_paths = NULL; + rfs4_dss_path_t *dss_path; + + /* + * First, look for removed paths: RGs that have been failed-over + * away from this node. + * Walk the "currently-serving" rfs4_dss_pathlist and, for each + * path, check if it is on the "passed-in" rfs4_dss_newpaths array + * from nfsd. If not, that RG path has been removed. + * + * Note that nfsd has sorted rfs4_dss_newpaths for us, and removed + * any duplicates. + */ + dss_path = rfs4_dss_pathlist; + do { + int found = 0; + char *path = dss_path->path; + + /* used only for non-HA so may not be removed */ + if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) { + dss_path = dss_path->next; + continue; + } + + for (i = 0; i < rfs4_dss_numnewpaths; i++) { + int cmpret; + size_t ncmp; + char *newpath = rfs4_dss_newpaths[i]; + + ncmp = MAX(strlen(path), strlen(newpath)); + cmpret = strncmp(path, newpath, ncmp); + + /* + * Since nfsd has sorted rfs4_dss_newpaths for us, + * once the return from strncmp is negative we know + * we've passed the point where "path" should be, + * and can stop searching: "path" has been removed. + */ + if (cmpret < 0) + break; + + if (cmpret == 0) { + found = 1; + break; + } + } + + if (found == 0) { + unsigned index = dss_path->index; + rfs4_servinst_t *sip = dss_path->sip; + rfs4_dss_path_t *path_next = dss_path->next; + + /* + * This path has been removed. + * We must clear out the servinst reference to + * it, since it's now owned by another + * node: we should not attempt to touch it. + */ + ASSERT(dss_path == sip->dss_paths[index]); + sip->dss_paths[index] = NULL; + + /* remove from "currently-serving" list, and destroy */ + remque(dss_path); + kmem_free(dss_path, sizeof (rfs4_dss_path_t)); + + dss_path = path_next; + } else { + /* path was found; not removed */ + dss_path = dss_path->next; + } + } while (dss_path != rfs4_dss_pathlist); + + /* + * Now, look for added paths: RGs that have been failed-over + * to this node. + * Walk the "passed-in" rfs4_dss_newpaths array from nfsd and, + * for each path, check if it is on the "currently-serving" + * rfs4_dss_pathlist. If not, that RG path has been added. + * + * Note: we don't do duplicate detection here; nfsd does that for us. + * + * Note: numadded_paths <= rfs4_dss_numnewpaths, which gives us + * an upper bound for the size needed for added_paths[numadded_paths]. + */ + + /* probably more space than we need, but guaranteed to be enough */ + if (rfs4_dss_numnewpaths > 0) { + size_t sz = rfs4_dss_numnewpaths * sizeof (char *); + added_paths = kmem_zalloc(sz, KM_SLEEP); + } + + /* walk the "passed-in" rfs4_dss_newpaths array from nfsd */ + for (i = 0; i < rfs4_dss_numnewpaths; i++) { + int found = 0; + char *newpath = rfs4_dss_newpaths[i]; + + dss_path = rfs4_dss_pathlist; + do { + char *path = dss_path->path; + + /* used only for non-HA */ + if (strcmp(path, NFS4_DSS_VAR_DIR) == 0) { + dss_path = dss_path->next; + continue; + } + + if (strncmp(path, newpath, strlen(path)) == 0) { + found = 1; + break; + } + + dss_path = dss_path->next; + } while (dss_path != rfs4_dss_pathlist); + + if (found == 0) { + added_paths[numadded_paths] = newpath; + numadded_paths++; + } + } + + /* did we find any added paths? */ + if (numadded_paths > 0) { + /* create a new server instance, and start its grace period */ + start_grace = 1; + rfs4_servinst_create(start_grace, numadded_paths, added_paths); + + /* read in the stable storage state from these paths */ + rfs4_dss_readstate(numadded_paths, added_paths); + + /* + * Multiple failovers during a grace period will cause + * clients of the same resource group to be partitioned + * into different server instances, with different + * grace periods. Since clients of the same resource + * group must be subject to the same grace period, + * we need to reset all currently active grace periods. + */ + rfs4_grace_reset_all(); + } + + if (rfs4_dss_numnewpaths > 0) + kmem_free(added_paths, rfs4_dss_numnewpaths * sizeof (char *)); +} diff --git a/usr/src/uts/common/fs/nfs/nfs_sys.c b/usr/src/uts/common/fs/nfs/nfs_sys.c index af32a7a7f3..5101da02ed 100644 --- a/usr/src/uts/common/fs/nfs/nfs_sys.c +++ b/usr/src/uts/common/fs/nfs/nfs_sys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. @@ -38,6 +37,7 @@ #include <sys/policy.h> #include <sys/siginfo.h> #include <sys/proc.h> /* for exit() declaration */ +#include <sys/kmem.h> #include <nfs/nfs4.h> #include <nfs/nfssys.h> #include <sys/thread.h> @@ -70,6 +70,12 @@ void (*nfs_srv_quiesce_func)(void) = NULL; time_t rfs4_lease_time = RFS4_LEASETIME; time_t rfs4_grace_period = RFS4_LEASETIME; +/* DSS: distributed stable storage */ +size_t nfs4_dss_buflen = 0; +/* This filled in by nfssrv:_init() */ +int (*nfs_srv_dss_func)(char *, size_t) = NULL; + + int nfssys(enum nfssys_op opcode, void *arg) { @@ -182,22 +188,6 @@ nfssys(enum nfssys_op opcode, void *arg) break; } - /* Request that NFS server quiesce on next shutdown */ - case NFS_SVC_REQUEST_QUIESCE: { - int id; - - /* check that nfssrv module is loaded */ - if (nfs_srv_quiesce_func == NULL) - return (set_errno(ENOTSUP)); - - if (copyin(arg, &id, sizeof (id))) - return (set_errno(EFAULT)); - - error = svc_pool_control(id, SVCPSET_SHUTDOWN_PROC, - (void *)nfs_srv_quiesce_func); - break; - } - case EXPORTFS: { /* export a file system */ STRUCT_DECL(exportfs_args, ea); @@ -295,6 +285,22 @@ nfssys(enum nfssys_op opcode, void *arg) break; } + /* Request that NFSv4 server quiesce on next shutdown */ + case NFS4_SVC_REQUEST_QUIESCE: { + int id; + + /* check that nfssrv module is loaded */ + if (nfs_srv_quiesce_func == NULL) + return (set_errno(ENOTSUP)); + + if (copyin(arg, &id, sizeof (id))) + return (set_errno(EFAULT)); + + error = svc_pool_control(id, SVCPSET_SHUTDOWN_PROC, + (void *)nfs_srv_quiesce_func); + break; + } + case NFS_IDMAP: { struct nfsidmap_args idm; @@ -306,6 +312,47 @@ nfssys(enum nfssys_op opcode, void *arg) break; } + case NFS4_DSS_SETPATHS_SIZE: { + /* crosses ILP32/LP64 boundary */ + uint32_t nfs4_dss_bufsize = 0; + + if (copyin(arg, &nfs4_dss_bufsize, sizeof (nfs4_dss_bufsize))) + return (set_errno(EFAULT)); + nfs4_dss_buflen = (long)nfs4_dss_bufsize; + error = 0; + break; + } + + case NFS4_DSS_SETPATHS: { + char *nfs4_dss_bufp; + + /* check that nfssrv module is loaded */ + if (nfs_srv_dss_func == NULL) + return (set_errno(ENOTSUP)); + + /* + * NFS4_DSS_SETPATHS_SIZE must be called before + * NFS4_DSS_SETPATHS, to tell us how big a buffer we need + * to allocate. + */ + if (nfs4_dss_buflen == 0) + return (set_errno(EINVAL)); + nfs4_dss_bufp = kmem_alloc(nfs4_dss_buflen, KM_SLEEP); + if (nfs4_dss_bufp == NULL) + return (set_errno(ENOMEM)); + + if (copyin(arg, nfs4_dss_bufp, nfs4_dss_buflen)) { + kmem_free(nfs4_dss_bufp, nfs4_dss_buflen); + return (set_errno(EFAULT)); + } + + /* unpack the buffer and extract the pathnames */ + error = nfs_srv_dss_func(nfs4_dss_bufp, nfs4_dss_buflen); + kmem_free(nfs4_dss_bufp, nfs4_dss_buflen); + + break; + } + default: error = EINVAL; break; diff --git a/usr/src/uts/common/nfs/nfs.h b/usr/src/uts/common/nfs/nfs.h index 043014ff37..eda293574e 100644 --- a/usr/src/uts/common/nfs/nfs.h +++ b/usr/src/uts/common/nfs/nfs.h @@ -44,6 +44,7 @@ #include <sys/dirent.h> #include <sys/zone.h> #include <sys/tsol/label.h> +#include <sys/nvpair.h> #include <nfs/mount.h> #endif #include <vm/page.h> @@ -933,8 +934,12 @@ extern int nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, extern void nfs_srv_stop_all(void); extern void nfs_srv_quiesce_all(void); extern void (*nfs_srv_quiesce_func)(void); +extern int rfs4_dss_setpaths(char *, size_t); +extern int (*nfs_srv_dss_func)(char *, size_t); extern time_t rfs4_lease_time; extern time_t rfs4_grace_period; +extern nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; + extern kstat_named_t *global_svstat_ptr[]; diff --git a/usr/src/uts/common/nfs/nfs4.h b/usr/src/uts/common/nfs/nfs4.h index eacbefe747..0950547bc9 100644 --- a/usr/src/uts/common/nfs/nfs4.h +++ b/usr/src/uts/common/nfs/nfs4.h @@ -36,6 +36,7 @@ #ifdef _KERNEL #include <nfs/nfs4_kprot.h> +#include <sys/nvpair.h> #else #include <rpcsvc/nfs4_prot.h> #endif @@ -324,17 +325,46 @@ typedef struct { * * Currently used only for Sun Cluster HA-NFS support, to group clients * on NFS resource failover so each set of clients gets its own dedicated - * grace period. + * grace period and distributed stable storage data. */ typedef struct rfs4_servinst { + int dss_npaths; krwlock_t rwlock; + krwlock_t oldstate_lock; time_t start_time; time_t grace_period; + rfs4_oldstate_t *oldstate; + struct rfs4_dss_path **dss_paths; struct rfs4_servinst *next; struct rfs4_servinst *prev; } rfs4_servinst_t; /* + * DSS: distributed stable storage + */ + +typedef struct rfs4_dss_path { + struct rfs4_dss_path *next; /* for insque/remque */ + struct rfs4_dss_path *prev; /* for insque/remque */ + char *path; + struct rfs4_servinst *sip; + unsigned index; /* offset in servinst's array */ +} rfs4_dss_path_t; + +/* array of paths passed-in from nfsd command-line; stored in nvlist */ +char **rfs4_dss_newpaths; +uint_t rfs4_dss_numnewpaths; + +/* + * Circular doubly-linked list of paths for currently-served RGs. + * No locking required: only changed on warmstart. Managed with insque/remque. + */ +rfs4_dss_path_t *rfs4_dss_pathlist; + +/* nvlists of all DSS paths: current, and before last warmstart */ +nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; + +/* * List declarations (suitable for insque/remque) used to link the * various datastructs listed below. */ @@ -712,12 +742,11 @@ typedef struct rfs4_file { krwlock_t file_rwlock; } rfs4_file_t; -extern int rfs4_servinst_debug; extern int rfs4_seen_first_compound; /* set first time we see one */ extern rfs4_servinst_t *rfs4_cur_servinst; /* current server instance */ extern kmutex_t rfs4_servinst_lock; /* protects linked list */ -extern void rfs4_servinst_create(int); +extern void rfs4_servinst_create(int, int, char **); extern void rfs4_servinst_destroy_all(void); extern void rfs4_servinst_assign(rfs4_client_t *, rfs4_servinst_t *); @@ -728,6 +757,8 @@ extern int rfs4_servinst_grace_new(rfs4_servinst_t *); extern void rfs4_grace_start(rfs4_servinst_t *); extern void rfs4_grace_start_new(void); extern void rfs4_grace_reset_all(void); +extern void rfs4_ss_oldstate(rfs4_oldstate_t *, char *, char *); +extern void rfs4_dss_readstate(int, char **); /* * rfs4_deleg_policy is used to signify the server's global delegation diff --git a/usr/src/uts/common/nfs/nfssys.h b/usr/src/uts/common/nfs/nfssys.h index 4d3794f1a9..931990fcf5 100644 --- a/usr/src/uts/common/nfs/nfssys.h +++ b/usr/src/uts/common/nfs/nfssys.h @@ -50,7 +50,8 @@ enum nfssys_op { OLD_NFS_SVC, OLD_ASYNC_DAEMON, EXPORTFS, OLD_NFS_GETFH, OLD_NFS_CNVT, NFS_REVAUTH, OLD_NFS_FH_TO_FID, OLD_LM_SVC, KILL_LOCKMGR, LOG_FLUSH, SVCPOOL_CREATE, NFS_SVC, LM_SVC, SVCPOOL_WAIT, SVCPOOL_RUN, NFS4_SVC, RDMA_SVC_INIT, NFS4_CLR_STATE, NFS_IDMAP, - NFS_SVC_REQUEST_QUIESCE, NFS_GETFH }; + NFS4_SVC_REQUEST_QUIESCE, NFS_GETFH, NFS4_DSS_SETPATHS, + NFS4_DSS_SETPATHS_SIZE }; struct nfs_svc_args { int fd; /* Connection endpoint */ @@ -294,6 +295,14 @@ struct nfs4_svc_args32 { #define NFS4_SETPORT 2 #define NFS4_DQUERY 4 +/* DSS: distributed stable storage */ +#define NFS4_DSS_STATE_LEAF "v4_state" +#define NFS4_DSS_OLDSTATE_LEAF "v4_oldstate" +#define NFS4_DSS_DIR_MODE 0755 +#define NFS4_DSS_NVPAIR_NAME "dss_pathname_array" +/* default storage dir */ +#define NFS4_DSS_VAR_DIR "/var/nfs" + #ifdef _KERNEL #include <sys/systm.h> /* for rval_t typedef */ |