diff options
Diffstat (limited to 'usr/src/uts/common/fs')
34 files changed, 606 insertions, 276 deletions
diff --git a/usr/src/uts/common/fs/nfs/nfs3_srv.c b/usr/src/uts/common/fs/nfs/nfs3_srv.c index c72f823cd3..4acbe92ad9 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c @@ -433,16 +433,25 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, goto out1; } + exi_hold(exi); + /* * If the public filehandle is used then allow * a multi-component lookup */ if (PUBLIC_FH3(&args->what.dir)) { + struct exportinfo *new; + publicfh_flag = TRUE; + error = rfs_publicfh_mclookup(name, dvp, cr, &vp, - &exi, &sec); - if (error && exi != NULL) - exi_rele(exi); /* See comment below Re: publicfh_flag */ + &new, &sec); + + if (error == 0) { + exi_rele(exi); + exi = new; + } + /* * Since WebNFS may bypass MOUNT, we need to ensure this * request didn't come from an unlabeled admin_low client. @@ -464,8 +473,6 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, if (tp == NULL || tp->tpc_tp.tp_doi != l_admin_low->tsl_doi || tp->tpc_tp.host_type != SUN_CIPSO) { - if (exi != NULL) - exi_rele(exi); VN_RELE(vp); resp->status = NFS3ERR_ACCES; error = 1; @@ -491,8 +498,6 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, dvp, DOMINANCE_CHECK, exi)) { - if (publicfh_flag && exi != NULL) - exi_rele(exi); VN_RELE(vp); resp->status = NFS3ERR_ACCES; error = 1; @@ -519,18 +524,10 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, goto out; } - /* - * If publicfh_flag is true then we have called rfs_publicfh_mclookup - * and have obtained a new exportinfo in exi which needs to be - * released. Note the the original exportinfo pointed to by exi - * will be released by the caller, common_dispatch. - */ - if (publicfh_flag) - exi_rele(exi); - va.va_mask = AT_ALL; vap = rfs4_delegated_getattr(vp, &va, 0, cr) ? NULL : &va; + exi_rele(exi); VN_RELE(vp); resp->status = NFS3_OK; @@ -552,6 +549,12 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, return; out: + /* + * The passed argument exportinfo is released by the + * caller, common_dispatch + */ + exi_rele(exi); + if (curthread->t_flag & T_WOULDBLOCK) { curthread->t_flag &= ~T_WOULDBLOCK; resp->status = NFS3ERR_JUKEBOX; diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index 29a9d67497..f2a9734541 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -21,6 +21,9 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. @@ -1131,6 +1134,7 @@ rfs4_op_secinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, char *nm; struct sockaddr *ca; char *name = NULL; + nfsstat4 status = NFS4_OK; DTRACE_NFSV4_2(op__secinfo__start, struct compound_state *, cs, SECINFO4args *, args); @@ -1154,11 +1158,12 @@ rfs4_op_secinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, * do not error out if the component name is a "..". * SECINFO will return its parents secinfo data for SECINFO "..". */ - if (!utf8_dir_verify(utfnm)) { + status = utf8_dir_verify(utfnm); + if (status != NFS4_OK) { if (utfnm->utf8string_len != 2 || utfnm->utf8string_val[0] != '.' || utfnm->utf8string_val[1] != '.') { - *cs->statusp = resp->status = NFS4ERR_INVAL; + *cs->statusp = resp->status = status; goto out; } } @@ -1336,7 +1341,8 @@ rfs4_op_access(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, blequal(clabel, slabel))) resp->access |= (args->access & (ACCESS4_MODIFY | ACCESS4_EXTEND)); - resp->supported |= (ACCESS4_MODIFY | ACCESS4_EXTEND); + resp->supported |= + resp->access & (ACCESS4_MODIFY | ACCESS4_EXTEND); } if (checkwriteperm && @@ -1570,8 +1576,9 @@ rfs4_op_create(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto out; } - if (!utf8_dir_verify(&args->objname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->objname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -2446,6 +2453,7 @@ rfs4_op_link(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, uint_t len; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__link__start, struct compound_state *, cs, LINK4args *, args); @@ -2495,8 +2503,9 @@ rfs4_op_link(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->newname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->newname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -2886,6 +2895,7 @@ rfs4_op_lookup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, uint_t len; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__lookup__start, struct compound_state *, cs, LOOKUP4args *, args); @@ -2905,8 +2915,9 @@ rfs4_op_lookup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->objname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->objname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -3655,30 +3666,6 @@ out: } /* - * A directory entry is a valid nfsv4 entry if - * - it has a non-zero ino - * - it is not a dot or dotdot name - * - it is visible in a pseudo export or in a real export that can - * only have a limited view. - */ -static bool_t -valid_nfs4_entry(struct exportinfo *exi, struct dirent64 *dp, - int *expseudo, int check_visible) -{ - if (dp->d_ino == 0 || NFS_IS_DOTNAME(dp->d_name)) { - *expseudo = 0; - return (FALSE); - } - - if (! check_visible) { - *expseudo = 0; - return (TRUE); - } - - return (nfs_visible_inode(exi, dp->d_ino, expseudo)); -} - -/* * set_rdattr_params sets up the variables used to manage what information * to get for each directory entry. */ @@ -4101,6 +4088,7 @@ rfs4_op_remove(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, bslabel_t *clabel; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__remove__start, struct compound_state *, cs, REMOVE4args *, args); @@ -4131,8 +4119,9 @@ rfs4_op_remove(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->target)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->target); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -4398,6 +4387,7 @@ rfs4_op_rename(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, struct sockaddr *ca; char *converted_onm = NULL; char *converted_nnm = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__rename__start, struct compound_state *, cs, RENAME4args *, args); @@ -4454,13 +4444,15 @@ rfs4_op_rename(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->oldname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->oldname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } - if (!utf8_dir_verify(&args->newname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->newname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -5789,6 +5781,8 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, cs.statusp = &resp->status; cs.req = req; + resp->array = NULL; + resp->array_len = 0; /* * XXX for now, minorversion should be zero @@ -5796,14 +5790,17 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, if (args->minorversion != NFS4_MINORVERSION) { DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs, COMPOUND4args *, args); - resp->array_len = 0; - resp->array = NULL; resp->status = NFS4ERR_MINOR_VERS_MISMATCH; DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs, COMPOUND4res *, resp); return; } + if (args->array_len == 0) { + resp->status = NFS4_OK; + return; + } + ASSERT(exi == NULL); ASSERT(cr == NULL); @@ -6079,8 +6076,9 @@ rfs4_lookup(component4 *component, struct svc_req *req, return (NFS4ERR_NOTDIR); } - if (!utf8_dir_verify(component)) - return (NFS4ERR_INVAL); + status = utf8_dir_verify(component); + if (status != NFS4_OK) + return (status); nm = utf8_to_fn(component, &len, NULL); if (nm == NULL) { @@ -6372,8 +6370,9 @@ rfs4_createfile(OPEN4args *args, struct svc_req *req, struct compound_state *cs, * the including directory on success. */ component = &args->open_claim4_u.file; - if (!utf8_dir_verify(component)) - return (NFS4ERR_INVAL); + status = utf8_dir_verify(component); + if (status != NFS4_OK) + return (status); nm = utf8_to_fn(component, &buflen, NULL); @@ -7594,6 +7593,12 @@ rfs4_op_open_confirm(nfs_argop4 *argop, nfs_resop4 *resop, goto out; } + if (cs->vp->v_type != VREG) { + *cs->statusp = resp->status = + cs->vp->v_type == VDIR ? NFS4ERR_ISDIR : NFS4ERR_INVAL; + return; + } + status = rfs4_get_state(&args->open_stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; @@ -7709,6 +7714,11 @@ rfs4_op_open_downgrade(nfs_argop4 *argop, nfs_resop4 *resop, goto out; } + if (cs->vp->v_type != VREG) { + *cs->statusp = resp->status = NFS4ERR_INVAL; + return; + } + status = rfs4_get_state(&args->open_stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c index dbd3263608..855cd8cd92 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ #include <sys/systm.h> #include <sys/cmn_err.h> @@ -1585,7 +1588,8 @@ rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, case NFS4ATTR_GETIT: fsl = fetch_referral(sarg->cs->vp, sarg->cs->cr); if (fsl == NULL) - error = EINVAL; + (void) memset(&(na->fs_locations), 0, + sizeof (fs_locations4)); else { na->fs_locations = *fsl; kmem_free(fsl, sizeof (fs_locations4)); diff --git a/usr/src/uts/common/fs/nfs/nfs4_subr.c b/usr/src/uts/common/fs/nfs/nfs4_subr.c index c14117c009..cfac742707 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_subr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_subr.c @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. @@ -713,33 +716,33 @@ utf8_compare(const utf8string *a, const utf8string *b) /* * utf8_dir_verify - checks that the utf8 string is valid */ -int +nfsstat4 utf8_dir_verify(utf8string *str) { char *nm; int len; if (str == NULL) - return (0); + return (NFS4ERR_INVAL); nm = str->utf8string_val; len = str->utf8string_len; if (nm == NULL || len == 0) { - return (0); + return (NFS4ERR_INVAL); } if (len == 1 && nm[0] == '.') - return (0); + return (NFS4ERR_BADNAME); if (len == 2 && nm[0] == '.' && nm[1] == '.') - return (0); + return (NFS4ERR_BADNAME); if (utf8_strchr(str, '/') != NULL) - return (0); + return (NFS4ERR_BADNAME); if (utf8_strchr(str, '\0') != NULL) - return (0); + return (NFS4ERR_BADNAME); - return (1); + return (NFS4_OK); } /* diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index bb625bb175..22d1ad4d68 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -22,6 +22,7 @@ * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2012 Joyent, Inc. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -2804,8 +2805,8 @@ rfs_publicfh_mclookup(char *p, vnode_t *dvp, cred_t *cr, vnode_t **vpp, */ /* Release the reference on the old exi value */ - ASSERT(*exi != NULL); exi_rele(*exi); + *exi = NULL; if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) { VN_RELE(*vpp); @@ -2818,6 +2819,9 @@ publicfh_done: if (mc_dvp) VN_RELE(mc_dvp); + if (error && *exi != NULL) + exi_rele(*exi); + return (error); } @@ -2963,16 +2967,19 @@ URLparse(char *str) /* * Get the export information for the lookup vnode, and verify its * useable. + * + * Set @exip only in success */ int nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr, - struct exportinfo **exi) + struct exportinfo **exip) { int walk; int error = 0; + struct exportinfo *exi; - *exi = nfs_vptoexi(mc_dvp, vp, cr, &walk, NULL, FALSE); - if (*exi == NULL) + exi = nfs_vptoexi(mc_dvp, vp, cr, &walk, NULL, FALSE); + if (exi == NULL) error = EACCES; else { /* @@ -2981,10 +2988,13 @@ nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr, * must not terminate below the * exported directory. */ - if ((*exi)->exi_export.ex_flags & EX_NOSUB && walk > 0) + if (exi->exi_export.ex_flags & EX_NOSUB && walk > 0) { error = EACCES; + exi_rele(exi); + } } - + if (error == 0) + *exip = exi; return (error); } diff --git a/usr/src/uts/common/fs/nfs/nfs_srv.c b/usr/src/uts/common/fs/nfs/nfs_srv.c index 8ca8ee5d1d..f0cd9633aa 100644 --- a/usr/src/uts/common/fs/nfs/nfs_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs_srv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -399,6 +400,8 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, return; } + exi_hold(exi); + /* * If the public filehandle is used then allow * a multi-component lookup, i.e. evaluate @@ -409,9 +412,16 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, * which is OK as long as the filesystem is exported. */ if (PUBLIC_FH2(fhp)) { + struct exportinfo *new; + publicfh_flag = TRUE; - error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi, + error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new, &sec); + + if (error == 0) { + exi_rele(exi); + exi = new; + } } else { /* * Do a normal single component lookup. @@ -452,13 +462,10 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, VN_RELE(dvp); /* - * If publicfh_flag is true then we have called rfs_publicfh_mclookup - * and have obtained a new exportinfo in exi which needs to be - * released. Note the the original exportinfo pointed to by exi - * will be released by the caller, comon_dispatch. + * The passed argument exportinfo is released by the + * caller, comon_dispatch */ - if (publicfh_flag && exi != NULL) - exi_rele(exi); + exi_rele(exi); /* * If it's public fh, no 0x81, and client's flavor is diff --git a/usr/src/uts/common/fs/smbsrv/smb_delete.c b/usr/src/uts/common/fs/smbsrv/smb_delete.c index 43f6d733bd..8a27b7408e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_delete.c +++ b/usr/src/uts/common/fs/smbsrv/smb_delete.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <smbsrv/smb_kproto.h> @@ -553,7 +554,7 @@ smb_delete_check_path(smb_request_t *sr) /* fname component is, or resolves to, '.' (dot) */ if ((strcmp(pn->pn_fname, ".") == 0) || (SMB_SEARCH_DIRECTORY(fqi->fq_sattr) && - (smb_match(pn->pn_fname, ".")))) { + (smb_match(pn->pn_fname, ".", B_FALSE)))) { smbsr_error(sr, NT_STATUS_OBJECT_NAME_INVALID, ERRDOS, ERROR_INVALID_NAME); return (-1); diff --git a/usr/src/uts/common/fs/smbsrv/smb_kutil.c b/usr/src/uts/common/fs/smbsrv/smb_kutil.c index 5d45081e2e..aed58277be 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_kutil.c +++ b/usr/src/uts/common/fs/smbsrv/smb_kutil.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <sys/param.h> @@ -98,116 +100,34 @@ smb_ascii_or_unicode_null_len(struct smb_request *sr) } /* - * Return B_TRUE if pattern contains wildcards - */ -boolean_t -smb_contains_wildcards(const char *pattern) -{ - static const char *wildcards = "*?"; - - return (strpbrk(pattern, wildcards) != NULL); -} - -/* - * When converting wildcards a '.' in a name is treated as a base and - * extension separator even if the name is longer than 8.3. - * - * The '*' character matches an entire part of the name. For example, - * "*.abc" matches any name with an extension of "abc". * - * The '?' character matches a single character. - * If the base contains all ? (8 or more) then it is treated as *. - * If the extension contains all ? (3 or more) then it is treated as *. - * - * Clients convert ASCII wildcards to Unicode wildcards as follows: + * Convert old-style (DOS, LanMan) wildcard strings to NT style. + * This should ONLY happen to patterns that come from old clients, + * meaning dialect LANMAN2_1 etc. (dialect < NT_LM_0_12). * * ? is converted to > - * . is converted to " if it is followed by ? or * * * is converted to < if it is followed by . + * . is converted to " if it is followed by ? or * or end of pattern * - * Note that clients convert "*." to '< and drop the '.' but "*.txt" - * is sent as "<.TXT", i.e. - * - * dir *. -> dir < - * dir *.txt -> dir <.TXT - * - * Since " and < are illegal in Windows file names, we always convert - * these Unicode wildcards without checking the following character. + * Note: modifies pattern in place. */ void smb_convert_wildcards(char *pattern) { - static char *match_all[] = { - "*.", - "*.*" - }; - char *extension; char *p; - int len; - int i; - /* - * Special case "<" for "dir *.", and fast-track for "*". - */ - if ((*pattern == '<') || (*pattern == '*')) { - if (*(pattern + 1) == '\0') { - *pattern = '*'; - return; - } - } - - for (p = pattern; *p != '\0'; ++p) { + for (p = pattern; *p != '\0'; p++) { switch (*p) { - case '<': - *p = '*'; - break; - case '>': - *p = '?'; + case '?': + *p = '>'; break; - case '\"': - *p = '.'; + case '*': + if (p[1] == '.') + *p = '<'; break; - default: - break; - } - } - - /* - * Replace "????????.ext" with "*.ext". - */ - p = pattern; - p += strspn(p, "?"); - if (*p == '.') { - *p = '\0'; - len = strlen(pattern); - *p = '.'; - if (len >= SMB_NAME83_BASELEN) { - *pattern = '*'; - (void) strlcpy(pattern + 1, p, MAXPATHLEN - 1); - } - } - - /* - * Replace "base.???" with 'base.*'. - */ - if ((extension = strrchr(pattern, '.')) != NULL) { - p = ++extension; - p += strspn(p, "?"); - if (*p == '\0') { - len = strlen(extension); - if (len >= SMB_NAME83_EXTLEN) { - *extension = '\0'; - (void) strlcat(pattern, "*", MAXPATHLEN); - } - } - } - - /* - * Replace anything that matches an entry in match_all with "*". - */ - for (i = 0; i < sizeof (match_all) / sizeof (match_all[0]); ++i) { - if (strcmp(pattern, match_all[i]) == 0) { - (void) strlcpy(pattern, "*", MAXPATHLEN); + case '.': + if (p[1] == '?' || p[1] == '*' || p[1] == '\0') + *p = '\"'; break; } } diff --git a/usr/src/uts/common/fs/smbsrv/smb_odir.c b/usr/src/uts/common/fs/smbsrv/smb_odir.c index ea9b505f0d..610126753b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_odir.c +++ b/usr/src/uts/common/fs/smbsrv/smb_odir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -286,7 +287,8 @@ smb_odir_open(smb_request_t *sr, char *path, uint16_t sattr, uint32_t flags) tree = sr->tid_tree; - smb_convert_wildcards(path); + if (sr->session->dialect < NT_LM_0_12) + smb_convert_wildcards(path); rc = smb_pathname_reduce(sr, sr->user_cr, path, tree->t_snode, tree->t_snode, &dnode, pattern); @@ -1278,22 +1280,23 @@ smb_odir_lookup_link(smb_request_t *sr, smb_odir_t *od, * - If shortnames are supported, generate the shortname from * odirent->od_name and check if it matches od->d_pattern. */ -boolean_t +static boolean_t smb_odir_match_name(smb_odir_t *od, smb_odirent_t *odirent) { char *name = odirent->od_name; char shortname[SMB_SHORTNAMELEN]; ino64_t ino = odirent->od_ino; + boolean_t ci = (od->d_flags & SMB_ODIR_FLAG_IGNORE_CASE) != 0; if (smb_is_reserved_dos_name(name)) return (B_FALSE); - if (smb_match_ci(od->d_pattern, name)) + if (smb_match(od->d_pattern, name, ci)) return (B_TRUE); if (od->d_flags & SMB_ODIR_FLAG_SHORTNAMES) { smb_mangle(name, ino, shortname, SMB_SHORTNAMELEN); - if (smb_match_ci(od->d_pattern, shortname)) + if (smb_match(od->d_pattern, shortname, ci)) return (B_TRUE); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_pathname.c b/usr/src/uts/common/fs/smbsrv/smb_pathname.c index e3ae3ffba2..db9883667e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_pathname.c +++ b/usr/src/uts/common/fs/smbsrv/smb_pathname.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <smbsrv/smb_kproto.h> @@ -732,8 +733,8 @@ smb_pathname_preprocess(smb_request_t *sr, smb_pathname_t *pn) return; } - /* perform unicode wildcard conversion */ - smb_convert_wildcards(pn->pn_path); + if (sr->session->dialect < NT_LM_0_12) + smb_convert_wildcards(pn->pn_path); /* treat '/' as '\\' */ (void) strsubst(pn->pn_path, '/', '\\'); diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index abe3a23e75..8d5c741428 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -936,29 +936,33 @@ vfs_mountroot(void) } #endif /* __sparc */ - /* - * Look up the root device via devfs so that a dv_node is - * created for it. The vnode is never VN_RELE()ed. - * We allocate more than MAXPATHLEN so that the - * buffer passed to i_ddi_prompath_to_devfspath() is - * exactly MAXPATHLEN (the function expects a buffer - * of that length). - */ - plen = strlen("/devices"); - path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); - (void) strcpy(path, "/devices"); + if (strcmp(rootfs.bo_fstype, "zfs") != 0) { + /* + * Look up the root device via devfs so that a dv_node is + * created for it. The vnode is never VN_RELE()ed. + * We allocate more than MAXPATHLEN so that the + * buffer passed to i_ddi_prompath_to_devfspath() is + * exactly MAXPATHLEN (the function expects a buffer + * of that length). + */ + plen = strlen("/devices"); + path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); + (void) strcpy(path, "/devices"); - if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) - != DDI_SUCCESS || - lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { + if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) + != DDI_SUCCESS || + lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { - /* NUL terminate in case "path" has garbage */ - path[plen + MAXPATHLEN - 1] = '\0'; + /* NUL terminate in case "path" has garbage */ + path[plen + MAXPATHLEN - 1] = '\0'; #ifdef DEBUG - cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); + cmn_err(CE_WARN, "!Cannot lookup root device: %s", + path); #endif + } + kmem_free(path, plen + MAXPATHLEN); } - kmem_free(path, plen + MAXPATHLEN); + vfs_mnttabvp_setup(); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5caabf8260..d8e9f26bdb 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -190,6 +190,7 @@ uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; /* * Note that buffers can be in one of 6 states: @@ -292,6 +293,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { @@ -347,7 +351,10 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1362,6 +1369,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1460,6 +1478,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1644,6 +1672,48 @@ arc_buf_size(arc_buf_t *buf) } /* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + +/* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: @@ -2638,8 +2708,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3186,6 +3258,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 437e0ac85c..e8bf55c321 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2089,7 +2089,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 190b26e5bf..a9308b0c08 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -574,7 +574,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (dn->dn_indblkshift - SPA_BLKPTRSHIFT); while (level++ < maxlevel) { - txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs)) + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift; blkcnt = 1 + (blkcnt >> epbs); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index d9cd70f1c8..968fbd80d6 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -5983,6 +5983,10 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6111,6 +6115,8 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); + /* * Clear the dirty config list. */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 30681b6464..a254c8d656 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ #include <sys/zfs_context.h> #include <sys/spa_impl.h> +#include <sys/spa_boot.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> @@ -249,6 +250,26 @@ int zfs_flags = 0; */ int zfs_recover = 0; +extern int zfs_txg_synctime_ms; + +/* + * Expiration time in units of zfs_txg_synctime_ms. This value has two + * meanings. First it is used to determine when the spa_deadman logic + * should fire. By default the spa_deadman will fire if spa_sync has + * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). + * Secondly, the value determines if an I/O is considered "hung". + * Any I/O that has not completed in zfs_deadman_synctime is considered + * "hung" resulting in a system panic. + * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). + */ +uint64_t zfs_deadman_synctime = 1000ULL; + +/* + * Override the zfs deadman behavior via /etc/system. By default the + * deadman is enabled except on VMware and sparc deployments. + */ +int zfs_deadman_enabled = -1; + /* * ========================================================================== @@ -418,6 +439,23 @@ spa_lookup(const char *name) } /* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +} + +/* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. @@ -427,6 +465,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; + cyc_handler_t hdlr; + cyc_time_t when; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -458,6 +498,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; + + spa->spa_deadman_synctime = zfs_deadman_synctime * + zfs_txg_synctime_ms * MICROSEC; + + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 synctimes before checking again. + */ + when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -540,6 +599,12 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; + refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); @@ -1507,6 +1572,12 @@ spa_prev_software_version(spa_t *spa) } uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); @@ -1600,7 +1671,9 @@ spa_init(int mode) spa_mode_global = mode; -#ifndef _KERNEL +#ifdef _KERNEL + spa_arch_init(); +#else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 28dbc57275..b109dcafbc 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -99,6 +99,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h index 6661e47cfc..8ae05ce364 100644 --- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -181,7 +182,7 @@ typedef struct sa_hdr_phys { */ #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ { \ BF32_SET_SB(x, 10, 6, 3, 0, size); \ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 1043f4038a..172a9f141e 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -604,6 +604,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_boot.h b/usr/src/uts/common/fs/zfs/sys/spa_boot.h index 1d3622f5a1..8df5072a55 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H @@ -35,6 +39,8 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); +extern void spa_arch_init(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 027832e858..42ce5556d3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -227,6 +227,10 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + cyclic_id_t spa_deadman_cycid; /* cyclic id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + uint64_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 7e34889b61..5a7836612b 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -79,6 +79,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index c772d954bb..e4c02bde1d 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -105,6 +105,8 @@ struct vdev_queue { avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; zoneid_t vq_last_zone_id; + uint64_t vq_io_complete_ts; + uint64_t vq_io_delta_ts; kmutex_t vq_lock; }; @@ -321,6 +323,14 @@ extern void vdev_set_min_asize(vdev_t *vd); */ extern int zfs_vdev_cache_size; +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index fdd0412fee..0dc8d8859c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -22,8 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -67,6 +69,7 @@ extern "C" { #include <sys/sysevent/dev.h> #include <sys/fm/util.h> #include <sys/sunddi.h> +#include <sys/cyclic.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 4d781ad2a4..86e901be0d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -240,12 +240,24 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index ce3a983d9f..9c718f691a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -21,8 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. @@ -406,6 +404,7 @@ struct zio { uint64_t io_offset; uint64_t io_deadline; + uint64_t io_timestamp; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -554,6 +553,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); +extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fa0a579e66..18180ecad3 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -3153,3 +3153,41 @@ vdev_split(vdev_t *vd) } vdev_propagate_state(cvd); } + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_pending_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_pending_tree); + delta = ddi_get_lbolt64() - fio->io_timestamp; + if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) { + zfs_dbgmsg("SLOW IO: zio timestamp %llu, " + "delta %llu, last io %llu", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + fm_panic("I/O to pool '%s' appears to be " + "hung.", spa_name(spa)); + } + } + mutex_exit(&vq->vq_lock); + } +} diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 1ba343226f..dfadeca9d4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -42,11 +42,6 @@ extern ldi_ident_t zfs_li; -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - static void vdev_disk_hold(vdev_t *vd) { @@ -170,7 +165,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave - * us, even if it is one of multiple paths to the save device. But we + * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * @@ -416,8 +411,8 @@ vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, static void vdev_disk_io_intr(buf_t *bp) { - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. @@ -429,7 +424,7 @@ vdev_disk_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = EIO; - kmem_free(vdb, sizeof (vdev_disk_buf_t)); + kmem_free(vb, sizeof (vdev_buf_t)); zio_interrupt(zio); } @@ -460,7 +455,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; + vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; @@ -524,10 +519,10 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; + vb->vb_io = zio; + bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 043fa51294..1fbce5e542 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -25,6 +25,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_file.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -140,12 +141,55 @@ vdev_file_close(vdev_t *vd) vd->vdev_tsd = NULL; } +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = ENOSPC; + + kmem_free(vb, sizeof (vdev_buf_t)); + zio_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + buf_t *bp = arg; + vnode_t *vp = bp->b_private; + ssize_t resid; + int error; + + error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, + vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (error == 0) { + bp->b_resid = resid; + biodone(bp); + } else { + bioerror(bp, error); + biodone(bp); + } +} + static int vdev_file_io_start(zio_t *zio) { + spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; + vdev_buf_t *vb; + buf_t *bp; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ @@ -166,15 +210,22 @@ vdev_file_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; + vb->vb_io = zio; + bp = &vb->vb_buf; - zio_interrupt(zio); + bioinit(bp); + bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_private = vf->vf_vnode; + bp->b_iodone = (int (*)())vdev_file_io_intr; + + taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], + vdev_file_io_strategy, bp, 0, &zio->io_tqent); return (ZIO_PIPELINE_STOP); } diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 4ea958a9f6..8dec283fee 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -24,6 +24,10 @@ * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -298,6 +302,7 @@ again: zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -369,7 +374,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = ddi_get_lbolt64(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -394,10 +400,16 @@ vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); + mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); + vq->vq_io_complete_ts = ddi_get_lbolt64(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 2292f658b3..c7bfbbaec4 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -50,6 +50,7 @@ #include <sys/spa.h> #include <sys/zap.h> #include <sys/sa.h> +#include <sys/sa_impl.h> #include <sys/varargs.h> #include <sys/policy.h> #include <sys/atomic.h> @@ -64,7 +65,6 @@ #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> -#include <sys/sa.h> #include "zfs_comutil.h" int zfsfstype; @@ -578,7 +578,6 @@ static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - znode_phys_t *znp = data; int error = 0; /* @@ -597,20 +596,18 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, return (EEXIST); if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); - hdrsize = sa_hdrsize(data); - if (hdrsize != 0) { - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_GID_OFFSET)); - } else { + if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled @@ -618,6 +615,25 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, */ *userp = 0; *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); } } return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 0c86cac427..92dc05f4a0 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1947,13 +1947,16 @@ zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) * or not the object is an extended attribute directory. */ static int -zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, - int *is_xattrdir) +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; + uint64_t parent_mode; sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; int count = 0; int error; @@ -1967,9 +1970,32 @@ zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); - *pobjp = parent; + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (EINVAL); + + *pobjp = parent; + return (0); } @@ -2018,7 +2044,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, if (prevdb) zfs_release_sa_handle(prevhdl, prevdb, FTAG); - if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj, + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index e2e98b7896..00964aa83f 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -2928,7 +2928,7 @@ zio_done(zio_t *zio) * Hand it off to the otherwise-unused claim taskq. */ ASSERT(zio->io_tqent.tqent_next == NULL); - (void) taskq_dispatch_ent( + taskq_dispatch_ent( spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 9ae7d1f697..a9d4ab4070 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors and panic injection */ - if (handler->zi_record.zi_guid != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ @@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults or panic injection */ - if (handler->zi_record.zi_start == 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* @@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* - * Ignore label specific faults, panic injection - * or fake writes - */ - if (handler->zi_record.zi_start != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { @@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio) handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* @@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { @@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } +uint64_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *handler; + uint64_t seconds = 0; + + if (zio_injection_enabled == 0) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + seconds = handler->zi_record.zi_timer; + break; + } + + } + rw_exit(&inject_lock); + return (seconds); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, |