diff options
Diffstat (limited to 'usr/src/uts/common')
56 files changed, 3633 insertions, 339 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index d91379be96..fa9a3a4bf4 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -21,8 +21,8 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright 2011 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 by Delphix. All rights reserved. # @@ -1953,6 +1953,16 @@ NXGE_HCALL_OBJS = \ nxge_hcall.o # +# Virtio modules +# + +# Virtio core +VIRTIO_OBJS = virtio.o + +# Virtio block driver +VIOBLK_OBJS = vioblk.o + +# # kiconv modules # KICONV_EMEA_OBJS += kiconv_emea.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index b420a7d8e1..27478a210d 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -24,8 +24,8 @@ # # -# Copyright 2011 Nexenta Systems, Inc. All rights reserved. # Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. # # @@ -1420,6 +1420,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/yge/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/virtio/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # krtld must refer to its own bzero/bcopy until the kernel is fully linked # @@ -2671,6 +2679,12 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/iscsi/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/kifconf/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/virtio/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/vioblk/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + ZMODLINTFLAGS = -erroff=E_CONSTANT_CONDITION $(LINTS_DIR)/%.ln: $(UTSBASE)/common/zmod/%.c diff --git a/usr/src/uts/common/cpr/cpr_main.c b/usr/src/uts/common/cpr/cpr_main.c index 68a4040186..15e8c6c8d8 100644 --- a/usr/src/uts/common/cpr/cpr_main.c +++ b/usr/src/uts/common/cpr/cpr_main.c @@ -166,7 +166,7 @@ cpr_main(int sleeptype) */ rc = i_cpr_power_down(sleeptype); if (rc == 0) { - PMD(PMD_SX, ("back from succssful suspend\n")) + PMD(PMD_SX, ("back from successful suspend\n")) } /* * We do care about the return value from cpr_resume diff --git a/usr/src/uts/common/exec/elf/elf_notes.c b/usr/src/uts/common/exec/elf/elf_notes.c index 8649e64d48..719d215dd5 100644 --- a/usr/src/uts/common/exec/elf/elf_notes.c +++ b/usr/src/uts/common/exec/elf/elf_notes.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/param.h> @@ -34,8 +36,11 @@ #include <sys/cred.h> #include <sys/priv.h> #include <sys/user.h> +#include <sys/file.h> #include <sys/errno.h> #include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/vfs.h> #include <sys/mman.h> #include <sys/kmem.h> #include <sys/proc.h> @@ -57,6 +62,7 @@ #include <sys/modctl.h> #include <sys/systeminfo.h> #include <sys/machelf.h> +#include <sys/sunddi.h> #include "elf_impl.h" #if defined(__i386) || defined(__i386_COMPAT) #include <sys/sysi86.h> @@ -67,12 +73,27 @@ setup_note_header(Phdr *v, proc_t *p) { int nlwp = p->p_lwpcnt; int nzomb = p->p_zombcnt; + int nfd; size_t size; prcred_t *pcrp; + uf_info_t *fip; + uf_entry_t *ufp; + int fd; + + fip = P_FINFO(p); + nfd = 0; + mutex_enter(&fip->fi_lock); + for (fd = 0; fd < fip->fi_nfiles; fd++) { + UF_ENTER(ufp, fip, fd); + if ((ufp->uf_file != NULL) && (ufp->uf_file->f_count > 0)) + nfd++; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); v[0].p_type = PT_NOTE; v[0].p_flags = PF_R; - v[0].p_filesz = (sizeof (Note) * (9 + 2 * nlwp + nzomb)) + v[0].p_filesz = (sizeof (Note) * (9 + 2 * nlwp + nzomb + nfd)) + roundup(sizeof (psinfo_t), sizeof (Word)) + roundup(sizeof (pstatus_t), sizeof (Word)) + roundup(prgetprivsize(), sizeof (Word)) @@ -83,7 +104,8 @@ setup_note_header(Phdr *v, proc_t *p) + roundup(sizeof (utsname), sizeof (Word)) + roundup(sizeof (core_content_t), sizeof (Word)) + (nlwp + nzomb) * roundup(sizeof (lwpsinfo_t), sizeof (Word)) - + nlwp * roundup(sizeof (lwpstatus_t), sizeof (Word)); + + nlwp * roundup(sizeof (lwpstatus_t), sizeof (Word)) + + nfd * roundup(sizeof (prfdinfo_t), sizeof (Word)); size = sizeof (prcred_t) + sizeof (gid_t) * (ngroups_max - 1); pcrp = kmem_alloc(size, KM_SLEEP); @@ -97,6 +119,7 @@ setup_note_header(Phdr *v, proc_t *p) } kmem_free(pcrp, size); + #if defined(__i386) || defined(__i386_COMPAT) mutex_enter(&p->p_ldtlock); size = prnldt(p) * sizeof (struct ssd); @@ -159,7 +182,7 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset, size_t crsize = sizeof (prcred_t) + sizeof (gid_t) * (ngroups_max - 1); size_t psize = prgetprivsize(); size_t bigsize = MAX(psize, MAX(sizeof (*bigwad), - MAX(xregsize, crsize))); + MAX(xregsize, crsize))); priv_impl_info_t *prii; @@ -173,6 +196,10 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset, int nzomb; int error; uchar_t oldsig; + uf_info_t *fip; + int fd; + vnode_t *vroot; + #if defined(__i386) || defined(__i386_COMPAT) struct ssd *ssd; size_t ssdsize; @@ -293,6 +320,89 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset, if (error) goto done; + + /* open file table */ + vroot = PTOU(p)->u_rdir; + if (vroot == NULL) + vroot = rootdir; + + VN_HOLD(vroot); + + fip = P_FINFO(p); + + for (fd = 0; fd < fip->fi_nfiles; fd++) { + uf_entry_t *ufp; + vnode_t *fvp; + struct file *fp; + vattr_t vattr; + prfdinfo_t fdinfo; + + bzero(&fdinfo, sizeof (fdinfo)); + + mutex_enter(&fip->fi_lock); + UF_ENTER(ufp, fip, fd); + if (((fp = ufp->uf_file) == NULL) || (fp->f_count < 1)) { + UF_EXIT(ufp); + mutex_exit(&fip->fi_lock); + continue; + } + + fdinfo.pr_fd = fd; + fdinfo.pr_fdflags = ufp->uf_flag; + fdinfo.pr_fileflags = fp->f_flag2; + fdinfo.pr_fileflags <<= 16; + fdinfo.pr_fileflags |= fp->f_flag; + if ((fdinfo.pr_fileflags & (FSEARCH | FEXEC)) == 0) + fdinfo.pr_fileflags += FOPEN; + fdinfo.pr_offset = fp->f_offset; + + + fvp = fp->f_vnode; + VN_HOLD(fvp); + UF_EXIT(ufp); + mutex_exit(&fip->fi_lock); + + /* + * There are some vnodes that have no corresponding + * path. Its reasonable for this to fail, in which + * case the path will remain an empty string. + */ + (void) vnodetopath(vroot, fvp, fdinfo.pr_path, + sizeof (fdinfo.pr_path), credp); + + error = VOP_GETATTR(fvp, &vattr, 0, credp, NULL); + if (error != 0) { + VN_RELE(fvp); + VN_RELE(vroot); + goto done; + } + + if (fvp->v_type == VSOCK) + fdinfo.pr_fileflags |= sock_getfasync(fvp); + + VN_RELE(fvp); + + /* + * This logic mirrors fstat(), which we cannot use + * directly, as it calls copyout(). + */ + fdinfo.pr_major = getmajor(vattr.va_fsid); + fdinfo.pr_minor = getminor(vattr.va_fsid); + fdinfo.pr_ino = (ino64_t)vattr.va_nodeid; + fdinfo.pr_mode = VTTOIF(vattr.va_type) | vattr.va_mode; + fdinfo.pr_uid = vattr.va_uid; + fdinfo.pr_gid = vattr.va_gid; + fdinfo.pr_rmajor = getmajor(vattr.va_rdev); + fdinfo.pr_rminor = getminor(vattr.va_rdev); + fdinfo.pr_size = (off64_t)vattr.va_size; + + error = elfnote(vp, &offset, NT_FDINFO, + sizeof (fdinfo), &fdinfo, rlimit, credp); + if (error) { + goto done; + } + } + #if defined(__i386) || defined(__i386_COMPAT) mutex_enter(&p->p_ldtlock); ssdsize = prnldt(p) * sizeof (struct ssd); diff --git a/usr/src/uts/common/fs/nfs/nfs3_srv.c b/usr/src/uts/common/fs/nfs/nfs3_srv.c index c72f823cd3..4acbe92ad9 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs3_srv.c @@ -433,16 +433,25 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, goto out1; } + exi_hold(exi); + /* * If the public filehandle is used then allow * a multi-component lookup */ if (PUBLIC_FH3(&args->what.dir)) { + struct exportinfo *new; + publicfh_flag = TRUE; + error = rfs_publicfh_mclookup(name, dvp, cr, &vp, - &exi, &sec); - if (error && exi != NULL) - exi_rele(exi); /* See comment below Re: publicfh_flag */ + &new, &sec); + + if (error == 0) { + exi_rele(exi); + exi = new; + } + /* * Since WebNFS may bypass MOUNT, we need to ensure this * request didn't come from an unlabeled admin_low client. @@ -464,8 +473,6 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, if (tp == NULL || tp->tpc_tp.tp_doi != l_admin_low->tsl_doi || tp->tpc_tp.host_type != SUN_CIPSO) { - if (exi != NULL) - exi_rele(exi); VN_RELE(vp); resp->status = NFS3ERR_ACCES; error = 1; @@ -491,8 +498,6 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, dvp, DOMINANCE_CHECK, exi)) { - if (publicfh_flag && exi != NULL) - exi_rele(exi); VN_RELE(vp); resp->status = NFS3ERR_ACCES; error = 1; @@ -519,18 +524,10 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, goto out; } - /* - * If publicfh_flag is true then we have called rfs_publicfh_mclookup - * and have obtained a new exportinfo in exi which needs to be - * released. Note the the original exportinfo pointed to by exi - * will be released by the caller, common_dispatch. - */ - if (publicfh_flag) - exi_rele(exi); - va.va_mask = AT_ALL; vap = rfs4_delegated_getattr(vp, &va, 0, cr) ? NULL : &va; + exi_rele(exi); VN_RELE(vp); resp->status = NFS3_OK; @@ -552,6 +549,12 @@ rfs3_lookup(LOOKUP3args *args, LOOKUP3res *resp, struct exportinfo *exi, return; out: + /* + * The passed argument exportinfo is released by the + * caller, common_dispatch + */ + exi_rele(exi); + if (curthread->t_flag & T_WOULDBLOCK) { curthread->t_flag &= ~T_WOULDBLOCK; resp->status = NFS3ERR_JUKEBOX; diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index 29a9d67497..f2a9734541 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -21,6 +21,9 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. @@ -1131,6 +1134,7 @@ rfs4_op_secinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, char *nm; struct sockaddr *ca; char *name = NULL; + nfsstat4 status = NFS4_OK; DTRACE_NFSV4_2(op__secinfo__start, struct compound_state *, cs, SECINFO4args *, args); @@ -1154,11 +1158,12 @@ rfs4_op_secinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, * do not error out if the component name is a "..". * SECINFO will return its parents secinfo data for SECINFO "..". */ - if (!utf8_dir_verify(utfnm)) { + status = utf8_dir_verify(utfnm); + if (status != NFS4_OK) { if (utfnm->utf8string_len != 2 || utfnm->utf8string_val[0] != '.' || utfnm->utf8string_val[1] != '.') { - *cs->statusp = resp->status = NFS4ERR_INVAL; + *cs->statusp = resp->status = status; goto out; } } @@ -1336,7 +1341,8 @@ rfs4_op_access(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, blequal(clabel, slabel))) resp->access |= (args->access & (ACCESS4_MODIFY | ACCESS4_EXTEND)); - resp->supported |= (ACCESS4_MODIFY | ACCESS4_EXTEND); + resp->supported |= + resp->access & (ACCESS4_MODIFY | ACCESS4_EXTEND); } if (checkwriteperm && @@ -1570,8 +1576,9 @@ rfs4_op_create(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto out; } - if (!utf8_dir_verify(&args->objname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->objname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -2446,6 +2453,7 @@ rfs4_op_link(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, uint_t len; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__link__start, struct compound_state *, cs, LINK4args *, args); @@ -2495,8 +2503,9 @@ rfs4_op_link(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->newname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->newname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -2886,6 +2895,7 @@ rfs4_op_lookup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, uint_t len; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__lookup__start, struct compound_state *, cs, LOOKUP4args *, args); @@ -2905,8 +2915,9 @@ rfs4_op_lookup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->objname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->objname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -3655,30 +3666,6 @@ out: } /* - * A directory entry is a valid nfsv4 entry if - * - it has a non-zero ino - * - it is not a dot or dotdot name - * - it is visible in a pseudo export or in a real export that can - * only have a limited view. - */ -static bool_t -valid_nfs4_entry(struct exportinfo *exi, struct dirent64 *dp, - int *expseudo, int check_visible) -{ - if (dp->d_ino == 0 || NFS_IS_DOTNAME(dp->d_name)) { - *expseudo = 0; - return (FALSE); - } - - if (! check_visible) { - *expseudo = 0; - return (TRUE); - } - - return (nfs_visible_inode(exi, dp->d_ino, expseudo)); -} - -/* * set_rdattr_params sets up the variables used to manage what information * to get for each directory entry. */ @@ -4101,6 +4088,7 @@ rfs4_op_remove(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, bslabel_t *clabel; struct sockaddr *ca; char *name = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__remove__start, struct compound_state *, cs, REMOVE4args *, args); @@ -4131,8 +4119,9 @@ rfs4_op_remove(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->target)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->target); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -4398,6 +4387,7 @@ rfs4_op_rename(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, struct sockaddr *ca; char *converted_onm = NULL; char *converted_nnm = NULL; + nfsstat4 status; DTRACE_NFSV4_2(op__rename__start, struct compound_state *, cs, RENAME4args *, args); @@ -4454,13 +4444,15 @@ rfs4_op_rename(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, goto out; } - if (!utf8_dir_verify(&args->oldname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->oldname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } - if (!utf8_dir_verify(&args->newname)) { - *cs->statusp = resp->status = NFS4ERR_INVAL; + status = utf8_dir_verify(&args->newname); + if (status != NFS4_OK) { + *cs->statusp = resp->status = status; goto out; } @@ -5789,6 +5781,8 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, cs.statusp = &resp->status; cs.req = req; + resp->array = NULL; + resp->array_len = 0; /* * XXX for now, minorversion should be zero @@ -5796,14 +5790,17 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, if (args->minorversion != NFS4_MINORVERSION) { DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs, COMPOUND4args *, args); - resp->array_len = 0; - resp->array = NULL; resp->status = NFS4ERR_MINOR_VERS_MISMATCH; DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs, COMPOUND4res *, resp); return; } + if (args->array_len == 0) { + resp->status = NFS4_OK; + return; + } + ASSERT(exi == NULL); ASSERT(cr == NULL); @@ -6079,8 +6076,9 @@ rfs4_lookup(component4 *component, struct svc_req *req, return (NFS4ERR_NOTDIR); } - if (!utf8_dir_verify(component)) - return (NFS4ERR_INVAL); + status = utf8_dir_verify(component); + if (status != NFS4_OK) + return (status); nm = utf8_to_fn(component, &len, NULL); if (nm == NULL) { @@ -6372,8 +6370,9 @@ rfs4_createfile(OPEN4args *args, struct svc_req *req, struct compound_state *cs, * the including directory on success. */ component = &args->open_claim4_u.file; - if (!utf8_dir_verify(component)) - return (NFS4ERR_INVAL); + status = utf8_dir_verify(component); + if (status != NFS4_OK) + return (status); nm = utf8_to_fn(component, &buflen, NULL); @@ -7594,6 +7593,12 @@ rfs4_op_open_confirm(nfs_argop4 *argop, nfs_resop4 *resop, goto out; } + if (cs->vp->v_type != VREG) { + *cs->statusp = resp->status = + cs->vp->v_type == VDIR ? NFS4ERR_ISDIR : NFS4ERR_INVAL; + return; + } + status = rfs4_get_state(&args->open_stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; @@ -7709,6 +7714,11 @@ rfs4_op_open_downgrade(nfs_argop4 *argop, nfs_resop4 *resop, goto out; } + if (cs->vp->v_type != VREG) { + *cs->statusp = resp->status = NFS4ERR_INVAL; + return; + } + status = rfs4_get_state(&args->open_stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c index dbd3263608..855cd8cd92 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ #include <sys/systm.h> #include <sys/cmn_err.h> @@ -1585,7 +1588,8 @@ rfs4_fattr4_fs_locations(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, case NFS4ATTR_GETIT: fsl = fetch_referral(sarg->cs->vp, sarg->cs->cr); if (fsl == NULL) - error = EINVAL; + (void) memset(&(na->fs_locations), 0, + sizeof (fs_locations4)); else { na->fs_locations = *fsl; kmem_free(fsl, sizeof (fs_locations4)); diff --git a/usr/src/uts/common/fs/nfs/nfs4_subr.c b/usr/src/uts/common/fs/nfs/nfs4_subr.c index c14117c009..cfac742707 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_subr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_subr.c @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. @@ -713,33 +716,33 @@ utf8_compare(const utf8string *a, const utf8string *b) /* * utf8_dir_verify - checks that the utf8 string is valid */ -int +nfsstat4 utf8_dir_verify(utf8string *str) { char *nm; int len; if (str == NULL) - return (0); + return (NFS4ERR_INVAL); nm = str->utf8string_val; len = str->utf8string_len; if (nm == NULL || len == 0) { - return (0); + return (NFS4ERR_INVAL); } if (len == 1 && nm[0] == '.') - return (0); + return (NFS4ERR_BADNAME); if (len == 2 && nm[0] == '.' && nm[1] == '.') - return (0); + return (NFS4ERR_BADNAME); if (utf8_strchr(str, '/') != NULL) - return (0); + return (NFS4ERR_BADNAME); if (utf8_strchr(str, '\0') != NULL) - return (0); + return (NFS4ERR_BADNAME); - return (1); + return (NFS4_OK); } /* diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index bb625bb175..22d1ad4d68 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -22,6 +22,7 @@ * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2012 Joyent, Inc. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -2804,8 +2805,8 @@ rfs_publicfh_mclookup(char *p, vnode_t *dvp, cred_t *cr, vnode_t **vpp, */ /* Release the reference on the old exi value */ - ASSERT(*exi != NULL); exi_rele(*exi); + *exi = NULL; if (error = nfs_check_vpexi(mc_dvp, *vpp, kcred, exi)) { VN_RELE(*vpp); @@ -2818,6 +2819,9 @@ publicfh_done: if (mc_dvp) VN_RELE(mc_dvp); + if (error && *exi != NULL) + exi_rele(*exi); + return (error); } @@ -2963,16 +2967,19 @@ URLparse(char *str) /* * Get the export information for the lookup vnode, and verify its * useable. + * + * Set @exip only in success */ int nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr, - struct exportinfo **exi) + struct exportinfo **exip) { int walk; int error = 0; + struct exportinfo *exi; - *exi = nfs_vptoexi(mc_dvp, vp, cr, &walk, NULL, FALSE); - if (*exi == NULL) + exi = nfs_vptoexi(mc_dvp, vp, cr, &walk, NULL, FALSE); + if (exi == NULL) error = EACCES; else { /* @@ -2981,10 +2988,13 @@ nfs_check_vpexi(vnode_t *mc_dvp, vnode_t *vp, cred_t *cr, * must not terminate below the * exported directory. */ - if ((*exi)->exi_export.ex_flags & EX_NOSUB && walk > 0) + if (exi->exi_export.ex_flags & EX_NOSUB && walk > 0) { error = EACCES; + exi_rele(exi); + } } - + if (error == 0) + *exip = exi; return (error); } diff --git a/usr/src/uts/common/fs/nfs/nfs_srv.c b/usr/src/uts/common/fs/nfs/nfs_srv.c index 8ca8ee5d1d..f0cd9633aa 100644 --- a/usr/src/uts/common/fs/nfs/nfs_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs_srv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -399,6 +400,8 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, return; } + exi_hold(exi); + /* * If the public filehandle is used then allow * a multi-component lookup, i.e. evaluate @@ -409,9 +412,16 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, * which is OK as long as the filesystem is exported. */ if (PUBLIC_FH2(fhp)) { + struct exportinfo *new; + publicfh_flag = TRUE; - error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi, + error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new, &sec); + + if (error == 0) { + exi_rele(exi); + exi = new; + } } else { /* * Do a normal single component lookup. @@ -452,13 +462,10 @@ rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr, VN_RELE(dvp); /* - * If publicfh_flag is true then we have called rfs_publicfh_mclookup - * and have obtained a new exportinfo in exi which needs to be - * released. Note the the original exportinfo pointed to by exi - * will be released by the caller, comon_dispatch. + * The passed argument exportinfo is released by the + * caller, comon_dispatch */ - if (publicfh_flag && exi != NULL) - exi_rele(exi); + exi_rele(exi); /* * If it's public fh, no 0x81, and client's flavor is diff --git a/usr/src/uts/common/fs/smbsrv/smb_delete.c b/usr/src/uts/common/fs/smbsrv/smb_delete.c index 43f6d733bd..8a27b7408e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_delete.c +++ b/usr/src/uts/common/fs/smbsrv/smb_delete.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <smbsrv/smb_kproto.h> @@ -553,7 +554,7 @@ smb_delete_check_path(smb_request_t *sr) /* fname component is, or resolves to, '.' (dot) */ if ((strcmp(pn->pn_fname, ".") == 0) || (SMB_SEARCH_DIRECTORY(fqi->fq_sattr) && - (smb_match(pn->pn_fname, ".")))) { + (smb_match(pn->pn_fname, ".", B_FALSE)))) { smbsr_error(sr, NT_STATUS_OBJECT_NAME_INVALID, ERRDOS, ERROR_INVALID_NAME); return (-1); diff --git a/usr/src/uts/common/fs/smbsrv/smb_kutil.c b/usr/src/uts/common/fs/smbsrv/smb_kutil.c index 5d45081e2e..aed58277be 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_kutil.c +++ b/usr/src/uts/common/fs/smbsrv/smb_kutil.c @@ -18,8 +18,10 @@ * * CDDL HEADER END */ + /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <sys/param.h> @@ -98,116 +100,34 @@ smb_ascii_or_unicode_null_len(struct smb_request *sr) } /* - * Return B_TRUE if pattern contains wildcards - */ -boolean_t -smb_contains_wildcards(const char *pattern) -{ - static const char *wildcards = "*?"; - - return (strpbrk(pattern, wildcards) != NULL); -} - -/* - * When converting wildcards a '.' in a name is treated as a base and - * extension separator even if the name is longer than 8.3. - * - * The '*' character matches an entire part of the name. For example, - * "*.abc" matches any name with an extension of "abc". * - * The '?' character matches a single character. - * If the base contains all ? (8 or more) then it is treated as *. - * If the extension contains all ? (3 or more) then it is treated as *. - * - * Clients convert ASCII wildcards to Unicode wildcards as follows: + * Convert old-style (DOS, LanMan) wildcard strings to NT style. + * This should ONLY happen to patterns that come from old clients, + * meaning dialect LANMAN2_1 etc. (dialect < NT_LM_0_12). * * ? is converted to > - * . is converted to " if it is followed by ? or * * * is converted to < if it is followed by . + * . is converted to " if it is followed by ? or * or end of pattern * - * Note that clients convert "*." to '< and drop the '.' but "*.txt" - * is sent as "<.TXT", i.e. - * - * dir *. -> dir < - * dir *.txt -> dir <.TXT - * - * Since " and < are illegal in Windows file names, we always convert - * these Unicode wildcards without checking the following character. + * Note: modifies pattern in place. */ void smb_convert_wildcards(char *pattern) { - static char *match_all[] = { - "*.", - "*.*" - }; - char *extension; char *p; - int len; - int i; - /* - * Special case "<" for "dir *.", and fast-track for "*". - */ - if ((*pattern == '<') || (*pattern == '*')) { - if (*(pattern + 1) == '\0') { - *pattern = '*'; - return; - } - } - - for (p = pattern; *p != '\0'; ++p) { + for (p = pattern; *p != '\0'; p++) { switch (*p) { - case '<': - *p = '*'; - break; - case '>': - *p = '?'; + case '?': + *p = '>'; break; - case '\"': - *p = '.'; + case '*': + if (p[1] == '.') + *p = '<'; break; - default: - break; - } - } - - /* - * Replace "????????.ext" with "*.ext". - */ - p = pattern; - p += strspn(p, "?"); - if (*p == '.') { - *p = '\0'; - len = strlen(pattern); - *p = '.'; - if (len >= SMB_NAME83_BASELEN) { - *pattern = '*'; - (void) strlcpy(pattern + 1, p, MAXPATHLEN - 1); - } - } - - /* - * Replace "base.???" with 'base.*'. - */ - if ((extension = strrchr(pattern, '.')) != NULL) { - p = ++extension; - p += strspn(p, "?"); - if (*p == '\0') { - len = strlen(extension); - if (len >= SMB_NAME83_EXTLEN) { - *extension = '\0'; - (void) strlcat(pattern, "*", MAXPATHLEN); - } - } - } - - /* - * Replace anything that matches an entry in match_all with "*". - */ - for (i = 0; i < sizeof (match_all) / sizeof (match_all[0]); ++i) { - if (strcmp(pattern, match_all[i]) == 0) { - (void) strlcpy(pattern, "*", MAXPATHLEN); + case '.': + if (p[1] == '?' || p[1] == '*' || p[1] == '\0') + *p = '\"'; break; } } diff --git a/usr/src/uts/common/fs/smbsrv/smb_odir.c b/usr/src/uts/common/fs/smbsrv/smb_odir.c index ea9b505f0d..610126753b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_odir.c +++ b/usr/src/uts/common/fs/smbsrv/smb_odir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ /* @@ -286,7 +287,8 @@ smb_odir_open(smb_request_t *sr, char *path, uint16_t sattr, uint32_t flags) tree = sr->tid_tree; - smb_convert_wildcards(path); + if (sr->session->dialect < NT_LM_0_12) + smb_convert_wildcards(path); rc = smb_pathname_reduce(sr, sr->user_cr, path, tree->t_snode, tree->t_snode, &dnode, pattern); @@ -1278,22 +1280,23 @@ smb_odir_lookup_link(smb_request_t *sr, smb_odir_t *od, * - If shortnames are supported, generate the shortname from * odirent->od_name and check if it matches od->d_pattern. */ -boolean_t +static boolean_t smb_odir_match_name(smb_odir_t *od, smb_odirent_t *odirent) { char *name = odirent->od_name; char shortname[SMB_SHORTNAMELEN]; ino64_t ino = odirent->od_ino; + boolean_t ci = (od->d_flags & SMB_ODIR_FLAG_IGNORE_CASE) != 0; if (smb_is_reserved_dos_name(name)) return (B_FALSE); - if (smb_match_ci(od->d_pattern, name)) + if (smb_match(od->d_pattern, name, ci)) return (B_TRUE); if (od->d_flags & SMB_ODIR_FLAG_SHORTNAMES) { smb_mangle(name, ino, shortname, SMB_SHORTNAMELEN); - if (smb_match_ci(od->d_pattern, shortname)) + if (smb_match(od->d_pattern, shortname, ci)) return (B_TRUE); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_pathname.c b/usr/src/uts/common/fs/smbsrv/smb_pathname.c index e3ae3ffba2..db9883667e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_pathname.c +++ b/usr/src/uts/common/fs/smbsrv/smb_pathname.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #include <smbsrv/smb_kproto.h> @@ -732,8 +733,8 @@ smb_pathname_preprocess(smb_request_t *sr, smb_pathname_t *pn) return; } - /* perform unicode wildcard conversion */ - smb_convert_wildcards(pn->pn_path); + if (sr->session->dialect < NT_LM_0_12) + smb_convert_wildcards(pn->pn_path); /* treat '/' as '\\' */ (void) strsubst(pn->pn_path, '/', '\\'); diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index abe3a23e75..8d5c741428 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -936,29 +936,33 @@ vfs_mountroot(void) } #endif /* __sparc */ - /* - * Look up the root device via devfs so that a dv_node is - * created for it. The vnode is never VN_RELE()ed. - * We allocate more than MAXPATHLEN so that the - * buffer passed to i_ddi_prompath_to_devfspath() is - * exactly MAXPATHLEN (the function expects a buffer - * of that length). - */ - plen = strlen("/devices"); - path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); - (void) strcpy(path, "/devices"); + if (strcmp(rootfs.bo_fstype, "zfs") != 0) { + /* + * Look up the root device via devfs so that a dv_node is + * created for it. The vnode is never VN_RELE()ed. + * We allocate more than MAXPATHLEN so that the + * buffer passed to i_ddi_prompath_to_devfspath() is + * exactly MAXPATHLEN (the function expects a buffer + * of that length). + */ + plen = strlen("/devices"); + path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP); + (void) strcpy(path, "/devices"); - if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) - != DDI_SUCCESS || - lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { + if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen) + != DDI_SUCCESS || + lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) { - /* NUL terminate in case "path" has garbage */ - path[plen + MAXPATHLEN - 1] = '\0'; + /* NUL terminate in case "path" has garbage */ + path[plen + MAXPATHLEN - 1] = '\0'; #ifdef DEBUG - cmn_err(CE_WARN, "!Cannot lookup root device: %s", path); + cmn_err(CE_WARN, "!Cannot lookup root device: %s", + path); #endif + } + kmem_free(path, plen + MAXPATHLEN); } - kmem_free(path, plen + MAXPATHLEN); + vfs_mnttabvp_setup(); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 5caabf8260..d8e9f26bdb 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -190,6 +190,7 @@ uint64_t zfs_arc_meta_limit = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +int zfs_disable_dup_eviction = 0; /* * Note that buffers can be in one of 6 states: @@ -292,6 +293,9 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_duplicate_buffers; + kstat_named_t arcstat_duplicate_buffers_size; + kstat_named_t arcstat_duplicate_reads; } arc_stats_t; static arc_stats_t arc_stats = { @@ -347,7 +351,10 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "duplicate_buffers", KSTAT_DATA_UINT64 }, + { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, + { "duplicate_reads", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -1362,6 +1369,17 @@ arc_buf_clone(arc_buf_t *from) hdr->b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); + + /* + * This buffer already exists in the arc so create a duplicate + * copy for the caller. If the buffer is associated with user data + * then track the size and number of duplicates. These stats will be + * updated as duplicate buffers are created and destroyed. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMP(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); + } hdr->b_datacnt += 1; return (buf); } @@ -1460,6 +1478,16 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) ASSERT3U(state->arcs_size, >=, size); atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; + + /* + * If we're destroying a duplicate buffer make sure + * that the appropriate statistics are updated. + */ + if (buf->b_hdr->b_datacnt > 1 && + buf->b_hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); + } ASSERT(buf->b_hdr->b_datacnt > 0); buf->b_hdr->b_datacnt -= 1; } @@ -1644,6 +1672,48 @@ arc_buf_size(arc_buf_t *buf) } /* + * Called from the DMU to determine if the current buffer should be + * evicted. In order to ensure proper locking, the eviction must be initiated + * from the DMU. Return true if the buffer is associated with user data and + * duplicate buffers still exist. + */ +boolean_t +arc_buf_eviction_needed(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + boolean_t evict_needed = B_FALSE; + + if (zfs_disable_dup_eviction) + return (B_FALSE); + + mutex_enter(&buf->b_evict_lock); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(); let that function + * perform the eviction. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&buf->b_evict_lock); + return (B_FALSE); + } else if (buf->b_data == NULL) { + /* + * We have already been added to the arc eviction list; + * recommend eviction. + */ + ASSERT3P(hdr, ==, &arc_eviction_hdr); + mutex_exit(&buf->b_evict_lock); + return (B_TRUE); + } + + if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + evict_needed = B_TRUE; + + mutex_exit(&buf->b_evict_lock); + return (evict_needed); +} + +/* * Evict buffers from list until we've removed the specified number of * bytes. Move the removed buffers to the appropriate evict state. * If the recycle flag is set, then attempt to "recycle" a buffer: @@ -2638,8 +2708,10 @@ arc_read_done(zio_t *zio) abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done) { - if (abuf == NULL) + if (abuf == NULL) { + ARCSTAT_BUMP(arcstat_duplicate_reads); abuf = arc_buf_clone(buf); + } acb->acb_buf = abuf; abuf = NULL; } @@ -3186,6 +3258,16 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } + + /* + * We're releasing a duplicate user data buffer, update + * our statistics accordingly. + */ + if (hdr->b_type == ARC_BUFC_DATA) { + ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); + ARCSTAT_INCR(arcstat_duplicate_buffers_size, + -hdr->b_size); + } hdr->b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 437e0ac85c..e8bf55c321 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -2089,7 +2089,24 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) + + /* + * A dbuf will be eligible for eviction if either the + * 'primarycache' property is set or a duplicate + * copy of this buffer is already cached in the arc. + * + * In the case of the 'primarycache' a buffer + * is considered for eviction if it matches the + * criteria set in the property. + * + * To decide if our buffer is considered a + * duplicate, we must call into the arc to determine + * if multiple buffers are referencing the same + * block on-disk. If so, then we simply evict + * ourselves. + */ + if (!DBUF_IS_CACHEABLE(db) || + arc_buf_eviction_needed(db->db_buf)) dbuf_clear(db); else mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 190b26e5bf..a9308b0c08 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -574,7 +574,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (dn->dn_indblkshift - SPA_BLKPTRSHIFT); while (level++ < maxlevel) { - txh->txh_memory_tohold += MIN(blkcnt, (nl1blks >> epbs)) + txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift; blkcnt = 1 + (blkcnt >> epbs); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index d9cd70f1c8..968fbd80d6 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -5983,6 +5983,10 @@ spa_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -6111,6 +6115,8 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); + /* * Clear the dirty config list. */ diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 30681b6464..a254c8d656 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -26,6 +26,7 @@ #include <sys/zfs_context.h> #include <sys/spa_impl.h> +#include <sys/spa_boot.h> #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zio_compress.h> @@ -249,6 +250,26 @@ int zfs_flags = 0; */ int zfs_recover = 0; +extern int zfs_txg_synctime_ms; + +/* + * Expiration time in units of zfs_txg_synctime_ms. This value has two + * meanings. First it is used to determine when the spa_deadman logic + * should fire. By default the spa_deadman will fire if spa_sync has + * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). + * Secondly, the value determines if an I/O is considered "hung". + * Any I/O that has not completed in zfs_deadman_synctime is considered + * "hung" resulting in a system panic. + * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds). + */ +uint64_t zfs_deadman_synctime = 1000ULL; + +/* + * Override the zfs deadman behavior via /etc/system. By default the + * deadman is enabled except on VMware and sparc deployments. + */ +int zfs_deadman_enabled = -1; + /* * ========================================================================== @@ -418,6 +439,23 @@ spa_lookup(const char *name) } /* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +} + +/* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. @@ -427,6 +465,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; + cyc_handler_t hdlr; + cyc_time_t when; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -458,6 +498,25 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; + + spa->spa_deadman_synctime = zfs_deadman_synctime * + zfs_txg_synctime_ms * MICROSEC; + + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 synctimes before checking again. + */ + when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -540,6 +599,12 @@ spa_remove(spa_t *spa) nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; + refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); @@ -1507,6 +1572,12 @@ spa_prev_software_version(spa_t *spa) } uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); @@ -1600,7 +1671,9 @@ spa_init(int mode) spa_mode_global = mode; -#ifndef _KERNEL +#ifdef _KERNEL + spa_arch_init(); +#else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 28dbc57275..b109dcafbc 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -99,6 +99,7 @@ int arc_released(arc_buf_t *buf); int arc_has_callback(arc_buf_t *buf); void arc_buf_freeze(arc_buf_t *buf); void arc_buf_thaw(arc_buf_t *buf); +boolean_t arc_buf_eviction_needed(arc_buf_t *buf); #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf); #endif diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h index 6661e47cfc..8ae05ce364 100644 --- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_SA_IMPL_H @@ -181,7 +182,7 @@ typedef struct sa_hdr_phys { */ #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) -#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 16, 3, 0) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ { \ BF32_SET_SB(x, 10, 6, 3, 0, size); \ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 1043f4038a..172a9f141e 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -604,6 +604,7 @@ extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_boot.h b/usr/src/uts/common/fs/zfs/sys/spa_boot.h index 1d3622f5a1..8df5072a55 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H @@ -35,6 +39,8 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); +extern void spa_arch_init(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 027832e858..42ce5556d3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -227,6 +227,10 @@ struct spa { uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ + cyclic_id_t spa_deadman_cycid; /* cyclic id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + uint64_t spa_sync_starttime; /* starting time fo spa_sync */ + uint64_t spa_deadman_synctime; /* deadman expiration timer */ /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 7e34889b61..5a7836612b 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -79,6 +79,7 @@ extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index c772d954bb..e4c02bde1d 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -105,6 +105,8 @@ struct vdev_queue { avl_tree_t vq_write_tree; avl_tree_t vq_pending_tree; zoneid_t vq_last_zone_id; + uint64_t vq_io_complete_ts; + uint64_t vq_io_delta_ts; kmutex_t vq_lock; }; @@ -321,6 +323,14 @@ extern void vdev_set_min_asize(vdev_t *vd); */ extern int zfs_vdev_cache_size; +/* + * The vdev_buf_t is used to translate between zio_t and buf_t, and back again. + */ +typedef struct vdev_buf { + buf_t vb_buf; /* buffer that describes the io */ + zio_t *vb_io; /* pointer back to the original zio_t */ +} vdev_buf_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_context.h b/usr/src/uts/common/fs/zfs/sys/zfs_context.h index fdd0412fee..0dc8d8859c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_context.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_context.h @@ -22,8 +22,10 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_CONTEXT_H @@ -67,6 +69,7 @@ extern "C" { #include <sys/sysevent/dev.h> #include <sys/fm/util.h> #include <sys/sunddi.h> +#include <sys/cyclic.h> #define CPU_SEQID (CPU->cpu_seqid) diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 4d781ad2a4..86e901be0d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -240,12 +240,24 @@ typedef struct zinject_record { uint32_t zi_iotype; int32_t zi_duration; uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; } zinject_record_t; #define ZINJECT_NULL 0x1 #define ZINJECT_FLUSH_ARC 0x2 #define ZINJECT_UNLOAD_SPA 0x4 +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, +} zinject_type_t; + typedef struct zfs_share { uint64_t z_exportdata; uint64_t z_sharedata; diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index ce3a983d9f..9c718f691a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -21,8 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - */ -/* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. @@ -406,6 +404,7 @@ struct zio { uint64_t io_offset; uint64_t io_deadline; + uint64_t io_timestamp; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; @@ -554,6 +553,7 @@ extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); extern void zio_handle_ignored_writes(zio_t *zio); +extern uint64_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index fa0a579e66..18180ecad3 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -3153,3 +3153,41 @@ vdev_split(vdev_t *vd) } vdev_propagate_state(cvd); } + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_pending_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_pending_tree); + delta = ddi_get_lbolt64() - fio->io_timestamp; + if (delta > NSEC_TO_TICK(spa_deadman_synctime(spa))) { + zfs_dbgmsg("SLOW IO: zio timestamp %llu, " + "delta %llu, last io %llu", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + fm_panic("I/O to pool '%s' appears to be " + "hung.", spa_name(spa)); + } + } + mutex_exit(&vq->vq_lock); + } +} diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 1ba343226f..dfadeca9d4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -42,11 +42,6 @@ extern ldi_ident_t zfs_li; -typedef struct vdev_disk_buf { - buf_t vdb_buf; - zio_t *vdb_io; -} vdev_disk_buf_t; - static void vdev_disk_hold(vdev_t *vd) { @@ -170,7 +165,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, /* * When opening a disk device, we want to preserve the user's original * intent. We always want to open the device by the path the user gave - * us, even if it is one of multiple paths to the save device. But we + * us, even if it is one of multiple paths to the same device. But we * also want to be able to survive disks being removed/recabled. * Therefore the sequence of opening devices is: * @@ -416,8 +411,8 @@ vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, static void vdev_disk_io_intr(buf_t *bp) { - vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; - zio_t *zio = vdb->vdb_io; + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; /* * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. @@ -429,7 +424,7 @@ vdev_disk_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = EIO; - kmem_free(vdb, sizeof (vdev_disk_buf_t)); + kmem_free(vb, sizeof (vdev_buf_t)); zio_interrupt(zio); } @@ -460,7 +455,7 @@ vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_disk_t *dvd = vd->vdev_tsd; - vdev_disk_buf_t *vdb; + vdev_buf_t *vb; struct dk_callback *dkc; buf_t *bp; int error; @@ -524,10 +519,10 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - vdb->vdb_io = zio; - bp = &vdb->vdb_buf; + vb->vb_io = zio; + bp = &vb->vb_buf; bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 043fa51294..1fbce5e542 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -25,6 +25,7 @@ #include <sys/zfs_context.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/vdev_file.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -140,12 +141,55 @@ vdev_file_close(vdev_t *vd) vd->vdev_tsd = NULL; } +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(buf_t *bp) +{ + vdev_buf_t *vb = (vdev_buf_t *)bp; + zio_t *zio = vb->vb_io; + + zio->io_error = (geterror(bp) != 0 ? EIO : 0); + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = ENOSPC; + + kmem_free(vb, sizeof (vdev_buf_t)); + zio_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + buf_t *bp = arg; + vnode_t *vp = bp->b_private; + ssize_t resid; + int error; + + error = vn_rdwr((bp->b_flags & B_READ) ? UIO_READ : UIO_WRITE, + vp, bp->b_un.b_addr, bp->b_bcount, ldbtob(bp->b_lblkno), + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (error == 0) { + bp->b_resid = resid; + biodone(bp); + } else { + bioerror(bp, error); + biodone(bp); + } +} + static int vdev_file_io_start(zio_t *zio) { + spa_t *spa = zio->io_spa; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; + vdev_buf_t *vb; + buf_t *bp; if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ @@ -166,15 +210,22 @@ vdev_file_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; + vb->vb_io = zio; + bp = &vb->vb_buf; - zio_interrupt(zio); + bioinit(bp); + bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + bp->b_bcount = zio->io_size; + bp->b_un.b_addr = zio->io_data; + bp->b_lblkno = lbtodb(zio->io_offset); + bp->b_bufsize = zio->io_size; + bp->b_private = vf->vf_vnode; + bp->b_iodone = (int (*)())vdev_file_io_intr; + + taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE], + vdev_file_io_strategy, bp, 0, &zio->io_tqent); return (ZIO_PIPELINE_STOP); } diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 4ea958a9f6..8dec283fee 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -24,6 +24,10 @@ * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include <sys/zfs_context.h> #include <sys/vdev_impl.h> #include <sys/zio.h> @@ -298,6 +302,7 @@ again: zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); + aio->io_timestamp = fio->io_timestamp; nio = fio; do { @@ -369,7 +374,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_timestamp = ddi_get_lbolt64(); + zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + zio->io_priority; vdev_queue_io_add(vq, zio); @@ -394,10 +400,16 @@ vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + if (zio_injection_enabled) + delay(SEC_TO_TICK(zio_handle_io_delay(zio))); + mutex_enter(&vq->vq_lock); avl_remove(&vq->vq_pending_tree, zio); + vq->vq_io_complete_ts = ddi_get_lbolt64(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + for (int i = 0; i < zfs_vdev_ramp_rate; i++) { zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); if (nio == NULL) diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 2292f658b3..c7bfbbaec4 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -50,6 +50,7 @@ #include <sys/spa.h> #include <sys/zap.h> #include <sys/sa.h> +#include <sys/sa_impl.h> #include <sys/varargs.h> #include <sys/policy.h> #include <sys/atomic.h> @@ -64,7 +65,6 @@ #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> -#include <sys/sa.h> #include "zfs_comutil.h" int zfsfstype; @@ -578,7 +578,6 @@ static int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, uint64_t *userp, uint64_t *groupp) { - znode_phys_t *znp = data; int error = 0; /* @@ -597,20 +596,18 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, return (EEXIST); if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; *userp = znp->zp_uid; *groupp = znp->zp_gid; } else { int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; ASSERT(bonustype == DMU_OT_SA); - hdrsize = sa_hdrsize(data); - if (hdrsize != 0) { - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_GID_OFFSET)); - } else { + if (sa.sa_magic == 0) { /* * This should only happen for newly created * files that haven't had the znode data filled @@ -618,6 +615,25 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, */ *userp = 0; *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); } } return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 0c86cac427..92dc05f4a0 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1947,13 +1947,16 @@ zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) * or not the object is an extended attribute directory. */ static int -zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, - int *is_xattrdir) +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) { uint64_t parent; uint64_t pflags; uint64_t mode; + uint64_t parent_mode; sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; int count = 0; int error; @@ -1967,9 +1970,32 @@ zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp, if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) return (error); - *pobjp = parent; + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (EINVAL); + + *pobjp = parent; + return (0); } @@ -2018,7 +2044,7 @@ zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, if (prevdb) zfs_release_sa_handle(prevhdl, prevdb, FTAG); - if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj, + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, &is_xattrdir)) != 0) break; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index e2e98b7896..00964aa83f 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -2928,7 +2928,7 @@ zio_done(zio_t *zio) * Hand it off to the otherwise-unused claim taskq. */ ASSERT(zio->io_tqent.tqent_next == NULL); - (void) taskq_dispatch_ent( + taskq_dispatch_ent( spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE], (task_func_t *)zio_reexecute, zio, 0, &zio->io_tqent); diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 9ae7d1f697..a9d4ab4070 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -147,14 +148,8 @@ zio_handle_fault_injection(zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - /* Ignore device errors and panic injection */ - if (handler->zi_record.zi_guid != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) continue; /* If this handler matches, return EIO */ @@ -197,10 +192,7 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults or panic injection */ - if (handler->zi_record.zi_start == 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) continue; /* @@ -246,13 +238,7 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* - * Ignore label specific faults, panic injection - * or fake writes - */ - if (handler->zi_record.zi_start != 0 || - handler->zi_record.zi_func[0] != '\0' || - handler->zi_record.zi_duration != 0) + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { @@ -316,10 +302,8 @@ zio_handle_ignored_writes(zio_t *zio) handler = list_next(&inject_handlers, handler)) { /* Ignore errors not destined for this pool */ - if (zio->io_spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; /* @@ -355,11 +339,8 @@ spa_handle_ignored_writes(spa_t *spa) for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore errors not destined for this pool */ - if (spa != handler->zi_spa) - continue; - - if (handler->zi_record.zi_duration == 0) + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) continue; if (handler->zi_record.zi_duration > 0) { @@ -379,6 +360,34 @@ spa_handle_ignored_writes(spa_t *spa) rw_exit(&inject_lock); } +uint64_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *handler; + uint64_t seconds = 0; + + if (zio_injection_enabled == 0) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + seconds = handler->zi_record.zi_timer; + break; + } + + } + rw_exit(&inject_lock); + return (seconds); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, diff --git a/usr/src/uts/common/io/1394/adapters/hci1394_extern.c b/usr/src/uts/common/io/1394/adapters/hci1394_extern.c index 1da7580b6b..2faf274fb3 100644 --- a/usr/src/uts/common/io/1394/adapters/hci1394_extern.c +++ b/usr/src/uts/common/io/1394/adapters/hci1394_extern.c @@ -24,8 +24,6 @@ * All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * hci1394_extern.c * Central location for externs. There are two exceptions to this, @@ -52,7 +50,7 @@ uint32_t hci1394_split_timeout = 800; /* - * 1394 address map for OpenHCI adpaters. + * 1394 address map for OpenHCI adapters. * * This is what is reported to the services layer. The hci1394 driver does not * modify the HW to reflect this. This should reflect what the OpenHCI 1.0 HW diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index 3410fad1ec..20e3a5737e 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -20,8 +20,9 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011, 2012 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> All rights reserved. */ #include <sys/types.h> @@ -503,7 +504,7 @@ bd_xfer_ctor(void *buf, void *arg, int kmflag) bd_t *bd = arg; int (*dcb)(caddr_t); - if (kmflag == KM_SLEEP) { + if (kmflag == KM_PUSHPAGE || kmflag == KM_SLEEP) { dcb = DDI_DMA_SLEEP; } else { dcb = DDI_DMA_DONTWAIT; diff --git a/usr/src/uts/common/io/e1000g/e1000g_tx.c b/usr/src/uts/common/io/e1000g/e1000g_tx.c index a696aec5a5..1f8a51d291 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_tx.c +++ b/usr/src/uts/common/io/e1000g/e1000g_tx.c @@ -668,10 +668,12 @@ e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list, * drivers do not have this issue because they (harmlessly) set the * POPTS field on every data descriptor to be the intended options for * the entire packet. To circumvent this QEMU bug, we engage in this - * same behavior iff our type matches that which is emulated by QEMU - * (the 82540). + * same behavior iff the subsystem vendor and device IDs indicate that + * this is an emulated QEMU device (1af4,1100). */ - if (hw->mac.type == e1000_82540 && cur_context->cksum_flags) { + if (hw->subsystem_vendor_id == 0x1af4 && + hw->subsystem_device_id == 0x1100 && + cur_context->cksum_flags) { if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) zeroed.upper.fields.popts |= E1000_TXD_POPTS_IXSM; diff --git a/usr/src/uts/common/io/igb/igb_regs.h b/usr/src/uts/common/io/igb/igb_regs.h index a2b2450b67..b554ef6d23 100644 --- a/usr/src/uts/common/io/igb/igb_regs.h +++ b/usr/src/uts/common/io/igb/igb_regs.h @@ -216,7 +216,7 @@ extern "C" { /* Packet Buffer DWORD (_n) */ #define E1000_PBSLAD(_n) (0x03110 + (0x4 * (_n))) #define E1000_TXPBS 0x03404 /* Tx Packet Buffer Size - RW */ -/* Same as TXPBS, renamed for newer adpaters - RW */ +/* Same as TXPBS, renamed for newer adapters - RW */ #define E1000_ITPBS 0x03404 #define E1000_TDFH 0x03410 /* Tx Data FIFO Head - RW */ #define E1000_TDFT 0x03418 /* Tx Data FIFO Tail - RW */ diff --git a/usr/src/uts/common/io/ipw/ipw2100.c b/usr/src/uts/common/io/ipw/ipw2100.c index 2559c64762..8afe91725e 100644 --- a/usr/src/uts/common/io/ipw/ipw2100.c +++ b/usr/src/uts/common/io/ipw/ipw2100.c @@ -1273,7 +1273,7 @@ ipw2100_chip_reset(struct ipw2100_softc *sc) ipw2100_master_stop(sc); /* - * move adatper to DO state + * move adapter to DO state */ tmp = ipw2100_csr_get32(sc, IPW2100_CSR_CTL); ipw2100_csr_put32(sc, IPW2100_CSR_CTL, tmp | IPW2100_CTL_INIT); diff --git a/usr/src/uts/common/io/iwi/ipw2200.h b/usr/src/uts/common/io/iwi/ipw2200.h index b7676ffffa..58c3701ba2 100644 --- a/usr/src/uts/common/io/iwi/ipw2200.h +++ b/usr/src/uts/common/io/iwi/ipw2200.h @@ -38,7 +38,7 @@ extern "C" { #endif /* - * Intel Wireless PRO/2200 mini-pci adpater drier + * Intel Wireless PRO/2200 mini-pci adapter drier * ipw2200.h: common definitions and interface to user land application */ #include <sys/types.h> diff --git a/usr/src/uts/common/io/vioblk/vioblk.c b/usr/src/uts/common/io/vioblk/vioblk.c new file mode 100644 index 0000000000..4d63b7b7ea --- /dev/null +++ b/usr/src/uts/common/io/vioblk/vioblk.c @@ -0,0 +1,1072 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com> + */ + + +#include <sys/modctl.h> +#include <sys/blkdev.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/kmem.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/ksynch.h> +#include <sys/stat.h> +#include <sys/modctl.h> +#include <sys/debug.h> +#include <sys/pci.h> +#include <sys/sysmacros.h> +#include "virtiovar.h" +#include "virtioreg.h" + +/* Feature bits */ +#define VIRTIO_BLK_F_BARRIER (1<<0) +#define VIRTIO_BLK_F_SIZE_MAX (1<<1) +#define VIRTIO_BLK_F_SEG_MAX (1<<2) +#define VIRTIO_BLK_F_GEOMETRY (1<<4) +#define VIRTIO_BLK_F_RO (1<<5) +#define VIRTIO_BLK_F_BLK_SIZE (1<<6) +#define VIRTIO_BLK_F_SCSI (1<<7) +#define VIRTIO_BLK_F_FLUSH (1<<9) +#define VIRTIO_BLK_F_TOPOLOGY (1<<10) + +/* Configuration registers */ +#define VIRTIO_BLK_CONFIG_CAPACITY 0 /* 64bit */ +#define VIRTIO_BLK_CONFIG_SIZE_MAX 8 /* 32bit */ +#define VIRTIO_BLK_CONFIG_SEG_MAX 12 /* 32bit */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_C 16 /* 16bit */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_H 18 /* 8bit */ +#define VIRTIO_BLK_CONFIG_GEOMETRY_S 19 /* 8bit */ +#define VIRTIO_BLK_CONFIG_BLK_SIZE 20 /* 32bit */ +#define VIRTIO_BLK_CONFIG_TOPOLOGY 24 /* 32bit */ + +/* Command */ +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 +#define VIRTIO_BLK_T_SCSI_CMD 2 +#define VIRTIO_BLK_T_SCSI_CMD_OUT 3 +#define VIRTIO_BLK_T_FLUSH 4 +#define VIRTIO_BLK_T_FLUSH_OUT 5 +#define VIRTIO_BLK_T_GET_ID 8 +#define VIRTIO_BLK_T_BARRIER 0x80000000 + +#define VIRTIO_BLK_ID_BYTES 20 /* devid */ + +/* Statuses */ +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +#define DEF_MAXINDIRECT (128) +#define DEF_MAXSECTOR (4096) + +#define VIOBLK_POISON 0xdead0001dead0001 + +/* + * Static Variables. + */ +static char vioblk_ident[] = "VirtIO block driver"; + +/* Request header structure */ +struct vioblk_req_hdr { + uint32_t type; /* VIRTIO_BLK_T_* */ + uint32_t ioprio; + uint64_t sector; +}; + +struct vioblk_req { + struct vioblk_req_hdr hdr; + uint8_t status; + uint8_t unused[3]; + unsigned int ndmac; + ddi_dma_handle_t dmah; + ddi_dma_handle_t bd_dmah; + ddi_dma_cookie_t dmac; + bd_xfer_t *xfer; +}; + +struct vioblk_stats { + struct kstat_named sts_rw_outofmemory; + struct kstat_named sts_rw_badoffset; + struct kstat_named sts_rw_queuemax; + struct kstat_named sts_rw_cookiesmax; + struct kstat_named sts_rw_cacheflush; + struct kstat_named sts_intr_queuemax; + struct kstat_named sts_intr_total; + struct kstat_named sts_io_errors; + struct kstat_named sts_unsupp_errors; + struct kstat_named sts_nxio_errors; +}; + +struct vioblk_lstats { + uint64_t rw_cacheflush; + uint64_t intr_total; + unsigned int rw_cookiesmax; + unsigned int intr_queuemax; + unsigned int io_errors; + unsigned int unsupp_errors; + unsigned int nxio_errors; +}; + +struct vioblk_softc { + dev_info_t *sc_dev; /* mirrors virtio_softc->sc_dev */ + struct virtio_softc sc_virtio; + struct virtqueue *sc_vq; + bd_handle_t bd_h; + struct vioblk_req *sc_reqs; + struct vioblk_stats *ks_data; + kstat_t *sc_intrstat; + uint64_t sc_capacity; + uint64_t sc_nblks; + struct vioblk_lstats sc_stats; + short sc_blkflags; + boolean_t sc_in_poll_mode; + boolean_t sc_readonly; + int sc_blk_size; + int sc_seg_max; + int sc_seg_size_max; + kmutex_t lock_devid; + kcondvar_t cv_devid; + char devid[VIRTIO_BLK_ID_BYTES + 1]; +}; + +static int vioblk_read(void *arg, bd_xfer_t *xfer); +static int vioblk_write(void *arg, bd_xfer_t *xfer); +static int vioblk_flush(void *arg, bd_xfer_t *xfer); +static void vioblk_driveinfo(void *arg, bd_drive_t *drive); +static int vioblk_mediainfo(void *arg, bd_media_t *media); +static int vioblk_devid_init(void *, dev_info_t *, ddi_devid_t *); +uint_t vioblk_int_handler(caddr_t arg1, caddr_t arg2); + +static bd_ops_t vioblk_ops = { + BD_OPS_VERSION_0, + vioblk_driveinfo, + vioblk_mediainfo, + vioblk_devid_init, + vioblk_flush, + vioblk_read, + vioblk_write, +}; + +static int vioblk_quiesce(dev_info_t *); +static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t); +static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t); + +static struct dev_ops vioblk_dev_ops = { + DEVO_REV, + 0, + ddi_no_info, + nulldev, /* identify */ + nulldev, /* probe */ + vioblk_attach, /* attach */ + vioblk_detach, /* detach */ + nodev, /* reset */ + NULL, /* cb_ops */ + NULL, /* bus_ops */ + NULL, /* power */ + vioblk_quiesce /* quiesce */ +}; + + + +/* Standard Module linkage initialization for a Streams driver */ +extern struct mod_ops mod_driverops; + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + vioblk_ident, /* short description */ + &vioblk_dev_ops /* driver specific ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + { + (void *)&modldrv, + NULL, + }, +}; + +ddi_device_acc_attr_t vioblk_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, /* virtio is always native byte order */ + DDI_STORECACHING_OK_ACC, + DDI_DEFAULT_ACC +}; + +/* DMA attr for the header/status blocks. */ +static ddi_dma_attr_t vioblk_req_dma_attr = { + DMA_ATTR_V0, /* dma_attr version */ + 0, /* dma_attr_addr_lo */ + 0xFFFFFFFFFFFFFFFFull, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFull, /* dma_attr_count_max */ + 1, /* dma_attr_align */ + 1, /* dma_attr_burstsizes */ + 1, /* dma_attr_minxfer */ + 0xFFFFFFFFull, /* dma_attr_maxxfer */ + 0xFFFFFFFFFFFFFFFFull, /* dma_attr_seg */ + 1, /* dma_attr_sgllen */ + 1, /* dma_attr_granular */ + 0, /* dma_attr_flags */ +}; + +/* DMA attr for the data blocks. */ +static ddi_dma_attr_t vioblk_bd_dma_attr = { + DMA_ATTR_V0, /* dma_attr version */ + 0, /* dma_attr_addr_lo */ + 0xFFFFFFFFFFFFFFFFull, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFull, /* dma_attr_count_max */ + 1, /* dma_attr_align */ + 1, /* dma_attr_burstsizes */ + 1, /* dma_attr_minxfer */ + 0, /* dma_attr_maxxfer, set in attach */ + 0xFFFFFFFFFFFFFFFFull, /* dma_attr_seg */ + 0, /* dma_attr_sgllen, set in attach */ + 1, /* dma_attr_granular */ + 0, /* dma_attr_flags */ +}; + +static int +vioblk_rw(struct vioblk_softc *sc, bd_xfer_t *xfer, int type, + uint32_t len) +{ + struct vioblk_req *req; + struct vq_entry *ve_hdr; + int total_cookies, write; + + write = (type == VIRTIO_BLK_T_OUT || + type == VIRTIO_BLK_T_FLUSH_OUT) ? 1 : 0; + total_cookies = 2; + + if ((xfer->x_blkno + xfer->x_nblks) > sc->sc_nblks) { + sc->ks_data->sts_rw_badoffset.value.ui64++; + return (EINVAL); + } + + /* allocate top entry */ + ve_hdr = vq_alloc_entry(sc->sc_vq); + if (!ve_hdr) { + sc->ks_data->sts_rw_outofmemory.value.ui64++; + return (ENOMEM); + } + + /* getting request */ + req = &sc->sc_reqs[ve_hdr->qe_index]; + req->hdr.type = type; + req->hdr.ioprio = 0; + req->hdr.sector = xfer->x_blkno; + req->xfer = xfer; + + /* Header */ + virtio_ve_add_indirect_buf(ve_hdr, req->dmac.dmac_laddress, + sizeof (struct vioblk_req_hdr), B_TRUE); + + /* Payload */ + if (len > 0) { + virtio_ve_add_cookie(ve_hdr, xfer->x_dmah, xfer->x_dmac, + xfer->x_ndmac, write ? B_TRUE : B_FALSE); + total_cookies += xfer->x_ndmac; + } + + /* Status */ + virtio_ve_add_indirect_buf(ve_hdr, + req->dmac.dmac_laddress + sizeof (struct vioblk_req_hdr), + sizeof (uint8_t), B_FALSE); + + /* sending the whole chain to the device */ + virtio_push_chain(ve_hdr, B_TRUE); + + if (sc->sc_stats.rw_cookiesmax < total_cookies) + sc->sc_stats.rw_cookiesmax = total_cookies; + + return (DDI_SUCCESS); +} + +/* + * Now in polling mode. Interrupts are off, so we + * 1) poll for the already queued requests to complete. + * 2) push our request. + * 3) wait for our request to complete. + */ +static int +vioblk_rw_poll(struct vioblk_softc *sc, bd_xfer_t *xfer, + int type, uint32_t len) +{ + clock_t tmout; + int ret; + + ASSERT(xfer->x_flags & BD_XFER_POLL); + + /* Prevent a hard hang. */ + tmout = drv_usectohz(30000000); + + /* Poll for an empty queue */ + while (vq_num_used(sc->sc_vq)) { + /* Check if any pending requests completed. */ + ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL); + if (ret != DDI_INTR_CLAIMED) { + drv_usecwait(10); + tmout -= 10; + return (ETIMEDOUT); + } + } + + ret = vioblk_rw(sc, xfer, type, len); + if (ret) + return (ret); + + tmout = drv_usectohz(30000000); + /* Poll for an empty queue again. */ + while (vq_num_used(sc->sc_vq)) { + /* Check if any pending requests completed. */ + ret = vioblk_int_handler((caddr_t)&sc->sc_virtio, NULL); + if (ret != DDI_INTR_CLAIMED) { + drv_usecwait(10); + tmout -= 10; + return (ETIMEDOUT); + } + } + + return (DDI_SUCCESS); +} + +static int +vioblk_read(void *arg, bd_xfer_t *xfer) +{ + int ret; + struct vioblk_softc *sc = (void *)arg; + + if (xfer->x_flags & BD_XFER_POLL) { + if (!sc->sc_in_poll_mode) { + virtio_stop_vq_intr(sc->sc_vq); + sc->sc_in_poll_mode = 1; + } + + ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_IN, + xfer->x_nblks * DEV_BSIZE); + } else { + if (sc->sc_in_poll_mode) { + virtio_start_vq_intr(sc->sc_vq); + sc->sc_in_poll_mode = 0; + } + + ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_IN, + xfer->x_nblks * DEV_BSIZE); + } + + return (ret); +} + +static int +vioblk_write(void *arg, bd_xfer_t *xfer) +{ + int ret; + struct vioblk_softc *sc = (void *)arg; + + if (xfer->x_flags & BD_XFER_POLL) { + if (!sc->sc_in_poll_mode) { + virtio_stop_vq_intr(sc->sc_vq); + sc->sc_in_poll_mode = 1; + } + + ret = vioblk_rw_poll(sc, xfer, VIRTIO_BLK_T_OUT, + xfer->x_nblks * DEV_BSIZE); + } else { + if (sc->sc_in_poll_mode) { + virtio_start_vq_intr(sc->sc_vq); + sc->sc_in_poll_mode = 0; + } + + ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_OUT, + xfer->x_nblks * DEV_BSIZE); + } + return (ret); +} + +static int +vioblk_flush(void *arg, bd_xfer_t *xfer) +{ + int ret; + struct vioblk_softc *sc = (void *)arg; + + ASSERT((xfer->x_flags & BD_XFER_POLL) == 0); + + ret = vioblk_rw(sc, xfer, VIRTIO_BLK_T_FLUSH_OUT, + xfer->x_nblks * DEV_BSIZE); + + if (!ret) + sc->sc_stats.rw_cacheflush++; + + return (ret); +} + + +static void +vioblk_driveinfo(void *arg, bd_drive_t *drive) +{ + struct vioblk_softc *sc = (void *)arg; + + drive->d_qsize = sc->sc_vq->vq_num; + drive->d_removable = B_FALSE; + drive->d_hotpluggable = B_TRUE; + drive->d_target = 0; + drive->d_lun = 0; +} + +static int +vioblk_mediainfo(void *arg, bd_media_t *media) +{ + struct vioblk_softc *sc = (void *)arg; + + media->m_nblks = sc->sc_nblks; + media->m_blksize = DEV_BSIZE; + media->m_readonly = sc->sc_readonly; + return (0); +} + +static int +vioblk_devid_init(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) +{ + struct vioblk_softc *sc = (void *)arg; + clock_t deadline; + int ret; + bd_xfer_t xfer; + + deadline = ddi_get_lbolt() + (clock_t)drv_usectohz(3 * 1000000); + (void) memset(&xfer, 0, sizeof (bd_xfer_t)); + xfer.x_nblks = 1; + + ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_bd_dma_attr, + DDI_DMA_SLEEP, NULL, &xfer.x_dmah); + if (ret != DDI_SUCCESS) + goto out_alloc; + + ret = ddi_dma_addr_bind_handle(xfer.x_dmah, NULL, (caddr_t)&sc->devid, + VIRTIO_BLK_ID_BYTES, DDI_DMA_READ | DDI_DMA_CONSISTENT, + DDI_DMA_SLEEP, NULL, &xfer.x_dmac, &xfer.x_ndmac); + if (ret != DDI_DMA_MAPPED) { + ret = DDI_FAILURE; + goto out_map; + } + + mutex_enter(&sc->lock_devid); + + ret = vioblk_rw(sc, &xfer, VIRTIO_BLK_T_GET_ID, + VIRTIO_BLK_ID_BYTES); + if (ret) { + mutex_exit(&sc->lock_devid); + goto out_rw; + } + + /* wait for reply */ + ret = cv_timedwait(&sc->cv_devid, &sc->lock_devid, deadline); + mutex_exit(&sc->lock_devid); + + (void) ddi_dma_unbind_handle(xfer.x_dmah); + ddi_dma_free_handle(&xfer.x_dmah); + + /* timeout */ + if (ret < 0) { + dev_err(devinfo, CE_WARN, "Cannot get devid from the device"); + return (DDI_FAILURE); + } + + ret = ddi_devid_init(devinfo, DEVID_ATA_SERIAL, + VIRTIO_BLK_ID_BYTES, sc->devid, devid); + if (ret != DDI_SUCCESS) { + dev_err(devinfo, CE_WARN, "Cannot build devid from the device"); + return (ret); + } + + dev_debug(sc->sc_dev, CE_NOTE, + "devid %x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x%x", + sc->devid[0], sc->devid[1], sc->devid[2], sc->devid[3], + sc->devid[4], sc->devid[5], sc->devid[6], sc->devid[7], + sc->devid[8], sc->devid[9], sc->devid[10], sc->devid[11], + sc->devid[12], sc->devid[13], sc->devid[14], sc->devid[15], + sc->devid[16], sc->devid[17], sc->devid[18], sc->devid[19]); + + return (0); + +out_rw: + (void) ddi_dma_unbind_handle(xfer.x_dmah); +out_map: + ddi_dma_free_handle(&xfer.x_dmah); +out_alloc: + return (ret); +} + +static void +vioblk_show_features(struct vioblk_softc *sc, const char *prefix, + uint32_t features) +{ + char buf[512]; + char *bufp = buf; + char *bufend = buf + sizeof (buf); + + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, prefix); + + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += virtio_show_features(features, bufp, bufend - bufp); + + + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "Vioblk ( "); + + if (features & VIRTIO_BLK_F_BARRIER) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "BARRIER "); + if (features & VIRTIO_BLK_F_SIZE_MAX) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "SIZE_MAX "); + if (features & VIRTIO_BLK_F_SEG_MAX) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "SEG_MAX "); + if (features & VIRTIO_BLK_F_GEOMETRY) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "GEOMETRY "); + if (features & VIRTIO_BLK_F_RO) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "RO "); + if (features & VIRTIO_BLK_F_BLK_SIZE) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "BLK_SIZE "); + if (features & VIRTIO_BLK_F_SCSI) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "SCSI "); + if (features & VIRTIO_BLK_F_FLUSH) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "FLUSH "); + if (features & VIRTIO_BLK_F_TOPOLOGY) + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, "TOPOLOGY "); + + /* LINTED E_PTRDIFF_OVERFLOW */ + bufp += snprintf(bufp, bufend - bufp, ")"); + *bufp = '\0'; + + dev_debug(sc->sc_dev, CE_NOTE, "%s", buf); +} + +static int +vioblk_dev_features(struct vioblk_softc *sc) +{ + uint32_t host_features; + + host_features = virtio_negotiate_features(&sc->sc_virtio, + VIRTIO_BLK_F_RO | + VIRTIO_BLK_F_GEOMETRY | + VIRTIO_BLK_F_BLK_SIZE | + VIRTIO_BLK_F_FLUSH | + VIRTIO_BLK_F_SEG_MAX | + VIRTIO_BLK_F_SIZE_MAX | + VIRTIO_F_RING_INDIRECT_DESC); + + vioblk_show_features(sc, "Host features: ", host_features); + vioblk_show_features(sc, "Negotiated features: ", + sc->sc_virtio.sc_features); + + if (!(sc->sc_virtio.sc_features & VIRTIO_F_RING_INDIRECT_DESC)) { + dev_err(sc->sc_dev, CE_NOTE, + "Host does not support RING_INDIRECT_DESC, bye."); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +uint_t +vioblk_int_handler(caddr_t arg1, caddr_t arg2) +{ + struct virtio_softc *vsc = (void *)arg1; + struct vioblk_softc *sc = container_of(vsc, + struct vioblk_softc, sc_virtio); + struct vq_entry *ve; + uint32_t len; + int i = 0, error; + + while ((ve = virtio_pull_chain(sc->sc_vq, &len))) { + struct vioblk_req *req = &sc->sc_reqs[ve->qe_index]; + bd_xfer_t *xfer = req->xfer; + uint8_t status = req->status; + uint32_t type = req->hdr.type; + + if (req->xfer == (void *)VIOBLK_POISON) { + dev_err(sc->sc_dev, CE_WARN, "Poisoned descriptor!"); + virtio_free_chain(ve); + return (DDI_INTR_CLAIMED); + } + + req->xfer = (void *) VIOBLK_POISON; + + /* Note: blkdev tears down the payload mapping for us. */ + virtio_free_chain(ve); + + /* returning payload back to blkdev */ + switch (status) { + case VIRTIO_BLK_S_OK: + error = 0; + break; + case VIRTIO_BLK_S_IOERR: + error = EIO; + sc->sc_stats.io_errors++; + break; + case VIRTIO_BLK_S_UNSUPP: + sc->sc_stats.unsupp_errors++; + error = ENOTTY; + break; + default: + sc->sc_stats.nxio_errors++; + error = ENXIO; + break; + } + + if (type == VIRTIO_BLK_T_GET_ID) { + /* notify devid_init */ + mutex_enter(&sc->lock_devid); + cv_broadcast(&sc->cv_devid); + mutex_exit(&sc->lock_devid); + } else + bd_xfer_done(xfer, error); + + i++; + } + + /* update stats */ + if (sc->sc_stats.intr_queuemax < i) + sc->sc_stats.intr_queuemax = i; + sc->sc_stats.intr_total++; + + return (DDI_INTR_CLAIMED); +} + +/* ARGSUSED */ +uint_t +vioblk_config_handler(caddr_t arg1, caddr_t arg2) +{ + return (DDI_INTR_CLAIMED); +} + +static int +vioblk_register_ints(struct vioblk_softc *sc) +{ + int ret; + + struct virtio_int_handler vioblk_conf_h = { + vioblk_config_handler + }; + + struct virtio_int_handler vioblk_vq_h[] = { + { vioblk_int_handler }, + { NULL }, + }; + + ret = virtio_register_ints(&sc->sc_virtio, + &vioblk_conf_h, vioblk_vq_h); + + return (ret); +} + +static void +vioblk_free_reqs(struct vioblk_softc *sc) +{ + int i, qsize; + + qsize = sc->sc_vq->vq_num; + + for (i = 0; i < qsize; i++) { + struct vioblk_req *req = &sc->sc_reqs[i]; + + if (req->ndmac) + (void) ddi_dma_unbind_handle(req->dmah); + + if (req->dmah) + ddi_dma_free_handle(&req->dmah); + } + + kmem_free(sc->sc_reqs, sizeof (struct vioblk_req) * qsize); +} + +static int +vioblk_alloc_reqs(struct vioblk_softc *sc) +{ + int i, qsize; + int ret; + + qsize = sc->sc_vq->vq_num; + + sc->sc_reqs = kmem_zalloc(sizeof (struct vioblk_req) * qsize, KM_SLEEP); + + for (i = 0; i < qsize; i++) { + struct vioblk_req *req = &sc->sc_reqs[i]; + + ret = ddi_dma_alloc_handle(sc->sc_dev, &vioblk_req_dma_attr, + DDI_DMA_SLEEP, NULL, &req->dmah); + if (ret != DDI_SUCCESS) { + + dev_err(sc->sc_dev, CE_WARN, + "Can't allocate dma handle for req " + "buffer %d", i); + goto exit; + } + + ret = ddi_dma_addr_bind_handle(req->dmah, NULL, + (caddr_t)&req->hdr, + sizeof (struct vioblk_req_hdr) + sizeof (uint8_t), + DDI_DMA_RDWR | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, + NULL, &req->dmac, &req->ndmac); + if (ret != DDI_DMA_MAPPED) { + dev_err(sc->sc_dev, CE_WARN, + "Can't bind req buffer %d", i); + goto exit; + } + } + + return (0); + +exit: + vioblk_free_reqs(sc); + return (ENOMEM); +} + + +static int +vioblk_ksupdate(kstat_t *ksp, int rw) +{ + struct vioblk_softc *sc = ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + + sc->ks_data->sts_rw_cookiesmax.value.ui32 = sc->sc_stats.rw_cookiesmax; + sc->ks_data->sts_intr_queuemax.value.ui32 = sc->sc_stats.intr_queuemax; + sc->ks_data->sts_unsupp_errors.value.ui32 = sc->sc_stats.unsupp_errors; + sc->ks_data->sts_nxio_errors.value.ui32 = sc->sc_stats.nxio_errors; + sc->ks_data->sts_io_errors.value.ui32 = sc->sc_stats.io_errors; + sc->ks_data->sts_rw_cacheflush.value.ui64 = sc->sc_stats.rw_cacheflush; + sc->ks_data->sts_intr_total.value.ui64 = sc->sc_stats.intr_total; + + + return (0); +} + +static int +vioblk_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) +{ + int ret = DDI_SUCCESS; + int instance; + struct vioblk_softc *sc; + struct virtio_softc *vsc; + struct vioblk_stats *ks_data; + + instance = ddi_get_instance(devinfo); + + switch (cmd) { + case DDI_ATTACH: + break; + + case DDI_RESUME: + case DDI_PM_RESUME: + dev_err(devinfo, CE_WARN, "resume not supported yet"); + ret = DDI_FAILURE; + goto exit; + + default: + dev_err(devinfo, CE_WARN, "cmd 0x%x not recognized", cmd); + ret = DDI_FAILURE; + goto exit; + } + + sc = kmem_zalloc(sizeof (struct vioblk_softc), KM_SLEEP); + ddi_set_driver_private(devinfo, sc); + + vsc = &sc->sc_virtio; + + /* Duplicate for faster access / less typing */ + sc->sc_dev = devinfo; + vsc->sc_dev = devinfo; + + cv_init(&sc->cv_devid, NULL, CV_DRIVER, NULL); + mutex_init(&sc->lock_devid, NULL, MUTEX_DRIVER, NULL); + + /* + * Initialize interrupt kstat. This should not normally fail, since + * we don't use a persistent stat. We do it this way to avoid having + * to test for it at run time on the hot path. + */ + sc->sc_intrstat = kstat_create("vioblk", instance, + "intrs", "controller", KSTAT_TYPE_NAMED, + sizeof (struct vioblk_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_PERSISTENT); + if (sc->sc_intrstat == NULL) { + dev_err(devinfo, CE_WARN, "kstat_create failed"); + goto exit_intrstat; + } + ks_data = (struct vioblk_stats *)sc->sc_intrstat->ks_data; + kstat_named_init(&ks_data->sts_rw_outofmemory, + "total_rw_outofmemory", KSTAT_DATA_UINT64); + kstat_named_init(&ks_data->sts_rw_badoffset, + "total_rw_badoffset", KSTAT_DATA_UINT64); + kstat_named_init(&ks_data->sts_intr_total, + "total_intr", KSTAT_DATA_UINT64); + kstat_named_init(&ks_data->sts_io_errors, + "total_io_errors", KSTAT_DATA_UINT32); + kstat_named_init(&ks_data->sts_unsupp_errors, + "total_unsupp_errors", KSTAT_DATA_UINT32); + kstat_named_init(&ks_data->sts_nxio_errors, + "total_nxio_errors", KSTAT_DATA_UINT32); + kstat_named_init(&ks_data->sts_rw_cacheflush, + "total_rw_cacheflush", KSTAT_DATA_UINT64); + kstat_named_init(&ks_data->sts_rw_cookiesmax, + "max_rw_cookies", KSTAT_DATA_UINT32); + kstat_named_init(&ks_data->sts_intr_queuemax, + "max_intr_queue", KSTAT_DATA_UINT32); + sc->ks_data = ks_data; + sc->sc_intrstat->ks_private = sc; + sc->sc_intrstat->ks_update = vioblk_ksupdate; + kstat_install(sc->sc_intrstat); + + /* map BAR0 */ + ret = ddi_regs_map_setup(devinfo, 1, + (caddr_t *)&sc->sc_virtio.sc_io_addr, + 0, 0, &vioblk_attr, &sc->sc_virtio.sc_ioh); + if (ret != DDI_SUCCESS) { + dev_err(devinfo, CE_WARN, "unable to map bar0: [%d]", ret); + goto exit_map; + } + + virtio_device_reset(&sc->sc_virtio); + virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_ACK); + virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_DRIVER); + + if (vioblk_register_ints(sc)) { + dev_err(devinfo, CE_WARN, "Unable to add interrupt"); + goto exit_int; + } + + ret = vioblk_dev_features(sc); + if (ret) + goto exit_features; + + if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_RO) + sc->sc_readonly = B_TRUE; + else + sc->sc_readonly = B_FALSE; + + sc->sc_capacity = virtio_read_device_config_8(&sc->sc_virtio, + VIRTIO_BLK_CONFIG_CAPACITY); + sc->sc_nblks = sc->sc_capacity; + + /* + * BLK_SIZE is just a hint for the optimal logical block + * granularity. Ignored for now. + */ + sc->sc_blk_size = DEV_BSIZE; + if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_BLK_SIZE) { + sc->sc_blk_size = virtio_read_device_config_4(&sc->sc_virtio, + VIRTIO_BLK_CONFIG_BLK_SIZE); + } + + /* Flushing is not supported. */ + if (!(sc->sc_virtio.sc_features & VIRTIO_BLK_F_FLUSH)) { + vioblk_ops.o_sync_cache = NULL; + } + + sc->sc_seg_max = DEF_MAXINDIRECT; + /* The max number of segments (cookies) in a request */ + if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SEG_MAX) { + sc->sc_seg_max = virtio_read_device_config_4(&sc->sc_virtio, + VIRTIO_BLK_CONFIG_SEG_MAX); + + /* That's what Linux does. */ + if (!sc->sc_seg_max) + sc->sc_seg_max = 1; + + /* + * SEG_MAX corresponds to the number of _data_ + * blocks in a request + */ + sc->sc_seg_max += 2; + } + /* 2 descriptors taken for header/status */ + vioblk_bd_dma_attr.dma_attr_sgllen = sc->sc_seg_max - 2; + + + /* The maximum size for a cookie in a request. */ + sc->sc_seg_size_max = DEF_MAXSECTOR; + if (sc->sc_virtio.sc_features & VIRTIO_BLK_F_SIZE_MAX) { + sc->sc_seg_size_max = virtio_read_device_config_4( + &sc->sc_virtio, VIRTIO_BLK_CONFIG_SIZE_MAX); + } + + /* The maximum request size */ + vioblk_bd_dma_attr.dma_attr_maxxfer = + vioblk_bd_dma_attr.dma_attr_sgllen * sc->sc_seg_size_max; + + dev_debug(devinfo, CE_NOTE, + "nblks=%" PRIu64 " blksize=%d num_seg=%d, " + "seg_size=%d, maxxfer=%" PRIu64, + sc->sc_nblks, sc->sc_blk_size, + vioblk_bd_dma_attr.dma_attr_sgllen, + sc->sc_seg_size_max, + vioblk_bd_dma_attr.dma_attr_maxxfer); + + + sc->sc_vq = virtio_alloc_vq(&sc->sc_virtio, 0, 0, + sc->sc_seg_max, "I/O request"); + if (sc->sc_vq == NULL) { + goto exit_alloc1; + } + + ret = vioblk_alloc_reqs(sc); + if (ret) { + goto exit_alloc2; + } + + sc->bd_h = bd_alloc_handle(sc, &vioblk_ops, &vioblk_bd_dma_attr, + KM_SLEEP); + + + virtio_set_status(&sc->sc_virtio, + VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK); + virtio_start_vq_intr(sc->sc_vq); + + ret = virtio_enable_ints(&sc->sc_virtio); + if (ret) + goto exit_enable_ints; + + ret = bd_attach_handle(devinfo, sc->bd_h); + if (ret != DDI_SUCCESS) { + dev_err(devinfo, CE_WARN, "Failed to attach blkdev"); + goto exit_attach_bd; + } + + return (DDI_SUCCESS); + +exit_attach_bd: + /* + * There is no virtio_disable_ints(), it's done in virtio_release_ints. + * If they ever get split, don't forget to add a call here. + */ +exit_enable_ints: + virtio_stop_vq_intr(sc->sc_vq); + bd_free_handle(sc->bd_h); + vioblk_free_reqs(sc); +exit_alloc2: + virtio_free_vq(sc->sc_vq); +exit_alloc1: +exit_features: + virtio_release_ints(&sc->sc_virtio); +exit_int: + virtio_set_status(&sc->sc_virtio, VIRTIO_CONFIG_DEVICE_STATUS_FAILED); + ddi_regs_map_free(&sc->sc_virtio.sc_ioh); +exit_map: + kstat_delete(sc->sc_intrstat); +exit_intrstat: + mutex_destroy(&sc->lock_devid); + cv_destroy(&sc->cv_devid); + kmem_free(sc, sizeof (struct vioblk_softc)); +exit: + return (ret); +} + +static int +vioblk_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) +{ + struct vioblk_softc *sc = ddi_get_driver_private(devinfo); + + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_PM_SUSPEND: + cmn_err(CE_WARN, "suspend not supported yet"); + return (DDI_FAILURE); + + default: + cmn_err(CE_WARN, "cmd 0x%x unrecognized", cmd); + return (DDI_FAILURE); + } + + (void) bd_detach_handle(sc->bd_h); + virtio_stop_vq_intr(sc->sc_vq); + virtio_release_ints(&sc->sc_virtio); + vioblk_free_reqs(sc); + virtio_free_vq(sc->sc_vq); + virtio_device_reset(&sc->sc_virtio); + ddi_regs_map_free(&sc->sc_virtio.sc_ioh); + kstat_delete(sc->sc_intrstat); + kmem_free(sc, sizeof (struct vioblk_softc)); + + return (DDI_SUCCESS); +} + +static int +vioblk_quiesce(dev_info_t *devinfo) +{ + struct vioblk_softc *sc = ddi_get_driver_private(devinfo); + + virtio_stop_vq_intr(sc->sc_vq); + virtio_device_reset(&sc->sc_virtio); + + return (DDI_SUCCESS); +} + +int +_init(void) +{ + int rv; + + bd_mod_init(&vioblk_dev_ops); + + if ((rv = mod_install(&modlinkage)) != 0) { + bd_mod_fini(&vioblk_dev_ops); + } + + return (rv); +} + +int +_fini(void) +{ + int rv; + + if ((rv = mod_remove(&modlinkage)) == 0) { + bd_mod_fini(&vioblk_dev_ops); + } + + return (rv); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/virtio/virtio.c b/usr/src/uts/common/io/virtio/virtio.c new file mode 100644 index 0000000000..320dc0666a --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtio.c @@ -0,0 +1,1348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012 Nexenta Systems, Inc. + * Copyright 2012 Alexey Zaytsev <alexey.zaytsev@gmail.com> + */ + +/* Based on the NetBSD virtio driver by Minoura Makoto. */ +/* + * Copyright (c) 2010 Minoura Makoto. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <sys/conf.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/autoconf.h> +#include <sys/ddi_impldefs.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/avintr.h> +#include <sys/spl.h> +#include <sys/promif.h> +#include <sys/list.h> +#include <sys/bootconf.h> +#include <sys/bootsvcs.h> +#include <sys/sysmacros.h> +#include <sys/pci.h> + +#include "virtiovar.h" +#include "virtioreg.h" +#define NDEVNAMES (sizeof (virtio_device_name) / sizeof (char *)) +#define MINSEG_INDIRECT 2 /* use indirect if nsegs >= this value */ +#define VIRTQUEUE_ALIGN(n) (((n)+(VIRTIO_PAGE_SIZE-1)) & \ + ~(VIRTIO_PAGE_SIZE-1)) + +void +virtio_set_status(struct virtio_softc *sc, unsigned int status) +{ + int old = 0; + + if (status != 0) + old = ddi_get8(sc->sc_ioh, + (uint8_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_DEVICE_STATUS)); + + ddi_put8(sc->sc_ioh, + (uint8_t *)(sc->sc_io_addr + VIRTIO_CONFIG_DEVICE_STATUS), + status | old); +} + +/* + * Negotiate features, save the result in sc->sc_features + */ +uint32_t +virtio_negotiate_features(struct virtio_softc *sc, uint32_t guest_features) +{ + uint32_t host_features; + uint32_t features; + + host_features = ddi_get32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_DEVICE_FEATURES)); + + dev_debug(sc->sc_dev, CE_NOTE, + "host features: %x, guest features: %x", + host_features, guest_features); + + features = host_features & guest_features; + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_GUEST_FEATURES), + features); + + sc->sc_features = features; + + return (host_features); +} + +size_t +virtio_show_features(uint32_t features, + char *buf, size_t len) +{ + char *orig_buf = buf; + char *bufend = buf + len; + + /* LINTED E_PTRDIFF_OVERFLOW */ + buf += snprintf(buf, bufend - buf, "Generic ( "); + if (features & VIRTIO_F_RING_INDIRECT_DESC) + /* LINTED E_PTRDIFF_OVERFLOW */ + buf += snprintf(buf, bufend - buf, "INDIRECT_DESC "); + + /* LINTED E_PTRDIFF_OVERFLOW */ + buf += snprintf(buf, bufend - buf, ") "); + + /* LINTED E_PTRDIFF_OVERFLOW */ + return (buf - orig_buf); +} + +boolean_t +virtio_has_feature(struct virtio_softc *sc, uint32_t feature) +{ + return (sc->sc_features & feature); +} + +/* + * Device configuration registers. + */ +uint8_t +virtio_read_device_config_1(struct virtio_softc *sc, unsigned int index) +{ + ASSERT(sc->sc_config_offset); + return ddi_get8(sc->sc_ioh, + (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); +} + +uint16_t +virtio_read_device_config_2(struct virtio_softc *sc, unsigned int index) +{ + ASSERT(sc->sc_config_offset); + return ddi_get16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); +} + +uint32_t +virtio_read_device_config_4(struct virtio_softc *sc, unsigned int index) +{ + ASSERT(sc->sc_config_offset); + return ddi_get32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); +} + +uint64_t +virtio_read_device_config_8(struct virtio_softc *sc, unsigned int index) +{ + uint64_t r; + + ASSERT(sc->sc_config_offset); + r = ddi_get32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + + index + sizeof (uint32_t))); + + r <<= 32; + + r += ddi_get32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index)); + return (r); +} + +void +virtio_write_device_config_1(struct virtio_softc *sc, + unsigned int index, uint8_t value) +{ + ASSERT(sc->sc_config_offset); + ddi_put8(sc->sc_ioh, + (uint8_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); +} + +void +virtio_write_device_config_2(struct virtio_softc *sc, + unsigned int index, uint16_t value) +{ + ASSERT(sc->sc_config_offset); + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); +} + +void +virtio_write_device_config_4(struct virtio_softc *sc, + unsigned int index, uint32_t value) +{ + ASSERT(sc->sc_config_offset); + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index), value); +} + +void +virtio_write_device_config_8(struct virtio_softc *sc, + unsigned int index, uint64_t value) +{ + ASSERT(sc->sc_config_offset); + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + index), + value & 0xFFFFFFFF); + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + sc->sc_config_offset + + index + sizeof (uint32_t)), value >> 32); +} + +/* + * Start/stop vq interrupt. No guarantee. + */ +void +virtio_stop_vq_intr(struct virtqueue *vq) +{ + vq->vq_avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; +} + +void +virtio_start_vq_intr(struct virtqueue *vq) +{ + vq->vq_avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; +} + +static ddi_dma_attr_t virtio_vq_dma_attr = { + DMA_ATTR_V0, /* Version number */ + 0, /* low address */ + /* + * high address. Has to fit into 32 bits + * after page-shifting + */ + 0x00000FFFFFFFFFFF, + 0xFFFFFFFF, /* counter register max */ + VIRTIO_PAGE_SIZE, /* page alignment required */ + 0x3F, /* burst sizes: 1 - 32 */ + 0x1, /* minimum transfer size */ + 0xFFFFFFFF, /* max transfer size */ + 0xFFFFFFFF, /* address register max */ + 1, /* no scatter-gather */ + 1, /* device operates on bytes */ + 0, /* attr flag: set to 0 */ +}; + +static ddi_dma_attr_t virtio_vq_indirect_dma_attr = { + DMA_ATTR_V0, /* Version number */ + 0, /* low address */ + 0xFFFFFFFFFFFFFFFF, /* high address */ + 0xFFFFFFFF, /* counter register max */ + 1, /* No specific alignment */ + 0x3F, /* burst sizes: 1 - 32 */ + 0x1, /* minimum transfer size */ + 0xFFFFFFFF, /* max transfer size */ + 0xFFFFFFFF, /* address register max */ + 1, /* no scatter-gather */ + 1, /* device operates on bytes */ + 0, /* attr flag: set to 0 */ +}; + +/* Same for direct and indirect descriptors. */ +static ddi_device_acc_attr_t virtio_vq_devattr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STORECACHING_OK_ACC, + DDI_DEFAULT_ACC +}; + +static void +virtio_free_indirect(struct vq_entry *entry) +{ + + (void) ddi_dma_unbind_handle(entry->qe_indirect_dma_handle); + ddi_dma_mem_free(&entry->qe_indirect_dma_acch); + ddi_dma_free_handle(&entry->qe_indirect_dma_handle); + + entry->qe_indirect_descs = NULL; +} + + +static int +virtio_alloc_indirect(struct virtio_softc *sc, struct vq_entry *entry) +{ + int allocsize, num; + size_t len; + unsigned int ncookies; + int ret; + + num = entry->qe_queue->vq_indirect_num; + ASSERT(num > 1); + + allocsize = sizeof (struct vring_desc) * num; + + ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_indirect_dma_attr, + DDI_DMA_SLEEP, NULL, &entry->qe_indirect_dma_handle); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to allocate dma handle for indirect descriptors," + " entry %d, vq %d", entry->qe_index, + entry->qe_queue->vq_index); + goto out_alloc_handle; + } + + ret = ddi_dma_mem_alloc(entry->qe_indirect_dma_handle, + allocsize, &virtio_vq_devattr, + DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, + (caddr_t *)&entry->qe_indirect_descs, &len, + &entry->qe_indirect_dma_acch); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to alocate dma memory for indirect descriptors," + " entry %d, vq %d,", entry->qe_index, + entry->qe_queue->vq_index); + goto out_alloc; + } + + (void) memset(entry->qe_indirect_descs, 0xff, allocsize); + + ret = ddi_dma_addr_bind_handle(entry->qe_indirect_dma_handle, NULL, + (caddr_t)entry->qe_indirect_descs, len, + DDI_DMA_RDWR | DDI_DMA_CONSISTENT, + DDI_DMA_SLEEP, NULL, &entry->qe_indirect_dma_cookie, &ncookies); + if (ret != DDI_DMA_MAPPED) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to bind dma memory for indirect descriptors," + "entry %d, vq %d", entry->qe_index, + entry->qe_queue->vq_index); + goto out_bind; + } + + /* We asked for a single segment */ + ASSERT(ncookies == 1); + + return (0); + +out_bind: + ddi_dma_mem_free(&entry->qe_indirect_dma_acch); +out_alloc: + ddi_dma_free_handle(&entry->qe_indirect_dma_handle); +out_alloc_handle: + + return (ret); +} + +/* + * Initialize the vq structure. + */ +static int +virtio_init_vq(struct virtio_softc *sc, struct virtqueue *vq) +{ + int ret; + uint16_t i; + int vq_size = vq->vq_num; + int indirect_num = vq->vq_indirect_num; + + /* free slot management */ + list_create(&vq->vq_freelist, sizeof (struct vq_entry), + offsetof(struct vq_entry, qe_list)); + + for (i = 0; i < vq_size; i++) { + struct vq_entry *entry = &vq->vq_entries[i]; + list_insert_tail(&vq->vq_freelist, entry); + entry->qe_index = i; + entry->qe_desc = &vq->vq_descs[i]; + entry->qe_queue = vq; + + if (indirect_num) { + ret = virtio_alloc_indirect(sc, entry); + if (ret) + goto out_indirect; + } + } + + mutex_init(&vq->vq_freelist_lock, "virtio-freelist", + MUTEX_DRIVER, DDI_INTR_PRI(sc->sc_intr_prio)); + mutex_init(&vq->vq_avail_lock, "virtio-avail", + MUTEX_DRIVER, DDI_INTR_PRI(sc->sc_intr_prio)); + mutex_init(&vq->vq_used_lock, "virtio-used", + MUTEX_DRIVER, DDI_INTR_PRI(sc->sc_intr_prio)); + + return (0); + +out_indirect: + for (i = 0; i < vq_size; i++) { + struct vq_entry *entry = &vq->vq_entries[i]; + if (entry->qe_indirect_descs) + virtio_free_indirect(entry); + } + + return (ret); +} + + + +/* + * Allocate/free a vq. + */ +struct virtqueue * +virtio_alloc_vq(struct virtio_softc *sc, + unsigned int index, + unsigned int size, + unsigned int indirect_num, + const char *name) +{ + int vq_size, allocsize1, allocsize2, allocsize = 0; + int ret; + unsigned int ncookies; + size_t len; + struct virtqueue *vq; + + + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT), index); + vq_size = ddi_get16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SIZE)); + if (vq_size == 0) { + dev_err(sc->sc_dev, CE_WARN, + "virtqueue dest not exist, index %d for %s\n", index, name); + goto out; + } + + vq = kmem_zalloc(sizeof (struct virtqueue), KM_SLEEP); + + /* size 0 => use native vq size, good for receive queues. */ + if (size) + vq_size = MIN(vq_size, size); + + /* allocsize1: descriptor table + avail ring + pad */ + allocsize1 = VIRTQUEUE_ALIGN(sizeof (struct vring_desc) * vq_size + + sizeof (struct vring_avail) + + sizeof (uint16_t) * vq_size); + /* allocsize2: used ring + pad */ + allocsize2 = VIRTQUEUE_ALIGN(sizeof (struct vring_used) + + sizeof (struct vring_used_elem) * vq_size); + + allocsize = allocsize1 + allocsize2; + + ret = ddi_dma_alloc_handle(sc->sc_dev, &virtio_vq_dma_attr, + DDI_DMA_SLEEP, NULL, &vq->vq_dma_handle); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to allocate dma handle for vq %d", index); + goto out_alloc_handle; + } + + ret = ddi_dma_mem_alloc(vq->vq_dma_handle, allocsize, + &virtio_vq_devattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, + (caddr_t *)&vq->vq_vaddr, &len, &vq->vq_dma_acch); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to alocate dma memory for vq %d", index); + goto out_alloc; + } + + + ret = ddi_dma_addr_bind_handle(vq->vq_dma_handle, NULL, + (caddr_t)vq->vq_vaddr, len, + DDI_DMA_RDWR | DDI_DMA_CONSISTENT, + DDI_DMA_SLEEP, NULL, &vq->vq_dma_cookie, &ncookies); + if (ret != DDI_DMA_MAPPED) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to bind dma memory for vq %d", index); + goto out_bind; + } + + /* We asked for a single segment */ + ASSERT(ncookies == 1); + /* and page-ligned buffers. */ + ASSERT(vq->vq_dma_cookie.dmac_laddress % VIRTIO_PAGE_SIZE == 0); + + (void) memset(vq->vq_vaddr, 0, allocsize); + + /* Make sure all zeros hit the buffer before we point the host to it */ + membar_producer(); + + /* set the vq address */ + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS), + (vq->vq_dma_cookie.dmac_laddress / VIRTIO_PAGE_SIZE)); + + /* remember addresses and offsets for later use */ + vq->vq_owner = sc; + vq->vq_num = vq_size; + vq->vq_index = index; + vq->vq_descs = vq->vq_vaddr; + vq->vq_availoffset = sizeof (struct vring_desc)*vq_size; + vq->vq_avail = (void *)(((char *)vq->vq_descs) + vq->vq_availoffset); + vq->vq_usedoffset = allocsize1; + vq->vq_used = (void *)(((char *)vq->vq_descs) + vq->vq_usedoffset); + + ASSERT(indirect_num == 0 || + virtio_has_feature(sc, VIRTIO_F_RING_INDIRECT_DESC)); + vq->vq_indirect_num = indirect_num; + + /* free slot management */ + vq->vq_entries = kmem_zalloc(sizeof (struct vq_entry) * vq_size, + KM_SLEEP); + + ret = virtio_init_vq(sc, vq); + if (ret) + goto out_init; + + dev_debug(sc->sc_dev, CE_NOTE, + "Allocated %d entries for vq %d:%s (%d incdirect descs)", + vq_size, index, name, indirect_num * vq_size); + + return (vq); + +out_init: + kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq_size); + (void) ddi_dma_unbind_handle(vq->vq_dma_handle); +out_bind: + ddi_dma_mem_free(&vq->vq_dma_acch); +out_alloc: + ddi_dma_free_handle(&vq->vq_dma_handle); +out_alloc_handle: + kmem_free(vq, sizeof (struct virtqueue)); +out: + return (NULL); +} + + +void +virtio_free_vq(struct virtqueue *vq) +{ + struct virtio_softc *sc = vq->vq_owner; + int i; + + /* tell device that there's no virtqueue any longer */ + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_SELECT), + vq->vq_index); + ddi_put32(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint32_t *)(sc->sc_io_addr + VIRTIO_CONFIG_QUEUE_ADDRESS), 0); + + /* Free the indirect descriptors, if any. */ + for (i = 0; i < vq->vq_num; i++) { + struct vq_entry *entry = &vq->vq_entries[i]; + if (entry->qe_indirect_descs) + virtio_free_indirect(entry); + } + + kmem_free(vq->vq_entries, sizeof (struct vq_entry) * vq->vq_num); + + (void) ddi_dma_unbind_handle(vq->vq_dma_handle); + ddi_dma_mem_free(&vq->vq_dma_acch); + ddi_dma_free_handle(&vq->vq_dma_handle); + + mutex_destroy(&vq->vq_used_lock); + mutex_destroy(&vq->vq_avail_lock); + mutex_destroy(&vq->vq_freelist_lock); + + kmem_free(vq, sizeof (struct virtqueue)); +} + +/* + * Free descriptor management. + */ +struct vq_entry * +vq_alloc_entry(struct virtqueue *vq) +{ + struct vq_entry *qe; + + mutex_enter(&vq->vq_freelist_lock); + if (list_is_empty(&vq->vq_freelist)) { + mutex_exit(&vq->vq_freelist_lock); + return (NULL); + } + qe = list_remove_head(&vq->vq_freelist); + + ASSERT(vq->vq_used_entries >= 0); + vq->vq_used_entries++; + + mutex_exit(&vq->vq_freelist_lock); + + qe->qe_next = NULL; + qe->qe_indirect_next = 0; + (void) memset(qe->qe_desc, 0, sizeof (struct vring_desc)); + + return (qe); +} + +void +vq_free_entry(struct virtqueue *vq, struct vq_entry *qe) +{ + mutex_enter(&vq->vq_freelist_lock); + + list_insert_head(&vq->vq_freelist, qe); + vq->vq_used_entries--; + ASSERT(vq->vq_used_entries >= 0); + mutex_exit(&vq->vq_freelist_lock); +} + +/* + * We (intentionally) don't have a global vq mutex, so you are + * responsible for external locking to avoid allocting/freeing any + * entries before using the returned value. Have fun. + */ +uint_t +vq_num_used(struct virtqueue *vq) +{ + /* vq->vq_freelist_lock would not help here. */ + return (vq->vq_used_entries); +} + +static inline void +virtio_ve_set_desc(struct vring_desc *desc, uint64_t paddr, uint32_t len, + boolean_t write) +{ + desc->addr = paddr; + desc->len = len; + desc->next = 0; + desc->flags = 0; + + /* 'write' - from the driver's point of view */ + if (!write) + desc->flags = VRING_DESC_F_WRITE; + + +} + +void +virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len, + boolean_t write) +{ + virtio_ve_set_desc(qe->qe_desc, paddr, len, write); +} + +void +virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr, uint32_t len, + boolean_t write) +{ + struct vring_desc *indirect_desc; + + ASSERT(qe->qe_queue->vq_indirect_num); + ASSERT(qe->qe_indirect_next < qe->qe_queue->vq_indirect_num); + + indirect_desc = &qe->qe_indirect_descs[qe->qe_indirect_next]; + virtio_ve_set_desc(indirect_desc, paddr, len, write); + qe->qe_indirect_next++; +} + +void +virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle, + ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write) +{ + int i; + + for (i = 0; i < ncookies; i++) { + virtio_ve_add_indirect_buf(qe, dma_cookie.dmac_laddress, + dma_cookie.dmac_size, write); + ddi_dma_nextcookie(dma_handle, &dma_cookie); + } +} + +void +virtio_sync_vq(struct virtqueue *vq) +{ + struct virtio_softc *vsc = vq->vq_owner; + + /* Make sure the avail ring update hit the buffer */ + membar_producer(); + + vq->vq_avail->idx = vq->vq_avail_idx; + + /* Make sure the avail idx update hits the buffer */ + membar_producer(); + + /* Make sure we see the flags update */ + membar_consumer(); + + if (!(vq->vq_used->flags & VRING_USED_F_NO_NOTIFY)) + ddi_put16(vsc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(vsc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_NOTIFY), + vq->vq_index); +} + +void +virtio_push_chain(struct vq_entry *qe, boolean_t sync) +{ + struct virtqueue *vq = qe->qe_queue; + struct vq_entry *head = qe; + struct vring_desc *desc; + int idx; + + ASSERT(qe); + + /* + * Bind the descs together, paddr and len should be already + * set with virtio_ve_set + */ + do { + /* Bind the indirect descriptors */ + if (qe->qe_indirect_next > 1) { + uint16_t i = 0; + + /* + * Set the pointer/flags to the + * first indirect descriptor + */ + virtio_ve_set_desc(qe->qe_desc, + qe->qe_indirect_dma_cookie.dmac_laddress, + sizeof (struct vring_desc) * qe->qe_indirect_next, + B_FALSE); + qe->qe_desc->flags |= VRING_DESC_F_INDIRECT; + + /* For all but the last one, add the next index/flag */ + do { + desc = &qe->qe_indirect_descs[i]; + i++; + + desc->flags |= VRING_DESC_F_NEXT; + desc->next = i; + } while (i < qe->qe_indirect_next - 1); + + } + + if (qe->qe_next) { + qe->qe_desc->flags |= VRING_DESC_F_NEXT; + qe->qe_desc->next = qe->qe_next->qe_index; + } + + qe = qe->qe_next; + } while (qe); + + mutex_enter(&vq->vq_avail_lock); + idx = vq->vq_avail_idx; + vq->vq_avail_idx++; + + /* Make sure the bits hit the descriptor(s) */ + membar_producer(); + vq->vq_avail->ring[idx % vq->vq_num] = head->qe_index; + + /* Notify the device, if needed. */ + if (sync) + virtio_sync_vq(vq); + + mutex_exit(&vq->vq_avail_lock); +} + +/* Get a chain of descriptors from the used ring, if one is available. */ +struct vq_entry * +virtio_pull_chain(struct virtqueue *vq, uint32_t *len) +{ + struct vq_entry *head; + int slot; + int usedidx; + + mutex_enter(&vq->vq_used_lock); + + /* No used entries? Bye. */ + if (vq->vq_used_idx == vq->vq_used->idx) { + mutex_exit(&vq->vq_used_lock); + return (NULL); + } + + usedidx = vq->vq_used_idx; + vq->vq_used_idx++; + mutex_exit(&vq->vq_used_lock); + + usedidx %= vq->vq_num; + + /* Make sure we do the next step _after_ checking the idx. */ + membar_consumer(); + + slot = vq->vq_used->ring[usedidx].id; + *len = vq->vq_used->ring[usedidx].len; + + head = &vq->vq_entries[slot]; + + return (head); +} + +void +virtio_free_chain(struct vq_entry *qe) +{ + struct vq_entry *tmp; + struct virtqueue *vq = qe->qe_queue; + + ASSERT(qe); + + do { + ASSERT(qe->qe_queue == vq); + tmp = qe->qe_next; + vq_free_entry(vq, qe); + qe = tmp; + } while (tmp); +} + +void +virtio_ventry_stick(struct vq_entry *first, struct vq_entry *second) +{ + first->qe_next = second; +} + +static int +virtio_register_msi(struct virtio_softc *sc, + struct virtio_int_handler *config_handler, + struct virtio_int_handler vq_handlers[], + int intr_types) +{ + int count, actual; + int int_type; + int i; + int handler_count; + int ret; + + /* If both MSI and MSI-x are reported, prefer MSI-x. */ + int_type = DDI_INTR_TYPE_MSI; + if (intr_types & DDI_INTR_TYPE_MSIX) + int_type = DDI_INTR_TYPE_MSIX; + + /* Walk the handler table to get the number of handlers. */ + for (handler_count = 0; + vq_handlers && vq_handlers[handler_count].vh_func; + handler_count++) + ; + + /* +1 if there is a config change handler. */ + if (config_handler) + handler_count++; + + /* Number of MSIs supported by the device. */ + ret = ddi_intr_get_nintrs(sc->sc_dev, int_type, &count); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_nintrs failed"); + return (ret); + } + + /* + * Those who try to register more handlers then the device + * supports shall suffer. + */ + ASSERT(handler_count <= count); + + sc->sc_intr_htable = kmem_zalloc( + sizeof (ddi_intr_handle_t) * handler_count, KM_SLEEP); + + ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable, int_type, 0, + handler_count, &actual, DDI_INTR_ALLOC_NORMAL); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "Failed to allocate MSI: %d", ret); + goto out_msi_alloc; + } + + if (actual != handler_count) { + dev_err(sc->sc_dev, CE_WARN, + "Not enough MSI available: need %d, available %d", + handler_count, actual); + goto out_msi_available; + } + + sc->sc_intr_num = handler_count; + sc->sc_intr_config = B_FALSE; + if (config_handler) { + sc->sc_intr_config = B_TRUE; + } + + /* Assume they are all same priority */ + ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed"); + goto out_msi_prio; + } + + /* Add the vq handlers */ + for (i = 0; vq_handlers[i].vh_func; i++) { + ret = ddi_intr_add_handler(sc->sc_intr_htable[i], + vq_handlers[i].vh_func, + sc, vq_handlers[i].vh_priv); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "ddi_intr_add_handler failed"); + /* Remove the handlers that succeeded. */ + while (--i >= 0) { + (void) ddi_intr_remove_handler( + sc->sc_intr_htable[i]); + } + goto out_add_handlers; + } + } + + /* Don't forget the config handler */ + if (config_handler) { + ret = ddi_intr_add_handler(sc->sc_intr_htable[i], + config_handler->vh_func, + sc, config_handler->vh_priv); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "ddi_intr_add_handler failed"); + /* Remove the handlers that succeeded. */ + while (--i >= 0) { + (void) ddi_intr_remove_handler( + sc->sc_intr_htable[i]); + } + goto out_add_handlers; + } + } + + /* We know we are using MSI, so set the config offset. */ + sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_MSI; + + ret = ddi_intr_get_cap(sc->sc_intr_htable[0], + &sc->sc_intr_cap); + /* Just in case. */ + if (ret != DDI_SUCCESS) + sc->sc_intr_cap = 0; + +out_add_handlers: +out_msi_prio: +out_msi_available: + for (i = 0; i < actual; i++) + (void) ddi_intr_free(sc->sc_intr_htable[i]); +out_msi_alloc: + kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t) * count); + + return (ret); +} + +struct virtio_handler_container { + int nhandlers; + struct virtio_int_handler config_handler; + struct virtio_int_handler vq_handlers[]; +}; + +uint_t +virtio_intx_dispatch(caddr_t arg1, caddr_t arg2) +{ + struct virtio_softc *sc = (void *)arg1; + struct virtio_handler_container *vhc = (void *)arg2; + uint8_t isr_status; + int i; + + isr_status = ddi_get8(sc->sc_ioh, (uint8_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_ISR_STATUS)); + + if (!isr_status) + return (DDI_INTR_UNCLAIMED); + + if ((isr_status & VIRTIO_CONFIG_ISR_CONFIG_CHANGE) && + vhc->config_handler.vh_func) { + vhc->config_handler.vh_func((void *)sc, + vhc->config_handler.vh_priv); + } + + /* Notify all handlers */ + for (i = 0; i < vhc->nhandlers; i++) { + vhc->vq_handlers[i].vh_func((void *)sc, + vhc->vq_handlers[i].vh_priv); + } + + return (DDI_INTR_CLAIMED); +} + +/* + * config_handler and vq_handlers may be allocated on stack. + * Take precautions not to loose them. + */ +static int +virtio_register_intx(struct virtio_softc *sc, + struct virtio_int_handler *config_handler, + struct virtio_int_handler vq_handlers[]) +{ + int vq_handler_count; + int config_handler_count = 0; + int actual; + struct virtio_handler_container *vhc; + int ret = DDI_FAILURE; + + /* Walk the handler table to get the number of handlers. */ + for (vq_handler_count = 0; + vq_handlers && vq_handlers[vq_handler_count].vh_func; + vq_handler_count++) + ; + + if (config_handler) + config_handler_count = 1; + + vhc = kmem_zalloc(sizeof (struct virtio_handler_container) + + sizeof (struct virtio_int_handler) * vq_handler_count, + KM_SLEEP); + + vhc->nhandlers = vq_handler_count; + (void) memcpy(vhc->vq_handlers, vq_handlers, + sizeof (struct virtio_int_handler) * vq_handler_count); + + if (config_handler) { + (void) memcpy(&vhc->config_handler, config_handler, + sizeof (struct virtio_int_handler)); + } + + /* Just a single entry for a single interrupt. */ + sc->sc_intr_htable = kmem_zalloc(sizeof (ddi_intr_handle_t), KM_SLEEP); + + ret = ddi_intr_alloc(sc->sc_dev, sc->sc_intr_htable, + DDI_INTR_TYPE_FIXED, 0, 1, &actual, + DDI_INTR_ALLOC_NORMAL); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to allocate a fixed interrupt: %d", ret); + goto out_int_alloc; + } + + ASSERT(actual == 1); + sc->sc_intr_num = 1; + + ret = ddi_intr_get_pri(sc->sc_intr_htable[0], &sc->sc_intr_prio); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "ddi_intr_get_pri failed"); + goto out_prio; + } + + ret = ddi_intr_add_handler(sc->sc_intr_htable[0], + virtio_intx_dispatch, sc, vhc); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "ddi_intr_add_handler failed"); + goto out_add_handlers; + } + + /* We know we are not using MSI, so set the config offset. */ + sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI; + + return (DDI_SUCCESS); + +out_add_handlers: +out_prio: + (void) ddi_intr_free(sc->sc_intr_htable[0]); +out_int_alloc: + kmem_free(sc->sc_intr_htable, sizeof (ddi_intr_handle_t)); + kmem_free(vhc, sizeof (struct virtio_int_handler) * + (vq_handler_count + config_handler_count)); + return (ret); +} + +/* + * We find out if we support MSI during this, and the register layout + * depends on the MSI (doh). Don't acces the device specific bits in + * BAR 0 before calling it! + */ +int +virtio_register_ints(struct virtio_softc *sc, + struct virtio_int_handler *config_handler, + struct virtio_int_handler vq_handlers[]) +{ + int ret; + int intr_types; + + /* Determine which types of interrupts are supported */ + ret = ddi_intr_get_supported_types(sc->sc_dev, &intr_types); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, "Can't get supported int types"); + goto out_inttype; + } + + /* If we have msi, let's use them. */ + if (intr_types & (DDI_INTR_TYPE_MSIX | DDI_INTR_TYPE_MSI)) { + ret = virtio_register_msi(sc, config_handler, + vq_handlers, intr_types); + if (!ret) + return (0); + } + + /* Fall back to old-fashioned interrupts. */ + if (intr_types & DDI_INTR_TYPE_FIXED) { + dev_debug(sc->sc_dev, CE_WARN, + "Using legacy interrupts"); + + return (virtio_register_intx(sc, config_handler, vq_handlers)); + } + + dev_err(sc->sc_dev, CE_WARN, + "MSI failed and fixed interrupts not supported. Giving up."); + ret = DDI_FAILURE; + +out_inttype: + return (ret); +} + + +static int +virtio_enable_msi(struct virtio_softc *sc) +{ + int ret, i; + int vq_handler_count = sc->sc_intr_num; + + /* Number of handlers, not counting the counfig. */ + if (sc->sc_intr_config) + vq_handler_count--; + + /* Enable the iterrupts. Either the whole block, or one by one. */ + if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) { + ret = ddi_intr_block_enable(sc->sc_intr_htable, + sc->sc_intr_num); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to enable MSI, falling back to INTx"); + goto out_enable; + } + } else { + for (i = 0; i < sc->sc_intr_num; i++) { + ret = ddi_intr_enable(sc->sc_intr_htable[i]); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to enable MSI %d, " + "falling back to INTx", i); + + while (--i >= 0) { + (void) ddi_intr_disable( + sc->sc_intr_htable[i]); + } + goto out_enable; + } + } + } + + /* Bind the allocated MSI to the queues and config */ + for (i = 0; i < vq_handler_count; i++) { + int check; + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_SELECT), i); + + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_VECTOR), i); + + check = ddi_get16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_VECTOR)); + if (check != i) { + dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler" + "for VQ %d, MSI %d. Check = %x", i, i, check); + ret = ENODEV; + goto out_bind; + } + } + + if (sc->sc_intr_config) { + int check; + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_CONFIG_VECTOR), i); + + check = ddi_get16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_CONFIG_VECTOR)); + if (check != i) { + dev_err(sc->sc_dev, CE_WARN, "Failed to bind handler " + "for Config updates, MSI %d", i); + ret = ENODEV; + goto out_bind; + } + } + + return (DDI_SUCCESS); + +out_bind: + /* Unbind the vqs */ + for (i = 0; i < vq_handler_count - 1; i++) { + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_SELECT), i); + + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_VECTOR), + VIRTIO_MSI_NO_VECTOR); + } + /* And the config */ + /* LINTED E_BAD_PTR_CAST_ALIGN */ + ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_CONFIG_VECTOR), VIRTIO_MSI_NO_VECTOR); + + ret = DDI_FAILURE; + +out_enable: + return (ret); +} + +static int virtio_enable_intx(struct virtio_softc *sc) +{ + int ret; + + ret = ddi_intr_enable(sc->sc_intr_htable[0]); + if (ret != DDI_SUCCESS) + dev_err(sc->sc_dev, CE_WARN, + "Failed to enable interrupt: %d", ret); + return (ret); +} + +/* + * We can't enable/disable individual handlers in the INTx case so do + * the whole bunch even in the msi case. + */ +int +virtio_enable_ints(struct virtio_softc *sc) +{ + + /* See if we are using MSI. */ + if (sc->sc_config_offset == VIRTIO_CONFIG_DEVICE_CONFIG_MSI) + return (virtio_enable_msi(sc)); + + ASSERT(sc->sc_config_offset == VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI); + + return (virtio_enable_intx(sc)); +} + +void +virtio_release_ints(struct virtio_softc *sc) +{ + int i; + int ret; + + /* We were running with MSI, unbind them. */ + if (sc->sc_config_offset == VIRTIO_CONFIG_DEVICE_CONFIG_MSI) { + /* Unbind all vqs */ + for (i = 0; i < sc->sc_nvqs; i++) { + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_SELECT), i); + + ddi_put16(sc->sc_ioh, + /* LINTED E_BAD_PTR_CAST_ALIGN */ + (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_QUEUE_VECTOR), + VIRTIO_MSI_NO_VECTOR); + } + /* And the config */ + /* LINTED E_BAD_PTR_CAST_ALIGN */ + ddi_put16(sc->sc_ioh, (uint16_t *)(sc->sc_io_addr + + VIRTIO_CONFIG_CONFIG_VECTOR), + VIRTIO_MSI_NO_VECTOR); + + } + + /* Disable the iterrupts. Either the whole block, or one by one. */ + if (sc->sc_intr_cap & DDI_INTR_FLAG_BLOCK) { + ret = ddi_intr_block_disable(sc->sc_intr_htable, + sc->sc_intr_num); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to disable MSIs, won't be able to" + "reuse next time"); + } + } else { + for (i = 0; i < sc->sc_intr_num; i++) { + ret = ddi_intr_disable(sc->sc_intr_htable[i]); + if (ret != DDI_SUCCESS) { + dev_err(sc->sc_dev, CE_WARN, + "Failed to disable interrupt %d, " + "won't be able to reuse", i); + + } + } + } + + + for (i = 0; i < sc->sc_intr_num; i++) { + (void) ddi_intr_remove_handler(sc->sc_intr_htable[i]); + } + + for (i = 0; i < sc->sc_intr_num; i++) + (void) ddi_intr_free(sc->sc_intr_htable[i]); + + kmem_free(sc->sc_intr_htable, + sizeof (ddi_intr_handle_t) * sc->sc_intr_num); + + + /* After disabling interrupts, the config offset is non-MSI. */ + sc->sc_config_offset = VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI; +} + +/* + * Module linkage information for the kernel. + */ +static struct modlmisc modlmisc = { + &mod_miscops, /* Type of module */ + "VirtIO common library module", +}; + +static struct modlinkage modlinkage = { + MODREV_1, + { + (void *)&modlmisc, + NULL + } +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/virtio/virtioreg.h b/usr/src/uts/common/io/virtio/virtioreg.h new file mode 100644 index 0000000000..8cfcd59a47 --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtioreg.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2010 Minoura Makoto. + * Copyright (c) 2012 Nexenta Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT' + * Appendix A. + */ + +/* + * An interface for efficient virtio implementation. + * + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers. + * + * Copyright 2007, 2009, IBM Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __VIRTIOREG_H__ +#define __VIRTIOREG_H__ + +#include <sys/types.h> + +#define PCI_VENDOR_QUMRANET 0x1af4 +#define PCI_DEV_VIRTIO_MIN 0x1000 +#define PCI_DEV_VIRTIO_MAX 0x103f +#define VIRTIO_PCI_ABI_VERSION 0 + +/* Virtio product id (subsystem) */ +#define PCI_PRODUCT_VIRTIO_NETWORK 1 +#define PCI_PRODUCT_VIRTIO_BLOCK 2 +#define PCI_PRODUCT_VIRTIO_CONSOLE 3 +#define PCI_PRODUCT_VIRTIO_ENTROPY 4 +#define PCI_PRODUCT_VIRTIO_BALLOON 5 +#define PCI_PRODUCT_VIRTIO_9P 9 + +/* Virtio header */ +#define VIRTIO_CONFIG_DEVICE_FEATURES 0 /* 32bit */ +#define VIRTIO_CONFIG_GUEST_FEATURES 4 /* 32bit */ + +#define VIRTIO_F_NOTIFY_ON_EMPTY (1<<24) +#define VIRTIO_F_RING_INDIRECT_DESC (1<<28) +#define VIRTIO_F_BAD_FEATURE (1<<30) + +#define VIRTIO_CONFIG_QUEUE_ADDRESS 8 /* 32bit */ +#define VIRTIO_CONFIG_QUEUE_SIZE 12 /* 16bit */ +#define VIRTIO_CONFIG_QUEUE_SELECT 14 /* 16bit */ +#define VIRTIO_CONFIG_QUEUE_NOTIFY 16 /* 16bit */ +#define VIRTIO_CONFIG_DEVICE_STATUS 18 /* 8bit */ + +#define VIRTIO_CONFIG_DEVICE_STATUS_RESET 0 +#define VIRTIO_CONFIG_DEVICE_STATUS_ACK 1 +#define VIRTIO_CONFIG_DEVICE_STATUS_DRIVER 2 +#define VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK 4 +#define VIRTIO_CONFIG_DEVICE_STATUS_FAILED 128 + +#define VIRTIO_CONFIG_ISR_STATUS 19 /* 8bit */ +#define VIRTIO_CONFIG_ISR_CONFIG_CHANGE 2 + +#define VIRTIO_CONFIG_CONFIG_VECTOR 20 /* 16bit, optional */ +#define VIRTIO_CONFIG_QUEUE_VECTOR 22 + +#define VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI 20 +#define VIRTIO_CONFIG_DEVICE_CONFIG_MSI 24 + +#define VIRTIO_MSI_NO_VECTOR 0xffff + +/* Virtqueue */ +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* + * This marks a buffer as write-only, from the devices's perspective. + * (otherwise read-only). + */ +#define VRING_DESC_F_WRITE 2 +/* This means the buffer contains a list of buffer descriptors. */ +#define VRING_DESC_F_INDIRECT 4 + +/* + * The Host uses this in used->flags to advise the Guest: don't kick me + * when you add a buffer. It's unreliable, so it's simply an + * optimization. Guest will still kick if it's out of buffers. + */ +#define VRING_USED_F_NO_NOTIFY 1 +/* + * The Guest uses this in avail->flags to advise the Host: don't + * interrupt me when you consume a buffer. It's unreliable, so it's + * simply an optimization. + */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +/* + * Virtio ring descriptors: 16 bytes. + * These can chain together via "next". + */ +struct vring_desc { + /* Address (guest-physical). */ + uint64_t addr; + /* Length. */ + uint32_t len; + /* The flags as indicated above. */ + uint16_t flags; + /* We chain unused descriptors via this, too */ + uint16_t next; +} __attribute__((packed)); + +struct vring_avail { + uint16_t flags; + uint16_t idx; + uint16_t ring[]; +} __attribute__((packed)); + +/* u32 is used here for ids for padding reasons. */ +struct vring_used_elem { + /* Index of start of used descriptor chain. */ + uint32_t id; + /* Total length of the descriptor chain which was written to. */ + uint32_t len; +} __attribute__((packed)); + +struct vring_used { + uint16_t flags; + uint16_t idx; + struct vring_used_elem ring[]; +} __attribute__((packed)); + + +/* Got nothing to do with the system page size, just a confusing name. */ +#define VIRTIO_PAGE_SIZE (4096) + +#endif /* __VIRTIOREG_H__ */ diff --git a/usr/src/uts/common/io/virtio/virtiovar.h b/usr/src/uts/common/io/virtio/virtiovar.h new file mode 100644 index 0000000000..e1617feb5d --- /dev/null +++ b/usr/src/uts/common/io/virtio/virtiovar.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2010 Minoura Makoto. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Part of the file derived from `Virtio PCI Card Specification v0.8.6 DRAFT' + * Appendix A. + */ + +/* + * An interface for efficient virtio implementation. + * + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers. + * + * Copyright 2007, 2009, IBM Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' ANDANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef __VIRTIOVAR_H__ +#define __VIRTIOVAR_H__ + +#include <sys/types.h> +#include <sys/dditypes.h> +#include <sys/cmn_err.h> +#include <sys/list.h> + +#ifdef DEBUG +#define dev_debug(dip, fmt, arg...) \ + dev_err(dip, fmt, ##arg) +#else +#define dev_debug(dip, fmt, arg...) +#endif + +struct vq_entry { + list_node_t qe_list; + struct virtqueue *qe_queue; + uint16_t qe_index; /* index in vq_desc array */ + /* followings are used only when it is the `head' entry */ + struct vq_entry *qe_next; + struct vring_desc *qe_desc; + ddi_dma_cookie_t qe_indirect_dma_cookie; + ddi_dma_handle_t qe_indirect_dma_handle; + ddi_acc_handle_t qe_indirect_dma_acch; + struct vring_desc *qe_indirect_descs; + unsigned int qe_indirect_next; +}; + +struct virtqueue { + struct virtio_softc *vq_owner; + unsigned int vq_num; /* queue size (# of entries) */ + unsigned int vq_indirect_num; + int vq_index; /* queue number (0, 1, ...) */ + + /* vring pointers (KVA) */ + struct vring_desc *vq_descs; + struct vring_avail *vq_avail; + struct vring_used *vq_used; + + /* virtqueue allocation info */ + void *vq_vaddr; + int vq_availoffset; + int vq_usedoffset; + ddi_dma_cookie_t vq_dma_cookie; + ddi_dma_handle_t vq_dma_handle; + ddi_acc_handle_t vq_dma_acch; + + int vq_maxsegsize; + + /* free entry management */ + struct vq_entry *vq_entries; + list_t vq_freelist; + kmutex_t vq_freelist_lock; + int vq_used_entries; + + /* enqueue/dequeue status */ + uint16_t vq_avail_idx; + kmutex_t vq_avail_lock; + uint16_t vq_used_idx; + kmutex_t vq_used_lock; +}; + +struct virtio_softc { + dev_info_t *sc_dev; + + uint_t sc_intr_prio; + + ddi_acc_handle_t sc_ioh; + caddr_t sc_io_addr; + int sc_config_offset; + + uint32_t sc_features; + + int sc_nvqs; /* set by the user */ + + ddi_intr_handle_t *sc_intr_htable; + int sc_intr_num; + boolean_t sc_intr_config; + int sc_intr_cap; +}; + +struct virtio_int_handler { + ddi_intr_handler_t *vh_func; + void *vh_priv; +}; + +/* public interface */ +uint32_t virtio_negotiate_features(struct virtio_softc *, uint32_t); +size_t virtio_show_features(uint32_t features, char *buffer, size_t len); +boolean_t virtio_has_feature(struct virtio_softc *sc, uint32_t feature); +void virtio_set_status(struct virtio_softc *sc, unsigned int); +#define virtio_device_reset(sc) virtio_set_status((sc), 0) + +uint8_t virtio_read_device_config_1(struct virtio_softc *sc, + unsigned int index); +uint16_t virtio_read_device_config_2(struct virtio_softc *sc, + unsigned int index); +uint32_t virtio_read_device_config_4(struct virtio_softc *sc, + unsigned int index); +uint64_t virtio_read_device_config_8(struct virtio_softc *sc, + unsigned int index); +void virtio_write_device_config_1(struct virtio_softc *sc, + unsigned int index, uint8_t value); +void virtio_write_device_config_2(struct virtio_softc *sc, + unsigned int index, uint16_t value); +void virtio_write_device_config_4(struct virtio_softc *sc, + unsigned int index, uint32_t value); +void virtio_write_device_config_8(struct virtio_softc *sc, + unsigned int index, uint64_t value); + +struct virtqueue *virtio_alloc_vq(struct virtio_softc *sc, + unsigned int index, unsigned int size, + unsigned int indirect_num, const char *name); +void virtio_free_vq(struct virtqueue *); +void virtio_reset(struct virtio_softc *); +struct vq_entry *vq_alloc_entry(struct virtqueue *vq); +void vq_free_entry(struct virtqueue *vq, struct vq_entry *qe); +uint_t vq_num_used(struct virtqueue *vq); + +void virtio_stop_vq_intr(struct virtqueue *); +void virtio_start_vq_intr(struct virtqueue *); + +void virtio_ve_add_cookie(struct vq_entry *qe, ddi_dma_handle_t dma_handle, + ddi_dma_cookie_t dma_cookie, unsigned int ncookies, boolean_t write); +void virtio_ve_add_indirect_buf(struct vq_entry *qe, uint64_t paddr, + uint32_t len, boolean_t write); +void virtio_ve_set(struct vq_entry *qe, uint64_t paddr, uint32_t len, + boolean_t write); + +void virtio_push_chain(struct vq_entry *qe, boolean_t sync); +struct vq_entry *virtio_pull_chain(struct virtqueue *vq, uint32_t *len); +void virtio_free_chain(struct vq_entry *ve); +void virtio_sync_vq(struct virtqueue *vq); + +int virtio_register_ints(struct virtio_softc *sc, + struct virtio_int_handler *config_handler, + struct virtio_int_handler vq_handlers[]); +void virtio_release_ints(struct virtio_softc *sc); +int virtio_enable_ints(struct virtio_softc *sc); + +#endif /* __VIRTIOVAR_H__ */ diff --git a/usr/src/uts/common/nfs/nfs4.h b/usr/src/uts/common/nfs/nfs4.h index 829043bbe3..cf36c03d0a 100644 --- a/usr/src/uts/common/nfs/nfs4.h +++ b/usr/src/uts/common/nfs/nfs4.h @@ -22,6 +22,9 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ #ifndef _NFS4_H #define _NFS4_H @@ -1307,7 +1310,7 @@ extern char *utf8_to_fn(utf8string *, uint_t *, char *); extern utf8string *str_to_utf8(char *, utf8string *); extern utf8string *utf8_copy(utf8string *, utf8string *); extern int utf8_compare(const utf8string *, const utf8string *); -extern int utf8_dir_verify(utf8string *); +extern nfsstat4 utf8_dir_verify(utf8string *); extern char *utf8_strchr(utf8string *, const char); extern int ln_ace4_cmp(nfsace4 *, nfsace4 *, int); extern int vs_aent_to_ace4(vsecattr_t *, vsecattr_t *, int, int); diff --git a/usr/src/uts/common/os/sunmdi.c b/usr/src/uts/common/os/sunmdi.c index a4ee88fef4..f174b4d4aa 100644 --- a/usr/src/uts/common/os/sunmdi.c +++ b/usr/src/uts/common/os/sunmdi.c @@ -23,8 +23,8 @@ */ /* - * Multipath driver interface (MDI) implementation; see mdi_impl.h for a more - * detailed discussion of the overall mpxio architecture. + * Multipath driver interface (MDI) implementation; see mdi_impldefs.h for a + * more detailed discussion of the overall mpxio architecture. * * Default locking order: * diff --git a/usr/src/uts/common/smbsrv/string.h b/usr/src/uts/common/smbsrv/string.h index ceeb8accde..14b9cac8b8 100644 --- a/usr/src/uts/common/smbsrv/string.h +++ b/usr/src/uts/common/smbsrv/string.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SMBSRV_STRING_H @@ -123,8 +124,7 @@ int smb_isstrupr(const char *); int smb_isstrlwr(const char *); int smb_strcasecmp(const char *, const char *, size_t); -boolean_t smb_match(char *, char *); -boolean_t smb_match_ci(char *, char *); +boolean_t smb_match(const char *, const char *, boolean_t); size_t smb_mbstowcs(smb_wchar_t *, const char *, size_t); size_t smb_wcstombs(char *, const smb_wchar_t *, size_t); diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index 6d66401259..bc25aee9c4 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -812,7 +815,8 @@ typedef Elf64_Word Elf64_Capchain; #define NT_PRPRIVINFO 19 /* priv_impl_info_t <sys/priv.h> */ #define NT_CONTENT 20 /* core_content_t <sys/corectl.h> */ #define NT_ZONENAME 21 /* string from getzonenamebyid(3C) */ -#define NT_NUM 21 +#define NT_FDINFO 22 /* open fd info */ +#define NT_NUM 22 #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/ipmi.h b/usr/src/uts/common/sys/ipmi.h index 9dafac407d..94a53392de 100644 --- a/usr/src/uts/common/sys/ipmi.h +++ b/usr/src/uts/common/sys/ipmi.h @@ -42,8 +42,8 @@ extern "C" { #define IPMI_MAX_ADDR_SIZE 0x20 #define IPMI_MAX_RX 1024 -#define IPMI_BMC_SLAVE_ADDR 0x20 /* Linux Default slave address */ -#define IPMI_BMC_CHANNEL 0x0f /* Linux BMC channel */ +#define IPMI_BMC_SLAVE_ADDR 0x20 /* Default slave address */ +#define IPMI_BMC_CHANNEL 0x0f /* BMC channel */ #define IPMI_BMC_SMS_LUN 0x02 diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index 12a6925368..f592fd9dcf 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -23,12 +23,13 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + */ #ifndef _SYS_PROCFS_H #define _SYS_PROCFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -61,6 +62,8 @@ extern "C" { #include <sys/pset.h> #include <sys/procfs_isa.h> #include <sys/priv.h> +#include <sys/stat.h> +#include <sys/param.h> /* * System call interfaces for /proc. @@ -488,6 +491,38 @@ typedef struct prasmap { #define PG_HWMAPPED 0x04 /* page is present and mapped */ /* + * Open files. Only in core files (for now). Note that we'd like to use + * the stat or stat64 structure, but both of these structures are unfortunately + * not consistent between 32 and 64 bit modes. To keep our lives simpler, we + * just define our own structure with types that are not sensitive to this + * difference. Also, it turns out that pfiles omits a lot of info from the + * struct stat (e.g. times, device sizes, etc.) so we don't bother adding those + * here. + */ +typedef struct prfdinfo { + int pr_fd; + mode_t pr_mode; + + uid_t pr_uid; + gid_t pr_gid; + + major_t pr_major; /* think stat.st_dev */ + minor_t pr_minor; + + major_t pr_rmajor; /* think stat.st_rdev */ + minor_t pr_rminor; + + ino64_t pr_ino; + off64_t pr_offset; + off64_t pr_size; + + int pr_fileflags; /* fcntl(F_GETXFL), etc */ + int pr_fdflags; /* fcntl(F_GETFD), etc. */ + + char pr_path[MAXPATHLEN]; +} prfdinfo_t; + +/* * Header for /proc/<pid>/lstatus /proc/<pid>/lpsinfo /proc/<pid>/lusage */ typedef struct prheader { diff --git a/usr/src/uts/common/sys/utsname.h b/usr/src/uts/common/sys/utsname.h index 2b9cf0e33f..4a2aca442c 100644 --- a/usr/src/uts/common/sys/utsname.h +++ b/usr/src/uts/common/sys/utsname.h @@ -31,8 +31,6 @@ #ifndef _SYS_UTSNAME_H #define _SYS_UTSNAME_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/feature_tests.h> #ifdef __cplusplus @@ -67,13 +65,9 @@ extern struct utsname utsname; #if defined(__STDC__) -#if !defined(__lint) -static int uname(struct utsname *); -static int _uname(struct utsname *); -#else extern int uname(struct utsname *); extern int _uname(struct utsname *); -#endif + #if !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) extern int nuname(struct utsname *); #endif /* !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) */ @@ -81,13 +75,9 @@ extern int _nuname(struct utsname *); #else /* defined(__STDC__) */ -#if !defined(__lint) -static int uname(); -static int _uname(); -#else extern int uname(); extern int _uname(); -#endif + #if !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) extern int nuname(); #endif /* !defined(__XOPEN_OR_POSIX) || defined(__EXTENSIONS__) */ @@ -95,30 +85,20 @@ extern int _nuname(); #endif /* defined(__STDC__) */ - -#if !defined(__lint) -static int -#if defined(__STDC__) -_uname(struct utsname *_buf) -#else -_uname(_buf) -struct utsname *_buf; -#endif -{ - return (_nuname(_buf)); -} - -static int -#if defined(__STDC__) -uname(struct utsname *_buf) +/* + * On i386 in SVID.2 uname() returns a utsname structure with 8 byte members, + * and nuname() returns the real struct utsname. In SVID.3 uname and nuname + * are equivalent. Anyone who includes this header gets the SVID.3 behaviour. + * The SVID.2 behaviour exists solely for compatibility, and is what is + * implemented by the libc uname/_uname entrypoints. + */ +#ifdef __PRAGMA_REDEFINE_EXTNAME +#pragma redefine_extname uname _nuname +#pragma redefine_extname _uname _nuname #else -uname(_buf) -struct utsname *_buf; +#define uname _nuname +#define _uname _nuname #endif -{ - return (_nuname(_buf)); -} -#endif /* !defined(__lint) */ #else /* defined(__i386) */ diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 5e87d073e3..7f37529941 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -308,7 +308,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) deadline = 0; } else { /* They must wait at least a tick. */ - deadline = tsp->tv_sec * NANOSEC + tsp->tv_nsec; + deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; deadline = MAX(deadline, nsec_per_tick); deadline += gethrtime(); } |