diff options
Diffstat (limited to 'usr/src/uts/common/fs')
80 files changed, 13534 insertions, 896 deletions
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_construct.c b/usr/src/uts/common/fs/bootfs/bootfs_construct.c new file mode 100644 index 0000000000..b909b5d121 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_construct.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * This file takes care of reading the boot time modules and constructing them + * into the appropriate series of vnodes. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/vfs.h> +#include <sys/sysmacros.h> +#include <sys/stat.h> + +#include <sys/fs/bootfs_impl.h> + +kmem_cache_t *bootfs_node_cache; + +static const vattr_t bootfs_vattr_dir = { + AT_ALL, /* va_mask */ + VDIR, /* va_type */ + S_IFDIR | 0555, /* va_mode */ + 0, /* va_uid */ + 0, /* va_gid */ + 0, /* va_fsid */ + 0, /* va_nodeid */ + 1, /* va_nlink */ + 0, /* va_size */ + 0, /* va_atime */ + 0, /* va_mtime */ + 0, /* va_ctime */ + 0, /* va_rdev */ + 0, /* va_blksize */ + 0, /* va_nblocks */ + 0 /* va_seq */ +}; + +static const vattr_t bootfs_vattr_reg = { + AT_ALL, /* va_mask */ + VREG, /* va_type */ + S_IFREG | 0555, /* va_mode */ + 0, /* va_uid */ + 0, /* va_gid */ + 0, /* va_fsid */ + 0, /* va_nodeid */ + 1, /* va_nlink */ + 0, /* va_size */ + 0, /* va_atime */ + 0, /* va_mtime */ + 0, /* va_ctime */ + 0, /* va_rdev */ + 0, /* va_blksize */ + 0, /* va_nblocks */ + 0 /* va_seq */ +}; + +/*ARGSUSED*/ +int +bootfs_node_constructor(void *buf, void *arg, int kmflags) +{ + bootfs_node_t *bnp = buf; + + bnp->bvn_vnp = vn_alloc(kmflags); + if (bnp->bvn_vnp == NULL) + return (-1); + + return (0); +} + +/*ARGSUSED*/ +void +bootfs_node_destructor(void *buf, void *arg) +{ + bootfs_node_t *bnp = buf; + + vn_free(bnp->bvn_vnp); +} + +static int +bootfs_comparator(const void *a, const void *b) +{ + const bootfs_node_t *lfs, *rfs; + int ret; + + lfs = a; + rfs = b; + + ret = strcmp(lfs->bvn_name, rfs->bvn_name); + if (ret > 0) + ret = 1; + if (ret < 0) + ret = -1; + return (ret); +} + +static void +bootfs_node_init(bootfs_t *bfs, bootfs_node_t *bnp, const struct vattr *vap, + const char *name, size_t namelen) +{ + timestruc_t now; + + vn_reinit(bnp->bvn_vnp); + + bnp->bvn_vnp->v_flag |= VNOSWAP; + bnp->bvn_vnp->v_type = vap->va_type; + bnp->bvn_vnp->v_vfsp = bfs->bfs_vfsp; + bnp->bvn_vnp->v_rdev = 0; + bnp->bvn_vnp->v_data = (caddr_t)bnp; + vn_setops(bnp->bvn_vnp, bootfs_vnodeops); + + bnp->bvn_name = kmem_alloc(namelen + 1, KM_SLEEP); + bcopy(name, bnp->bvn_name, namelen); + bnp->bvn_name[namelen] = '\0'; + if (vap->va_type == VDIR) { + avl_create(&bnp->bvn_dir, bootfs_comparator, + sizeof (bootfs_node_t), + offsetof(bootfs_node_t, bvn_link)); + } + bzero(&bnp->bvn_link, sizeof (avl_node_t)); + bcopy(vap, &bnp->bvn_attr, sizeof (vattr_t)); + + gethrestime(&now); + bnp->bvn_attr.va_atime = now; + bnp->bvn_attr.va_ctime = now; + bnp->bvn_attr.va_mtime = now; + bnp->bvn_attr.va_fsid = makedevice(bootfs_major, bfs->bfs_minor); + bnp->bvn_attr.va_nodeid = bfs->bfs_ninode; + bnp->bvn_attr.va_blksize = PAGESIZE; + bfs->bfs_ninode++; + list_insert_tail(&bfs->bfs_nodes, bnp); +} + +static void +bootfs_mkroot(bootfs_t *bfs) +{ + bootfs_node_t *bnp; + + bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP); + bootfs_node_init(bfs, bnp, &bootfs_vattr_dir, "/", 1); + bnp->bvn_vnp->v_flag |= VROOT; + bnp->bvn_parent = bnp; + bfs->bfs_rootvn = bnp; + bfs->bfs_stat.bfss_ndirs.value.ui32++; + vn_exists(bnp->bvn_vnp); +} + +static int +bootfs_mknode(bootfs_t *bfs, bootfs_node_t *parent, bootfs_node_t **outp, + const char *name, size_t namelen, const vattr_t *vap, uintptr_t addr, + uint64_t size) +{ + bootfs_node_t *bnp; + bootfs_node_t sn; + avl_index_t where; + char *buf; + + ASSERT(parent->bvn_attr.va_type == VDIR); + buf = kmem_alloc(namelen + 1, KM_SLEEP); + bcopy(name, buf, namelen); + buf[namelen] = '\0'; + sn.bvn_name = buf; + if ((bnp = avl_find(&parent->bvn_dir, &sn, &where)) != NULL) { + kmem_free(buf, namelen + 1); + /* Directories can collide, files cannot */ + if (vap->va_type == VDIR) { + *outp = bnp; + return (0); + } + return (EEXIST); + } + kmem_free(buf, namelen + 1); + + bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP); + bootfs_node_init(bfs, bnp, vap, name, namelen); + bnp->bvn_parent = parent; + avl_add(&parent->bvn_dir, bnp); + *outp = bnp; + + if (vap->va_type == VDIR) { + parent->bvn_attr.va_size++; + parent->bvn_attr.va_nlink++; + bfs->bfs_stat.bfss_ndirs.value.ui32++; + } else { + bnp->bvn_addr = addr; + bnp->bvn_size = size; + bfs->bfs_stat.bfss_nfiles.value.ui32++; + bfs->bfs_stat.bfss_nbytes.value.ui64 += size; + bnp->bvn_attr.va_nblocks = P2ROUNDUP(size, 512) >> 9; + bnp->bvn_attr.va_size = size; + } + + vn_exists(bnp->bvn_vnp); + + return (0); +} + +/* + * Given the address, size, and path a boot-time module would like, go through + * and create all of the directory entries that are required and then the file + * itself. If someone has passed in a module that has the same name as another + * one, we honor the first one. + */ +static int +bootfs_construct_entry(bootfs_t *bfs, uintptr_t addr, uint64_t size, + const char *mname) +{ + char *sp; + size_t nlen; + int ret; + bootfs_node_t *nbnp; + + const char *p = mname; + bootfs_node_t *bnp = bfs->bfs_rootvn; + + if (*p == '\0') + return (EINVAL); + + for (;;) { + /* First eliminate all leading / characters. */ + while (*p == '/') + p++; + + /* A name with all slashes or ending in a / */ + if (*p == '\0') + return (EINVAL); + + sp = strchr(p, '/'); + if (sp == NULL) + break; + nlen = (ptrdiff_t)sp - (ptrdiff_t)p; + if (strncmp(p, ".", nlen) == 0) { + p = sp + 1; + continue; + } + + if (strncmp(p, "..", nlen) == 0) { + bnp = bnp->bvn_parent; + p = sp + 1; + continue; + } + + VERIFY(bootfs_mknode(bfs, bnp, &nbnp, p, nlen, + &bootfs_vattr_dir, addr, size) == 0); + p = sp + 1; + bnp = nbnp; + } + + nlen = strlen(p); + ret = bootfs_mknode(bfs, bnp, &nbnp, p, nlen, &bootfs_vattr_reg, + addr, size); + if (ret != 0) + return (ret); + + return (0); +} + +/* + * We're going to go through every boot time module and construct the + * appropriate vnodes for them now. Because there are very few of these that + * exist, generally on the order of a handful, we're going to create them all + * when the file system is initialized and then tear them all down when the + * module gets unloaded. + * + * The information about the modules is contained in properties on the root of + * the devinfo tree. Specifically there are three properties per module: + * + * - module-size-%d int64_t size, in bytes, of the boot time module. + * - module-addr-%d The address of the boot time module + * - module-name-%d The string name of the boot time module + * + * Note that the module-size and module-addr fields are always 64-bit values + * regardless of being on a 32-bit or 64-bit kernel. module-name is a string + * property. + * + * There is no property that indicates the total number of such modules. Modules + * start at 0 and work their way up incrementally. The first time we can't find + * a module or a property, then we stop. + */ +void +bootfs_construct(bootfs_t *bfs) +{ + uint_t id = 0, ndata; + char paddr[64], psize[64], pname[64], *mname; + dev_info_t *root; + uchar_t *datap; + uint64_t size = 0, addr = 0; + int ret; + + bootfs_mkroot(bfs); + root = ddi_root_node(); + + for (;;) { + if (id == UINT32_MAX) + break; + + if (snprintf(paddr, sizeof (paddr), "module-addr-%d", id) > + sizeof (paddr)) + break; + + if (snprintf(psize, sizeof (paddr), "module-size-%d", id) > + sizeof (paddr)) + break; + + if (snprintf(pname, sizeof (paddr), "module-name-%d", id) > + sizeof (paddr)) + break; + + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, paddr, &datap, &ndata) != + DDI_PROP_SUCCESS) + break; + + if (ndata == 8) + bcopy(datap, &addr, sizeof (uint64_t)); + ddi_prop_free(datap); + if (ndata != 8) + break; + + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, psize, &datap, &ndata) != + DDI_PROP_SUCCESS) + break; + if (ndata == 8) + bcopy(datap, &size, sizeof (uint64_t)); + ddi_prop_free(datap); + if (ndata != 8) + break; + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, pname, &mname) != DDI_PROP_SUCCESS) + break; + + ret = bootfs_construct_entry(bfs, addr, size, mname); + if (ret == EINVAL) + bfs->bfs_stat.bfss_ndiscards.value.ui32++; + if (ret == EEXIST) + bfs->bfs_stat.bfss_ndups.value.ui32++; + ddi_prop_free(mname); + + id++; + } +} + +void +bootfs_destruct(bootfs_t *bfs) +{ + bootfs_node_t *bnp; + + while ((bnp = list_remove_head(&bfs->bfs_nodes)) != NULL) { + ASSERT(bnp->bvn_vnp->v_count == 1); + VN_RELE(bnp->bvn_vnp); + kmem_free(bnp->bvn_name, strlen(bnp->bvn_name) + 1); + kmem_cache_free(bootfs_node_cache, bnp); + } +} diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c new file mode 100644 index 0000000000..e642e86169 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c @@ -0,0 +1,321 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/modctl.h> +#include <sys/types.h> +#include <sys/mkdev.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/systm.h> +#include <sys/id_space.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/policy.h> +#include <sys/mount.h> +#include <sys/sysmacros.h> + +#include <sys/fs/bootfs_impl.h> + +/* + * While booting, additional types of modules and files can be passed in to the + * loader. These include the familiar boot archive, as well as, a module hash + * and additional modules that are interpreted as files. As part of the handoff + * in early boot, information about these modules are saved as properties on the + * root of the devinfo tree, similar to other boot-time properties. + * + * This file system provides a read-only view of those additional files. Due to + * its limited scope, it has a slightly simpler construction than several other + * file systems. When mounted, it looks for the corresponding properties and + * creates bootfs_node_t's and vnodes for all of the corresponding files and + * directories that exist along the way. At this time, there are currently a + * rather small number of files passed in this way. + * + * This does lead to one behavior that folks used to other file systems might + * find peculiar. Because we are not always actively creating and destroying the + * required vnodes on demand, the count on the root vnode will not be going up + * accordingly with the existence of other vnodes. This means that a bootfs file + * system that is not in use will have all of its vnodes exist with a v_count of + * one. + */ + +major_t bootfs_major; +static int bootfs_fstype; +static id_space_t *bootfs_idspace; +static uint64_t bootfs_nactive; +static kmutex_t bootfs_lock; + +static const char *bootfs_name = "bootfs"; + +static int +bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + int ret; + bootfs_t *bfs; + struct pathname dpn; + dev_t fsdev; + + if ((ret = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (ret); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * We indicate that the backing store is bootfs. We don't want to use + * swap, because folks might think that this is putting all the data + * into memory ala tmpfs. Rather these modules are always in memory and + * there's nothing to be done about that. + */ + vfs_setresource(vfsp, bootfs_name, 0); + bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI); + if (bfs == NULL) + return (ENOMEM); + + ret = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (ret != 0) { + kmem_free(bfs, sizeof (bfs)); + return (ret); + } + + bfs->bfs_minor = id_alloc(bootfs_idspace); + bfs->bfs_kstat = kstat_create_zone("bootfs", bfs->bfs_minor, "bootfs", + "fs", KSTAT_TYPE_NAMED, + sizeof (bootfs_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (bfs->bfs_kstat == NULL) { + id_free(bootfs_idspace, bfs->bfs_minor); + pn_free(&dpn); + kmem_free(bfs, sizeof (bfs)); + return (ENOMEM); + } + bfs->bfs_kstat->ks_data = &bfs->bfs_stat; + + fsdev = makedevice(bootfs_major, bfs->bfs_minor); + bfs->bfs_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)bfs; + vfsp->vfs_fstype = bootfs_fstype; + vfsp->vfs_dev = fsdev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_RDONLY | VFS_NOSETUID | VFS_NOTRUNC | + VFS_UNLINKABLE; + vfs_make_fsid(&vfsp->vfs_fsid, fsdev, bootfs_fstype); + bfs->bfs_mntpath = kmem_alloc(dpn.pn_pathlen + 1, KM_SLEEP); + bcopy(dpn.pn_path, bfs->bfs_mntpath, dpn.pn_pathlen); + bfs->bfs_mntpath[dpn.pn_pathlen] = '\0'; + pn_free(&dpn); + list_create(&bfs->bfs_nodes, sizeof (bootfs_node_t), + offsetof(bootfs_node_t, bvn_alink)); + + kstat_named_init(&bfs->bfs_stat.bfss_nfiles, "nfiles", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_ndirs, "ndirs", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_nbytes, "nbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&bfs->bfs_stat.bfss_ndups, "ndup", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_ndiscards, "ndiscard", + KSTAT_DATA_UINT32); + + bootfs_construct(bfs); + + kstat_install(bfs->bfs_kstat); + + return (0); +} + +static int +bootfs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + int ret; + bootfs_t *bfs = vfsp->vfs_data; + bootfs_node_t *bnp; + + if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (ret); + + if (flag & MS_FORCE) + return (ENOTSUP); + + for (bnp = list_head(&bfs->bfs_nodes); bnp != NULL; + bnp = list_next(&bfs->bfs_nodes, bnp)) { + mutex_enter(&bnp->bvn_vnp->v_lock); + if (bnp->bvn_vnp->v_count > 1) { + mutex_exit(&bnp->bvn_vnp->v_lock); + return (EBUSY); + } + mutex_exit(&bnp->bvn_vnp->v_lock); + } + + kstat_delete(bfs->bfs_kstat); + bootfs_destruct(bfs); + list_destroy(&bfs->bfs_nodes); + kmem_free(bfs->bfs_mntpath, strlen(bfs->bfs_mntpath) + 1); + id_free(bootfs_idspace, bfs->bfs_minor); + kmem_free(bfs, sizeof (bootfs_t)); + return (0); +} + +static int +bootfs_root(vfs_t *vfsp, vnode_t **vpp) +{ + bootfs_t *bfs; + + bfs = (bootfs_t *)vfsp->vfs_data; + *vpp = bfs->bfs_rootvn->bvn_vnp; + VN_HOLD(*vpp) + + return (0); +} + +static int +bootfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + const bootfs_t *bfs = (bootfs_t *)vfsp; + dev32_t d32; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + sbp->f_blocks = bfs->bfs_stat.bfss_nbytes.value.ui64 >> PAGESHIFT; + sbp->f_bfree = 0; + sbp->f_bavail = 0; + + sbp->f_files = bfs->bfs_stat.bfss_nfiles.value.ui32 + + bfs->bfs_stat.bfss_ndirs.value.ui32; + sbp->f_ffree = 0; + sbp->f_favail = 0; + + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strlcpy(sbp->f_basetype, bootfs_name, FSTYPSZ); + bzero(sbp->f_fstr, sizeof (sbp->f_fstr)); + + return (0); +} + +static const fs_operation_def_t bootfs_vfsops_tmpl[] = { + VFSNAME_MOUNT, { .vfs_mount = bootfs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = bootfs_unmount }, + VFSNAME_ROOT, { .vfs_root = bootfs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = bootfs_statvfs }, + NULL, NULL +}; + +static int +bootfs_init(int fstype, char *name) +{ + int ret; + + bootfs_fstype = fstype; + ASSERT(bootfs_fstype != 0); + + ret = vfs_setfsops(fstype, bootfs_vfsops_tmpl, NULL); + if (ret != 0) + return (ret); + + ret = vn_make_ops(name, bootfs_vnodeops_template, &bootfs_vnodeops); + if (ret != 0) { + (void) vfs_freevfsops_by_type(bootfs_fstype); + return (ret); + } + + bootfs_major = getudev(); + if (bootfs_major == (major_t)-1) { + cmn_err(CE_WARN, "bootfs_init: Can't get unique device number"); + bootfs_major = 0; + } + + bootfs_nactive = 0; + return (0); +} + +static mntopts_t bootfs_mntopts = { + 0, NULL +}; + +static vfsdef_t bootfs_vfsdef = { + VFSDEF_VERSION, + "bootfs", + bootfs_init, + VSW_HASPROTO|VSW_STATS, + &bootfs_mntopts +}; + +static struct modlfs bootfs_modlfs = { + &mod_fsops, "boot-time modules file system", &bootfs_vfsdef +}; + +static struct modlinkage bootfs_modlinkage = { + MODREV_1, &bootfs_modlfs, NULL +}; + +int +_init(void) +{ + bootfs_node_cache = kmem_cache_create("bootfs_node_cache", + sizeof (bootfs_node_t), 0, bootfs_node_constructor, + bootfs_node_destructor, NULL, NULL, NULL, 0); + bootfs_idspace = id_space_create("bootfs_minors", 1, INT32_MAX); + mutex_init(&bootfs_lock, NULL, MUTEX_DEFAULT, NULL); + + return (mod_install(&bootfs_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&bootfs_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + mutex_enter(&bootfs_lock); + if (bootfs_nactive > 0) { + mutex_exit(&bootfs_lock); + return (EBUSY); + } + mutex_exit(&bootfs_lock); + + err = mod_remove(&bootfs_modlinkage); + if (err != 0) + return (err); + + (void) vfs_freevfsops_by_type(bootfs_fstype); + vn_freevnodeops(bootfs_vnodeops); + id_space_destroy(bootfs_idspace); + mutex_destroy(&bootfs_lock); + kmem_cache_destroy(bootfs_node_cache); + return (err); +} diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vnops.c b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c new file mode 100644 index 0000000000..f63d0a4f24 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c @@ -0,0 +1,544 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * bootfs vnode operations + */ + +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mman.h> +#include <fs/fs_subr.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/dirent.h> +#include <sys/uio.h> +#include <vm/pvn.h> +#include <vm/hat.h> +#include <vm/seg_map.h> +#include <vm/seg_vn.h> +#include <sys/vmsystm.h> + +#include <sys/fs/bootfs_impl.h> + +struct vnodeops *bootfs_vnodeops; + +/*ARGSUSED*/ +static int +bootfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + int err; + ssize_t sres = uiop->uio_resid; + bootfs_node_t *bnp = vp->v_data; + + if (vp->v_type == VDIR) + return (EISDIR); + + if (vp->v_type != VREG) + return (EINVAL); + + if (uiop->uio_loffset < 0) + return (EINVAL); + + if (uiop->uio_loffset >= bnp->bvn_size) + return (0); + + err = 0; + while (uiop->uio_resid != 0) { + caddr_t base; + long offset, frem; + ulong_t poff, segoff; + size_t bytes; + int relerr; + + offset = uiop->uio_loffset; + poff = offset & PAGEOFFSET; + bytes = MIN(PAGESIZE - poff, uiop->uio_resid); + + frem = bnp->bvn_size - offset; + if (frem <= 0) { + err = 0; + break; + } + + /* Don't read past EOF */ + bytes = MIN(bytes, frem); + + /* + * Segmaps are likely larger than our page size, so make sure we + * have the proper offfset into the resulting segmap data. + */ + segoff = (offset & PAGEMASK) & MAXBOFFSET; + + base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK, bytes, + 1, S_READ); + + err = uiomove(base + segoff + poff, bytes, UIO_READ, uiop); + relerr = segmap_release(segkmap, base, 0); + + if (err == 0) + err = relerr; + + if (err != 0) + break; + } + + /* Even if we had an error in a partial read, return success */ + if (uiop->uio_resid > sres) + err = 0; + + gethrestime(&bnp->bvn_attr.va_atime); + + return (err); +} + +/*ARGSUSED*/ +static int +bootfs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +bootfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + uint32_t mask; + bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data; + + mask = vap->va_mask; + bcopy(&bpn->bvn_attr, vap, sizeof (vattr_t)); + vap->va_mask = mask; + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + int shift = 0; + bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data; + + if (crgetuid(cr) != bpn->bvn_attr.va_uid) { + shift += 3; + if (groupmember(bpn->bvn_attr.va_gid, cr) == 0) + shift += 3; + } + + return (secpolicy_vnode_access2(cr, vp, bpn->bvn_attr.va_uid, + bpn->bvn_attr.va_mode << shift, mode)); +} + +/*ARGSUSED*/ +static int +bootfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + avl_index_t where; + bootfs_node_t sn, *bnp; + bootfs_node_t *bpp = (bootfs_node_t *)dvp->v_data; + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + if (bpp->bvn_attr.va_type != VDIR) + return (ENOTDIR); + + if (*nm == '\0' || strcmp(nm, ".") == 0) { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + if (strcmp(nm, "..") == 0) { + VN_HOLD(bpp->bvn_parent->bvn_vnp); + *vpp = bpp->bvn_parent->bvn_vnp; + return (0); + } + + sn.bvn_name = nm; + bnp = avl_find(&bpp->bvn_dir, &sn, &where); + if (bnp == NULL) + return (ENOENT); + + VN_HOLD(bnp->bvn_vnp); + *vpp = bnp->bvn_vnp; + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data; + dirent64_t *dp; + void *buf; + ulong_t bsize, brem; + offset_t coff, roff; + int dlen, ret; + bootfs_node_t *dnp; + boolean_t first = B_TRUE; + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp != NULL) + *eofp = 1; + return (0); + } + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (!(uiop->uio_iov->iov_len > 0)) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + roff = uiop->uio_loffset; + coff = 0; + brem = bsize = uiop->uio_iov->iov_len; + buf = kmem_alloc(bsize, KM_SLEEP); + dp = buf; + + /* + * Recall that offsets here are done based on the name of the dirent + * excluding the null terminator. Therefore `.` is always at 0, `..` is + * always at 1, and then the first real dirent is at 3. This offset is + * what's actually stored when we update the offset in the structure. + */ + if (roff == 0) { + dlen = DIRENT64_RECLEN(1); + if (first == B_TRUE) { + if (dlen > brem) { + kmem_free(buf, bsize); + return (EINVAL); + } + first = B_FALSE; + } + dp->d_ino = (ino64_t)bnp->bvn_attr.va_nodeid; + dp->d_off = 0; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, ".", DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + } + + if (roff <= 1) { + dlen = DIRENT64_RECLEN(2); + if (first == B_TRUE) { + if (dlen > brem) { + kmem_free(buf, bsize); + return (EINVAL); + } + first = B_FALSE; + } + dp->d_ino = (ino64_t)bnp->bvn_parent->bvn_attr.va_nodeid; + dp->d_off = 1; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, "..", DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + } + + coff = 3; + for (dnp = avl_first(&bnp->bvn_dir); dnp != NULL; + dnp = AVL_NEXT(&bnp->bvn_dir, dnp)) { + size_t nlen = strlen(dnp->bvn_name); + + if (roff > coff) { + coff += nlen; + continue; + } + + dlen = DIRENT64_RECLEN(nlen); + if (dlen > brem) { + if (first == B_TRUE) { + kmem_free(buf, bsize); + return (EINVAL); + } + break; + } + first = B_FALSE; + + dp->d_ino = (ino64_t)dnp->bvn_attr.va_nodeid; + dp->d_off = coff; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, dnp->bvn_name, + DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + coff += nlen; + } + + ret = uiomove(buf, (bsize - brem), UIO_READ, uiop); + + if (ret == 0) { + if (dnp == NULL) { + coff++; + if (eofp != NULL) + *eofp = 1; + } else if (eofp != NULL) { + *eofp = 0; + } + uiop->uio_loffset = coff; + } + gethrestime(&bnp->bvn_attr.va_atime); + kmem_free(buf, bsize); + return (ret); +} + +/*ARGSUSED*/ +static void +bootfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ +} + +/*ARGSUSED*/ +static int +bootfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + if (write_lock != 0) + return (EINVAL); + return (0); +} + +/*ARGSUSED*/ +static void +bootfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ +} + +/*ARGSUSED*/ +static int +bootfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data; + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > bnp->bvn_size ? EINVAL : 0)); +} + +/* + * We need to fill in a single page of a vnode's memory based on the actual data + * from the kernel. We'll use this node's sliding window into physical memory + * and update one page at a time. + */ +/*ARGSUSED*/ +static int +bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr) +{ + bootfs_node_t *bnp = vp->v_data; + page_t *pp, *fpp; + pfn_t pfn; + + for (;;) { + /* Easy case where the page exists */ + pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED); + if (pp != NULL) { + if (pl != NULL) { + pl[0] = pp; + pl[1] = NULL; + } else { + page_unlock(pp); + } + return (0); + } + + pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg, + addr); + + /* + * If we didn't get the page, that means someone else beat us to + * creating this so we need to try again. + */ + if (pp != NULL) + break; + } + + pfn = btop((bnp->bvn_addr + off) & PAGEMASK); + fpp = page_numtopp_nolock(pfn); + + if (ppcopy(fpp, pp) == 0) { + pvn_read_done(pp, B_ERROR); + return (EIO); + } + + if (pl != NULL) { + pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw); + } else { + pvn_io_done(pp); + } + + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + int err; + bootfs_node_t *bnp = vp->v_data; + + if (off + len > bnp->bvn_size + PAGEOFFSET) + return (EFAULT); + + if (len <= PAGESIZE) + err = bootfs_getapage(vp, (u_offset_t)off, len, protp, pl, + plsz, seg, addr, rw, cr); + else + err = pvn_getpages(bootfs_getapage, vp, (u_offset_t)off, len, + protp, pl, plsz, seg, addr, rw, cr); + + return (err); +} + +/*ARGSUSED*/ +static int +bootfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + int ret; + segvn_crargs_t vn_a; + +#ifdef _ILP32 + if (len > MAXOFF_T) + return (ENOMEM); +#endif + + if (vp->v_flag & VNOMAP) + return (ENOSYS); + + if (off < 0 || off > MAXOFFSET_T - off) + return (ENXIO); + + if (vp->v_type != VREG) + return (ENODEV); + + if (prot & PROT_WRITE) + return (ENOTSUP); + + as_rangelock(as); + ret = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (ret != 0) { + as_rangeunlock(as); + return (ret); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + ret = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + return (ret); + +} + +/*ARGSUSED*/ +static int +bootfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +static int +bootfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int ret; + + switch (cmd) { + case _PC_TIMESTAMP_RESOLUTION: + *valp = 1L; + ret = 0; + break; + default: + ret = fs_pathconf(vp, cmd, valp, cr, ct); + } + + return (ret); +} + +const fs_operation_def_t bootfs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = bootfs_open }, + VOPNAME_CLOSE, { .vop_close = bootfs_close }, + VOPNAME_READ, { .vop_read = bootfs_read }, + VOPNAME_IOCTL, { .vop_ioctl = bootfs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = bootfs_getattr }, + VOPNAME_ACCESS, { .vop_access = bootfs_access }, + VOPNAME_LOOKUP, { .vop_lookup = bootfs_lookup }, + VOPNAME_READDIR, { .vop_readdir = bootfs_readdir }, + VOPNAME_INACTIVE, { .vop_inactive = bootfs_inactive }, + VOPNAME_RWLOCK, { .vop_rwlock = bootfs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = bootfs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = bootfs_seek }, + VOPNAME_GETPAGE, { .vop_getpage = bootfs_getpage }, + VOPNAME_MAP, { .vop_map = bootfs_map }, + VOPNAME_ADDMAP, { .vop_addmap = bootfs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = bootfs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = bootfs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_nosupport }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c index 4eaf38f484..41441ec52d 100644 --- a/usr/src/uts/common/fs/dev/sdev_netops.c +++ b/usr/src/uts/common/fs/dev/sdev_netops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -41,8 +42,102 @@ #include <sys/zone.h> #include <sys/dls.h> +static const char *devnet_zpath = "/dev/net/zone/"; struct vnodeops *devnet_vnodeops; +static zoneid_t +devnet_nodetozone(sdev_node_t *dv) +{ + char *zname = NULL, *dup; + zone_t *zone; + int duplen; + zoneid_t zid; + + /* + * If in a non-global zone, always return it's zid no matter what the + * node is. + */ + zid = getzoneid(); + if (zid != GLOBAL_ZONEID) + return (zid); + + /* + * If it doesn't have /dev/net/zone/ then it can't be a specific zone + * we're targetting. + */ + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0) + return (GLOBAL_ZONEID); + + if (dv->sdev_vnode->v_type == VDIR) { + zone = zone_find_by_name(dv->sdev_name); + } else { + /* Non directories have the form /dev/net/zone/%z/%s */ + dup = strdup(dv->sdev_path); + duplen = strlen(dup); + zname = strrchr(dup, '/'); + *zname = '\0'; + zname--; + zname = strrchr(dup, '/'); + zname++; + zone = zone_find_by_name(zname); + kmem_free(dup, duplen + 1); + } + if (zone == NULL) + return (GLOBAL_ZONEID); + zid = zone->zone_id; + zone_rele(zone); + return (zid); +} + +static int +devnet_mkdir(struct sdev_node *ddv, char *name) +{ + sdev_node_t *dv; + struct vattr va; + int ret; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + dv = sdev_cache_lookup(ddv, name); + if (dv != NULL) { + SDEV_SIMPLE_RELE(dv); + return (EEXIST); + } + + va = *sdev_getdefault_attr(VDIR); + gethrestime(&va.va_atime); + va.va_mtime = va.va_atime; + va.va_ctime = va.va_atime; + + ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(dv); + return (0); +} + +/* + * We basically need to walk down the directory path to determine what we should + * do. At the top level of /dev/net, only the directory /dev/net/zone is valid, + * and it is always valid. Following on that, /dev/net/zone/%zonename is valid + * if and only if we can look up that zone name. If it's not, or it's some other + * name, then it's SDEV_VTOR_INVALID. + */ +static int +devnet_dirvalidate(struct sdev_node *dv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, dv->sdev_path) == 0) + return (SDEV_VTOR_VALID); + + zonep = zone_find_by_name(dv->sdev_name); + if (zonep == NULL) + return (SDEV_VTOR_INVALID); + zone_rele(zonep); + return (SDEV_VTOR_VALID); +} + /* * Check if a net sdev_node is still valid - i.e. it represents a current * network link. @@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv) ASSERT(dv->sdev_state == SDEV_READY); - if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0) + if (dv->sdev_vnode->v_type == VDIR) + return (devnet_dirvalidate(dv)); + + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) { + ASSERT(SDEV_IS_GLOBAL(dv)); + zoneid = devnet_nodetozone(dv); + } else { + zoneid = getzoneid(); + } + + if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0) return (SDEV_VTOR_INVALID); - if (SDEV_IS_GLOBAL(dv)) + if (zoneid == GLOBAL_ZONEID) return (SDEV_VTOR_VALID); - zoneid = getzoneid(); return (zone_check_datalink(&zoneid, linkid) == 0 ? SDEV_VTOR_VALID : SDEV_VTOR_INVALID); } @@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv) * a net entry when the node is not found in the cache. */ static int -devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp) +devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp, + zoneid_t zid) { timestruc_t now; dev_t dev; int error; - if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) { + if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) { sdcmn_err12(("devnet_create_rvp: not a valid vanity name " "network node: %s\n", nm)); return (error); @@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct sdev_node *ddv = VTOSDEV(dvp); struct sdev_node *dv = NULL; dls_dl_handle_t ddh = NULL; + zone_t *zone; struct vattr vattr; int nmlen; int error = ENOENT; @@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, if (SDEVTOV(ddv)->v_type != VDIR) return (ENOTDIR); + if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID) + return (EPERM); + /* * Empty name or ., return node itself. */ @@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, rw_enter(&ddv->sdev_contents, RW_WRITER); /* + * ZOMBIED parent does not allow new node creation, bail out early. + */ + if (ddv->sdev_state == SDEV_ZOMBIE) + goto failed; + + /* * directory cache lookup: */ if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) { @@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, goto found; } + if (SDEV_IS_GLOBAL(ddv)) { + /* + * Check for /dev/net/zone + */ + if (strcmp("zone", nm) == 0 && strcmp("/dev/net", + ddv->sdev_path) == 0) { + (void) devnet_mkdir(ddv, nm); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + + /* + * Check for /dev/net/zone/%z. We can't use devnet_zpath due to + * its trailing slash. + */ + if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) { + zone = zone_find_by_name(nm); + if (zone == NULL) + goto failed; + (void) devnet_mkdir(ddv, nm); + zone_rele(zone); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + } else if (strcmp("/dev/net", ddv->sdev_path) != 0) { + goto failed; + } + /* - * ZOMBIED parent does not allow new node creation, bail out early. + * We didn't find what we were looking for. What that is depends a lot + * on what directory we're in. */ - if (ddv->sdev_state == SDEV_ZOMBIE) - goto failed; - error = devnet_create_rvp(nm, &vattr, &ddh); + error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv)); if (error != 0) goto failed; @@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg) if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL) goto found; - if (devnet_create_rvp(link, &vattr, &ddh) != 0) + if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0) return (0); ASSERT(ddh != NULL); @@ -244,16 +388,77 @@ found: return (0); } +/* + * Fill in all the entries for the current zone. + */ static void -devnet_filldir(struct sdev_node *ddv) +devnet_fillzone(struct sdev_node *ddv, zoneid_t zid) { - sdev_node_t *dv, *next; datalink_id_t linkid; + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + if (zid == GLOBAL_ZONEID) { + ASSERT(SDEV_IS_GLOBAL(ddv)); + linkid = DATALINK_INVALID_LINKID; + do { + linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); + if (linkid != DATALINK_INVALID_LINKID) + (void) devnet_filldir_datalink(linkid, ddv); + } while (linkid != DATALINK_INVALID_LINKID); + } else { + (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv); + } +} + +/* + * Callback for zone_walk when filling up /dev/net/zone/... + */ +static int +devnet_fillzdir_cb(zone_t *zonep, void *arg) +{ + sdev_node_t *ddv = arg; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + (void) devnet_mkdir(ddv, zonep->zone_name); + return (0); +} + +/* + * Fill in a directory that isn't the top level /dev/net. + */ +static void +devnet_fillzdir(struct sdev_node *ddv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, ddv->sdev_path) == 0) { + (void) zone_walk(devnet_fillzdir_cb, ddv); + return; + } + + zonep = zone_find_by_name(ddv->sdev_name); + if (zonep == NULL) + return; + devnet_fillzone(ddv, zonep->zone_id); + zone_rele(zonep); +} + +static void +devnet_filldir(struct sdev_node *ddv) +{ + int ret; + sdev_node_t *dv, *next; + ASSERT(RW_READ_HELD(&ddv->sdev_contents)); if (rw_tryupgrade(&ddv->sdev_contents) == NULL) { rw_exit(&ddv->sdev_contents); rw_enter(&ddv->sdev_contents, RW_WRITER); + if (ddv->sdev_state == SDEV_ZOMBIE) { + rw_exit(&ddv->sdev_contents); + return; + } } for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) { @@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv) if (SDEVTOV(dv)->v_count > 0) continue; + SDEV_HOLD(dv); + + /* + * Clean out everything underneath before we remove ourselves. + */ + if (SDEVTOV(ddv)->v_type == VDIR) { + ret = sdev_cleandir(dv, NULL, 0); + ASSERT(ret == 0); + } /* remove the cache node */ (void) sdev_cache_update(ddv, &dv, dv->sdev_name, SDEV_CACHE_DELETE); SDEV_RELE(dv); } + if (strcmp(ddv->sdev_path, "/dev/net") != 0) { + devnet_fillzdir(ddv); + goto done; + } + if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild()) goto done; if (SDEV_IS_GLOBAL(ddv)) { - linkid = DATALINK_INVALID_LINKID; - do { - linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, - DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); - if (linkid != DATALINK_INVALID_LINKID) - (void) devnet_filldir_datalink(linkid, ddv); - } while (linkid != DATALINK_INVALID_LINKID); + devnet_fillzone(ddv, GLOBAL_ZONEID); + (void) devnet_mkdir(ddv, "zone"); } else { - (void) zone_datalink_walk(getzoneid(), - devnet_filldir_datalink, ddv); + devnet_fillzone(ddv, getzoneid()); } ddv->sdev_flags &= ~SDEV_BUILD; - done: rw_downgrade(&ddv->sdev_contents); } @@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, ASSERT(sdvp); + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + if (uiop->uio_offset == 0) devnet_filldir(sdvp); diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c new file mode 100644 index 0000000000..885191175f --- /dev/null +++ b/usr/src/uts/common/fs/dev/sdev_plugin.c @@ -0,0 +1,913 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* + * Dynamic directory plugin interface for sdev. + * + * The sdev plugin interfaces provides a means for a dynamic directory based on + * in-kernel state to be simply created. Traditionally, dynamic directories were + * built into sdev itself. While these legacy plugins are useful, it makes more + * sense for these pieces of functionality to live with the individual drivers. + * + * The plugin interface requires folks to implement three interfaces and + * provides a series of callbacks that can be made in the context of those + * interfaces to interrogate the sdev_node_t without having to leak + * implementation details of the sdev_node_t. These interfaces are: + * + * o spo_validate + * + * Given a particular node, answer the question as to whether or not this + * entry is still valid. Here, plugins should use the name and the dev_t + * associated with the node to verify that it matches something that still + * exists. + * + * o spo_filldir + * + * Fill all the entries inside of a directory. Note that some of these entries + * may already exist. + * + * o spo_inactive + * + * The given node is no longer being used. This allows the consumer to + * potentially tear down anything that was being held open related to this. + * Note that this only fires when the given sdev_node_t becomes a zombie. + * + * During these callbacks a consumer is not allowed to register or unregister a + * plugin, especially their own. They may call the sdev_ctx style functions. All + * callbacks fire in a context where blocking is allowed (eg. the spl is below + * LOCK_LEVEL). + * + * When a plugin is added, we create its directory in the global zone. By doing + * that, we ensure that something isn't already there and that nothing else can + * come along and try and create something without our knowledge. We only have + * to create it in the GZ and not for all other instances of sdev because an + * instance of sdev that isn't at /dev does not have dynamic directories, and + * second, any instance of sdev present in a non-global zone cannot create + * anything, therefore we know that by it not being in the global zone's + * instance of sdev that we're good to go. + * + * Lock Ordering + * ------------- + * + * The global sdev_plugin_lock must be held before any of the individual + * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held, + * it is not legal to take any holds on any sdev_node_t or to grab the + * sdev_node_t`contents_lock in any way. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/fs/sdev_impl.h> +#include <sys/fs/sdev_plugin.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/list.h> +#include <sys/ctype.h> + +kmutex_t sdev_plugin_lock; +list_t sdev_plugin_list; +kmem_cache_t *sdev_plugin_cache; +struct vnodeops *sdev_plugin_vnops; + +#define SDEV_PLUGIN_NAMELEN 64 + +typedef struct sdev_plugin { + list_node_t sp_link; + char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */ + int sp_nflags; /* E */ + struct vnodeops *sp_vnops; /* E */ + sdev_plugin_ops_t *sp_pops; /* E */ + boolean_t sp_islegacy; /* E */ + int (*sp_lvtor)(sdev_node_t *); /* E */ + kmutex_t sp_lock; /* Protects everything below */ + kcondvar_t sp_nodecv; + size_t sp_nnodes; +} sdev_plugin_t; + +/* ARGSUSED */ +static int +sdev_plugin_cache_constructor(void *buf, void *arg, int tags) +{ + sdev_plugin_t *spp = buf; + mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0); + cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +sdev_plugin_cache_destructor(void *buf, void *arg) +{ + sdev_plugin_t *spp = buf; + cv_destroy(&spp->sp_nodecv); + mutex_destroy(&spp->sp_lock); +} + +enum vtype +sdev_ctx_vtype(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_vnode->v_type); +} + +const char * +sdev_ctx_path(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_path); +} + +const char * +sdev_ctx_name(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_name); +} + +/* + * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL. + */ +sdev_ctx_flags_t +sdev_ctx_flags(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_flags & SDEV_GLOBAL); +} + +/* + * Return some amount of private data specific to the vtype. In the case of a + * character or block device this is the device number. + */ +const void * +sdev_ctx_vtype_data(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + void *ret; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + switch (sdp->sdev_vnode->v_type) { + case VCHR: + case VBLK: + ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev); + break; + default: + ret = NULL; + break; + } + + return (ret); +} + +/* + * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'. + */ +static int +sdev_plugin_name_isvalid(const char *c, int buflen) +{ + int i; + + for (i = 0; i < buflen; i++, c++) { + if (*c == '\0') + return (1); + + if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.') + return (0); + } + /* Never found a null terminator */ + return (0); +} + +static int +sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name, + vattr_t *vap) +{ + int ret; + sdev_node_t *svp; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + ASSERT(spp != NULL); + svp = sdev_cache_lookup(sdvp, name); + if (svp != NULL) { + SDEV_SIMPLE_RELE(svp); + return (EEXIST); + } + + ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred, + SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(svp); + + return (0); +} + +/* + * Plugin node creation callbacks + */ +int +sdev_plugin_mkdir(sdev_ctx_t ctx, char *name) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(sdvp->sdev_private != NULL); + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); +} + +int +sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + if (mode != S_IFCHR && mode != S_IFBLK) + return (EINVAL); + + ASSERT(sdvp->sdev_private != NULL); + + vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + vap.va_rdev = dev; + vap.va_mode = mode | 0666; + + /* Despite the similar name, this is in fact a different function */ + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); + +} + +static int +sdev_plugin_validate(sdev_node_t *sdp) +{ + int ret; + sdev_plugin_t *spp; + + ASSERT(sdp->sdev_private != NULL); + spp = sdp->sdev_private; + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + rw_enter(&sdp->sdev_contents, RW_READER); + ret = spp->sp_pops->spo_validate((uintptr_t)sdp); + rw_exit(&sdp->sdev_contents); + return (ret); +} + +static void +sdev_plugin_validate_dir(sdev_node_t *sdvp) +{ + int ret; + sdev_node_t *svp, *next; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) { + + next = SDEV_NEXT_ENTRY(sdvp, svp); + ASSERT(svp->sdev_state != SDEV_ZOMBIE); + /* skip nodes that aren't ready */ + if (svp->sdev_state == SDEV_INIT) + continue; + + switch (sdev_plugin_validate(svp)) { + case SDEV_VTOR_VALID: + case SDEV_VTOR_SKIP: + continue; + case SDEV_VTOR_INVALID: + case SDEV_VTOR_STALE: + break; + } + + SDEV_HOLD(svp); + + /* + * Clean out everything underneath this node before we + * remove it. + */ + if (svp->sdev_vnode->v_type == VDIR) { + ret = sdev_cleandir(svp, NULL, 0); + ASSERT(ret == 0); + } + /* remove the cache node */ + (void) sdev_cache_update(sdvp, &svp, svp->sdev_name, + SDEV_CACHE_DELETE); + SDEV_RELE(svp); + } +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, + int *eofp, caller_context_t *ct_unused, int flags_unused) +{ + int ret; + sdev_node_t *sdvp = VTOSDEV(dvp); + sdev_plugin_t *spp; + + ASSERT(RW_READ_HELD(&sdvp->sdev_contents)); + + /* Sanity check we're not a zombie before we do anyting else */ + if (sdvp->sdev_state == SDEV_ZOMBIE) + return (ENOENT); + + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + if (uiop->uio_offset == 0) { + /* + * We upgrade to a write lock and grab the plugin's lock along + * the way. We're almost certainly going to get creation + * callbacks, so this is the only safe way to go. + */ + if (rw_tryupgrade(&sdvp->sdev_contents) == 0) { + rw_exit(&sdvp->sdev_contents); + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_downgrade(&sdvp->sdev_contents); + return (ENOENT); + } + } + + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_downgrade(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + } + + return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); +} + +/* + * If we don't have a callback function that returns a failure, then sdev will + * try to create a node for us which violates all of our basic assertions. To + * work around that we create our own callback for devname_lookup_func which + * always returns ENOENT as at this point either it was created with the filldir + * callback or it was not. + */ +/*ARGSUSED*/ +static int +sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred, + void *unused, char *unused2) +{ + return (ENOENT); +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + int ret; + sdev_node_t *sdvp; + sdev_plugin_t *spp; + + /* execute access is required to search the directory */ + if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) + return (ret); + + sdvp = VTOSDEV(dvp); + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + /* + * Go straight for the write lock. + */ + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_exit(&sdvp->sdev_contents); + return (ENOENT); + } + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_exit(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + + return (devname_lookup_func(sdvp, nm, vpp, cred, + sdev_plugin_vop_lookup_cb, SDEV_VATTR)); +} + +/* + * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes + * to zero, but isn't necessairily a zombie yet. As such, to make things easier + * for users, we only fire the inactive callback when the node becomes a zombie + * and thus will be torn down here. + */ +static void +sdev_plugin_vop_inactive_cb(struct vnode *dvp) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + + rw_enter(&sdp->sdev_contents, RW_READER); + if (sdp->sdev_state != SDEV_ZOMBIE) { + rw_exit(&sdp->sdev_contents); + return; + } + spp->sp_pops->spo_inactive((uintptr_t)sdp); + mutex_enter(&spp->sp_lock); + VERIFY(spp->sp_nnodes > 0); + spp->sp_nnodes--; + cv_signal(&spp->sp_nodecv); + mutex_exit(&spp->sp_lock); + rw_exit(&sdp->sdev_contents); +} + +/*ARGSUSED*/ +static void +sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred, + caller_context_t *ct) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + ASSERT(sdp->sdev_private != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb); +} + +const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = { + VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir }, + VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup }, + VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive }, + VOPNAME_CREATE, { .error = fs_nosys }, + VOPNAME_REMOVE, { .error = fs_nosys }, + VOPNAME_MKDIR, { .error = fs_nosys }, + VOPNAME_RMDIR, { .error = fs_nosys }, + VOPNAME_SYMLINK, { .error = fs_nosys }, + VOPNAME_SETSECATTR, { .error = fs_nosys }, + NULL, NULL +}; + +/* + * construct a new template with overrides from vtab + */ +static fs_operation_def_t * +sdev_merge_vtab(const fs_operation_def_t tab[]) +{ + fs_operation_def_t *new; + const fs_operation_def_t *tab_entry; + + /* make a copy of standard vnode ops table */ + new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); + bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); + + /* replace the overrides from tab */ + for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { + fs_operation_def_t *std_entry = new; + while (std_entry->name) { + if (strcmp(tab_entry->name, std_entry->name) == 0) { + std_entry->func = tab_entry->func; + break; + } + std_entry++; + } + } + + return (new); +} + +/* free memory allocated by sdev_merge_vtab */ +static void +sdev_free_vtab(fs_operation_def_t *new) +{ + kmem_free(new, sdev_vnodeops_tbl_size); +} + +/* + * Register a new plugin. + */ +sdev_plugin_hdl_t +sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp) +{ + int ret, err; + sdev_plugin_t *spp, *iter; + vnode_t *vp, *nvp; + sdev_node_t *sdp, *slp; + timestruc_t now; + struct vattr vap; + + /* + * Some consumers don't care about why they failed. To keep the code + * simple, we'll just pretend they gave us something. + */ + if (errp == NULL) + errp = &err; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_version != 1) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_validate == NULL || ops->spo_filldir == NULL || + ops->spo_inactive == NULL) { + *errp = EINVAL; + return (NULL); + } + + if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) { + *errp = EINVAL; + return (NULL); + } + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN); + + spp->sp_pops = ops; + spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR; + if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE) + spp->sp_nflags |= SDEV_NO_NCACHE; + if (ops->spo_flags & SDEV_PLUGIN_SUBDIR) + spp->sp_nflags |= SDEV_SUBDIR; + spp->sp_vnops = sdev_plugin_vnops; + spp->sp_islegacy = B_FALSE; + spp->sp_lvtor = NULL; + spp->sp_nnodes = 0; + + /* + * Make sure it's unique, nothing exists with this name already, and add + * it to the list. We also need to go through and grab the sdev + * root node as we cannot grab any sdev node locks once we've grabbed + * the sdev_plugin_lock. We effectively assert that if a directory is + * not present in the GZ's /dev, then it doesn't exist in any of the + * local zones. + */ + ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1); + if (ret != 0) { + *errp = ret; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + /* Make sure we have the real vnode */ + if (VOP_REALVP(vp, &nvp, NULL) == 0) { + VN_HOLD(nvp); + VN_RELE(vp); + vp = nvp; + nvp = NULL; + } + VERIFY(vp->v_op == sdev_vnodeops); + sdp = VTOSDEV(vp); + rw_enter(&sdp->sdev_contents, RW_WRITER); + slp = sdev_cache_lookup(sdp, spp->sp_name); + if (slp != NULL) { + SDEV_RELE(slp); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + + mutex_enter(&sdev_plugin_lock); + for (iter = list_head(&sdev_plugin_list); iter != NULL; + iter = list_next(&sdev_plugin_list, iter)) { + if (strcmp(spp->sp_name, iter->sp_name) == 0) { + mutex_exit(&sdev_plugin_lock); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + } + + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + /* + * Now go ahead and create the top level directory for the global zone. + */ + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap); + + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + + return ((sdev_plugin_hdl_t)spp); +} + +static void +sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg) +{ + sdev_plugin_t *spp = arg; + sdev_node_t *sdp; + + rw_enter(&rdp->sdev_contents, RW_WRITER); + sdp = sdev_cache_lookup(rdp, spp->sp_name); + /* If it doesn't exist, we're done here */ + if (sdp == NULL) { + rw_exit(&rdp->sdev_contents); + return; + } + + /* + * We first delete the directory before recursively marking everything + * else stale. This ordering should ensure that we don't accidentally + * miss anything. + */ + sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE); + sdev_stale(sdp); + SDEV_RELE(sdp); + rw_exit(&rdp->sdev_contents); +} + +/* + * Remove a plugin. This will block until everything has become a zombie, thus + * guaranteeing the caller that nothing will call into them again once this call + * returns. While the call is ongoing, it could be called into. Note that while + * this is ongoing, it will block other mounts. + */ +int +sdev_plugin_unregister(sdev_plugin_hdl_t hdl) +{ + sdev_plugin_t *spp = (sdev_plugin_t *)hdl; + if (spp->sp_islegacy) + return (EINVAL); + + mutex_enter(&sdev_plugin_lock); + list_remove(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + sdev_mnt_walk(sdev_plugin_unregister_cb, spp); + mutex_enter(&spp->sp_lock); + while (spp->sp_nnodes > 0) + cv_wait(&spp->sp_nodecv, &spp->sp_lock); + mutex_exit(&spp->sp_lock); + kmem_cache_free(sdev_plugin_cache, spp); + return (0); +} + +/* + * Register an old sdev style plugin to deal with what used to be in the vtab. + */ +static int +sdev_plugin_register_legacy(struct sdev_vop_table *vtp) +{ + sdev_plugin_t *spp; + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN); + spp->sp_islegacy = B_TRUE; + spp->sp_pops = NULL; + spp->sp_nflags = vtp->vt_flags; + spp->sp_lvtor = vtp->vt_vtor; + spp->sp_nnodes = 0; + + if (vtp->vt_service != NULL) { + fs_operation_def_t *templ; + templ = sdev_merge_vtab(vtp->vt_service); + if (vn_make_ops(vtp->vt_name, + (const fs_operation_def_t *)templ, + &spp->sp_vnops) != 0) { + cmn_err(CE_WARN, "%s: malformed vnode ops\n", + vtp->vt_name); + sdev_free_vtab(templ); + kmem_cache_free(sdev_plugin_cache, spp); + return (1); + } + + if (vtp->vt_global_vops) { + *(vtp->vt_global_vops) = spp->sp_vnops; + } + + sdev_free_vtab(templ); + } else { + spp->sp_vnops = sdev_vnodeops; + } + + /* + * No need to check for EEXIST here. These are loaded as a part of the + * sdev's initialization function. Further, we don't have to create them + * as that's taken care of in sdev's mount for the GZ. + */ + mutex_enter(&sdev_plugin_lock); + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + return (0); +} + +/* + * We need to match off of the sdev_path, not the sdev_name. We are only allowed + * to exist directly under /dev. + */ +static sdev_plugin_t * +sdev_match(sdev_node_t *dv) +{ + int vlen; + const char *path; + sdev_plugin_t *spp; + + if (strlen(dv->sdev_path) <= 5) + return (NULL); + + if (strncmp(dv->sdev_path, "/dev/", 5) != 0) + return (NULL); + path = dv->sdev_path + 5; + + mutex_enter(&sdev_plugin_lock); + + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + if (spp->sp_nflags & SDEV_SUBDIR) { + vlen = strlen(spp->sp_name); + if ((strncmp(spp->sp_name, path, + vlen - 1) == 0) && path[vlen] == '/') { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + } + } + + mutex_exit(&sdev_plugin_lock); + return (NULL); +} + +void +sdev_set_no_negcache(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + ASSERT(dv->sdev_path); + path = dv->sdev_path + strlen("/dev/"); + + mutex_enter(&sdev_plugin_lock); + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + if (spp->sp_nflags & SDEV_NO_NCACHE) + dv->sdev_flags |= SDEV_NO_NCACHE; + break; + } + } + mutex_exit(&sdev_plugin_lock); +} + +struct vnodeops * +sdev_get_vop(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + path = dv->sdev_path; + ASSERT(path); + + /* gets the relative path to /dev/ */ + path += 5; + + if ((spp = sdev_match(dv)) != NULL) { + dv->sdev_flags |= spp->sp_nflags; + if (SDEV_IS_PERSIST(dv->sdev_dotdot) && + (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) + dv->sdev_flags |= SDEV_PERSIST; + return (spp->sp_vnops); + } + + /* child inherits the persistence of the parent */ + if (SDEV_IS_PERSIST(dv->sdev_dotdot)) + dv->sdev_flags |= SDEV_PERSIST; + return (sdev_vnodeops); +} + +void * +sdev_get_vtor(sdev_node_t *dv) +{ + sdev_plugin_t *spp; + + if (dv->sdev_private == NULL) { + spp = sdev_match(dv); + if (spp == NULL) + return (NULL); + } else { + spp = dv->sdev_private; + } + + if (spp->sp_islegacy) + return ((void *)spp->sp_lvtor); + else + return ((void *)sdev_plugin_validate); +} + +void +sdev_plugin_nodeready(sdev_node_t *sdp) +{ + sdev_plugin_t *spp; + + ASSERT(RW_WRITE_HELD(&sdp->sdev_contents)); + ASSERT(sdp->sdev_private == NULL); + + spp = sdev_match(sdp); + if (spp == NULL) + return; + if (spp->sp_islegacy) + return; + sdp->sdev_private = spp; + mutex_enter(&spp->sp_lock); + spp->sp_nnodes++; + mutex_exit(&spp->sp_lock); +} + +int +sdev_plugin_init(void) +{ + sdev_vop_table_t *vtp; + fs_operation_def_t *templ; + + sdev_plugin_cache = kmem_cache_create("sdev_plugin", + sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor, + sdev_plugin_cache_destructor, NULL, NULL, NULL, 0); + if (sdev_plugin_cache == NULL) + return (1); + mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&sdev_plugin_list, sizeof (sdev_plugin_t), + offsetof(sdev_plugin_t, sp_link)); + + /* + * Register all of the legacy vnops + */ + for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++) + if (sdev_plugin_register_legacy(vtp) != 0) + return (1); + + templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl); + if (vn_make_ops("sdev_plugin", + (const fs_operation_def_t *)templ, + &sdev_plugin_vnops) != 0) { + sdev_free_vtab(templ); + return (1); + } + + sdev_free_vtab(templ); + return (0); +} diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index b4b27e6285..16439a66a6 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = { kmem_cache_t *sdev_node_cache; /* sdev_node cache */ int devtype; /* fstype */ -/* static */ -static struct vnodeops *sdev_get_vop(struct sdev_node *); -static void sdev_set_no_negcache(struct sdev_node *); -static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []); -static void sdev_free_vtab(fs_operation_def_t *); - static void sdev_prof_free(struct sdev_node *dv) { @@ -318,6 +312,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv, (void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm); /* overwritten for VLNK nodes */ dv->sdev_symlink = NULL; + list_link_init(&dv->sdev_plist); vp = SDEVTOV(dv); vn_reinit(vp); @@ -406,6 +401,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp, } else { dv->sdev_nlink = 1; } + sdev_plugin_nodeready(dv); if (!(SDEV_IS_GLOBAL(dv))) { dv->sdev_origin = (struct sdev_node *)args; @@ -502,37 +498,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp, return (dv); } -/* directory dependent vop table */ -struct sdev_vop_table { - char *vt_name; /* subdirectory name */ - const fs_operation_def_t *vt_service; /* vnodeops table */ - struct vnodeops *vt_vops; /* constructed vop */ - struct vnodeops **vt_global_vops; /* global container for vop */ - int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ - int vt_flags; -}; - -/* - * A nice improvement would be to provide a plug-in mechanism - * for this table instead of a const table. - */ -static struct sdev_vop_table vtab[] = -{ - { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate, +struct sdev_vop_table vtab[] = { + { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate, + { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops, + { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops, devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE }, + { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE }, - { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate, - SDEV_DYNAMIC | SDEV_VTOR }, + { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate, + SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops, + { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops, devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE }, /* @@ -547,132 +528,14 @@ static struct sdev_vop_table vtab[] = * preventing a mkdir. */ - { "lofi", NULL, NULL, NULL, NULL, + { "lofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { "rlofi", NULL, NULL, NULL, NULL, + { "rlofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { NULL, NULL, NULL, NULL, NULL, 0} + { NULL, NULL, NULL, NULL, 0} }; -/* - * We need to match off of the sdev_path, not the sdev_name. We are only allowed - * to exist directly under /dev. - */ -struct sdev_vop_table * -sdev_match(struct sdev_node *dv) -{ - int vlen; - int i; - const char *path; - - if (strlen(dv->sdev_path) <= 5) - return (NULL); - - if (strncmp(dv->sdev_path, "/dev/", 5) != 0) - return (NULL); - path = dv->sdev_path + 5; - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) - return (&vtab[i]); - if (vtab[i].vt_flags & SDEV_SUBDIR) { - vlen = strlen(vtab[i].vt_name); - if ((strncmp(vtab[i].vt_name, path, - vlen - 1) == 0) && path[vlen] == '/') - return (&vtab[i]); - } - - } - return (NULL); -} - -/* - * sets a directory's vnodeops if the directory is in the vtab; - */ -static struct vnodeops * -sdev_get_vop(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - char *path; - - path = dv->sdev_path; - ASSERT(path); - - /* gets the relative path to /dev/ */ - path += 5; - - /* gets the vtab entry it matches */ - if ((vtp = sdev_match(dv)) != NULL) { - dv->sdev_flags |= vtp->vt_flags; - if (SDEV_IS_PERSIST(dv->sdev_dotdot) && - (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) - dv->sdev_flags |= SDEV_PERSIST; - - if (vtp->vt_vops) { - if (vtp->vt_global_vops) - *(vtp->vt_global_vops) = vtp->vt_vops; - - return (vtp->vt_vops); - } - - if (vtp->vt_service) { - fs_operation_def_t *templ; - templ = sdev_merge_vtab(vtp->vt_service); - if (vn_make_ops(vtp->vt_name, - (const fs_operation_def_t *)templ, - &vtp->vt_vops) != 0) { - cmn_err(CE_PANIC, "%s: malformed vnode ops\n", - vtp->vt_name); - /*NOTREACHED*/ - } - if (vtp->vt_global_vops) { - *(vtp->vt_global_vops) = vtp->vt_vops; - } - sdev_free_vtab(templ); - - return (vtp->vt_vops); - } - - return (sdev_vnodeops); - } - - /* child inherits the persistence of the parent */ - if (SDEV_IS_PERSIST(dv->sdev_dotdot)) - dv->sdev_flags |= SDEV_PERSIST; - - return (sdev_vnodeops); -} - -static void -sdev_set_no_negcache(struct sdev_node *dv) -{ - int i; - char *path; - - ASSERT(dv->sdev_path); - path = dv->sdev_path + strlen("/dev/"); - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) { - if (vtab[i].vt_flags & SDEV_NO_NCACHE) - dv->sdev_flags |= SDEV_NO_NCACHE; - break; - } - } -} - -void * -sdev_get_vtor(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - - vtp = sdev_match(dv); - if (vtp) - return ((void *)vtp->vt_vtor); - else - return (NULL); -} /* * Build the base root inode @@ -952,8 +815,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) dv->sdev_path = NULL; } - if (!SDEV_IS_GLOBAL(dv)) + if (!SDEV_IS_GLOBAL(dv)) { sdev_prof_free(dv); + if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL) + SDEV_RELE(dv->sdev_origin); + } if (SDEVTOV(dv)->v_type == VDIR) { ASSERT(SDEV_FIRST_ENTRY(dv) == NULL); @@ -967,6 +833,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) (void) memset((void *)&dv->sdev_instance_data, 0, sizeof (dv->sdev_instance_data)); vn_invalid(SDEVTOV(dv)); + dv->sdev_private = NULL; kmem_cache_free(sdev_node_cache, dv); } @@ -2948,46 +2815,6 @@ sdev_modctl_devexists(const char *path) return (error); } -extern int sdev_vnodeops_tbl_size; - -/* - * construct a new template with overrides from vtab - */ -static fs_operation_def_t * -sdev_merge_vtab(const fs_operation_def_t tab[]) -{ - fs_operation_def_t *new; - const fs_operation_def_t *tab_entry; - - /* make a copy of standard vnode ops table */ - new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); - bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); - - /* replace the overrides from tab */ - for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { - fs_operation_def_t *std_entry = new; - while (std_entry->name) { - if (strcmp(tab_entry->name, std_entry->name) == 0) { - std_entry->func = tab_entry->func; - break; - } - std_entry++; - } - if (std_entry->name == NULL) - cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.", - tab_entry->name); - } - - return (new); -} - -/* free memory allocated by sdev_merge_vtab */ -static void -sdev_free_vtab(fs_operation_def_t *new) -{ - kmem_free(new, sdev_vnodeops_tbl_size); -} - /* * a generic setattr() function * diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c index ea9cb6374a..6f32f47635 100644 --- a/usr/src/uts/common/fs/dev/sdev_vfsops.c +++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c @@ -169,7 +169,13 @@ devinit(int fstype, char *name) if ((devmajor = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name); - return (1); + return (ENXIO); + } + + if (sdev_plugin_init() != 0) { + cmn_err(CE_WARN, "%s: failed to set init plugin subsystem", + sdev_vfssw.name); + return (EIO); } /* initialize negative cache */ @@ -332,6 +338,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, ASSERT(sdev_origins); dv->sdev_flags &= ~SDEV_GLOBAL; dv->sdev_origin = sdev_origins->sdev_root; + SDEV_HOLD(dv->sdev_origin); } else { sdev_ncache_setup(); rw_enter(&dv->sdev_contents, RW_WRITER); @@ -504,3 +511,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo) SDEVTOV(mntinfo->sdev_root)->v_count--; mutex_exit(&sdev_lock); } + +void +sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg) +{ + struct sdev_data *mntinfo; + + mutex_enter(&sdev_lock); + mntinfo = sdev_mntinfo; + while (mntinfo != NULL) { + func(mntinfo->sdev_root, arg); + mntinfo = mntinfo->sdev_next; + } + mutex_exit(&sdev_lock); +} diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index e4c3acf787..495cf450de 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* vnode ops for the /dev/zvol directory */ @@ -370,8 +370,10 @@ devzvol_create_pool_dirs(struct vnode *dvp) ASSERT(dvp->v_count > 0); rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, NULL, kcred, NULL, 0, NULL); - /* should either work, or not be visible from a zone */ - ASSERT(rc == 0 || rc == ENOENT); + /* + * should either work or we should get an error if this should + * not be visible from the zone, or disallowed in the zone + */ if (rc == 0) VN_RELE(vp); pools++; diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index 25327d2852..c949117da6 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop) } /* - * Perform a reverse lookup in the DNLC. This will find the first occurrence of - * the vnode. If successful, it will return the vnode of the parent, and the - * name of the entry in the given buffer. If it cannot be found, or the buffer - * is too small, then it will return NULL. Note that this is a highly - * inefficient function, since the DNLC is constructed solely for forward - * lookups. - */ -vnode_t * -dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) -{ - nc_hash_t *nch; - ncache_t *ncp; - vnode_t *pvp; - - if (!doingcache) - return (NULL); - - for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { - mutex_enter(&nch->hash_lock); - ncp = nch->hash_next; - while (ncp != (ncache_t *)nch) { - /* - * We ignore '..' entries since it can create - * confusion and infinite loops. - */ - if (ncp->vp == vp && !(ncp->namlen == 2 && - 0 == bcmp(ncp->name, "..", 2)) && - ncp->namlen < buflen) { - bcopy(ncp->name, buf, ncp->namlen); - buf[ncp->namlen] = '\0'; - pvp = ncp->dp; - /* VN_HOLD 2 of 2 in this file */ - VN_HOLD_CALLER(pvp); - mutex_exit(&nch->hash_lock); - return (pvp); - } - ncp = ncp->hash_next; - } - mutex_exit(&nch->hash_lock); - } - - return (NULL); -} -/* * Utility routine to search for a cache entry. Return the * ncache entry if found, NULL otherwise. */ diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index b4e28cc860..5f524def30 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> @@ -33,11 +37,12 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> - #include <sys/fem.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */ /* @@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1) } #endif +/* + * File event monitoring handoffs + * + * File event monitoring relies on being able to inject stack frames between + * vnode consumers and the underlying file systems. This becomes problematic + * when there exist many monitors, as kernel stack depth is finite. The model + * very much encodes this injected frame: the flow of control deliberately + * lies with the monitor, not with the monitoring system. While we could + * conceivably address this by allowing each subsystem to install at most + * one monitor per vnode (and impose on subsystems that they handle any + * of their own consumer multiplexing internally), this in fact exports a + * substantial amount of run-time complexity to deal with an uncommon case + * (and, it must be said, assumes a small number of consuming subsystems). + * To allow our abstraction to remain clean, we instead check our remaining + * stack in every vnext_*() call; if the amount of stack remaining is lower + * than a threshold (fem_stack_needed), we call thread_splitstack() to carry + * on the execution of the monitors and the underlying vnode operation on a + * split stack. Because we can only pass a single argument to our split stack + * function, we must marshal our arguments, the mechanics of which are somewhat + * ornate in terms of the code: to marshal in a type-safe manner, we define a + * baton that is a union of payload structures for each kind of operation, + * loading the per-operation payload explicitly and calling into common handoff + * code that itself calls thread_splitstack(). The function passed to + * thread_splitstack() is a per-entry point function that continues monitor + * processing given the specified (marshalled) arguments. While this method + * is a little verbose to implement, it has the advantage of being relatively + * robust (that is, broadly type-safe) while imposing minimal burden on each + * vnext_*() entry point. + * + * In terms of the implementation: + * + * - The FEM_BATON_n macros define the per-entry point baton structures + * - The fem_baton_payload_t contains the union of these structures + * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point + * - The FEM_VNEXTn macros constitute the per-handoff entry point + * + * Note that we don't use variadic macros -- we define a variant of these + * macros for each of our relevant argument counts. This may seem overly + * explicit, but it is deliberate: the object here is to minimize the + * future maintenance burden by minimizing the likelihood of introduced + * error -- not to minimize the number of characters in this source file. + */ + +#ifndef STACK_GROWTH_DOWN +#error Downward stack growth assumed. +#endif + +int fem_stack_toodeep; +uintptr_t fem_stack_needed = 8 * 1024; +size_t fem_handoff_stacksize = 128 * 1024; + +#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ + (uintptr_t)curthread->t_stkbase < fem_stack_needed) + +#define FEM_BATON_1(what, t0, l0) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + } fb_##what + +#define FEM_BATON_2(what, t0, l0, t1, l1) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + } fb_##what + +#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + } fb_##what + +#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + } fb_##what + +#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + } fb_##what + +#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + } fb_##what + +#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + } fb_##what + +#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7, t8, l8) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + t8 fb_##what##_##l8; \ + } fb_##what + +typedef union { + FEM_BATON_2(open, int, mode, cred_t *, cr); + FEM_BATON_4(close, int, flag, int, count, + offset_t, offset, cred_t *, cr); + FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_5(ioctl, int, cmd, intptr_t, arg, + int, flag, cred_t *, cr, int *, rvalp); + FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr); + FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr); + FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp, + pathname_t *, pnp, int, flags, vnode_t *, rdir, + cred_t *, cr, int *, direntflags, pathname_t *, realpnp); + FEM_BATON_8(create, char *, name, vattr_t *, vap, + vcexcl_t, excl, int, mode, vnode_t **, vpp, + cred_t *, cr, int, flag, vsecattr_t *, vsecp); + FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags); + FEM_BATON_4(link, vnode_t *, svp, char *, tnm, + cred_t *, cr, int, flags); + FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp, + char *, tnm, cred_t *, cr, int, flags); + FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap, + vnode_t **, vpp, cred_t *, cr, int, flags, + vsecattr_t *, vsecp); + FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir, + cred_t *, cr, int, flags); + FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr, + int *, eofp, int, flags); + FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap, + char *, target, cred_t *, cr, int, flags); + FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr); + FEM_BATON_2(fsync, int, syncflag, cred_t *, cr); + FEM_BATON_1(inactive, cred_t *, cr); + FEM_BATON_1(fid, fid_t *, fidp); + FEM_BATON_1(rwlock, int, write_lock); + FEM_BATON_1(rwunlock, int, write_lock); + FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp); + FEM_BATON_1(cmp, vnode_t *, vp2); + FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, struct flk_callback *, flk_cbp, + cred_t *, cr); + FEM_BATON_5(space, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, cred_t *, cr); + FEM_BATON_1(realvp, vnode_t **, vpp); + FEM_BATON_9(getpage, offset_t, off, size_t, len, + uint_t *, protp, struct page **, plarr, size_t, plsz, + struct seg *, seg, caddr_t, addr, enum seg_rw, rw, + cred_t *, cr); + FEM_BATON_4(putpage, offset_t, off, size_t, len, + int, flags, cred_t *, cr); + FEM_BATON_8(map, offset_t, off, struct as *, as, + caddr_t *, addrp, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(addmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(delmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uint_t, prot, + uint_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_4(poll, short, events, int, anyyet, + short *, reventsp, struct pollhead **, phpp); + FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks); + FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr); + FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off, + size_t, io_len, int, flags, cred_t *, cr); + FEM_BATON_2(dumpctl, int, action, offset_t *, blkp); + FEM_BATON_4(dispose, struct page *, pp, int, flag, + int, dn, cred_t *, cr); + FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr, + int, flag, cred_t *, cr); + FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname); + FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag, + xuio_t *, xuiop, cred_t *, cr); + FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr); +} fem_baton_payload_t; + +typedef struct { + fem_baton_payload_t fb_payload; + int (*fb_func)(); + void (*fb_handoff)(); + int fb_rval; +} fem_baton_t; + +static int +fem_handoff(fem_baton_t *bp) +{ + fem_stack_toodeep++; + thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize); + + return (bp->fb_rval); +} + +#define FEM_VNEXT3_DECL(what, a0, a1, a2) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2); \ +} + +#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3); \ +} + +#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4); \ +} + +#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5); \ +} + +#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6); \ +} + +#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7); \ +} + +#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9); \ +} + +#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9, \ + bp->fb_payload.fb_##what.fb_##what##_##a10); \ +} + +#define FEM_VNEXT3(what, func, a0, a1, a2) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2)) + +#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3)) + +#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4)) + +#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5)) + +#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6)) + +#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7)) + +#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)) + +#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)) + static fem_t * fem_alloc() { @@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[] = { * 5. Return by invoking the base operation with the base object. * * for each classification, there needs to be at least one "next" operation - * for each "head"operation. - * + * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros + * to define the function to run when the stack is split; see the discussion + * on "File event monitoring handoffs", above. */ +FEM_VNEXT4_DECL(open, arg0, mode, cr, ct) +FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct) +FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct) +FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct) +FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct) +FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp) +FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp) +FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags) +FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags) +FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags) +FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp) +FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags) +FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags) +FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags) +FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct) +FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct) +FEM_VNEXT3_DECL(fid, arg0, fidp, ct) +FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct) +FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct) +FEM_VNEXT3_DECL(cmp, arg0, vp2, ct) +FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct) +FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct) +FEM_VNEXT3_DECL(realvp, arg0, vpp, ct) +FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz, + seg, addr, rw, cr, ct) +FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct) +FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct) +FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct) +FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct) +FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct) +FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct) +FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct) +FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct) +FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct) +FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct) + int vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) { @@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_open, femop_open); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, cr, ct)); + FEM_VNEXT4(open, func, arg0, mode, cr, ct); } int @@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_close, femop_close); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, flag, count, offset, cr, ct)); + FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct); } int @@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_read, femop_read); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct); } int @@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_write, femop_write); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct); } int @@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct)); + FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct); } int @@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, oflags, nflags, cr, ct)); + FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct); } int @@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct); } int @@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct); } int @@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_access, femop_access); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, flags, cr, ct)); + FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct); } int @@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp, vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct, - direntflags, realpnp)); + FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct, + direntflags, realpnp); } int @@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, vsop_find(vf, &func, int, &arg0, vop_create, femop_create); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)); + FEM_VNEXT10(create, func, arg0, name, vap, excl, + mode, vpp, cr, flag, ct, vsecp); } int @@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cr, ct, flags)); + FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags); } int @@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_link, femop_link); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, svp, tnm, cr, ct, flags)); + FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags); } int @@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags)); + FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags); } int @@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp, vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp)); + FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp); } int @@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cdir, cr, ct, flags)); + FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags); } int @@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, eofp, ct, flags)); + FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags); } int @@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target, vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, linkname, vap, target, cr, ct, flags)); + FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags); } int @@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, ct)); + FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct); } int @@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, syncflag, cr, ct)); + FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct); } void @@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, fidp, ct)); + FEM_VNEXT3(fid, func, arg0, fidp, ct); } int @@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, write_lock, ct)); + FEM_VNEXT3(rwlock, func, arg0, write_lock, ct); } void @@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ooff, noffp, ct)); + FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct); } int @@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vp2, ct)); + FEM_VNEXT3(cmp, func, arg0, vp2, ct); } int @@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)); + FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct); } int @@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_space, femop_space); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct)); + FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct); } int @@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vpp, ct)); + FEM_VNEXT3(realvp, func, arg0, vpp, ct); } int @@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp, vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw, - cr, ct)); + FEM_VNEXT11(getpage, func, arg0, off, len, protp, + plarr, plsz, seg, addr, rw, cr, ct); } int @@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags, vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, flags, cr, ct)); + FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct); } int @@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp, vsop_find(vf, &func, int, &arg0, vop_map, femop_map); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags, + cr, ct); } int @@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp, vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, events, anyyet, reventsp, phpp, ct)); + FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct); } int @@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks, vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, addr, lbdn, dblks, ct)); + FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct); } int @@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, valp, cr, ct)); + FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct); } int @@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off, vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct)); + FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct); } int @@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, action, blkp, ct)); + FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct); } void @@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, shr, flag, cr, ct)); + FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct); } int @@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vnevent, dvp, cname, ct)); + FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct); } int @@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ioflag, xuiop, cr, ct)); + FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct); } int @@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, xuiop, cr, ct)); + FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct); } int diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 6e56000ffe..56204c6741 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld) /* * The other end of the pipe is almost closed so * reject any other open on this end of the pipe - * This only happens with a pipe mounted under namefs + * This normally only happens with a pipe mounted under namefs, but + * we can also see an open via proc/fd, which should still succeed. + * To indicate the proc/fd case the FKLYR flag is passed. */ - if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) { + if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) && + (flag & FKLYR) == 0) { fifo_cleanup(oldvp, flag); cv_broadcast(&fnp->fn_wait_cv); if (!lockheld) diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index ac89e430c7..fee2924093 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -27,7 +27,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * FIFOFS file system vnode operations. This file system @@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, } /* - * if we happened to get something, return + * if we happened to get something and we're not edge-triggered, return */ - - if ((*reventsp = (short)retevents) != 0) { + if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) { mutex_exit(&fnp->fn_lock->flk_lock); return (0); } /* - * If poll() has not found any events yet, set up event cell - * to wake up the poll if a requested event occurs on this + * If poll() has not found any events yet or we're edge-triggered, set + * up event cell to wake up the poll if a requested event occurs on this * pipe/fifo. */ if (!anyyet) { diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c index 744d269716..ec5b145a86 100644 --- a/usr/src/uts/common/fs/fs_subr.c +++ b/usr/src/uts/common/fs/fs_subr.c @@ -25,6 +25,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -246,6 +247,7 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, int frcmd; int nlmid; int error = 0; + boolean_t skip_lock = B_FALSE; flk_callback_t serialize_callback; int serialize = 0; v_mode_t mode; @@ -265,6 +267,17 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, } break; + case F_OFD_GETLK: + /* + * TBD we do not support remote OFD locks at this time. + */ + if (flag & (F_REMOTELOCK | F_PXFSLOCK)) { + error = EINVAL; + goto done; + } + skip_lock = B_TRUE; + break; + case F_SETLK_NBMAND: /* * Are NBMAND locks allowed on this file? @@ -326,6 +339,20 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, } break; + case F_OFD_SETLK: + case F_OFD_SETLKW: + case F_FLOCK: + case F_FLOCKW: + /* + * TBD we do not support remote OFD locks at this time. + */ + if (flag & (F_REMOTELOCK | F_PXFSLOCK)) { + error = EINVAL; + goto done; + } + skip_lock = B_TRUE; + break; + case F_HASREMOTELOCKS: nlmid = GETNLMID(bfp->l_sysid); if (nlmid != 0) { /* booted as a cluster */ @@ -354,7 +381,8 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, flk_cbp = &serialize_callback; } - error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp); + if (!skip_lock) + error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp); done: if (serialize) diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..05ee2c6e09 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + kmem_free(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + kmem_free(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..1d857309f3 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..c582a8cac2 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,614 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), + KM_NORMALPRI | KM_NOSLEEP)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + kmem_free(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..a2064dfa1f --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1441 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (EISDIR); + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + int len, cnt, error; + int i; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + vattr_t tmp_va; + + if (tp->hln_looped == 1) { + int error; + + if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr, + ct)) != 0) + return (error); + } + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + if (tp->hln_looped == 1) { + vap->va_nblocks = tmp_va.va_nblocks; + } else { + vap->va_nblocks = + (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + } + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, int *pcnt, int n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + int cnt; + int len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == vn_vpath_empty) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + int limit, cnt, error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + kmem_free(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 55ffb94805..59ec5d1829 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -21,6 +21,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -57,6 +58,7 @@ #include <sys/zone.h> #include <sys/dnlc.h> #include <sys/fs/snode.h> +#include <sys/brand.h> /* Controls whether paths are stored with vnodes. */ int vfs_vnode_path = 1; @@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr) } /* + * Clean a stale v_path from a vnode. This is only performed if the v_path has + * not been altered since it was found to be stale + */ +static void +vnode_clear_vpath(vnode_t *vp, char *vpath_old) +{ + mutex_enter(&vp->v_lock); + if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) { + vp->v_path = vn_vpath_empty; + mutex_exit(&vp->v_lock); + kmem_free(vpath_old, strlen(vpath_old) + 1); + } else { + mutex_exit(&vp->v_lock); + } +} + +/* + * Validate that a pathname refers to a given vnode. + */ +static int +vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn, + int flags, cred_t *cr) +{ + vnode_t *compvp; + /* + * If we are in a zone or a chroot environment, then we have to + * take additional steps, since the path to the root might not + * be readable with the current credentials, even though the + * process can legitmately access the file. In this case, we + * do the following: + * + * lookuppnvp() with all privileges to get the resolved path. + * call localpath() to get the local portion of the path, and + * continue as normal. + * + * If the the conversion to a local path fails, then we continue + * as normal. This is a heuristic to make process object file + * paths available from within a zone. Because lofs doesn't + * support page operations, the vnode stored in the seg_t is + * actually the underlying real vnode, not the lofs node itself. + * Most of the time, the lofs path is the same as the underlying + * vnode (for example, /usr/lib/libc.so.1). + */ + if (vrootp != rootdir) { + char *local = NULL; + + VN_HOLD(rootdir); + if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir, + rootdir, kcred) == 0) { + local = localpath(rpn->pn_path, vrootp, kcred); + VN_RELE(compvp); + } + + /* + * The original pn was changed through lookuppnvp(). + * Set it to local for next validation attempt. + */ + if (local) { + (void) pn_set(pn, local); + } else { + return (1); + } + } + + /* + * We should have a local path at this point, so start the search from + * the root of the current process. + */ + VN_HOLD(vrootp); + if (vrootp != rootdir) + VN_HOLD(vrootp); + if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp, + cr) == 0) { + /* + * Check to see if the returned vnode is the same as the one we + * expect. + */ + if (vn_compare(vp, compvp) || + vnode_match(vp, compvp, cr)) { + VN_RELE(compvp); + return (0); + } else { + VN_RELE(compvp); + } + } + + return (1); +} + +/* * Given a directory, return the full, resolved path. This looks up "..", * searches for the given vnode in the parent, appends the component, etc. It * is used to implement vnodetopath() and getcwd() when the cached path fails. @@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, char *bufloc; size_t dlen = DIRENT64_RECLEN(MAXPATHLEN); refstr_t *mntpt; + char *vpath_cached; + boolean_t vpath_stale; /* Operation only allowed on directories */ ASSERT(vp->v_type == VDIR); @@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, * Shortcut: see if this vnode has correct v_path. If so, * we have the work done. */ + vpath_cached = NULL; + vpath_stale = B_FALSE; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { - - if ((err = pn_set(&pn, vp->v_path)) == 0) { - mutex_exit(&vp->v_lock); - rpn.pn_path = rpn.pn_buf; - - /* - * Ensure the v_path pointing to correct vnode - */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - if (lookuppnvp(&pn, &rpn, flags, NULL, - &cmpvp, vrootp, vrootp, cr) == 0) { - - if (VN_CMP(vp, cmpvp)) { - VN_RELE(cmpvp); + if (vp->v_path != vn_vpath_empty && + pn_set(&pn, vp->v_path) == 0) { + vpath_cached = vp->v_path; + mutex_exit(&vp->v_lock); + rpn.pn_path = rpn.pn_buf; - complen = strlen(rpn.pn_path); - bufloc -= complen; - if (bufloc < buf) { - err = ERANGE; - goto out; - } - bcopy(rpn.pn_path, bufloc, - complen); - break; - } else { - VN_RELE(cmpvp); - } + /* Ensure the v_path pointing to correct vnode */ + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, + cr) == 0) { + complen = strlen(rpn.pn_path); + bufloc -= complen; + if (bufloc < buf) { + err = ERANGE; + goto out; } + bcopy(rpn.pn_path, bufloc, complen); + break; } else { - mutex_exit(&vp->v_lock); + vpath_stale = B_TRUE; } } else { mutex_exit(&vp->v_lock); @@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, } /* - * Try to obtain the path component from dnlc cache - * before searching through the directory. - */ - if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) { - /* - * If we got parent vnode as a result, - * then the answered path is correct. - */ - if (VN_CMP(cmpvp, pvp)) { - VN_RELE(cmpvp); - complen = strlen(dbuf); - bufloc -= complen; - if (bufloc <= buf) { - err = ENAMETOOLONG; - goto out; - } - bcopy(dbuf, bufloc, complen); - - /* Prepend a slash to the current path */ - *--bufloc = '/'; - - /* And continue with the next component */ - VN_RELE(vp); - vp = pvp; - pvp = NULL; - continue; - } else { - VN_RELE(cmpvp); - } - } - - /* * Search the parent directory for the entry corresponding to * this vnode. */ @@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, /* Prepend a slash to the current path. */ *--bufloc = '/'; + /* Clear vp->v_path if it was found to be stale. */ + if (vpath_stale == B_TRUE) { + vnode_clear_vpath(vp, vpath_cached); + } + /* And continue with the next component */ VN_RELE(vp); vp = pvp; @@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, VN_RELE(vp); } - pn_alloc(&pn); /* - * Check to see if we have a cached path in the vnode. + * Check to see if we have a valid cached path in the vnode. */ + pn_alloc(&pn); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { (void) pn_set(&pn, vp->v_path); mutex_exit(&vp->v_lock); - pn_alloc(&rpn); - /* We should only cache absolute paths */ ASSERT(pn.pn_buf[0] == '/'); - /* - * If we are in a zone or a chroot environment, then we have to - * take additional steps, since the path to the root might not - * be readable with the current credentials, even though the - * process can legitmately access the file. In this case, we - * do the following: - * - * lookuppnvp() with all privileges to get the resolved path. - * call localpath() to get the local portion of the path, and - * continue as normal. - * - * If the the conversion to a local path fails, then we continue - * as normal. This is a heuristic to make process object file - * paths available from within a zone. Because lofs doesn't - * support page operations, the vnode stored in the seg_t is - * actually the underlying real vnode, not the lofs node itself. - * Most of the time, the lofs path is the same as the underlying - * vnode (for example, /usr/lib/libc.so.1). - */ - if (vrootp != rootdir) { - char *local = NULL; - VN_HOLD(rootdir); - if (lookuppnvp(&pn, &rpn, FOLLOW, - NULL, &compvp, rootdir, rootdir, kcred) == 0) { - local = localpath(rpn.pn_path, vrootp, - kcred); - VN_RELE(compvp); - } - - /* - * The original pn was changed through lookuppnvp(). - * Set it to local for next validation attempt. - */ - if (local) { - (void) pn_set(&pn, local); - } else { - goto notcached; + pn_alloc(&rpn); + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) { + /* Return the result, if we're able. */ + if (buflen > rpn.pn_pathlen) { + bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); + pn_free(&pn); + pn_free(&rpn); + VN_RELE(vrootp); + if (doclose) { + (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, + NULL); + VN_RELE(vp); + } + return (0); } } - /* - * We should have a local path at this point, so start the - * search from the root of the current process. + * A stale v_path will be purged by the later dirtopath lookup. */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL, - &compvp, vrootp, vrootp, cr); - if (ret == 0) { - /* - * Check to see if the returned vnode is the same as - * the one we expect. If not, give up. - */ - if (!vn_compare(vp, compvp) && - !vnode_match(vp, compvp, cr)) { - VN_RELE(compvp); - goto notcached; - } - - VN_RELE(compvp); - - /* - * Return the result. - */ - if (buflen <= rpn.pn_pathlen) - goto notcached; - - bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); - pn_free(&pn); - pn_free(&rpn); - VN_RELE(vrootp); - if (doclose) { - (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL); - VN_RELE(vp); - } - return (0); - } - -notcached: pn_free(&rpn); } else { mutex_exit(&vp->v_lock); } - pn_free(&pn); if (vp->v_type != VDIR) { - /* - * If we don't have a directory, try to find it in the dnlc via - * reverse lookup. Once this is found, we can use the regular - * directory search to find the full path. - */ - if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) { - /* - * Check if we have read privilege so, that - * we can lookup the path in the directory - */ - ret = 0; - if ((flags & LOOKUP_CHECKREAD)) { - ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL); - } - if (ret == 0) { - ret = dirtopath(vrootp, pvp, buf, buflen, - flags, cr); - } - if (ret == 0) { - len = strlen(buf); - if (len + strlen(path) + 1 >= buflen) { - ret = ENAMETOOLONG; - } else { - if (buf[len - 1] != '/') - buf[len++] = '/'; - bcopy(path, buf + len, - strlen(path) + 1); - } - } - - VN_RELE(pvp); - } else - ret = ENOENT; - } else + ret = ENOENT; + } else { ret = dirtopath(vrootp, vp, buf, buflen, flags, cr); + } VN_RELE(vrootp); if (doclose) { diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..3c1405d4af --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..48f4efc1bf --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3099 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support the LX brand with a Linux /proc entirely compatible with the Linux + * world view; the other -- this one -- is to support native (but Linux-borne) + * programs that wish to view the native system via the Linux /proc model. So + * the aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more -- and as such, it should not be unified with the much + * more complete Linux /proc implementation found in the LX brand. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + -1, /* 41: illumos SIGINFO */ + LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..e3718372e0 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifdef _LXPROC_BRANDED_H +#error Attempted to include native lxproc.h after branded lx_proc.h +#endif + +#ifndef _LXPROC_H +#define _LXPROC_H +#define _LXPROC_NATIVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index 207a708771..2176dcb9de 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index b7354c168a..cd1e08d5d7 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -29,7 +29,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c index ce0c9485a6..3ee41939ac 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/systm.h> @@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head, kex = &exi->exi_export; kex->ex_flags = EX_PSEUDO; - vpathlen = vp->v_path ? strlen(vp->v_path) : 0; + vpathlen = strlen(vp->v_path); kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX); kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP); if (vpathlen) - (void) strcpy(kex->ex_path, vp->v_path); + (void) strncpy(kex->ex_path, vp->v_path, vpathlen); (void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX); /* Transfer the secinfo data from exdata to this new pseudo node */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index 151cb62403..55f6c95289 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index d6bf384a8b..2b3fdfdd55 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -34,7 +34,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -8061,8 +8061,9 @@ link_call: * vnode if it already existed. */ if (error == 0) { - vnode_t *tvp; + vnode_t *tvp, *tovp; rnode4_t *trp; + /* * Notify the vnode. Each links is represented by * a different vnode, in nfsv4. @@ -8075,23 +8076,20 @@ link_call: vnevent_rename_dest(tvp, ndvp, nnm, ct); } - /* - * if the source and destination directory are not the - * same notify the destination directory. - */ - if (VTOR4(odvp) != VTOR4(ndvp)) { - trp = VTOR4(ndvp); - tvp = ndvp; - if (IS_SHADOW(ndvp, trp)) - tvp = RTOV4(trp); - vnevent_rename_dest_dir(tvp, ct); - } - trp = VTOR4(ovp); - tvp = ovp; + tovp = ovp; if (IS_SHADOW(ovp, trp)) + tovp = RTOV4(trp); + + vnevent_rename_src(tovp, odvp, onm, ct); + + trp = VTOR4(ndvp); + tvp = ndvp; + + if (IS_SHADOW(ndvp, trp)) tvp = RTOV4(trp); - vnevent_rename_src(tvp, odvp, onm, ct); + + vnevent_rename_dest_dir(tvp, tovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c index 3410340581..a8bcfdf438 100644 --- a/usr/src/uts/common/fs/nfs/nfs_auth.c +++ b/usr/src/uts/common/fs/nfs/nfs_auth.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -559,11 +560,16 @@ retry: *access = res.ares.auth_perm; *srv_uid = res.ares.auth_srv_uid; *srv_gid = res.ares.auth_srv_gid; - *srv_gids_cnt = res.ares.auth_srv_gids.len; - *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t), - KM_SLEEP); - bcopy(res.ares.auth_srv_gids.val, *srv_gids, - *srv_gids_cnt * sizeof (gid_t)); + + if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) { + *srv_gids = kmem_alloc(*srv_gids_cnt * + sizeof (gid_t), KM_SLEEP); + bcopy(res.ares.auth_srv_gids.val, *srv_gids, + *srv_gids_cnt * sizeof (gid_t)); + } else { + *srv_gids = NULL; + } + break; case NFSAUTH_DR_EFAIL: @@ -1114,9 +1120,13 @@ wait: if (gid != NULL) *gid = p->auth_srv_gid; if (ngids != NULL && gids != NULL) { - *ngids = p->auth_srv_ngids; - *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP); - bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t)); + if ((*ngids = p->auth_srv_ngids) != 0) { + size_t sz = *ngids * sizeof (gid_t); + *gids = kmem_alloc(sz, KM_SLEEP); + bcopy(p->auth_srv_gids, *gids, sz); + } else { + *gids = NULL; + } } access = p->auth_access; diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index 7e94c62734..dcc64def59 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -23,6 +23,7 @@ * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -2570,6 +2571,9 @@ nfs_srvinit(void) { int error; + if (getzoneid() != GLOBAL_ZONEID) + return (EACCES); + error = nfs_exportinit(); if (error != 0) return (error); @@ -3208,7 +3212,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi) char *path; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { zone = zone_find_by_any_path(vp->v_path, B_FALSE); mutex_exit(&vp->v_lock); } else { diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c index 57b21778b4..ffd5380a86 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 1a1082bcb8..8c321c8aa3 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -26,7 +26,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -2688,11 +2688,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); - ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c index a9ee604b7c..21a0b1a4bd 100644 --- a/usr/src/uts/common/fs/pcfs/pc_dir.c +++ b/usr/src/uts/common/fs/pcfs/pc_dir.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/errno.h> #include <sys/systm.h> @@ -822,10 +826,10 @@ top: return (error); } } -out: - vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); - if (dp != tdp) { - vnevent_rename_dest_dir(PCTOV(tdp), ctp); + + if (error == 0) { + vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); + vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp); } VN_RELE(PCTOV(pcp)); diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c index 14be8cbbae..11b7386269 100644 --- a/usr/src/uts/common/fs/portfs/port.c +++ b/usr/src/uts/common/fs/portfs/port.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/systm.h> @@ -1381,12 +1383,18 @@ portnowait: if (model == DATAMODEL_NATIVE) { eventsz = sizeof (port_event_t); - kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp = NULL; + } else { + kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { @@ -1423,12 +1431,18 @@ portnowait: port_event32_t *kevp32; eventsz = sizeof (port_event32_t); - kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp32 == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp32 = NULL; + } else { + kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp32 == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp32; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c index b2f5088e06..ab95c0a1f8 100644 --- a/usr/src/uts/common/fs/portfs/port_vnops.c +++ b/usr/src/uts/common/fs/portfs/port_vnops.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> @@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp, levents |= POLLOUT; levents &= events; *reventsp = levents; - if (levents == 0) { - if (!anyyet) { - *phpp = &pp->port_pollhd; - portq->portq_flags |= - events & POLLIN ? PORTQ_POLLIN : 0; - portq->portq_flags |= - events & POLLOUT ? PORTQ_POLLOUT : 0; - } + if ((levents == 0 && !anyyet) || (events & POLLET)) { + *phpp = &pp->port_pollhd; + portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0; + portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0; } mutex_exit(&portq->portq_mutex); return (0); diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c new file mode 100644 index 0000000000..a4ad82d661 --- /dev/null +++ b/usr/src/uts/common/fs/proc/prargv.c @@ -0,0 +1,441 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/sysmacros.h> +#include <vm/as.h> + +/* + * Safely read a contiguous region of memory from 'addr' in the address space + * of a particular process into the supplied kernel buffer (*buf, sz). + * Partially mapped regions will result in a partial read terminating at the + * first hole in the address space. The number of bytes actually read is + * returned to the caller via 'rdsz'. + */ +static int +prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz) +{ + int error = 0; + size_t rem = sz; + off_t pos = 0; + + if (rdsz != NULL) + *rdsz = 0; + + while (rem != 0) { + uintptr_t addr = ustart + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if ((error = uread(p, buf + pos, len, addr)) != 0) { + if (error == ENXIO) { + /* + * ENXIO from uread() indicates that the page + * does not exist. This will simply be a + * partial read. + */ + error = 0; + } + break; + } + + rem -= len; + pos += len; + } + + if (rdsz != NULL) + *rdsz = pos; + + return (error); +} + +/* + * Attempt to read the argument vector (argv) from this process. The caller + * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via + * prlock or lx_prlock). + * + * The caller must provide a buffer (buf, buflen). We will concatenate each + * argument string (including the NUL terminator) into this buffer. The number + * of characters written to this buffer (including the final NUL terminator) + * will be stored in 'slen'. + */ +int +prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *argv = NULL; + size_t argvsz = 0; + int i; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) { + /* + * Return the regular psargs string to the caller. + */ + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + + return (0); + } + + /* + * Allocate space to store argv array. + */ + argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + argv = kmem_alloc(argvsz, KM_SLEEP); + + /* + * Extract the argv array from the target process. Drop p_lock + * while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz, + NULL)) != 0) { + kmem_free(argv, argvsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each argument string from the pointers in the argv array. + */ + pos = 0; + for (i = 0; i < up->u_argc; i++) { + size_t rdsz, trysz; + uintptr_t arg; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + arg = (uintptr_t)((caddr32_t *)argv)[i]; + } else { + arg = (uintptr_t)argv[i]; + } +#else + arg = (uintptr_t)argv[i]; +#endif + + /* + * Stop trying to read arguments if we reach a NULL + * pointer in the vector. + */ + if (arg == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual argument strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this argument. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(argv, argvsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} + +/* + * Similar to prreadargv except reads the env vector. This is slightly more + * complex because there is no count for the env vector that corresponds to + * u_argc. + */ +int +prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *envp = NULL; + uintptr_t tmpp = NULL; + size_t envpsz = 0, rdsz = 0; + int i; + int cnt, bound; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) { + /* + * Return empty string. + */ + buf[0] = '\0'; + *slen = 1; + + return (0); + } + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + + /* + * We first have to count how many env entries we have. This is + * somewhat painful. We extract the env entries from the target process + * one entry at a time. Stop trying to read env entries if we reach a + * NULL pointer in the vector or hit our upper bound (which we take + * as the bufsz/4) to ensure we don't run off. + */ + rdsz = (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + bound = (int)(bufsz / 4); + for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) { + caddr_t tmp = NULL; + + if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz, + NULL)) != 0) { + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + if (tmp == NULL) + break; + } + if (cnt == 0) { + /* Return empty string. */ + buf[0] = '\0'; + *slen = 1; + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (0); + } + + /* + * Allocate space to store env array. + */ + envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + envp = kmem_alloc(envpsz, KM_SLEEP); + + /* + * Extract the env array from the target process. + */ + if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz, + NULL)) != 0) { + kmem_free(envp, envpsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each env string from the pointers in the env array. + */ + pos = 0; + for (i = 0; i < cnt; i++) { + size_t rdsz, trysz; + uintptr_t ev; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + ev = (uintptr_t)((caddr32_t *)envp)[i]; + } else { + ev = (uintptr_t)envp[i]; + } +#else + ev = (uintptr_t)envp[i]; +#endif + + /* + * Stop trying to read env entries if we reach a NULL + * pointer in the vector. + */ + if (ev == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual env strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this env var. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(envp, envpsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index a73a64a4a4..7e99d23b97 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ p->p_stopsig = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; t->t_dtrace_stop = 0; setrun_locked(t); } @@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr) return (EPERM); if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id) return (EINVAL); - if ((zptr = zone_find_by_id(zoneid)) == NULL) - return (EINVAL); + /* + * We cannot hold p_lock when we call zone_find_by_id since that can + * lead to a deadlock. zone_find_by_id() takes zonehash_lock. + * zone_enter() can hold the zonehash_lock and needs p_lock when it + * calls task_join. + */ mutex_exit(&p->p_lock); + if ((zptr = zone_find_by_id(zoneid)) == NULL) { + mutex_enter(&p->p_lock); + return (EINVAL); + } mutex_enter(&p->p_crlock); oldcred = p->p_cred; crhold(oldcred); diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index 8ea516bf82..72f26b3c05 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_PROC_PRDATA_H @@ -123,6 +123,7 @@ typedef enum prnodetype { #if defined(__i386) || defined(__amd64) PR_LDT, /* /proc/<pid>/ldt */ #endif + PR_ARGV, /* /proc/<pid>/argv */ PR_USAGE, /* /proc/<pid>/usage */ PR_LUSAGE, /* /proc/<pid>/lusage */ PR_PAGEDATA, /* /proc/<pid>/pagedata */ @@ -347,6 +348,8 @@ extern int pr_unset(proc_t *, long); extern void pr_sethold(prnode_t *, sigset_t *); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); extern int prwritectl(vnode_t *, struct uio *, cred_t *); extern int prlock(prnode_t *, int); extern void prunmark(proc_t *); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 7801fd0ac8..284bf8cb88 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -201,6 +201,7 @@ prchoose(proc_t *p) case PR_SYSEXIT: case PR_SIGNALLED: case PR_FAULTED: + case PR_BRAND: /* * Make an lwp calling exit() be the * last lwp seen in the process. diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 411c9b8b0b..276f54ae3a 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -96,6 +96,11 @@ struct prdirect { #define PRSDSIZE (sizeof (struct prdirect)) /* + * Maximum length of the /proc/$$/argv file: + */ +int prmaxargvlen = 4096; + +/* * Directory characteristics. */ typedef struct prdirent { @@ -166,6 +171,8 @@ static prdirent_t piddir[] = { { PR_LDT, 27 * sizeof (prdirent_t), sizeof (prdirent_t), "ldt" }, #endif + { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t), + "argv" }, }; #define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2) @@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(), #if defined(__x86) pr_read_ldt(), #endif + pr_read_argv(), pr_read_usage(), pr_read_lusage(), pr_read_pagedata(), pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(), pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(), @@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage, /* /proc/<pid>/usage */ pr_read_lusage, /* /proc/<pid>/lusage */ pr_read_pagedata, /* /proc/<pid>/pagedata */ @@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop) } static int +pr_read_argv(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = prmaxargvlen, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_ARGV); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int pr_read_as(prnode_t *pnp, uio_t *uiop) { int error; @@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage_32, /* /proc/<pid>/usage */ pr_read_lusage_32, /* /proc/<pid>/lusage */ pr_read_pagedata_32, /* /proc/<pid>/pagedata */ @@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) #endif } +/* + * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile + * time that PRFNSZ has the same definition as MAXCOMLEN. + */ +#if PRFNSZ != MAXCOMLEN +#error PRFNSZ/MAXCOMLEN mismatch +#endif + +static int +pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop) +{ + char fname[PRFNSZ]; + int offset = offsetof(psinfo_t, pr_fname), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_fname); +#endif + + /* + * If this isn't a write to pr_fname (or if the size doesn't match + * PRFNSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ) + return (0); + + if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0) + return (error); + + fname[PRFNSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ); + + prunlock(pnp); + + return (0); +} + +/* + * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile + * time that PRARGSZ has the same definition as PSARGSZ. + */ +#if PRARGSZ != PSARGSZ +#error PRARGSZ/PSARGSZ mismatch +#endif + +static int +pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop) +{ + char psargs[PRARGSZ]; + int offset = offsetof(psinfo_t, pr_psargs), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_psargs); +#endif + + /* + * If this isn't a write to pr_psargs (or if the size doesn't match + * PRARGSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ) + return (0); + + if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0) + return (error); + + psargs[PRARGSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ); + + prunlock(pnp); + + return (0); +} + +int +pr_write_psinfo(prnode_t *pnp, uio_t *uiop) +{ + int error; + + if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0) + return (error); + + if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0) + return (error); + + return (0); +} + + /* ARGSUSED */ static int prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) @@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) uiop->uio_resid = resid; return (error); + case PR_PSINFO: + return (pr_write_psinfo(pnp, uiop)); + default: return ((vp->v_type == VDIR)? EISDIR : EBADF); } @@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, case PR_AUXV: vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t); break; + case PR_ARGV: + if ((p->p_flag & SSYS) || p->p_as == &kas) { + vap->va_size = PSARGSZ; + } else { + vap->va_size = prmaxargvlen; + } + break; #if defined(__x86) case PR_LDT: mutex_exit(&p->p_lock); @@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) @@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = { #if defined(__x86) pr_lookup_notdir, /* /proc/<pid>/ldt */ #endif + pr_lookup_notdir, /* /proc/<pid>/argv */ pr_lookup_notdir, /* /proc/<pid>/usage */ pr_lookup_notdir, /* /proc/<pid>/lusage */ pr_lookup_notdir, /* /proc/<pid>/pagedata */ @@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type) break; case PR_PSINFO: + pnp->pr_mode = 0644; /* readable by all + owner can write */ + break; + case PR_LPSINFO: case PR_LWPSINFO: case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: pnp->pr_mode = 0444; /* read-only by all */ break; @@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = { #if defined(__x86) pr_readdir_notdir, /* /proc/<pid>/ldt */ #endif + pr_readdir_notdir, /* /proc/<pid>/argv */ pr_readdir_notdir, /* /proc/<pid>/usage */ pr_readdir_notdir, /* /proc/<pid>/lusage */ pr_readdir_notdir, /* /proc/<pid>/pagedata */ @@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp) case PR_PROCDIR: case PR_PSINFO: case PR_USAGE: + case PR_ARGV: break; default: continue; @@ -6010,7 +6170,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp, } *reventsp = revents; - if (!anyyet && revents == 0) { + if ((!anyyet && revents == 0) || (events & POLLET)) { /* * Arrange to wake up the polling lwp when * the target process/lwp stops or terminates diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index 703e26ea61..682f1d867b 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + return (0); } @@ -654,6 +658,10 @@ sonode_fini(struct sonode *so) if (so->so_filter_top != NULL) sof_sonode_cleanup(so); + /* Clean up any remnants of krecv callbacks */ + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + ASSERT(list_is_empty(&so->so_acceptq_list)); ASSERT(list_is_empty(&so->so_acceptq_defer)); ASSERT(!list_link_active(&so->so_acceptq_node)); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 0be628f329..40c368736d 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr)); ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); @@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); - if ((so->so_mode & SM_SENDFILESUPP) == 0) { - SO_UNBLOCK_FALLBACK(so); - return (EOPNOTSUPP); - } - error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top, B_FALSE); @@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, { int error; - SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); if (so->so_filter_active == 0 || (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) @@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_getsockopt(so, option_name, optval, optlenp, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); if ((so->so_filter_active == 0 || @@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_setsockopt(so, option_name, optval, optlen, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); /* X/Open requires this check */ @@ -953,6 +948,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, if (!list_is_empty(&so->so_acceptq_list)) *reventsp |= (POLLIN|POLLRDNORM) & events; + /* + * If we're looking for POLLRDHUP, indicate it if we have sent the + * last rx signal for the socket. + */ + if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG)) + *reventsp |= POLLRDHUP; + /* Data */ /* so_downcalls is null for sctp */ if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) { @@ -988,14 +990,20 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, *reventsp |= POLLHUP; } - if (!*reventsp && !anyyet) { + if ((!*reventsp && !anyyet) || (events & POLLET)) { /* Check for read events again, but this time under lock */ if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&so->so_lock); if (SO_HAVE_DATA(so) || !list_is_empty(&so->so_acceptq_list)) { + if (events & POLLET) { + so->so_pollev |= SO_POLLEV_IN; + *phpp = &so->so_poll_list; + } + mutex_exit(&so->so_lock); *reventsp |= (POLLIN|POLLRDNORM) & events; + return (0); } else { so->so_pollev |= SO_POLLEV_IN; @@ -1316,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp, } } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + boolean_t cont; + so_krecv_f func = so->so_krecv_cb; + void *arg = so->so_krecv_arg; + + mutex_exit(&so->so_lock); + cont = func(so, mp, msg_size, flags & MSG_OOB, arg); + mutex_enter(&so->so_lock); + if (cont == B_TRUE) { + space_left = so->so_rcvbuf; + } else { + so->so_rcv_queued = so->so_rcvlowat; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; + } + mutex_exit(&so->so_lock); + if (flags & MSG_OOB) { so_queue_oob(so, mp, msg_size); mutex_enter(&so->so_lock); @@ -1594,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, return (ENOTCONN); } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + mutex_exit(&so->so_lock); + return (EOPNOTSUPP); + } + mutex_exit(&so->so_lock); + if (msg->msg_flags & MSG_PEEK) msg->msg_flags &= ~MSG_WAITALL; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index 957c8f93b4..9604ea5dba 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -24,6 +24,7 @@ */ /* * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -2276,9 +2277,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) fbfunc = sp->sp_smod_info->smod_proto_fallback_func; /* - * Cannot fallback if the socket has active filters + * Cannot fallback if the socket has active filters or a krecv callback. */ - if (so->so_filter_active > 0) + if (so->so_filter_active > 0 || so->so_krecv_cb != NULL) return (EINVAL); switch (so->so_family) { @@ -2456,3 +2457,50 @@ out: return (error); } + +int +so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg) +{ + int ret; + + if (cb == NULL && arg != NULL) + return (EINVAL); + + SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg)); + + mutex_enter(&so->so_lock); + if (so->so_state & SS_FALLBACK_COMP) { + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + return (ENOTSUP); + } + + ret = so_lock_read(so, 0); + VERIFY(ret == 0); + /* + * Other consumers may actually care about getting extant data delivered + * to them, when they come along, they should figure out the best API + * for that. + */ + so_rcv_flush(so); + + so->so_krecv_cb = cb; + so->so_krecv_arg = arg; + + so_unlock_read(so); + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + + return (0); +} + +void +so_krecv_unblock(sonode_t *so) +{ + mutex_enter(&so->so_lock); + VERIFY(so->so_krecv_cb != NULL); + + so->so_rcv_queued = 0; + (void) so_check_flow_control(so); + mutex_exit(&so->so_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index 971523945e..7dca6ae6fc 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/systm.h> @@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name, /* Module loaded OK, so there must be an ops vector */ ASSERT(ent->sofe_mod != NULL); + + /* + * Check again to confirm ATTACH is ok. See if the the module + * is not SOF_ATT_SAFE after an unsafe operation has taken + * place. + */ + if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 && + so->so_state & SS_FILOP_UNSF) { + sof_instance_destroy(inst); + return (EINVAL); + } + inst->sofi_ops = &ent->sofe_mod->sofm_ops; SOF_STAT_ADD(inst, tot_active_attach, 1); @@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * sof_register(version, name, ops, flags) * * Register a socket filter identified by name `name' and which should use - * the ops vector `ops' for event notification. `flags' should be set to 0. + * the ops vector `ops' for event notification. `flags' should be set to 0 + * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An + * unsafe filter is one that cannot be attached after any socket operation has + * occured. This is the legacy default. A "safe" filter can be attached even + * after some basic initial socket operations have taken place. This set is + * currently bind, getsockname, getsockopt and setsockopt. The order in which + * a "safe" filter can be attached is more relaxed, and thus more flexible. * On success 0 is returned, otherwise an errno is returned. */ int @@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags) { sof_module_t *mod; - _NOTE(ARGUNUSED(flags)); - if (version != SOF_VERSION) return (EINVAL); mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(mod->sofm_name, name); + mod->sofm_flags = flags; mod->sofm_ops = *ops; mutex_enter(&sof_module_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index 7f7aece1f1..cf2ad8b20d 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKFILTER_H @@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t; struct sof_module { char *sofm_name; + int sofm_flags; sof_ops_t sofm_ops; uint_t sofm_refcnt; list_node_t sofm_node; diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c index 3d5ba2a7e8..3f858afecc 100644 --- a/usr/src/uts/common/fs/sockfs/socknotify.c +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev) so->so_state |= SS_SENTLASTREADSIG; so->so_pollev &= ~SO_POLLEV_IN; - *pollev |= POLLIN|POLLRDNORM; + *pollev |= POLLIN|POLLRDNORM|POLLRDHUP; *sigev |= SOCKETSIG_READ; return (1); diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 06d76044e5..095d84fc42 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -1879,7 +1880,7 @@ ssize_t soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) { struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec aiov[1]; register vnode_t *vp; int ioflag, rwflag; ssize_t cnt; diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index a86cda937c..067eaedd7c 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -21,10 +21,10 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ -/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ - #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> @@ -51,6 +51,7 @@ #include <sys/cmn_err.h> #include <sys/vmsystm.h> #include <sys/policy.h> +#include <sys/limits.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -83,12 +84,6 @@ extern void nl7c_init(void); extern int sockfs_defer_nl7c_init; /* - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? - */ -#define MSG_MAXIOVLEN 16 - -/* * Kernel component of socket creation. * * The socket library determines which version number to use. @@ -1023,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) STRUCT_HANDLE(nmsghdr, umsgptr); struct nmsghdr lmsg; struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; int *flagsp; model_t model; @@ -1068,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1091,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1107,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1121,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) (do_useracc == 0 || useracc(lmsg.msg_control, lmsg.msg_controllen, B_WRITE) != 0)) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } - return (recvit(sock, &lmsg, &auio, flags, + rval = recvit(sock, &lmsg, &auio, flags, STRUCT_FADDR(umsgptr, msg_namelen), - STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); + STRUCT_FADDR(umsgptr, msg_controllen), flagsp); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } /* @@ -1264,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) struct nmsghdr lmsg; STRUCT_DECL(nmsghdr, u_lmsg); struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; model_t model; @@ -1309,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { /* * Unless this is XPG 4.2 we allow iovcnt == 0 to * be compatible with SunOS 4.X and 4.4BSD. @@ -1318,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + if (iovcnt != 0 && - copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1338,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (iovcnt != 0 && copyin(lmsg.msg_iov, aiov, (unsigned)iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1356,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1366,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) auio.uio_segflg = UIO_USERSPACE; auio.uio_limit = 0; - return (sendit(sock, &lmsg, &auio, flags)); + rval = sendit(sock, &lmsg, &auio, flags); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } ssize_t diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index d33f53f7a6..485e73eb02 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -6272,6 +6272,13 @@ sotpi_poll( if (sti->sti_conn_ind_head != NULL) *reventsp |= (POLLIN|POLLRDNORM) & events; + if (so->so_state & SS_CANTRCVMORE) { + *reventsp |= POLLRDHUP & events; + + if (so->so_state & SS_CANTSENDMORE) + *reventsp |= POLLHUP; + } + if (so->so_state & SS_OOBPEND) *reventsp |= POLLRDBAND & events; diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index f6621c8097..387cc6ae54 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -516,7 +516,7 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + kmem_free(tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -549,8 +549,8 @@ tdirinit( ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP); /* * Initialize the entries @@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + kmem_free(tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -925,7 +925,7 @@ tdiraddentry( */ namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); if (tdp == NULL) return (ENOSPC); @@ -1025,7 +1025,7 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 2e59d28d80..e6e2b392fe 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -40,9 +41,19 @@ #include <sys/policy.h> #include <sys/fs/tmp.h> #include <sys/fs/tmpnode.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#define KILOBYTE 1024 +#define MEGABYTE (1024 * KILOBYTE) +#define GIGABYTE (1024 * MEGABYTE) #define MODESHIFT 3 +#define VALIDMODEBITS 07777 + +extern pgcnt_t swapfs_minfree; + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred) * a plain file and you have write access to that file. * Function returns 0 if remove access is granted. */ - int tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, struct cred *cr) @@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* * Convert a string containing a number (number of bytes) to a pgcnt_t, * containing the corresponding number of pages. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * - * If the number is followed by a "k" or "K", the value is converted from - * kilobytes to bytes. If it is followed by an "m" or "M" it is converted - * from megabytes to bytes. If it is not followed by a character it is - * assumed to be in bytes. Multiple letter options are allowed, so for instance - * '2mk' is interpreted as 2gb. + * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes; + * "m" or "M" for megabytes; "g" or "G" for gigabytes. This interface allows + * for an arguably esoteric interpretation of multiple suffix characters: + * namely, they cascade. For example, the caller may specify "2mk", which is + * interpreted as 2 gigabytes. It would seem, at this late stage, that the + * horse has left not only the barn but indeed the country, and possibly the + * entire planetary system. Alternatively, the number may be followed by a + * single '%' sign, indicating the size is a percentage of either the zone's + * swap limit or the system's overall swap size. * * Parse and overflow errors are detected and a non-zero number returned on * error. */ - int tmp_convnum(char *str, pgcnt_t *maxpg) { - uint64_t num = 0, oldnum; + u_longlong_t num = 0; #ifdef _LP64 - uint64_t max_bytes = ULONG_MAX; + u_longlong_t max_bytes = ULONG_MAX; #else - uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; + u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif char *c; - - if (str == NULL) + const struct convchar { + char *cc_char; + uint64_t cc_factor; + } convchars[] = { + { "kK", KILOBYTE }, + { "mM", MEGABYTE }, + { "gG", GIGABYTE }, + { NULL, 0 } + }; + + if (str == NULL) { return (EINVAL); + } c = str; /* - * Convert str to number + * Convert the initial numeric portion of the input string. */ - while ((*c >= '0') && (*c <= '9')) { - oldnum = num; - num = num * 10 + (*c++ - '0'); - if (oldnum > num) /* overflow */ + if (ddi_strtoull(str, &c, 10, &num) != 0) { + return (EINVAL); + } + + /* + * Handle a size in percent. Anything other than a single percent + * modifier is invalid. We use either the zone's swap limit or the + * system's total available swap size as the initial value. Perform the + * intermediate calculation in pages to avoid overflow. + */ + if (*c == '\%') { + u_longlong_t cap; + + if (*(c + 1) != '\0') + return (EINVAL); + + if (num > 100) return (EINVAL); + + cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl; + if (cap == UINT64_MAX) { + /* + * Use the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + cap = TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + } else { + cap = btop(cap); + } + + num = ptob(cap * num / 100); + goto done; } /* - * Terminate on null + * Apply the (potentially cascading) magnitude suffixes until an + * invalid character is found, or the string comes to an end. */ - while (*c != '\0') { - switch (*c++) { + for (; *c != '\0'; c++) { + int i; + + for (i = 0; convchars[i].cc_char != NULL; i++) { + /* + * Check if this character matches this multiplier + * class: + */ + if (strchr(convchars[i].cc_char, *c) != NULL) { + /* + * Check for overflow: + */ + if (num > max_bytes / convchars[i].cc_factor) { + return (EINVAL); + } + + num *= convchars[i].cc_factor; + goto valid_char; + } + } /* - * convert from kilobytes + * This was not a valid multiplier suffix character. */ - case 'k': - case 'K': - if (num > max_bytes / 1024) /* will overflow */ - return (EINVAL); - num *= 1024; - break; + return (EINVAL); - /* - * convert from megabytes - */ - case 'm': - case 'M': - if (num > max_bytes / (1024 * 1024)) /* will overflow */ - return (EINVAL); - num *= 1024 * 1024; - break; - - default: - return (EINVAL); - } +valid_char: + continue; } +done: /* * Since btopr() rounds up to page granularity, this round-up can * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) @@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg) return (EINVAL); return (0); } + +/* + * Parse an octal mode string for use as the permissions set for the root + * of the tmpfs mount. + */ +int +tmp_convmode(char *str, mode_t *mode) +{ + ulong_t num; + char *c; + + if (str == NULL) { + return (EINVAL); + } + + if (ddi_strtoul(str, &c, 8, &num) != 0) { + return (EINVAL); + } + + if ((num & ~VALIDMODEBITS) != 0) { + return (EINVAL); + } + + *mode = VALIDMODEBITS & num; + return (0); +} diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index f8a36a528f..3c088c442c 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -76,7 +87,7 @@ static vfsdef_t vfw = { VFSDEF_VERSION, "tmpfs", tmpfsinit, - VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, &tmpfs_proto_opttbl }; @@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = { /* Option name Cancel Opt Arg Flags Data */ { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, - { "size", NULL, "0", MO_HASVALUE, NULL} + { "size", NULL, "0", MO_HASVALUE, NULL}, + { "mode", NULL, NULL, MO_HASVALUE, NULL} }; @@ -121,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -139,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -155,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } static int -tmp_mount( - struct vfs *vfsp, - struct vnode *mvp, - struct mounta *uap, - struct cred *cr) +tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) { struct tmount *tm = NULL; struct tmpnode *tp; @@ -239,8 +240,9 @@ tmp_mount( pgcnt_t anonmax; struct vattr rattr; int got_attrs; - - char *sizestr; + boolean_t mode_arg = B_FALSE; + mode_t root_mode = 0777; + char *argstr; if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) return (error); @@ -249,7 +251,7 @@ tmp_mount( return (ENOTDIR); mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_OVERLAY) == 0 && + if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (EBUSY); @@ -275,18 +277,45 @@ tmp_mount( * tm_anonmax is set according to the mount arguments * if any. Otherwise, it is set to a maximum value. */ - if (vfs_optionisset(vfsp, "size", &sizestr)) { - if ((error = tmp_convnum(sizestr, &anonmax)) != 0) + if (vfs_optionisset(vfsp, "size", &argstr)) { + if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { anonmax = ULONG_MAX; } + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; + } + if (error = pn_get(uap->dir, (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) goto out; - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if (uap->flags & MS_REMOUNT) { + tm = (struct tmount *)VFSTOTM(vfsp); + + /* + * If we change the size so its less than what is currently + * being used, we allow that. The file system will simply be + * full until enough files have been removed to get below the + * new max. + */ + mutex_enter(&tm->tm_contents); + tm->tm_anonmax = anonmax; + mutex_exit(&tm->tm_contents); + goto out; + } + + if ((tm = kmem_zalloc(sizeof (struct tmount), + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -318,17 +347,17 @@ tmp_mount( vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); - rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ + rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, &rattr, cr); /* @@ -345,7 +374,14 @@ tmp_mount( * the previously set hardwired defaults to prevail. */ if (got_attrs == 0) { - tp->tn_mode = rattr.va_mode; + if (!mode_arg) { + /* + * Only use the underlying mount point for the + * mode if the "mode" mount argument was not + * provided. + */ + tp->tn_mode = rattr.va_mode; + } tp->tn_uid = rattr.va_uid; tp->tn_gid = rattr.va_gid; } @@ -366,6 +402,7 @@ tmp_mount( pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index 61607a6dcc..464d638db0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -833,6 +841,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -871,8 +882,7 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); + xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tm = VTOTM(dvp); tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* @@ -1186,7 +1196,7 @@ tmp_rename( struct tmpnode *toparent; struct tmpnode *fromtp = NULL; /* source tmpnode */ struct tmount *tm = (struct tmount *)VTOTM(odvp); - int error; + int error, err; int samedir = 0; /* set if odvp == ndvp */ struct vnode *realvp; @@ -1262,15 +1272,6 @@ tmp_rename( error = 0; goto done; } - vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); - - /* - * Notify the target directory if not same as - * source directory. - */ - if (ndvp != odvp) { - vnevent_rename_dest_dir(ndvp, ct); - } /* * Unlink from source. @@ -1278,7 +1279,7 @@ tmp_rename( rw_enter(&fromparent->tn_rwlock, RW_WRITER); rw_enter(&fromtp->tn_rwlock, RW_WRITER); - error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred); + error = err = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred); /* * The following handles the case where our source tmpnode was @@ -1293,6 +1294,12 @@ tmp_rename( rw_exit(&fromtp->tn_rwlock); rw_exit(&fromparent->tn_rwlock); + + if (err == 0) { + vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); + } + done: tmpnode_rele(fromtp); mutex_exit(&tm->tm_renamelck); @@ -1459,6 +1466,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1597,7 +1608,7 @@ tmp_symlink( return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1662,10 +1673,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - vp->v_count--; + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + vp->v_count--; + } else { + vp->v_count--; + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1690,7 +1718,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + kmem_free(tp->tn_symlink, tp->tn_size + 1); } /* @@ -1724,7 +1752,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + kmem_free(tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1846,6 +1878,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2067,6 +2103,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c index c1e2c74a87..def046a0bf 100644 --- a/usr/src/uts/common/fs/udfs/udf_dir.c +++ b/usr/src/uts/common/fs/udfs/udf_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -562,9 +563,8 @@ out: namep, ctp); } - if (sdp != tdp) { - vnevent_rename_dest_dir(ITOV(tdp), ctp); - } + vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip), + namep, ctp); } /* diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c index 307d3987ed..2d8de23399 100644 --- a/usr/src/uts/common/fs/udfs/udf_vnops.c +++ b/usr/src/uts/common/fs/udfs/udf_vnops.c @@ -911,7 +911,7 @@ udf_rename( caller_context_t *ct, int flags) { - int32_t error = 0; + int32_t error = 0, err; struct udf_vfs *udf_vfsp; struct ud_inode *sip; /* source inode */ struct ud_inode *sdp, *tdp; /* source and target parent inode */ @@ -995,7 +995,6 @@ udf_rename( rw_exit(&tdp->i_rwlock); goto errout; } - vnevent_rename_src(ITOV(sip), sdvp, snm, ct); rw_exit(&tdp->i_rwlock); rw_enter(&sdp->i_rwlock, RW_WRITER); @@ -1006,11 +1005,15 @@ udf_rename( * If the entry has changed just forget about it. Release * the source inode. */ - if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0, + if ((error = err = ud_dirremove(sdp, snm, sip, (struct vnode *)0, DR_RENAME, cr, ct)) == ENOENT) { error = 0; } rw_exit(&sdp->i_rwlock); + + if (err == 0) + vnevent_rename_src(ITOV(sip), sdvp, snm, ct); + errout: ITIMES(sdp); ITIMES(tdp); diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c index fcffd952ed..c77872b11d 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vnops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -3367,11 +3367,10 @@ ufs_rename( struct inode *ip = NULL; /* check inode */ struct inode *sdp; /* old (source) parent inode */ struct inode *tdp; /* new (target) parent inode */ - struct vnode *svp = NULL; /* source vnode */ struct vnode *tvp = NULL; /* target vnode, if it exists */ struct vnode *realvp; struct ufsvfs *ufsvfsp; - struct ulockfs *ulp; + struct ulockfs *ulp = NULL; struct ufs_slot slot; timestruc_t now; int error; @@ -3380,7 +3379,7 @@ ufs_rename( krwlock_t *first_lock; krwlock_t *second_lock; krwlock_t *reverse_lock; - int serr, terr; + int terr; sdp = VTOI(sdvp); slot.fbp = NULL; @@ -3389,34 +3388,13 @@ ufs_rename( if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; + /* Must do this before taking locks in case of DNLC miss */ terr = ufs_eventlookup(tdvp, tnm, cr, &tvp); - serr = ufs_eventlookup(sdvp, snm, cr, &svp); - - if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) { - if (tvp != NULL) - vnevent_rename_dest(tvp, tdvp, tnm, ct); - - /* - * Notify the target directory of the rename event - * if source and target directories are not the same. - */ - if (sdvp != tdvp) - vnevent_rename_dest_dir(tdvp, ct); - - if (svp != NULL) - vnevent_rename_src(svp, sdvp, snm, ct); - } - - if (tvp != NULL) - VN_RELE(tvp); - - if (svp != NULL) - VN_RELE(svp); retry_rename: error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); if (error) - goto out; + goto unlock; if (ulp) TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, @@ -3712,6 +3690,9 @@ retry_firstlock: goto errout; } + if (terr == 0 && tvp != NULL) + vnevent_rename_dest(tvp, tdvp, tnm, ct); + /* * Unlink the source. * Remove the source entry. ufs_dirremove() checks that the entry @@ -3723,6 +3704,9 @@ retry_firstlock: DR_RENAME, cr)) == ENOENT) error = 0; + vnevent_rename_src(ITOV(sip), sdvp, snm, ct); + vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); + errout: if (slot.fbp) fbrelse(slot.fbp, S_OTHER); @@ -3732,15 +3716,17 @@ errout: rw_exit(&sdp->i_rwlock); } - VN_RELE(ITOV(sip)); - unlock: + if (tvp != NULL) + VN_RELE(tvp); + if (sip != NULL) + VN_RELE(ITOV(sip)); + if (ulp) { TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); ufs_lockfs_end(ulp); } -out: return (error); } diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index 6e93d056df..b680b1168c 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -235,7 +235,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp) * Make sure this root has a path. With lofs, it is possible to have * a NULL mountpoint. */ - if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { + if (ret == 0 && vfsp->vfs_mntpt != NULL && + (*vpp)->v_path == vn_vpath_empty) { mntpt = vfs_getmntpoint(vfsp); vn_setpath_str(*vpp, refstr_value(mntpt), strlen(refstr_value(mntpt))); @@ -905,6 +906,7 @@ vfs_mountroot(void) vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); vfs_mountfs("objfs", "objfs", OBJFS_ROOT); + vfs_mountfs("bootfs", "bootfs", "/system/boot"); if (getzoneid() == GLOBAL_ZONEID) { vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); @@ -3899,6 +3901,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 4abb040de0..9fb1ea702a 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -66,6 +66,7 @@ #include <fs/fs_subr.h> #include <sys/taskq.h> #include <fs/fs_reparse.h> +#include <sys/time.h> /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ @@ -102,6 +103,9 @@ kmutex_t vskstat_tree_lock; /* Global variable which enables/disables the vopstats collection */ int vopstats_enabled = 1; +/* Global used for empty/invalid v_path */ +char *vn_vpath_empty = ""; + /* * forward declarations for internal vnode specific data (vsd) */ @@ -200,6 +204,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; vp->v_mpssdata = NULL; vp->v_vsd = NULL; vp->v_fopdata = NULL; @@ -2331,6 +2340,7 @@ void vn_recycle(vnode_t *vp) { ASSERT(vp->v_pages == NULL); + VERIFY(vp->v_path != NULL); /* * XXX - This really belongs in vn_reinit(), but we have some issues @@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp) kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } - if (vp->v_path) { + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } if (vp->v_fopdata != NULL) { @@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); - if (vp->v_path != NULL) { + VERIFY(vp->v_path != NULL); + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } /* If FEM was in use, make sure everything gets cleaned up */ @@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2951,7 +2964,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, * the potential for deadlock. */ mutex_enter(&base->v_lock); - if (base->v_path == NULL) { + if (base->v_path == vn_vpath_empty) { mutex_exit(&base->v_lock); return; } @@ -2978,7 +2991,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath = kmem_alloc(rpathalloc, KM_SLEEP); mutex_enter(&base->v_lock); - if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { + if (base->v_path == vn_vpath_empty || + strlen(base->v_path) != rpathlen) { mutex_exit(&base->v_lock); kmem_free(rpath, rpathalloc); return; @@ -2992,7 +3006,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath[rpathlen + plen] = '\0'; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(rpath, rpathalloc); } else { @@ -3012,7 +3026,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len) char *buf = kmem_alloc(len + 1, KM_SLEEP); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(buf, len + 1); return; @@ -3036,10 +3050,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) mutex_enter(&vp->v_lock); tmp = vp->v_path; - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; mutex_exit(&vp->v_lock); vn_setpath(rootdir, dvp, vp, nm, len); - if (tmp != NULL) + if (tmp != vn_vpath_empty) kmem_free(tmp, strlen(tmp) + 1); } @@ -3054,7 +3068,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) int alloc; mutex_enter(&src->v_lock); - if (src->v_path == NULL) { + if (src->v_path == vn_vpath_empty) { mutex_exit(&src->v_lock); return; } @@ -3064,7 +3078,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); buf = kmem_alloc(alloc, KM_SLEEP); mutex_enter(&src->v_lock); - if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { + if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) { mutex_exit(&src->v_lock); kmem_free(buf, alloc); return; @@ -3073,7 +3087,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); mutex_enter(&dst->v_lock); - if (dst->v_path != NULL) { + if (dst->v_path != vn_vpath_empty) { mutex_exit(&dst->v_lock); kmem_free(buf, alloc); return; @@ -3231,14 +3245,57 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3250,14 +3307,62 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3421,7 +3526,7 @@ fop_lookup( } if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, lookup); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); } } @@ -3463,7 +3568,7 @@ fop_create( (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, create); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); } } @@ -3585,7 +3690,7 @@ fop_mkdir( (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, mkdir); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, dirname, strlen(dirname)); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 09d2e1dd8f..d2b584cf02 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -129,6 +129,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/multilist.h> #ifdef _KERNEL #include <sys/vmsystm.h> @@ -4277,6 +4278,14 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 465dfd08b2..4b644f7479 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -617,8 +617,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + + if (bonuslen) { + /* + * Absent byzantine on-disk corruption, we fully expect + * our bonuslen to be no more than DN_MAX_BONUSLEN -- + * but we nonetheless explicitly clamp it on the bcopy() + * to prevent any on-disk corruption from becoming + * rampant in-kernel corruption. + */ + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + MIN(bonuslen, DN_MAX_BONUSLEN)); + } + DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 43bcfee91c..bfe5cda5e4 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1799,7 +1799,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (!zio_checksum_table[checksum].ci_dedup) dedup_verify = B_TRUE; } - /* * Enable nopwrite if we have a cryptographically secure * checksum that has no known collisions (i.e. SHA-256) diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 56ce2f0c27..c2b1dca1c0 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -38,11 +38,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + min_bs = SPA_MINBLOCKSHIFT; max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; min_ibs = DN_MIN_INDBLKSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 6a20ab3ca2..1222b8933a 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -42,6 +42,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include <sys/zfeature.h> #include <sys/policy.h> #include <sys/zfs_znode.h> @@ -1266,7 +1267,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); + zfs_zone_txg_delay(), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 8c5d820ede..8ad7c789d3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -42,6 +42,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 9ba9fd3841..02cccae29d 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -63,6 +63,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ int zfs_condense_pct = 200; /* + * Never condense any space map. This is for debugging/recovery only. + */ +int zfs_condense_never = 0; + +/* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the @@ -1656,6 +1661,9 @@ metaslab_should_condense(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + if (zfs_condense_never != 0) + return (B_FALSE); + /* * Use the ms_size_tree range tree, which is ordered by size, to * obtain the largest segment in the free tree. We always condense diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 0b5b37f5fb..5a2602271b 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -24,6 +24,7 @@ * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -406,15 +407,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; - int i; + int i, size; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; - tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + + if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) { + tb->lot_attrs = kmem_alloc(size, KM_SLEEP); + bcopy(attrs, tb->lot_attrs, size); + } + tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 17a18a3199..bde9b0a7ab 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -116,6 +116,7 @@ struct vdev_queue { avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; + zoneid_t vq_last_zone_id; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..f1431b3f55 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern hrtime_t zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t, + avl_tree_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 198dc92387..78c9355438 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -421,7 +421,8 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; - hrtime_t io_timestamp; + hrtime_t io_timestamp; /* time I/O entered zio pipeline */ + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ avl_node_t io_queue_node; avl_node_t io_offset_node; @@ -450,6 +451,7 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 191259e75b..915c9bb4b2 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -31,6 +31,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * ZFS Transaction Groups @@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index ed4a8b773b..e4cc42452a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -21,11 +21,13 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -44,6 +46,11 @@ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; @@ -127,6 +134,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; /* * Ignore events other than offline. @@ -586,6 +595,7 @@ static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; if (vd->vdev_reopening || dvd == NULL) return; @@ -812,6 +822,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } @@ -821,6 +833,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 79d6e13b3b..4c02013405 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* @@ -33,6 +34,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> /* * ZFS I/O Scheduler @@ -141,7 +143,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_min_active = 3; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; @@ -237,6 +239,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_last_zone_id = 0; + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -274,6 +278,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_enqueue(zio); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -289,6 +294,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -648,7 +654,11 @@ again: search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); +#ifdef _KERNEL + zio = zfs_zone_schedule(vq, p, idx, tree); +#else zio = avl_nearest(tree, idx, AVL_AFTER); +#endif if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index bd7424b55b..2adb297937 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/types.h> @@ -847,9 +848,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on %s is %u, " "should be at least %u", - zp->z_vnode->v_path ? zp->z_vnode->v_path : - "<unknown>", (int)zp->z_links, - zp_is_dir + 1); + zp->z_vnode->v_path != vn_vpath_empty ? + zp->z_vnode->v_path : "<unknown>", + (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 692b49611d..47b0b6f650 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -22,8 +22,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -611,9 +611,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); @@ -943,6 +944,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; + if (secpolicy_fs_import(cr) != 0) + return (set_errno(EPERM)); + if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); @@ -2034,7 +2038,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -2052,7 +2057,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -2079,11 +2085,24 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); } @@ -2277,8 +2296,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); @@ -2307,8 +2339,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -3018,6 +3052,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -3061,8 +3096,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3071,13 +3107,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 8cf83b399a..f9b7986c56 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1950,6 +1951,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index bf75097d2a..ddec59ffc9 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -22,11 +22,16 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -662,6 +667,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; + int prev_error; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; @@ -683,6 +689,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -735,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); - - /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { @@ -966,7 +972,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); - ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force @@ -976,18 +981,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; + /* + * Keep track of a possible pre-existing error from a partial + * write via dmu_write_uio_dbuf above. + */ + prev_error = error; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); - if (error != 0) + if (prev_error != 0 || error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; - - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); @@ -3659,18 +3666,6 @@ top: } } - vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - if (tzp) - vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); - - /* - * notify the target directory if it is not the same - * as source directory. - */ - if (tdvp != sdvp) { - vnevent_rename_dest_dir(tdvp, ct); - } - tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); @@ -3711,8 +3706,12 @@ top: return (error); } - if (tzp) /* Attempt to remove the existing target */ + if (tzp) { + /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + if (error == 0) + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + } if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); @@ -3754,6 +3753,11 @@ top: } dmu_tx_commit(tx); + + if (error == 0) { + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); + } out: if (zl != NULL) zfs_rename_unlock(&zl); @@ -4244,6 +4248,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -4779,10 +4785,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..4861c64f8e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1336 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + * 1) Ensure consistent and predictable I/O latency across all zones. + * 2) Sequential and random workloads have very different characteristics, + * so it is a non-starter to track IOPS or throughput. + * 3) A zone should be able to use the full disk bandwidth if no other zone + * is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + * (# of read syscalls) x (Average read latency) + + * (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +hrtime_t +zfs_zone_txg_delay() +{ + return (MSEC2NSEC(10)); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */ + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. + */ +uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */ +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + * ((now - zfs_zone_last_checked) * 1000); + */ +kmutex_t zfs_disk_lock; /* protects the following: */ +uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ +hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ + +hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; + +/* + * Data used to keep track of how often txg sync is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_sync_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on the zfs_vdev_sync_read_max_active value for the + * number of I/Os that can be pending on a device. If there are more than the + * max_active ops already queued up, beyond those already issued to the vdev, + * then use zone-based scheduling to get the next synchronous zio. + */ +uint32_t zfs_zone_schedule_thresh = 10; + +/* + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define SCHED_WEIGHT_MAX 20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale). + */ +int zfs_zone_txg_throttle_scale = 2; +hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); + +typedef struct { + int zq_qdepth; + zio_priority_t zq_queue; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static boolean_t +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (B_FALSE); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (B_TRUE); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + DTRACE_PROBE3(zfs__zone__calc__wt__avg, + uintptr_t, cp->sys_avg_lat, + uintptr_t, cp->cycle_lat, + uintptr_t, cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id, + uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, + uintptr_t, *wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, + uint_t, rops, uint_t, wops, uint_t, lwops, + uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && sp->zi_overutil) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, + uintptr_t, delay, uintptr_t, zonep->zone_io_delay, + uintptr_t, fairutil, uintptr_t, zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) +{ + zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, + uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, + uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, + uintptr_t, stats.zi_diskutil); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + uint_t cnt; + zone_q_bump_t *qbp = arg; + zio_priority_t p = qbp->zq_queue; + + cnt = zonep->zone_zfs_queued[p]; + if (cnt == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / cnt) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * it becomes the new leading contender. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. This is only + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller. + * + * For single-threaded synchronous processes a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple processes + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel processes (and ops in the queue) vs. other zones which + * are doing simple single-threaded processes, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, + avl_tree_t *tree) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + qbump.zq_queue = p; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(tree); zp != NULL; + zp = avl_walk(tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, + uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out TX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track and throttle IO operations per zone. Called from: + * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes + * go through this path) + * - arc_read for read ops that miss the ARC (both dataset and zvol) + * For each operation, increment that zone's counter based on the type of + * operation, then delay the operation, if necessary. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. We only see logical writes in this + * function, we see the physcial writes in the zfs_zone_zio_start and + * zfs_zone_zio_done functions. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice; first because this function + * is always called by a zone thread for logical writes, but then we also will + * count the physical writes that are performed at a low level via + * zfs_zone_zio_start. Without this, it can look like a non-global zone never + * writes (case 1). Depending on when the TXG is synced, the counts may be in + * the same sample bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through arc_read and we only come into this + * function when we have an arc miss. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow, last_checked; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counter for logical writes here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * If the zone's I/O priority is set to zero, don't throttle that zone's + * operations at all. + */ + if (zonep->zone_zfs_io_pri == 0) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + zfs_zone_wait_adjust(unow, last_checked); + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TXG + * syncing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_sync_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id, + uintptr_t, type, uintptr_t, wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the sync rate is going to be 1. When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second). In this case the rate + * will be > 1. The sync rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_sync_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_sync_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +hrtime_t +zfs_zone_txg_delay() +{ + if (curzone->zone_io_util_above_avg) + return (zfs_zone_txg_delay_nsec); + + return (MSEC2NSEC(10)); +} + +/* + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_disk_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if (zp->io_dispatched == 0) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_timestamp; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, + uintptr_t, zp->io_type, uintptr_t, udelta); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued[p] > 0); + if (zonep->zone_zfs_queued[p] == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued[p]--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued[p]++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. That function is where zio's are listed + * in FIFO order on one of the sync queues, then pulled off (by + * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling + * here to find a zone's zio deeper in the sync queue and issue that instead + * of simply doing FIFO. + * + * We only do zone-based zio scheduling for the two synchronous I/O queues + * (read & write). These queues are normally serviced in FIFO order but we + * may decide to move a zone's zio to the head of the line. A typical I/O + * load will be mostly synchronous reads and some asynchronous writes (which + * are scheduled differently due to transaction groups). There will also be + * some synchronous writes for those apps which want to ensure their data is on + * disk. We want to make sure that a zone with a single-threaded app (e.g. the + * shell) that is doing synchronous I/O (typically reads) isn't penalized by + * other zones which are doing lots of synchronous I/O because they have many + * running threads. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, + avl_tree_t *tree) +{ + vdev_queue_class_t *vqc = &vq->vq_class[p]; + uint_t cnt; + zoneid_t last_zone; + zio_t *zio; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* Don't change the order on the LBA ordered queues. */ + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return (avl_nearest(tree, idx, AVL_AFTER)); + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + cnt = avl_numnodes(tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few zios in the queue then just issue the head. + * If there are more than a few zios already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zio = avl_nearest(tree, idx, AVL_AFTER); + else + zio = get_next_zio(vqc, cnt, p, tree); + + vq->vq_last_zone_id = zio->io_zoneid; + + /* + * Probe with 4 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, the zone that was associated + * with the next IO that is scheduled, and which queue (priority). + */ + DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, + uint_t, zio->io_zoneid, uint_t, p); + + return (zio); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 38b57e0123..f9f93999db 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1801,9 +1802,18 @@ zil_close(zilog_t *zilog) if (lwb != NULL) txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); - if (txg) + + if (zilog_is_dirty(zilog)) { + /* + * If we're dirty, always wait for the current transaction -- + * our lwb_max_txg may be in the past. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (txg) { txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(!zilog_is_dirty(zilog)); + } + + VERIFY(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index aa0f2945dd..2fa3213be0 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #include <sys/sysmacros.h> @@ -39,6 +40,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/zfs_zone.h> /* * ========================================================================== @@ -557,11 +559,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 80888103fe..f681b1dc65 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -25,7 +25,7 @@ * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -83,6 +83,7 @@ #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/zfeature.h> @@ -137,6 +138,11 @@ typedef struct zvol_state { #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * zvol maximum transfer in one DMU tx. */ @@ -1378,6 +1384,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; rl_t *rl; int error = 0; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1394,6 +1403,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1403,6 +1420,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; + tot_bytes += bytes; error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ @@ -1412,6 +1430,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } } zfs_range_unlock(rl); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += tot_bytes; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + return (error); } @@ -1425,6 +1476,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) rl_t *rl; int error = 0; boolean_t sync; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1441,6 +1495,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for zvol write operations. There's no + * actual wait queue for zvol operations. + */ + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1454,6 +1521,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; + tot_bytes += bytes; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1471,6 +1539,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += tot_bytes; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + return (error); } |