summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/bootfs/bootfs_construct.c368
-rw-r--r--usr/src/uts/common/fs/bootfs/bootfs_vfsops.c321
-rw-r--r--usr/src/uts/common/fs/bootfs/bootfs_vnops.c544
-rw-r--r--usr/src/uts/common/fs/dev/sdev_netops.c259
-rw-r--r--usr/src/uts/common/fs/dev/sdev_plugin.c913
-rw-r--r--usr/src/uts/common/fs/dev/sdev_subr.c211
-rw-r--r--usr/src/uts/common/fs/dev/sdev_vfsops.c23
-rw-r--r--usr/src/uts/common/fs/dev/sdev_zvolops.c8
-rw-r--r--usr/src/uts/common/fs/dnlc.c44
-rw-r--r--usr/src/uts/common/fs/fem.c688
-rw-r--r--usr/src/uts/common/fs/fifofs/fifosubr.c7
-rw-r--r--usr/src/uts/common/fs/fifofs/fifovnops.c13
-rw-r--r--usr/src/uts/common/fs/fs_subr.c30
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c640
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c127
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c614
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c1441
-rw-r--r--usr/src/uts/common/fs/lookup.c316
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_subr.c524
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vfsops.c367
-rw-r--r--usr/src/uts/common/fs/lxproc/lxpr_vnops.c3099
-rw-r--r--usr/src/uts/common/fs/lxproc/lxproc.h277
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs3_vnops.c5
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_ns.c5
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c30
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_auth.c26
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_vfsops.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_vnops.c6
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_dir.c12
-rw-r--r--usr/src/uts/common/fs/portfs/port.c36
-rw-r--r--usr/src/uts/common/fs/portfs/port_vnops.c16
-rw-r--r--usr/src/uts/common/fs/proc/prargv.c441
-rw-r--r--usr/src/uts/common/fs/proc/prcontrol.c16
-rw-r--r--usr/src/uts/common/fs/proc/prdata.h5
-rw-r--r--usr/src/uts/common/fs/proc/prsubr.c3
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c164
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.c8
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c57
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_subr.c52
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter.c24
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter_impl.h2
-rw-r--r--usr/src/uts/common/fs/sockfs/socknotify.c2
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c3
-rw-r--r--usr/src/uts/common/fs/sockfs/socksyscalls.c122
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c7
-rw-r--r--usr/src/uts/common/fs/swapfs/swap_subr.c6
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_dir.c12
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_subr.c195
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vfsops.c269
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vnops.c76
-rw-r--r--usr/src/uts/common/fs/udfs/udf_dir.c6
-rw-r--r--usr/src/uts/common/fs/udfs/udf_vnops.c9
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_vnops.c46
-rw-r--r--usr/src/uts/common/fs/vfs.c8
-rw-r--r--usr/src/uts/common/fs/vnode.c157
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c9
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c15
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c1
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c1
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c8
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c12
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_zone.h63
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h4
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c14
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c7
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c66
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c12
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c68
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_zone.c1336
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c14
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c5
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c103
80 files changed, 13534 insertions, 896 deletions
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_construct.c b/usr/src/uts/common/fs/bootfs/bootfs_construct.c
new file mode 100644
index 0000000000..b909b5d121
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_construct.c
@@ -0,0 +1,368 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * This file takes care of reading the boot time modules and constructing them
+ * into the appropriate series of vnodes.
+ */
+
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/vfs.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+kmem_cache_t *bootfs_node_cache;
+
+static const vattr_t bootfs_vattr_dir = {
+ AT_ALL, /* va_mask */
+ VDIR, /* va_type */
+ S_IFDIR | 0555, /* va_mode */
+ 0, /* va_uid */
+ 0, /* va_gid */
+ 0, /* va_fsid */
+ 0, /* va_nodeid */
+ 1, /* va_nlink */
+ 0, /* va_size */
+ 0, /* va_atime */
+ 0, /* va_mtime */
+ 0, /* va_ctime */
+ 0, /* va_rdev */
+ 0, /* va_blksize */
+ 0, /* va_nblocks */
+ 0 /* va_seq */
+};
+
+static const vattr_t bootfs_vattr_reg = {
+ AT_ALL, /* va_mask */
+ VREG, /* va_type */
+ S_IFREG | 0555, /* va_mode */
+ 0, /* va_uid */
+ 0, /* va_gid */
+ 0, /* va_fsid */
+ 0, /* va_nodeid */
+ 1, /* va_nlink */
+ 0, /* va_size */
+ 0, /* va_atime */
+ 0, /* va_mtime */
+ 0, /* va_ctime */
+ 0, /* va_rdev */
+ 0, /* va_blksize */
+ 0, /* va_nblocks */
+ 0 /* va_seq */
+};
+
+/*ARGSUSED*/
+int
+bootfs_node_constructor(void *buf, void *arg, int kmflags)
+{
+ bootfs_node_t *bnp = buf;
+
+ bnp->bvn_vnp = vn_alloc(kmflags);
+ if (bnp->bvn_vnp == NULL)
+ return (-1);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+bootfs_node_destructor(void *buf, void *arg)
+{
+ bootfs_node_t *bnp = buf;
+
+ vn_free(bnp->bvn_vnp);
+}
+
+static int
+bootfs_comparator(const void *a, const void *b)
+{
+ const bootfs_node_t *lfs, *rfs;
+ int ret;
+
+ lfs = a;
+ rfs = b;
+
+ ret = strcmp(lfs->bvn_name, rfs->bvn_name);
+ if (ret > 0)
+ ret = 1;
+ if (ret < 0)
+ ret = -1;
+ return (ret);
+}
+
+static void
+bootfs_node_init(bootfs_t *bfs, bootfs_node_t *bnp, const struct vattr *vap,
+ const char *name, size_t namelen)
+{
+ timestruc_t now;
+
+ vn_reinit(bnp->bvn_vnp);
+
+ bnp->bvn_vnp->v_flag |= VNOSWAP;
+ bnp->bvn_vnp->v_type = vap->va_type;
+ bnp->bvn_vnp->v_vfsp = bfs->bfs_vfsp;
+ bnp->bvn_vnp->v_rdev = 0;
+ bnp->bvn_vnp->v_data = (caddr_t)bnp;
+ vn_setops(bnp->bvn_vnp, bootfs_vnodeops);
+
+ bnp->bvn_name = kmem_alloc(namelen + 1, KM_SLEEP);
+ bcopy(name, bnp->bvn_name, namelen);
+ bnp->bvn_name[namelen] = '\0';
+ if (vap->va_type == VDIR) {
+ avl_create(&bnp->bvn_dir, bootfs_comparator,
+ sizeof (bootfs_node_t),
+ offsetof(bootfs_node_t, bvn_link));
+ }
+ bzero(&bnp->bvn_link, sizeof (avl_node_t));
+ bcopy(vap, &bnp->bvn_attr, sizeof (vattr_t));
+
+ gethrestime(&now);
+ bnp->bvn_attr.va_atime = now;
+ bnp->bvn_attr.va_ctime = now;
+ bnp->bvn_attr.va_mtime = now;
+ bnp->bvn_attr.va_fsid = makedevice(bootfs_major, bfs->bfs_minor);
+ bnp->bvn_attr.va_nodeid = bfs->bfs_ninode;
+ bnp->bvn_attr.va_blksize = PAGESIZE;
+ bfs->bfs_ninode++;
+ list_insert_tail(&bfs->bfs_nodes, bnp);
+}
+
+static void
+bootfs_mkroot(bootfs_t *bfs)
+{
+ bootfs_node_t *bnp;
+
+ bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP);
+ bootfs_node_init(bfs, bnp, &bootfs_vattr_dir, "/", 1);
+ bnp->bvn_vnp->v_flag |= VROOT;
+ bnp->bvn_parent = bnp;
+ bfs->bfs_rootvn = bnp;
+ bfs->bfs_stat.bfss_ndirs.value.ui32++;
+ vn_exists(bnp->bvn_vnp);
+}
+
+static int
+bootfs_mknode(bootfs_t *bfs, bootfs_node_t *parent, bootfs_node_t **outp,
+ const char *name, size_t namelen, const vattr_t *vap, uintptr_t addr,
+ uint64_t size)
+{
+ bootfs_node_t *bnp;
+ bootfs_node_t sn;
+ avl_index_t where;
+ char *buf;
+
+ ASSERT(parent->bvn_attr.va_type == VDIR);
+ buf = kmem_alloc(namelen + 1, KM_SLEEP);
+ bcopy(name, buf, namelen);
+ buf[namelen] = '\0';
+ sn.bvn_name = buf;
+ if ((bnp = avl_find(&parent->bvn_dir, &sn, &where)) != NULL) {
+ kmem_free(buf, namelen + 1);
+ /* Directories can collide, files cannot */
+ if (vap->va_type == VDIR) {
+ *outp = bnp;
+ return (0);
+ }
+ return (EEXIST);
+ }
+ kmem_free(buf, namelen + 1);
+
+ bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP);
+ bootfs_node_init(bfs, bnp, vap, name, namelen);
+ bnp->bvn_parent = parent;
+ avl_add(&parent->bvn_dir, bnp);
+ *outp = bnp;
+
+ if (vap->va_type == VDIR) {
+ parent->bvn_attr.va_size++;
+ parent->bvn_attr.va_nlink++;
+ bfs->bfs_stat.bfss_ndirs.value.ui32++;
+ } else {
+ bnp->bvn_addr = addr;
+ bnp->bvn_size = size;
+ bfs->bfs_stat.bfss_nfiles.value.ui32++;
+ bfs->bfs_stat.bfss_nbytes.value.ui64 += size;
+ bnp->bvn_attr.va_nblocks = P2ROUNDUP(size, 512) >> 9;
+ bnp->bvn_attr.va_size = size;
+ }
+
+ vn_exists(bnp->bvn_vnp);
+
+ return (0);
+}
+
+/*
+ * Given the address, size, and path a boot-time module would like, go through
+ * and create all of the directory entries that are required and then the file
+ * itself. If someone has passed in a module that has the same name as another
+ * one, we honor the first one.
+ */
+static int
+bootfs_construct_entry(bootfs_t *bfs, uintptr_t addr, uint64_t size,
+ const char *mname)
+{
+ char *sp;
+ size_t nlen;
+ int ret;
+ bootfs_node_t *nbnp;
+
+ const char *p = mname;
+ bootfs_node_t *bnp = bfs->bfs_rootvn;
+
+ if (*p == '\0')
+ return (EINVAL);
+
+ for (;;) {
+ /* First eliminate all leading / characters. */
+ while (*p == '/')
+ p++;
+
+ /* A name with all slashes or ending in a / */
+ if (*p == '\0')
+ return (EINVAL);
+
+ sp = strchr(p, '/');
+ if (sp == NULL)
+ break;
+ nlen = (ptrdiff_t)sp - (ptrdiff_t)p;
+ if (strncmp(p, ".", nlen) == 0) {
+ p = sp + 1;
+ continue;
+ }
+
+ if (strncmp(p, "..", nlen) == 0) {
+ bnp = bnp->bvn_parent;
+ p = sp + 1;
+ continue;
+ }
+
+ VERIFY(bootfs_mknode(bfs, bnp, &nbnp, p, nlen,
+ &bootfs_vattr_dir, addr, size) == 0);
+ p = sp + 1;
+ bnp = nbnp;
+ }
+
+ nlen = strlen(p);
+ ret = bootfs_mknode(bfs, bnp, &nbnp, p, nlen, &bootfs_vattr_reg,
+ addr, size);
+ if (ret != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * We're going to go through every boot time module and construct the
+ * appropriate vnodes for them now. Because there are very few of these that
+ * exist, generally on the order of a handful, we're going to create them all
+ * when the file system is initialized and then tear them all down when the
+ * module gets unloaded.
+ *
+ * The information about the modules is contained in properties on the root of
+ * the devinfo tree. Specifically there are three properties per module:
+ *
+ * - module-size-%d int64_t size, in bytes, of the boot time module.
+ * - module-addr-%d The address of the boot time module
+ * - module-name-%d The string name of the boot time module
+ *
+ * Note that the module-size and module-addr fields are always 64-bit values
+ * regardless of being on a 32-bit or 64-bit kernel. module-name is a string
+ * property.
+ *
+ * There is no property that indicates the total number of such modules. Modules
+ * start at 0 and work their way up incrementally. The first time we can't find
+ * a module or a property, then we stop.
+ */
+void
+bootfs_construct(bootfs_t *bfs)
+{
+ uint_t id = 0, ndata;
+ char paddr[64], psize[64], pname[64], *mname;
+ dev_info_t *root;
+ uchar_t *datap;
+ uint64_t size = 0, addr = 0;
+ int ret;
+
+ bootfs_mkroot(bfs);
+ root = ddi_root_node();
+
+ for (;;) {
+ if (id == UINT32_MAX)
+ break;
+
+ if (snprintf(paddr, sizeof (paddr), "module-addr-%d", id) >
+ sizeof (paddr))
+ break;
+
+ if (snprintf(psize, sizeof (paddr), "module-size-%d", id) >
+ sizeof (paddr))
+ break;
+
+ if (snprintf(pname, sizeof (paddr), "module-name-%d", id) >
+ sizeof (paddr))
+ break;
+
+ if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root,
+ DDI_PROP_DONTPASS, paddr, &datap, &ndata) !=
+ DDI_PROP_SUCCESS)
+ break;
+
+ if (ndata == 8)
+ bcopy(datap, &addr, sizeof (uint64_t));
+ ddi_prop_free(datap);
+ if (ndata != 8)
+ break;
+
+ if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root,
+ DDI_PROP_DONTPASS, psize, &datap, &ndata) !=
+ DDI_PROP_SUCCESS)
+ break;
+ if (ndata == 8)
+ bcopy(datap, &size, sizeof (uint64_t));
+ ddi_prop_free(datap);
+ if (ndata != 8)
+ break;
+
+ if (ddi_prop_lookup_string(DDI_DEV_T_ANY, root,
+ DDI_PROP_DONTPASS, pname, &mname) != DDI_PROP_SUCCESS)
+ break;
+
+ ret = bootfs_construct_entry(bfs, addr, size, mname);
+ if (ret == EINVAL)
+ bfs->bfs_stat.bfss_ndiscards.value.ui32++;
+ if (ret == EEXIST)
+ bfs->bfs_stat.bfss_ndups.value.ui32++;
+ ddi_prop_free(mname);
+
+ id++;
+ }
+}
+
+void
+bootfs_destruct(bootfs_t *bfs)
+{
+ bootfs_node_t *bnp;
+
+ while ((bnp = list_remove_head(&bfs->bfs_nodes)) != NULL) {
+ ASSERT(bnp->bvn_vnp->v_count == 1);
+ VN_RELE(bnp->bvn_vnp);
+ kmem_free(bnp->bvn_name, strlen(bnp->bvn_name) + 1);
+ kmem_cache_free(bootfs_node_cache, bnp);
+ }
+}
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
new file mode 100644
index 0000000000..e642e86169
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
@@ -0,0 +1,321 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2015 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/modctl.h>
+#include <sys/types.h>
+#include <sys/mkdev.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/systm.h>
+#include <sys/id_space.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/policy.h>
+#include <sys/mount.h>
+#include <sys/sysmacros.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+/*
+ * While booting, additional types of modules and files can be passed in to the
+ * loader. These include the familiar boot archive, as well as, a module hash
+ * and additional modules that are interpreted as files. As part of the handoff
+ * in early boot, information about these modules are saved as properties on the
+ * root of the devinfo tree, similar to other boot-time properties.
+ *
+ * This file system provides a read-only view of those additional files. Due to
+ * its limited scope, it has a slightly simpler construction than several other
+ * file systems. When mounted, it looks for the corresponding properties and
+ * creates bootfs_node_t's and vnodes for all of the corresponding files and
+ * directories that exist along the way. At this time, there are currently a
+ * rather small number of files passed in this way.
+ *
+ * This does lead to one behavior that folks used to other file systems might
+ * find peculiar. Because we are not always actively creating and destroying the
+ * required vnodes on demand, the count on the root vnode will not be going up
+ * accordingly with the existence of other vnodes. This means that a bootfs file
+ * system that is not in use will have all of its vnodes exist with a v_count of
+ * one.
+ */
+
+major_t bootfs_major;
+static int bootfs_fstype;
+static id_space_t *bootfs_idspace;
+static uint64_t bootfs_nactive;
+static kmutex_t bootfs_lock;
+
+static const char *bootfs_name = "bootfs";
+
+static int
+bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ int ret;
+ bootfs_t *bfs;
+ struct pathname dpn;
+ dev_t fsdev;
+
+ if ((ret = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (ret);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uap->flags & MS_REMOUNT)
+ return (EBUSY);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * We indicate that the backing store is bootfs. We don't want to use
+ * swap, because folks might think that this is putting all the data
+ * into memory ala tmpfs. Rather these modules are always in memory and
+ * there's nothing to be done about that.
+ */
+ vfs_setresource(vfsp, bootfs_name, 0);
+ bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI);
+ if (bfs == NULL)
+ return (ENOMEM);
+
+ ret = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn);
+ if (ret != 0) {
+ kmem_free(bfs, sizeof (bfs));
+ return (ret);
+ }
+
+ bfs->bfs_minor = id_alloc(bootfs_idspace);
+ bfs->bfs_kstat = kstat_create_zone("bootfs", bfs->bfs_minor, "bootfs",
+ "fs", KSTAT_TYPE_NAMED,
+ sizeof (bootfs_stat_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
+ if (bfs->bfs_kstat == NULL) {
+ id_free(bootfs_idspace, bfs->bfs_minor);
+ pn_free(&dpn);
+ kmem_free(bfs, sizeof (bfs));
+ return (ENOMEM);
+ }
+ bfs->bfs_kstat->ks_data = &bfs->bfs_stat;
+
+ fsdev = makedevice(bootfs_major, bfs->bfs_minor);
+ bfs->bfs_vfsp = vfsp;
+
+ vfsp->vfs_data = (caddr_t)bfs;
+ vfsp->vfs_fstype = bootfs_fstype;
+ vfsp->vfs_dev = fsdev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_RDONLY | VFS_NOSETUID | VFS_NOTRUNC |
+ VFS_UNLINKABLE;
+ vfs_make_fsid(&vfsp->vfs_fsid, fsdev, bootfs_fstype);
+ bfs->bfs_mntpath = kmem_alloc(dpn.pn_pathlen + 1, KM_SLEEP);
+ bcopy(dpn.pn_path, bfs->bfs_mntpath, dpn.pn_pathlen);
+ bfs->bfs_mntpath[dpn.pn_pathlen] = '\0';
+ pn_free(&dpn);
+ list_create(&bfs->bfs_nodes, sizeof (bootfs_node_t),
+ offsetof(bootfs_node_t, bvn_alink));
+
+ kstat_named_init(&bfs->bfs_stat.bfss_nfiles, "nfiles",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&bfs->bfs_stat.bfss_ndirs, "ndirs",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&bfs->bfs_stat.bfss_nbytes, "nbytes",
+ KSTAT_DATA_UINT64);
+ kstat_named_init(&bfs->bfs_stat.bfss_ndups, "ndup",
+ KSTAT_DATA_UINT32);
+ kstat_named_init(&bfs->bfs_stat.bfss_ndiscards, "ndiscard",
+ KSTAT_DATA_UINT32);
+
+ bootfs_construct(bfs);
+
+ kstat_install(bfs->bfs_kstat);
+
+ return (0);
+}
+
+static int
+bootfs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ int ret;
+ bootfs_t *bfs = vfsp->vfs_data;
+ bootfs_node_t *bnp;
+
+ if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (ret);
+
+ if (flag & MS_FORCE)
+ return (ENOTSUP);
+
+ for (bnp = list_head(&bfs->bfs_nodes); bnp != NULL;
+ bnp = list_next(&bfs->bfs_nodes, bnp)) {
+ mutex_enter(&bnp->bvn_vnp->v_lock);
+ if (bnp->bvn_vnp->v_count > 1) {
+ mutex_exit(&bnp->bvn_vnp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&bnp->bvn_vnp->v_lock);
+ }
+
+ kstat_delete(bfs->bfs_kstat);
+ bootfs_destruct(bfs);
+ list_destroy(&bfs->bfs_nodes);
+ kmem_free(bfs->bfs_mntpath, strlen(bfs->bfs_mntpath) + 1);
+ id_free(bootfs_idspace, bfs->bfs_minor);
+ kmem_free(bfs, sizeof (bootfs_t));
+ return (0);
+}
+
+static int
+bootfs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ bootfs_t *bfs;
+
+ bfs = (bootfs_t *)vfsp->vfs_data;
+ *vpp = bfs->bfs_rootvn->bvn_vnp;
+ VN_HOLD(*vpp)
+
+ return (0);
+}
+
+static int
+bootfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+ const bootfs_t *bfs = (bootfs_t *)vfsp;
+ dev32_t d32;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ sbp->f_blocks = bfs->bfs_stat.bfss_nbytes.value.ui64 >> PAGESHIFT;
+ sbp->f_bfree = 0;
+ sbp->f_bavail = 0;
+
+ sbp->f_files = bfs->bfs_stat.bfss_nfiles.value.ui32 +
+ bfs->bfs_stat.bfss_ndirs.value.ui32;
+ sbp->f_ffree = 0;
+ sbp->f_favail = 0;
+
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strlcpy(sbp->f_basetype, bootfs_name, FSTYPSZ);
+ bzero(sbp->f_fstr, sizeof (sbp->f_fstr));
+
+ return (0);
+}
+
+static const fs_operation_def_t bootfs_vfsops_tmpl[] = {
+ VFSNAME_MOUNT, { .vfs_mount = bootfs_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = bootfs_unmount },
+ VFSNAME_ROOT, { .vfs_root = bootfs_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = bootfs_statvfs },
+ NULL, NULL
+};
+
+static int
+bootfs_init(int fstype, char *name)
+{
+ int ret;
+
+ bootfs_fstype = fstype;
+ ASSERT(bootfs_fstype != 0);
+
+ ret = vfs_setfsops(fstype, bootfs_vfsops_tmpl, NULL);
+ if (ret != 0)
+ return (ret);
+
+ ret = vn_make_ops(name, bootfs_vnodeops_template, &bootfs_vnodeops);
+ if (ret != 0) {
+ (void) vfs_freevfsops_by_type(bootfs_fstype);
+ return (ret);
+ }
+
+ bootfs_major = getudev();
+ if (bootfs_major == (major_t)-1) {
+ cmn_err(CE_WARN, "bootfs_init: Can't get unique device number");
+ bootfs_major = 0;
+ }
+
+ bootfs_nactive = 0;
+ return (0);
+}
+
+static mntopts_t bootfs_mntopts = {
+ 0, NULL
+};
+
+static vfsdef_t bootfs_vfsdef = {
+ VFSDEF_VERSION,
+ "bootfs",
+ bootfs_init,
+ VSW_HASPROTO|VSW_STATS,
+ &bootfs_mntopts
+};
+
+static struct modlfs bootfs_modlfs = {
+ &mod_fsops, "boot-time modules file system", &bootfs_vfsdef
+};
+
+static struct modlinkage bootfs_modlinkage = {
+ MODREV_1, &bootfs_modlfs, NULL
+};
+
+int
+_init(void)
+{
+ bootfs_node_cache = kmem_cache_create("bootfs_node_cache",
+ sizeof (bootfs_node_t), 0, bootfs_node_constructor,
+ bootfs_node_destructor, NULL, NULL, NULL, 0);
+ bootfs_idspace = id_space_create("bootfs_minors", 1, INT32_MAX);
+ mutex_init(&bootfs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (mod_install(&bootfs_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&bootfs_modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ mutex_enter(&bootfs_lock);
+ if (bootfs_nactive > 0) {
+ mutex_exit(&bootfs_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&bootfs_lock);
+
+ err = mod_remove(&bootfs_modlinkage);
+ if (err != 0)
+ return (err);
+
+ (void) vfs_freevfsops_by_type(bootfs_fstype);
+ vn_freevnodeops(bootfs_vnodeops);
+ id_space_destroy(bootfs_idspace);
+ mutex_destroy(&bootfs_lock);
+ kmem_cache_destroy(bootfs_node_cache);
+ return (err);
+}
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vnops.c b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c
new file mode 100644
index 0000000000..f63d0a4f24
--- /dev/null
+++ b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c
@@ -0,0 +1,544 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * bootfs vnode operations
+ */
+
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mman.h>
+#include <fs/fs_subr.h>
+#include <sys/policy.h>
+#include <sys/sysmacros.h>
+#include <sys/dirent.h>
+#include <sys/uio.h>
+#include <vm/pvn.h>
+#include <vm/hat.h>
+#include <vm/seg_map.h>
+#include <vm/seg_vn.h>
+#include <sys/vmsystm.h>
+
+#include <sys/fs/bootfs_impl.h>
+
+struct vnodeops *bootfs_vnodeops;
+
+/*ARGSUSED*/
+static int
+bootfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ int err;
+ ssize_t sres = uiop->uio_resid;
+ bootfs_node_t *bnp = vp->v_data;
+
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ if (uiop->uio_loffset < 0)
+ return (EINVAL);
+
+ if (uiop->uio_loffset >= bnp->bvn_size)
+ return (0);
+
+ err = 0;
+ while (uiop->uio_resid != 0) {
+ caddr_t base;
+ long offset, frem;
+ ulong_t poff, segoff;
+ size_t bytes;
+ int relerr;
+
+ offset = uiop->uio_loffset;
+ poff = offset & PAGEOFFSET;
+ bytes = MIN(PAGESIZE - poff, uiop->uio_resid);
+
+ frem = bnp->bvn_size - offset;
+ if (frem <= 0) {
+ err = 0;
+ break;
+ }
+
+ /* Don't read past EOF */
+ bytes = MIN(bytes, frem);
+
+ /*
+ * Segmaps are likely larger than our page size, so make sure we
+ * have the proper offfset into the resulting segmap data.
+ */
+ segoff = (offset & PAGEMASK) & MAXBOFFSET;
+
+ base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK, bytes,
+ 1, S_READ);
+
+ err = uiomove(base + segoff + poff, bytes, UIO_READ, uiop);
+ relerr = segmap_release(segkmap, base, 0);
+
+ if (err == 0)
+ err = relerr;
+
+ if (err != 0)
+ break;
+ }
+
+ /* Even if we had an error in a partial read, return success */
+ if (uiop->uio_resid > sres)
+ err = 0;
+
+ gethrestime(&bnp->bvn_attr.va_atime);
+
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+ cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+ return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ uint32_t mask;
+ bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data;
+
+ mask = vap->va_mask;
+ bcopy(&bpn->bvn_attr, vap, sizeof (vattr_t));
+ vap->va_mask = mask;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ int shift = 0;
+ bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data;
+
+ if (crgetuid(cr) != bpn->bvn_attr.va_uid) {
+ shift += 3;
+ if (groupmember(bpn->bvn_attr.va_gid, cr) == 0)
+ shift += 3;
+ }
+
+ return (secpolicy_vnode_access2(cr, vp, bpn->bvn_attr.va_uid,
+ bpn->bvn_attr.va_mode << shift, mode));
+}
+
+/*ARGSUSED*/
+static int
+bootfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ avl_index_t where;
+ bootfs_node_t sn, *bnp;
+ bootfs_node_t *bpp = (bootfs_node_t *)dvp->v_data;
+
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ if (bpp->bvn_attr.va_type != VDIR)
+ return (ENOTDIR);
+
+ if (*nm == '\0' || strcmp(nm, ".") == 0) {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+
+ if (strcmp(nm, "..") == 0) {
+ VN_HOLD(bpp->bvn_parent->bvn_vnp);
+ *vpp = bpp->bvn_parent->bvn_vnp;
+ return (0);
+ }
+
+ sn.bvn_name = nm;
+ bnp = avl_find(&bpp->bvn_dir, &sn, &where);
+ if (bnp == NULL)
+ return (ENOENT);
+
+ VN_HOLD(bnp->bvn_vnp);
+ *vpp = bnp->bvn_vnp;
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data;
+ dirent64_t *dp;
+ void *buf;
+ ulong_t bsize, brem;
+ offset_t coff, roff;
+ int dlen, ret;
+ bootfs_node_t *dnp;
+ boolean_t first = B_TRUE;
+
+ if (uiop->uio_loffset >= MAXOFF_T) {
+ if (eofp != NULL)
+ *eofp = 1;
+ return (0);
+ }
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (!(uiop->uio_iov->iov_len > 0))
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ roff = uiop->uio_loffset;
+ coff = 0;
+ brem = bsize = uiop->uio_iov->iov_len;
+ buf = kmem_alloc(bsize, KM_SLEEP);
+ dp = buf;
+
+ /*
+ * Recall that offsets here are done based on the name of the dirent
+ * excluding the null terminator. Therefore `.` is always at 0, `..` is
+ * always at 1, and then the first real dirent is at 3. This offset is
+ * what's actually stored when we update the offset in the structure.
+ */
+ if (roff == 0) {
+ dlen = DIRENT64_RECLEN(1);
+ if (first == B_TRUE) {
+ if (dlen > brem) {
+ kmem_free(buf, bsize);
+ return (EINVAL);
+ }
+ first = B_FALSE;
+ }
+ dp->d_ino = (ino64_t)bnp->bvn_attr.va_nodeid;
+ dp->d_off = 0;
+ dp->d_reclen = (ushort_t)dlen;
+ (void) strncpy(dp->d_name, ".", DIRENT64_NAMELEN(dlen));
+ dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+ brem -= dlen;
+ }
+
+ if (roff <= 1) {
+ dlen = DIRENT64_RECLEN(2);
+ if (first == B_TRUE) {
+ if (dlen > brem) {
+ kmem_free(buf, bsize);
+ return (EINVAL);
+ }
+ first = B_FALSE;
+ }
+ dp->d_ino = (ino64_t)bnp->bvn_parent->bvn_attr.va_nodeid;
+ dp->d_off = 1;
+ dp->d_reclen = (ushort_t)dlen;
+ (void) strncpy(dp->d_name, "..", DIRENT64_NAMELEN(dlen));
+ dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+ brem -= dlen;
+ }
+
+ coff = 3;
+ for (dnp = avl_first(&bnp->bvn_dir); dnp != NULL;
+ dnp = AVL_NEXT(&bnp->bvn_dir, dnp)) {
+ size_t nlen = strlen(dnp->bvn_name);
+
+ if (roff > coff) {
+ coff += nlen;
+ continue;
+ }
+
+ dlen = DIRENT64_RECLEN(nlen);
+ if (dlen > brem) {
+ if (first == B_TRUE) {
+ kmem_free(buf, bsize);
+ return (EINVAL);
+ }
+ break;
+ }
+ first = B_FALSE;
+
+ dp->d_ino = (ino64_t)dnp->bvn_attr.va_nodeid;
+ dp->d_off = coff;
+ dp->d_reclen = (ushort_t)dlen;
+ (void) strncpy(dp->d_name, dnp->bvn_name,
+ DIRENT64_NAMELEN(dlen));
+ dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen);
+ brem -= dlen;
+ coff += nlen;
+ }
+
+ ret = uiomove(buf, (bsize - brem), UIO_READ, uiop);
+
+ if (ret == 0) {
+ if (dnp == NULL) {
+ coff++;
+ if (eofp != NULL)
+ *eofp = 1;
+ } else if (eofp != NULL) {
+ *eofp = 0;
+ }
+ uiop->uio_loffset = coff;
+ }
+ gethrestime(&bnp->bvn_attr.va_atime);
+ kmem_free(buf, bsize);
+ return (ret);
+}
+
+/*ARGSUSED*/
+static void
+bootfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+}
+
+/*ARGSUSED*/
+static int
+bootfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ if (write_lock != 0)
+ return (EINVAL);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+bootfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+}
+
+/*ARGSUSED*/
+static int
+bootfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data;
+ if (vp->v_type == VDIR)
+ return (0);
+ return ((*noffp < 0 || *noffp > bnp->bvn_size ? EINVAL : 0));
+}
+
+/*
+ * We need to fill in a single page of a vnode's memory based on the actual data
+ * from the kernel. We'll use this node's sliding window into physical memory
+ * and update one page at a time.
+ */
+/*ARGSUSED*/
+static int
+bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+ cred_t *cr)
+{
+ bootfs_node_t *bnp = vp->v_data;
+ page_t *pp, *fpp;
+ pfn_t pfn;
+
+ for (;;) {
+ /* Easy case where the page exists */
+ pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED);
+ if (pp != NULL) {
+ if (pl != NULL) {
+ pl[0] = pp;
+ pl[1] = NULL;
+ } else {
+ page_unlock(pp);
+ }
+ return (0);
+ }
+
+ pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg,
+ addr);
+
+ /*
+ * If we didn't get the page, that means someone else beat us to
+ * creating this so we need to try again.
+ */
+ if (pp != NULL)
+ break;
+ }
+
+ pfn = btop((bnp->bvn_addr + off) & PAGEMASK);
+ fpp = page_numtopp_nolock(pfn);
+
+ if (ppcopy(fpp, pp) == 0) {
+ pvn_read_done(pp, B_ERROR);
+ return (EIO);
+ }
+
+ if (pl != NULL) {
+ pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw);
+ } else {
+ pvn_io_done(pp);
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+ cred_t *cr, caller_context_t *ct)
+{
+ int err;
+ bootfs_node_t *bnp = vp->v_data;
+
+ if (off + len > bnp->bvn_size + PAGEOFFSET)
+ return (EFAULT);
+
+ if (len <= PAGESIZE)
+ err = bootfs_getapage(vp, (u_offset_t)off, len, protp, pl,
+ plsz, seg, addr, rw, cr);
+ else
+ err = pvn_getpages(bootfs_getapage, vp, (u_offset_t)off, len,
+ protp, pl, plsz, seg, addr, rw, cr);
+
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ int ret;
+ segvn_crargs_t vn_a;
+
+#ifdef _ILP32
+ if (len > MAXOFF_T)
+ return (ENOMEM);
+#endif
+
+ if (vp->v_flag & VNOMAP)
+ return (ENOSYS);
+
+ if (off < 0 || off > MAXOFFSET_T - off)
+ return (ENXIO);
+
+ if (vp->v_type != VREG)
+ return (ENODEV);
+
+ if (prot & PROT_WRITE)
+ return (ENOTSUP);
+
+ as_rangelock(as);
+ ret = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (ret != 0) {
+ as_rangeunlock(as);
+ return (ret);
+ }
+
+ vn_a.vp = vp;
+ vn_a.offset = (u_offset_t)off;
+ vn_a.type = flags & MAP_TYPE;
+ vn_a.prot = prot;
+ vn_a.maxprot = maxprot;
+ vn_a.cred = cr;
+ vn_a.amp = NULL;
+ vn_a.flags = flags & ~MAP_TYPE;
+ vn_a.szc = 0;
+ vn_a.lgrp_mem_policy_flags = 0;
+
+ ret = as_map(as, *addrp, len, segvn_create, &vn_a);
+
+ as_rangeunlock(as);
+ return (ret);
+
+}
+
+/*ARGSUSED*/
+static int
+bootfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+bootfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ return (0);
+}
+
+static int
+bootfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ int ret;
+
+ switch (cmd) {
+ case _PC_TIMESTAMP_RESOLUTION:
+ *valp = 1L;
+ ret = 0;
+ break;
+ default:
+ ret = fs_pathconf(vp, cmd, valp, cr, ct);
+ }
+
+ return (ret);
+}
+
+const fs_operation_def_t bootfs_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = bootfs_open },
+ VOPNAME_CLOSE, { .vop_close = bootfs_close },
+ VOPNAME_READ, { .vop_read = bootfs_read },
+ VOPNAME_IOCTL, { .vop_ioctl = bootfs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = bootfs_getattr },
+ VOPNAME_ACCESS, { .vop_access = bootfs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = bootfs_lookup },
+ VOPNAME_READDIR, { .vop_readdir = bootfs_readdir },
+ VOPNAME_INACTIVE, { .vop_inactive = bootfs_inactive },
+ VOPNAME_RWLOCK, { .vop_rwlock = bootfs_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = bootfs_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = bootfs_seek },
+ VOPNAME_GETPAGE, { .vop_getpage = bootfs_getpage },
+ VOPNAME_MAP, { .vop_map = bootfs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = bootfs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = bootfs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = bootfs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_nosupport },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c
index 4eaf38f484..41441ec52d 100644
--- a/usr/src/uts/common/fs/dev/sdev_netops.c
+++ b/usr/src/uts/common/fs/dev/sdev_netops.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -41,8 +42,102 @@
#include <sys/zone.h>
#include <sys/dls.h>
+static const char *devnet_zpath = "/dev/net/zone/";
struct vnodeops *devnet_vnodeops;
+static zoneid_t
+devnet_nodetozone(sdev_node_t *dv)
+{
+ char *zname = NULL, *dup;
+ zone_t *zone;
+ int duplen;
+ zoneid_t zid;
+
+ /*
+ * If in a non-global zone, always return it's zid no matter what the
+ * node is.
+ */
+ zid = getzoneid();
+ if (zid != GLOBAL_ZONEID)
+ return (zid);
+
+ /*
+ * If it doesn't have /dev/net/zone/ then it can't be a specific zone
+ * we're targetting.
+ */
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0)
+ return (GLOBAL_ZONEID);
+
+ if (dv->sdev_vnode->v_type == VDIR) {
+ zone = zone_find_by_name(dv->sdev_name);
+ } else {
+ /* Non directories have the form /dev/net/zone/%z/%s */
+ dup = strdup(dv->sdev_path);
+ duplen = strlen(dup);
+ zname = strrchr(dup, '/');
+ *zname = '\0';
+ zname--;
+ zname = strrchr(dup, '/');
+ zname++;
+ zone = zone_find_by_name(zname);
+ kmem_free(dup, duplen + 1);
+ }
+ if (zone == NULL)
+ return (GLOBAL_ZONEID);
+ zid = zone->zone_id;
+ zone_rele(zone);
+ return (zid);
+}
+
+static int
+devnet_mkdir(struct sdev_node *ddv, char *name)
+{
+ sdev_node_t *dv;
+ struct vattr va;
+ int ret;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ dv = sdev_cache_lookup(ddv, name);
+ if (dv != NULL) {
+ SDEV_SIMPLE_RELE(dv);
+ return (EEXIST);
+ }
+
+ va = *sdev_getdefault_attr(VDIR);
+ gethrestime(&va.va_atime);
+ va.va_mtime = va.va_atime;
+ va.va_ctime = va.va_atime;
+
+ ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(dv);
+ return (0);
+}
+
+/*
+ * We basically need to walk down the directory path to determine what we should
+ * do. At the top level of /dev/net, only the directory /dev/net/zone is valid,
+ * and it is always valid. Following on that, /dev/net/zone/%zonename is valid
+ * if and only if we can look up that zone name. If it's not, or it's some other
+ * name, then it's SDEV_VTOR_INVALID.
+ */
+static int
+devnet_dirvalidate(struct sdev_node *dv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, dv->sdev_path) == 0)
+ return (SDEV_VTOR_VALID);
+
+ zonep = zone_find_by_name(dv->sdev_name);
+ if (zonep == NULL)
+ return (SDEV_VTOR_INVALID);
+ zone_rele(zonep);
+ return (SDEV_VTOR_VALID);
+}
+
/*
* Check if a net sdev_node is still valid - i.e. it represents a current
* network link.
@@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv)
ASSERT(dv->sdev_state == SDEV_READY);
- if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0)
+ if (dv->sdev_vnode->v_type == VDIR)
+ return (devnet_dirvalidate(dv));
+
+ if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) {
+ ASSERT(SDEV_IS_GLOBAL(dv));
+ zoneid = devnet_nodetozone(dv);
+ } else {
+ zoneid = getzoneid();
+ }
+
+ if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0)
return (SDEV_VTOR_INVALID);
- if (SDEV_IS_GLOBAL(dv))
+ if (zoneid == GLOBAL_ZONEID)
return (SDEV_VTOR_VALID);
- zoneid = getzoneid();
return (zone_check_datalink(&zoneid, linkid) == 0 ?
SDEV_VTOR_VALID : SDEV_VTOR_INVALID);
}
@@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv)
* a net entry when the node is not found in the cache.
*/
static int
-devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp)
+devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp,
+ zoneid_t zid)
{
timestruc_t now;
dev_t dev;
int error;
- if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) {
+ if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) {
sdcmn_err12(("devnet_create_rvp: not a valid vanity name "
"network node: %s\n", nm));
return (error);
@@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
struct sdev_node *ddv = VTOSDEV(dvp);
struct sdev_node *dv = NULL;
dls_dl_handle_t ddh = NULL;
+ zone_t *zone;
struct vattr vattr;
int nmlen;
int error = ENOENT;
@@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
if (SDEVTOV(ddv)->v_type != VDIR)
return (ENOTDIR);
+ if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID)
+ return (EPERM);
+
/*
* Empty name or ., return node itself.
*/
@@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
rw_enter(&ddv->sdev_contents, RW_WRITER);
/*
+ * ZOMBIED parent does not allow new node creation, bail out early.
+ */
+ if (ddv->sdev_state == SDEV_ZOMBIE)
+ goto failed;
+
+ /*
* directory cache lookup:
*/
if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) {
@@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
goto found;
}
+ if (SDEV_IS_GLOBAL(ddv)) {
+ /*
+ * Check for /dev/net/zone
+ */
+ if (strcmp("zone", nm) == 0 && strcmp("/dev/net",
+ ddv->sdev_path) == 0) {
+ (void) devnet_mkdir(ddv, nm);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+
+ /*
+ * Check for /dev/net/zone/%z. We can't use devnet_zpath due to
+ * its trailing slash.
+ */
+ if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) {
+ zone = zone_find_by_name(nm);
+ if (zone == NULL)
+ goto failed;
+ (void) devnet_mkdir(ddv, nm);
+ zone_rele(zone);
+ dv = sdev_cache_lookup(ddv, nm);
+ ASSERT(dv != NULL);
+ goto found;
+ }
+ } else if (strcmp("/dev/net", ddv->sdev_path) != 0) {
+ goto failed;
+ }
+
/*
- * ZOMBIED parent does not allow new node creation, bail out early.
+ * We didn't find what we were looking for. What that is depends a lot
+ * on what directory we're in.
*/
- if (ddv->sdev_state == SDEV_ZOMBIE)
- goto failed;
- error = devnet_create_rvp(nm, &vattr, &ddh);
+ error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv));
if (error != 0)
goto failed;
@@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg)
if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL)
goto found;
- if (devnet_create_rvp(link, &vattr, &ddh) != 0)
+ if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0)
return (0);
ASSERT(ddh != NULL);
@@ -244,16 +388,77 @@ found:
return (0);
}
+/*
+ * Fill in all the entries for the current zone.
+ */
static void
-devnet_filldir(struct sdev_node *ddv)
+devnet_fillzone(struct sdev_node *ddv, zoneid_t zid)
{
- sdev_node_t *dv, *next;
datalink_id_t linkid;
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ if (zid == GLOBAL_ZONEID) {
+ ASSERT(SDEV_IS_GLOBAL(ddv));
+ linkid = DATALINK_INVALID_LINKID;
+ do {
+ linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
+ DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
+ if (linkid != DATALINK_INVALID_LINKID)
+ (void) devnet_filldir_datalink(linkid, ddv);
+ } while (linkid != DATALINK_INVALID_LINKID);
+ } else {
+ (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv);
+ }
+}
+
+/*
+ * Callback for zone_walk when filling up /dev/net/zone/...
+ */
+static int
+devnet_fillzdir_cb(zone_t *zonep, void *arg)
+{
+ sdev_node_t *ddv = arg;
+
+ ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+ (void) devnet_mkdir(ddv, zonep->zone_name);
+ return (0);
+}
+
+/*
+ * Fill in a directory that isn't the top level /dev/net.
+ */
+static void
+devnet_fillzdir(struct sdev_node *ddv)
+{
+ zone_t *zonep;
+ char *path = "/dev/net/zone";
+
+ if (strcmp(path, ddv->sdev_path) == 0) {
+ (void) zone_walk(devnet_fillzdir_cb, ddv);
+ return;
+ }
+
+ zonep = zone_find_by_name(ddv->sdev_name);
+ if (zonep == NULL)
+ return;
+ devnet_fillzone(ddv, zonep->zone_id);
+ zone_rele(zonep);
+}
+
+static void
+devnet_filldir(struct sdev_node *ddv)
+{
+ int ret;
+ sdev_node_t *dv, *next;
+
ASSERT(RW_READ_HELD(&ddv->sdev_contents));
if (rw_tryupgrade(&ddv->sdev_contents) == NULL) {
rw_exit(&ddv->sdev_contents);
rw_enter(&ddv->sdev_contents, RW_WRITER);
+ if (ddv->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&ddv->sdev_contents);
+ return;
+ }
}
for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
@@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv)
if (SDEVTOV(dv)->v_count > 0)
continue;
+
SDEV_HOLD(dv);
+
+ /*
+ * Clean out everything underneath before we remove ourselves.
+ */
+ if (SDEVTOV(ddv)->v_type == VDIR) {
+ ret = sdev_cleandir(dv, NULL, 0);
+ ASSERT(ret == 0);
+ }
/* remove the cache node */
(void) sdev_cache_update(ddv, &dv, dv->sdev_name,
SDEV_CACHE_DELETE);
SDEV_RELE(dv);
}
+ if (strcmp(ddv->sdev_path, "/dev/net") != 0) {
+ devnet_fillzdir(ddv);
+ goto done;
+ }
+
if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild())
goto done;
if (SDEV_IS_GLOBAL(ddv)) {
- linkid = DATALINK_INVALID_LINKID;
- do {
- linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL,
- DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE);
- if (linkid != DATALINK_INVALID_LINKID)
- (void) devnet_filldir_datalink(linkid, ddv);
- } while (linkid != DATALINK_INVALID_LINKID);
+ devnet_fillzone(ddv, GLOBAL_ZONEID);
+ (void) devnet_mkdir(ddv, "zone");
} else {
- (void) zone_datalink_walk(getzoneid(),
- devnet_filldir_datalink, ddv);
+ devnet_fillzone(ddv, getzoneid());
}
ddv->sdev_flags &= ~SDEV_BUILD;
-
done:
rw_downgrade(&ddv->sdev_contents);
}
@@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
ASSERT(sdvp);
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
if (uiop->uio_offset == 0)
devnet_filldir(sdvp);
diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c
new file mode 100644
index 0000000000..885191175f
--- /dev/null
+++ b/usr/src/uts/common/fs/dev/sdev_plugin.c
@@ -0,0 +1,913 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Dynamic directory plugin interface for sdev.
+ *
+ * The sdev plugin interfaces provides a means for a dynamic directory based on
+ * in-kernel state to be simply created. Traditionally, dynamic directories were
+ * built into sdev itself. While these legacy plugins are useful, it makes more
+ * sense for these pieces of functionality to live with the individual drivers.
+ *
+ * The plugin interface requires folks to implement three interfaces and
+ * provides a series of callbacks that can be made in the context of those
+ * interfaces to interrogate the sdev_node_t without having to leak
+ * implementation details of the sdev_node_t. These interfaces are:
+ *
+ * o spo_validate
+ *
+ * Given a particular node, answer the question as to whether or not this
+ * entry is still valid. Here, plugins should use the name and the dev_t
+ * associated with the node to verify that it matches something that still
+ * exists.
+ *
+ * o spo_filldir
+ *
+ * Fill all the entries inside of a directory. Note that some of these entries
+ * may already exist.
+ *
+ * o spo_inactive
+ *
+ * The given node is no longer being used. This allows the consumer to
+ * potentially tear down anything that was being held open related to this.
+ * Note that this only fires when the given sdev_node_t becomes a zombie.
+ *
+ * During these callbacks a consumer is not allowed to register or unregister a
+ * plugin, especially their own. They may call the sdev_ctx style functions. All
+ * callbacks fire in a context where blocking is allowed (eg. the spl is below
+ * LOCK_LEVEL).
+ *
+ * When a plugin is added, we create its directory in the global zone. By doing
+ * that, we ensure that something isn't already there and that nothing else can
+ * come along and try and create something without our knowledge. We only have
+ * to create it in the GZ and not for all other instances of sdev because an
+ * instance of sdev that isn't at /dev does not have dynamic directories, and
+ * second, any instance of sdev present in a non-global zone cannot create
+ * anything, therefore we know that by it not being in the global zone's
+ * instance of sdev that we're good to go.
+ *
+ * Lock Ordering
+ * -------------
+ *
+ * The global sdev_plugin_lock must be held before any of the individual
+ * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held,
+ * it is not legal to take any holds on any sdev_node_t or to grab the
+ * sdev_node_t`contents_lock in any way.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/fs/sdev_plugin.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/list.h>
+#include <sys/ctype.h>
+
+kmutex_t sdev_plugin_lock;
+list_t sdev_plugin_list;
+kmem_cache_t *sdev_plugin_cache;
+struct vnodeops *sdev_plugin_vnops;
+
+#define SDEV_PLUGIN_NAMELEN 64
+
+typedef struct sdev_plugin {
+ list_node_t sp_link;
+ char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */
+ int sp_nflags; /* E */
+ struct vnodeops *sp_vnops; /* E */
+ sdev_plugin_ops_t *sp_pops; /* E */
+ boolean_t sp_islegacy; /* E */
+ int (*sp_lvtor)(sdev_node_t *); /* E */
+ kmutex_t sp_lock; /* Protects everything below */
+ kcondvar_t sp_nodecv;
+ size_t sp_nnodes;
+} sdev_plugin_t;
+
+/* ARGSUSED */
+static int
+sdev_plugin_cache_constructor(void *buf, void *arg, int tags)
+{
+ sdev_plugin_t *spp = buf;
+ mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0);
+ cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL);
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+sdev_plugin_cache_destructor(void *buf, void *arg)
+{
+ sdev_plugin_t *spp = buf;
+ cv_destroy(&spp->sp_nodecv);
+ mutex_destroy(&spp->sp_lock);
+}
+
+enum vtype
+sdev_ctx_vtype(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_vnode->v_type);
+}
+
+const char *
+sdev_ctx_path(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_path);
+}
+
+const char *
+sdev_ctx_name(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_name);
+}
+
+/*
+ * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL.
+ */
+sdev_ctx_flags_t
+sdev_ctx_flags(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ return (sdp->sdev_flags & SDEV_GLOBAL);
+}
+
+/*
+ * Return some amount of private data specific to the vtype. In the case of a
+ * character or block device this is the device number.
+ */
+const void *
+sdev_ctx_vtype_data(sdev_ctx_t ctx)
+{
+ sdev_node_t *sdp = (sdev_node_t *)ctx;
+ void *ret;
+
+ ASSERT(RW_LOCK_HELD(&sdp->sdev_contents));
+ switch (sdp->sdev_vnode->v_type) {
+ case VCHR:
+ case VBLK:
+ ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev);
+ break;
+ default:
+ ret = NULL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'.
+ */
+static int
+sdev_plugin_name_isvalid(const char *c, int buflen)
+{
+ int i;
+
+ for (i = 0; i < buflen; i++, c++) {
+ if (*c == '\0')
+ return (1);
+
+ if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.')
+ return (0);
+ }
+ /* Never found a null terminator */
+ return (0);
+}
+
+static int
+sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name,
+ vattr_t *vap)
+{
+ int ret;
+ sdev_node_t *svp;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+ ASSERT(spp != NULL);
+ svp = sdev_cache_lookup(sdvp, name);
+ if (svp != NULL) {
+ SDEV_SIMPLE_RELE(svp);
+ return (EEXIST);
+ }
+
+ ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred,
+ SDEV_READY);
+ if (ret != 0)
+ return (ret);
+ SDEV_SIMPLE_RELE(svp);
+
+ return (0);
+}
+
+/*
+ * Plugin node creation callbacks
+ */
+int
+sdev_plugin_mkdir(sdev_ctx_t ctx, char *name)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(sdvp->sdev_private != NULL);
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+}
+
+int
+sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev)
+{
+ sdev_node_t *sdvp;
+ timestruc_t now;
+ struct vattr vap;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0)
+ return (EINVAL);
+
+ sdvp = (sdev_node_t *)ctx;
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+ if (mode != S_IFCHR && mode != S_IFBLK)
+ return (EINVAL);
+
+ ASSERT(sdvp->sdev_private != NULL);
+
+ vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+ vap.va_rdev = dev;
+ vap.va_mode = mode | 0666;
+
+ /* Despite the similar name, this is in fact a different function */
+ return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap));
+
+}
+
+static int
+sdev_plugin_validate(sdev_node_t *sdp)
+{
+ int ret;
+ sdev_plugin_t *spp;
+
+ ASSERT(sdp->sdev_private != NULL);
+ spp = sdp->sdev_private;
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ ret = spp->sp_pops->spo_validate((uintptr_t)sdp);
+ rw_exit(&sdp->sdev_contents);
+ return (ret);
+}
+
+static void
+sdev_plugin_validate_dir(sdev_node_t *sdvp)
+{
+ int ret;
+ sdev_node_t *svp, *next;
+
+ ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents));
+
+ for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) {
+
+ next = SDEV_NEXT_ENTRY(sdvp, svp);
+ ASSERT(svp->sdev_state != SDEV_ZOMBIE);
+ /* skip nodes that aren't ready */
+ if (svp->sdev_state == SDEV_INIT)
+ continue;
+
+ switch (sdev_plugin_validate(svp)) {
+ case SDEV_VTOR_VALID:
+ case SDEV_VTOR_SKIP:
+ continue;
+ case SDEV_VTOR_INVALID:
+ case SDEV_VTOR_STALE:
+ break;
+ }
+
+ SDEV_HOLD(svp);
+
+ /*
+ * Clean out everything underneath this node before we
+ * remove it.
+ */
+ if (svp->sdev_vnode->v_type == VDIR) {
+ ret = sdev_cleandir(svp, NULL, 0);
+ ASSERT(ret == 0);
+ }
+ /* remove the cache node */
+ (void) sdev_cache_update(sdvp, &svp, svp->sdev_name,
+ SDEV_CACHE_DELETE);
+ SDEV_RELE(svp);
+ }
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+ int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+ int ret;
+ sdev_node_t *sdvp = VTOSDEV(dvp);
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_READ_HELD(&sdvp->sdev_contents));
+
+ /* Sanity check we're not a zombie before we do anyting else */
+ if (sdvp->sdev_state == SDEV_ZOMBIE)
+ return (ENOENT);
+
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ if (uiop->uio_offset == 0) {
+ /*
+ * We upgrade to a write lock and grab the plugin's lock along
+ * the way. We're almost certainly going to get creation
+ * callbacks, so this is the only safe way to go.
+ */
+ if (rw_tryupgrade(&sdvp->sdev_contents) == 0) {
+ rw_exit(&sdvp->sdev_contents);
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_downgrade(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ }
+
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_downgrade(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+/*
+ * If we don't have a callback function that returns a failure, then sdev will
+ * try to create a node for us which violates all of our basic assertions. To
+ * work around that we create our own callback for devname_lookup_func which
+ * always returns ENOENT as at this point either it was created with the filldir
+ * callback or it was not.
+ */
+/*ARGSUSED*/
+static int
+sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred,
+ void *unused, char *unused2)
+{
+ return (ENOENT);
+}
+
+/* ARGSUSED */
+static int
+sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+ struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+ caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+ int ret;
+ sdev_node_t *sdvp;
+ sdev_plugin_t *spp;
+
+ /* execute access is required to search the directory */
+ if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+ return (ret);
+
+ sdvp = VTOSDEV(dvp);
+ spp = sdvp->sdev_private;
+ ASSERT(spp != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ ASSERT(spp->sp_pops != NULL);
+
+ if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp))
+ return (EPERM);
+
+ /*
+ * Go straight for the write lock.
+ */
+ rw_enter(&sdvp->sdev_contents, RW_WRITER);
+ if (sdvp->sdev_state == SDEV_ZOMBIE) {
+ rw_exit(&sdvp->sdev_contents);
+ return (ENOENT);
+ }
+ sdev_plugin_validate_dir(sdvp);
+ ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp);
+ rw_exit(&sdvp->sdev_contents);
+ if (ret != 0)
+ return (ret);
+
+ return (devname_lookup_func(sdvp, nm, vpp, cred,
+ sdev_plugin_vop_lookup_cb, SDEV_VATTR));
+}
+
+/*
+ * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes
+ * to zero, but isn't necessairily a zombie yet. As such, to make things easier
+ * for users, we only fire the inactive callback when the node becomes a zombie
+ * and thus will be torn down here.
+ */
+static void
+sdev_plugin_vop_inactive_cb(struct vnode *dvp)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+
+ rw_enter(&sdp->sdev_contents, RW_READER);
+ if (sdp->sdev_state != SDEV_ZOMBIE) {
+ rw_exit(&sdp->sdev_contents);
+ return;
+ }
+ spp->sp_pops->spo_inactive((uintptr_t)sdp);
+ mutex_enter(&spp->sp_lock);
+ VERIFY(spp->sp_nnodes > 0);
+ spp->sp_nnodes--;
+ cv_signal(&spp->sp_nodecv);
+ mutex_exit(&spp->sp_lock);
+ rw_exit(&sdp->sdev_contents);
+}
+
+/*ARGSUSED*/
+static void
+sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred,
+ caller_context_t *ct)
+{
+ sdev_node_t *sdp = VTOSDEV(dvp);
+ sdev_plugin_t *spp = sdp->sdev_private;
+ ASSERT(sdp->sdev_private != NULL);
+ ASSERT(spp->sp_islegacy == B_FALSE);
+ devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb);
+}
+
+const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = {
+ VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir },
+ VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup },
+ VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive },
+ VOPNAME_CREATE, { .error = fs_nosys },
+ VOPNAME_REMOVE, { .error = fs_nosys },
+ VOPNAME_MKDIR, { .error = fs_nosys },
+ VOPNAME_RMDIR, { .error = fs_nosys },
+ VOPNAME_SYMLINK, { .error = fs_nosys },
+ VOPNAME_SETSECATTR, { .error = fs_nosys },
+ NULL, NULL
+};
+
+/*
+ * construct a new template with overrides from vtab
+ */
+static fs_operation_def_t *
+sdev_merge_vtab(const fs_operation_def_t tab[])
+{
+ fs_operation_def_t *new;
+ const fs_operation_def_t *tab_entry;
+
+ /* make a copy of standard vnode ops table */
+ new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
+ bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
+
+ /* replace the overrides from tab */
+ for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
+ fs_operation_def_t *std_entry = new;
+ while (std_entry->name) {
+ if (strcmp(tab_entry->name, std_entry->name) == 0) {
+ std_entry->func = tab_entry->func;
+ break;
+ }
+ std_entry++;
+ }
+ }
+
+ return (new);
+}
+
+/* free memory allocated by sdev_merge_vtab */
+static void
+sdev_free_vtab(fs_operation_def_t *new)
+{
+ kmem_free(new, sdev_vnodeops_tbl_size);
+}
+
+/*
+ * Register a new plugin.
+ */
+sdev_plugin_hdl_t
+sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp)
+{
+ int ret, err;
+ sdev_plugin_t *spp, *iter;
+ vnode_t *vp, *nvp;
+ sdev_node_t *sdp, *slp;
+ timestruc_t now;
+ struct vattr vap;
+
+ /*
+ * Some consumers don't care about why they failed. To keep the code
+ * simple, we'll just pretend they gave us something.
+ */
+ if (errp == NULL)
+ errp = &err;
+
+ if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_version != 1) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if (ops->spo_validate == NULL || ops->spo_filldir == NULL ||
+ ops->spo_inactive == NULL) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) {
+ *errp = EINVAL;
+ return (NULL);
+ }
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN);
+
+ spp->sp_pops = ops;
+ spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR;
+ if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE)
+ spp->sp_nflags |= SDEV_NO_NCACHE;
+ if (ops->spo_flags & SDEV_PLUGIN_SUBDIR)
+ spp->sp_nflags |= SDEV_SUBDIR;
+ spp->sp_vnops = sdev_plugin_vnops;
+ spp->sp_islegacy = B_FALSE;
+ spp->sp_lvtor = NULL;
+ spp->sp_nnodes = 0;
+
+ /*
+ * Make sure it's unique, nothing exists with this name already, and add
+ * it to the list. We also need to go through and grab the sdev
+ * root node as we cannot grab any sdev node locks once we've grabbed
+ * the sdev_plugin_lock. We effectively assert that if a directory is
+ * not present in the GZ's /dev, then it doesn't exist in any of the
+ * local zones.
+ */
+ ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1);
+ if (ret != 0) {
+ *errp = ret;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ /* Make sure we have the real vnode */
+ if (VOP_REALVP(vp, &nvp, NULL) == 0) {
+ VN_HOLD(nvp);
+ VN_RELE(vp);
+ vp = nvp;
+ nvp = NULL;
+ }
+ VERIFY(vp->v_op == sdev_vnodeops);
+ sdp = VTOSDEV(vp);
+ rw_enter(&sdp->sdev_contents, RW_WRITER);
+ slp = sdev_cache_lookup(sdp, spp->sp_name);
+ if (slp != NULL) {
+ SDEV_RELE(slp);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+
+ mutex_enter(&sdev_plugin_lock);
+ for (iter = list_head(&sdev_plugin_list); iter != NULL;
+ iter = list_next(&sdev_plugin_list, iter)) {
+ if (strcmp(spp->sp_name, iter->sp_name) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+ *errp = EEXIST;
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (NULL);
+ }
+ }
+
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ /*
+ * Now go ahead and create the top level directory for the global zone.
+ */
+ vap = *sdev_getdefault_attr(VDIR);
+ gethrestime(&now);
+ vap.va_atime = now;
+ vap.va_mtime = now;
+ vap.va_ctime = now;
+
+ (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap);
+
+ rw_exit(&sdp->sdev_contents);
+ VN_RELE(vp);
+
+ return ((sdev_plugin_hdl_t)spp);
+}
+
+static void
+sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg)
+{
+ sdev_plugin_t *spp = arg;
+ sdev_node_t *sdp;
+
+ rw_enter(&rdp->sdev_contents, RW_WRITER);
+ sdp = sdev_cache_lookup(rdp, spp->sp_name);
+ /* If it doesn't exist, we're done here */
+ if (sdp == NULL) {
+ rw_exit(&rdp->sdev_contents);
+ return;
+ }
+
+ /*
+ * We first delete the directory before recursively marking everything
+ * else stale. This ordering should ensure that we don't accidentally
+ * miss anything.
+ */
+ sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE);
+ sdev_stale(sdp);
+ SDEV_RELE(sdp);
+ rw_exit(&rdp->sdev_contents);
+}
+
+/*
+ * Remove a plugin. This will block until everything has become a zombie, thus
+ * guaranteeing the caller that nothing will call into them again once this call
+ * returns. While the call is ongoing, it could be called into. Note that while
+ * this is ongoing, it will block other mounts.
+ */
+int
+sdev_plugin_unregister(sdev_plugin_hdl_t hdl)
+{
+ sdev_plugin_t *spp = (sdev_plugin_t *)hdl;
+ if (spp->sp_islegacy)
+ return (EINVAL);
+
+ mutex_enter(&sdev_plugin_lock);
+ list_remove(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ sdev_mnt_walk(sdev_plugin_unregister_cb, spp);
+ mutex_enter(&spp->sp_lock);
+ while (spp->sp_nnodes > 0)
+ cv_wait(&spp->sp_nodecv, &spp->sp_lock);
+ mutex_exit(&spp->sp_lock);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (0);
+}
+
+/*
+ * Register an old sdev style plugin to deal with what used to be in the vtab.
+ */
+static int
+sdev_plugin_register_legacy(struct sdev_vop_table *vtp)
+{
+ sdev_plugin_t *spp;
+
+ spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP);
+ (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN);
+ spp->sp_islegacy = B_TRUE;
+ spp->sp_pops = NULL;
+ spp->sp_nflags = vtp->vt_flags;
+ spp->sp_lvtor = vtp->vt_vtor;
+ spp->sp_nnodes = 0;
+
+ if (vtp->vt_service != NULL) {
+ fs_operation_def_t *templ;
+ templ = sdev_merge_vtab(vtp->vt_service);
+ if (vn_make_ops(vtp->vt_name,
+ (const fs_operation_def_t *)templ,
+ &spp->sp_vnops) != 0) {
+ cmn_err(CE_WARN, "%s: malformed vnode ops\n",
+ vtp->vt_name);
+ sdev_free_vtab(templ);
+ kmem_cache_free(sdev_plugin_cache, spp);
+ return (1);
+ }
+
+ if (vtp->vt_global_vops) {
+ *(vtp->vt_global_vops) = spp->sp_vnops;
+ }
+
+ sdev_free_vtab(templ);
+ } else {
+ spp->sp_vnops = sdev_vnodeops;
+ }
+
+ /*
+ * No need to check for EEXIST here. These are loaded as a part of the
+ * sdev's initialization function. Further, we don't have to create them
+ * as that's taken care of in sdev's mount for the GZ.
+ */
+ mutex_enter(&sdev_plugin_lock);
+ list_insert_tail(&sdev_plugin_list, spp);
+ mutex_exit(&sdev_plugin_lock);
+
+ return (0);
+}
+
+/*
+ * We need to match off of the sdev_path, not the sdev_name. We are only allowed
+ * to exist directly under /dev.
+ */
+static sdev_plugin_t *
+sdev_match(sdev_node_t *dv)
+{
+ int vlen;
+ const char *path;
+ sdev_plugin_t *spp;
+
+ if (strlen(dv->sdev_path) <= 5)
+ return (NULL);
+
+ if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
+ return (NULL);
+ path = dv->sdev_path + 5;
+
+ mutex_enter(&sdev_plugin_lock);
+
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ if (spp->sp_nflags & SDEV_SUBDIR) {
+ vlen = strlen(spp->sp_name);
+ if ((strncmp(spp->sp_name, path,
+ vlen - 1) == 0) && path[vlen] == '/') {
+ mutex_exit(&sdev_plugin_lock);
+ return (spp);
+ }
+
+ }
+ }
+
+ mutex_exit(&sdev_plugin_lock);
+ return (NULL);
+}
+
+void
+sdev_set_no_negcache(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ ASSERT(dv->sdev_path);
+ path = dv->sdev_path + strlen("/dev/");
+
+ mutex_enter(&sdev_plugin_lock);
+ for (spp = list_head(&sdev_plugin_list); spp != NULL;
+ spp = list_next(&sdev_plugin_list, spp)) {
+ if (strcmp(spp->sp_name, path) == 0) {
+ if (spp->sp_nflags & SDEV_NO_NCACHE)
+ dv->sdev_flags |= SDEV_NO_NCACHE;
+ break;
+ }
+ }
+ mutex_exit(&sdev_plugin_lock);
+}
+
+struct vnodeops *
+sdev_get_vop(sdev_node_t *dv)
+{
+ char *path;
+ sdev_plugin_t *spp;
+
+ path = dv->sdev_path;
+ ASSERT(path);
+
+ /* gets the relative path to /dev/ */
+ path += 5;
+
+ if ((spp = sdev_match(dv)) != NULL) {
+ dv->sdev_flags |= spp->sp_nflags;
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
+ (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (spp->sp_vnops);
+ }
+
+ /* child inherits the persistence of the parent */
+ if (SDEV_IS_PERSIST(dv->sdev_dotdot))
+ dv->sdev_flags |= SDEV_PERSIST;
+ return (sdev_vnodeops);
+}
+
+void *
+sdev_get_vtor(sdev_node_t *dv)
+{
+ sdev_plugin_t *spp;
+
+ if (dv->sdev_private == NULL) {
+ spp = sdev_match(dv);
+ if (spp == NULL)
+ return (NULL);
+ } else {
+ spp = dv->sdev_private;
+ }
+
+ if (spp->sp_islegacy)
+ return ((void *)spp->sp_lvtor);
+ else
+ return ((void *)sdev_plugin_validate);
+}
+
+void
+sdev_plugin_nodeready(sdev_node_t *sdp)
+{
+ sdev_plugin_t *spp;
+
+ ASSERT(RW_WRITE_HELD(&sdp->sdev_contents));
+ ASSERT(sdp->sdev_private == NULL);
+
+ spp = sdev_match(sdp);
+ if (spp == NULL)
+ return;
+ if (spp->sp_islegacy)
+ return;
+ sdp->sdev_private = spp;
+ mutex_enter(&spp->sp_lock);
+ spp->sp_nnodes++;
+ mutex_exit(&spp->sp_lock);
+}
+
+int
+sdev_plugin_init(void)
+{
+ sdev_vop_table_t *vtp;
+ fs_operation_def_t *templ;
+
+ sdev_plugin_cache = kmem_cache_create("sdev_plugin",
+ sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor,
+ sdev_plugin_cache_destructor, NULL, NULL, NULL, 0);
+ if (sdev_plugin_cache == NULL)
+ return (1);
+ mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&sdev_plugin_list, sizeof (sdev_plugin_t),
+ offsetof(sdev_plugin_t, sp_link));
+
+ /*
+ * Register all of the legacy vnops
+ */
+ for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++)
+ if (sdev_plugin_register_legacy(vtp) != 0)
+ return (1);
+
+ templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl);
+ if (vn_make_ops("sdev_plugin",
+ (const fs_operation_def_t *)templ,
+ &sdev_plugin_vnops) != 0) {
+ sdev_free_vtab(templ);
+ return (1);
+ }
+
+ sdev_free_vtab(templ);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c
index b4b27e6285..16439a66a6 100644
--- a/usr/src/uts/common/fs/dev/sdev_subr.c
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = {
kmem_cache_t *sdev_node_cache; /* sdev_node cache */
int devtype; /* fstype */
-/* static */
-static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_negcache(struct sdev_node *);
-static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
-static void sdev_free_vtab(fs_operation_def_t *);
-
static void
sdev_prof_free(struct sdev_node *dv)
{
@@ -318,6 +312,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
/* overwritten for VLNK nodes */
dv->sdev_symlink = NULL;
+ list_link_init(&dv->sdev_plist);
vp = SDEVTOV(dv);
vn_reinit(vp);
@@ -406,6 +401,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
} else {
dv->sdev_nlink = 1;
}
+ sdev_plugin_nodeready(dv);
if (!(SDEV_IS_GLOBAL(dv))) {
dv->sdev_origin = (struct sdev_node *)args;
@@ -502,37 +498,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
return (dv);
}
-/* directory dependent vop table */
-struct sdev_vop_table {
- char *vt_name; /* subdirectory name */
- const fs_operation_def_t *vt_service; /* vnodeops table */
- struct vnodeops *vt_vops; /* constructed vop */
- struct vnodeops **vt_global_vops; /* global container for vop */
- int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */
- int vt_flags;
-};
-
-/*
- * A nice improvement would be to provide a plug-in mechanism
- * for this table instead of a const table.
- */
-static struct sdev_vop_table vtab[] =
-{
- { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
+struct sdev_vop_table vtab[] = {
+ { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
+ { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate,
SDEV_DYNAMIC | SDEV_VTOR },
- { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+ { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops,
devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
+ { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE },
- { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
- SDEV_DYNAMIC | SDEV_VTOR },
+ { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate,
+ SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
- { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
+ { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops,
devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
/*
@@ -547,132 +528,14 @@ static struct sdev_vop_table vtab[] =
* preventing a mkdir.
*/
- { "lofi", NULL, NULL, NULL, NULL,
+ { "lofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { "rlofi", NULL, NULL, NULL, NULL,
+ { "rlofi", NULL, NULL, NULL,
SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
- { NULL, NULL, NULL, NULL, NULL, 0}
+ { NULL, NULL, NULL, NULL, 0}
};
-/*
- * We need to match off of the sdev_path, not the sdev_name. We are only allowed
- * to exist directly under /dev.
- */
-struct sdev_vop_table *
-sdev_match(struct sdev_node *dv)
-{
- int vlen;
- int i;
- const char *path;
-
- if (strlen(dv->sdev_path) <= 5)
- return (NULL);
-
- if (strncmp(dv->sdev_path, "/dev/", 5) != 0)
- return (NULL);
- path = dv->sdev_path + 5;
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0)
- return (&vtab[i]);
- if (vtab[i].vt_flags & SDEV_SUBDIR) {
- vlen = strlen(vtab[i].vt_name);
- if ((strncmp(vtab[i].vt_name, path,
- vlen - 1) == 0) && path[vlen] == '/')
- return (&vtab[i]);
- }
-
- }
- return (NULL);
-}
-
-/*
- * sets a directory's vnodeops if the directory is in the vtab;
- */
-static struct vnodeops *
-sdev_get_vop(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
- char *path;
-
- path = dv->sdev_path;
- ASSERT(path);
-
- /* gets the relative path to /dev/ */
- path += 5;
-
- /* gets the vtab entry it matches */
- if ((vtp = sdev_match(dv)) != NULL) {
- dv->sdev_flags |= vtp->vt_flags;
- if (SDEV_IS_PERSIST(dv->sdev_dotdot) &&
- (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv)))
- dv->sdev_flags |= SDEV_PERSIST;
-
- if (vtp->vt_vops) {
- if (vtp->vt_global_vops)
- *(vtp->vt_global_vops) = vtp->vt_vops;
-
- return (vtp->vt_vops);
- }
-
- if (vtp->vt_service) {
- fs_operation_def_t *templ;
- templ = sdev_merge_vtab(vtp->vt_service);
- if (vn_make_ops(vtp->vt_name,
- (const fs_operation_def_t *)templ,
- &vtp->vt_vops) != 0) {
- cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
- vtp->vt_name);
- /*NOTREACHED*/
- }
- if (vtp->vt_global_vops) {
- *(vtp->vt_global_vops) = vtp->vt_vops;
- }
- sdev_free_vtab(templ);
-
- return (vtp->vt_vops);
- }
-
- return (sdev_vnodeops);
- }
-
- /* child inherits the persistence of the parent */
- if (SDEV_IS_PERSIST(dv->sdev_dotdot))
- dv->sdev_flags |= SDEV_PERSIST;
-
- return (sdev_vnodeops);
-}
-
-static void
-sdev_set_no_negcache(struct sdev_node *dv)
-{
- int i;
- char *path;
-
- ASSERT(dv->sdev_path);
- path = dv->sdev_path + strlen("/dev/");
-
- for (i = 0; vtab[i].vt_name; i++) {
- if (strcmp(vtab[i].vt_name, path) == 0) {
- if (vtab[i].vt_flags & SDEV_NO_NCACHE)
- dv->sdev_flags |= SDEV_NO_NCACHE;
- break;
- }
- }
-}
-
-void *
-sdev_get_vtor(struct sdev_node *dv)
-{
- struct sdev_vop_table *vtp;
-
- vtp = sdev_match(dv);
- if (vtp)
- return ((void *)vtp->vt_vtor);
- else
- return (NULL);
-}
/*
* Build the base root inode
@@ -952,8 +815,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
dv->sdev_path = NULL;
}
- if (!SDEV_IS_GLOBAL(dv))
+ if (!SDEV_IS_GLOBAL(dv)) {
sdev_prof_free(dv);
+ if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL)
+ SDEV_RELE(dv->sdev_origin);
+ }
if (SDEVTOV(dv)->v_type == VDIR) {
ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
@@ -967,6 +833,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
(void) memset((void *)&dv->sdev_instance_data, 0,
sizeof (dv->sdev_instance_data));
vn_invalid(SDEVTOV(dv));
+ dv->sdev_private = NULL;
kmem_cache_free(sdev_node_cache, dv);
}
@@ -2948,46 +2815,6 @@ sdev_modctl_devexists(const char *path)
return (error);
}
-extern int sdev_vnodeops_tbl_size;
-
-/*
- * construct a new template with overrides from vtab
- */
-static fs_operation_def_t *
-sdev_merge_vtab(const fs_operation_def_t tab[])
-{
- fs_operation_def_t *new;
- const fs_operation_def_t *tab_entry;
-
- /* make a copy of standard vnode ops table */
- new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
- bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
-
- /* replace the overrides from tab */
- for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
- fs_operation_def_t *std_entry = new;
- while (std_entry->name) {
- if (strcmp(tab_entry->name, std_entry->name) == 0) {
- std_entry->func = tab_entry->func;
- break;
- }
- std_entry++;
- }
- if (std_entry->name == NULL)
- cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
- tab_entry->name);
- }
-
- return (new);
-}
-
-/* free memory allocated by sdev_merge_vtab */
-static void
-sdev_free_vtab(fs_operation_def_t *new)
-{
- kmem_free(new, sdev_vnodeops_tbl_size);
-}
-
/*
* a generic setattr() function
*
diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c
index ea9cb6374a..6f32f47635 100644
--- a/usr/src/uts/common/fs/dev/sdev_vfsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c
@@ -169,7 +169,13 @@ devinit(int fstype, char *name)
if ((devmajor = getudev()) == (major_t)-1) {
cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name);
- return (1);
+ return (ENXIO);
+ }
+
+ if (sdev_plugin_init() != 0) {
+ cmn_err(CE_WARN, "%s: failed to set init plugin subsystem",
+ sdev_vfssw.name);
+ return (EIO);
}
/* initialize negative cache */
@@ -332,6 +338,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap,
ASSERT(sdev_origins);
dv->sdev_flags &= ~SDEV_GLOBAL;
dv->sdev_origin = sdev_origins->sdev_root;
+ SDEV_HOLD(dv->sdev_origin);
} else {
sdev_ncache_setup();
rw_enter(&dv->sdev_contents, RW_WRITER);
@@ -504,3 +511,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo)
SDEVTOV(mntinfo->sdev_root)->v_count--;
mutex_exit(&sdev_lock);
}
+
+void
+sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg)
+{
+ struct sdev_data *mntinfo;
+
+ mutex_enter(&sdev_lock);
+ mntinfo = sdev_mntinfo;
+ while (mntinfo != NULL) {
+ func(mntinfo->sdev_root, arg);
+ mntinfo = mntinfo->sdev_next;
+ }
+ mutex_exit(&sdev_lock);
+}
diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c
index e4c3acf787..495cf450de 100644
--- a/usr/src/uts/common/fs/dev/sdev_zvolops.c
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc. All rights reserved.
*/
/* vnode ops for the /dev/zvol directory */
@@ -370,8 +370,10 @@ devzvol_create_pool_dirs(struct vnode *dvp)
ASSERT(dvp->v_count > 0);
rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
NULL, kcred, NULL, 0, NULL);
- /* should either work, or not be visible from a zone */
- ASSERT(rc == 0 || rc == ENOENT);
+ /*
+ * should either work or we should get an error if this should
+ * not be visible from the zone, or disallowed in the zone
+ */
if (rc == 0)
VN_RELE(vp);
pools++;
diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c
index 25327d2852..c949117da6 100644
--- a/usr/src/uts/common/fs/dnlc.c
+++ b/usr/src/uts/common/fs/dnlc.c
@@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop)
}
/*
- * Perform a reverse lookup in the DNLC. This will find the first occurrence of
- * the vnode. If successful, it will return the vnode of the parent, and the
- * name of the entry in the given buffer. If it cannot be found, or the buffer
- * is too small, then it will return NULL. Note that this is a highly
- * inefficient function, since the DNLC is constructed solely for forward
- * lookups.
- */
-vnode_t *
-dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen)
-{
- nc_hash_t *nch;
- ncache_t *ncp;
- vnode_t *pvp;
-
- if (!doingcache)
- return (NULL);
-
- for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) {
- mutex_enter(&nch->hash_lock);
- ncp = nch->hash_next;
- while (ncp != (ncache_t *)nch) {
- /*
- * We ignore '..' entries since it can create
- * confusion and infinite loops.
- */
- if (ncp->vp == vp && !(ncp->namlen == 2 &&
- 0 == bcmp(ncp->name, "..", 2)) &&
- ncp->namlen < buflen) {
- bcopy(ncp->name, buf, ncp->namlen);
- buf[ncp->namlen] = '\0';
- pvp = ncp->dp;
- /* VN_HOLD 2 of 2 in this file */
- VN_HOLD_CALLER(pvp);
- mutex_exit(&nch->hash_lock);
- return (pvp);
- }
- ncp = ncp->hash_next;
- }
- mutex_exit(&nch->hash_lock);
- }
-
- return (NULL);
-}
-/*
* Utility routine to search for a cache entry. Return the
* ncache entry if found, NULL otherwise.
*/
diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c
index b4e28cc860..5f524def30 100644
--- a/usr/src/uts/common/fs/fem.c
+++ b/usr/src/uts/common/fs/fem.c
@@ -23,6 +23,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
@@ -33,11 +37,12 @@
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
-
#include <sys/fem.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/vfs_opreg.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
#define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */
/*
@@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1)
}
#endif
+/*
+ * File event monitoring handoffs
+ *
+ * File event monitoring relies on being able to inject stack frames between
+ * vnode consumers and the underlying file systems. This becomes problematic
+ * when there exist many monitors, as kernel stack depth is finite. The model
+ * very much encodes this injected frame: the flow of control deliberately
+ * lies with the monitor, not with the monitoring system. While we could
+ * conceivably address this by allowing each subsystem to install at most
+ * one monitor per vnode (and impose on subsystems that they handle any
+ * of their own consumer multiplexing internally), this in fact exports a
+ * substantial amount of run-time complexity to deal with an uncommon case
+ * (and, it must be said, assumes a small number of consuming subsystems).
+ * To allow our abstraction to remain clean, we instead check our remaining
+ * stack in every vnext_*() call; if the amount of stack remaining is lower
+ * than a threshold (fem_stack_needed), we call thread_splitstack() to carry
+ * on the execution of the monitors and the underlying vnode operation on a
+ * split stack. Because we can only pass a single argument to our split stack
+ * function, we must marshal our arguments, the mechanics of which are somewhat
+ * ornate in terms of the code: to marshal in a type-safe manner, we define a
+ * baton that is a union of payload structures for each kind of operation,
+ * loading the per-operation payload explicitly and calling into common handoff
+ * code that itself calls thread_splitstack(). The function passed to
+ * thread_splitstack() is a per-entry point function that continues monitor
+ * processing given the specified (marshalled) arguments. While this method
+ * is a little verbose to implement, it has the advantage of being relatively
+ * robust (that is, broadly type-safe) while imposing minimal burden on each
+ * vnext_*() entry point.
+ *
+ * In terms of the implementation:
+ *
+ * - The FEM_BATON_n macros define the per-entry point baton structures
+ * - The fem_baton_payload_t contains the union of these structures
+ * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point
+ * - The FEM_VNEXTn macros constitute the per-handoff entry point
+ *
+ * Note that we don't use variadic macros -- we define a variant of these
+ * macros for each of our relevant argument counts. This may seem overly
+ * explicit, but it is deliberate: the object here is to minimize the
+ * future maintenance burden by minimizing the likelihood of introduced
+ * error -- not to minimize the number of characters in this source file.
+ */
+
+#ifndef STACK_GROWTH_DOWN
+#error Downward stack growth assumed.
+#endif
+
+int fem_stack_toodeep;
+uintptr_t fem_stack_needed = 8 * 1024;
+size_t fem_handoff_stacksize = 128 * 1024;
+
+#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
+ (uintptr_t)curthread->t_stkbase < fem_stack_needed)
+
+#define FEM_BATON_1(what, t0, l0) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ } fb_##what
+
+#define FEM_BATON_2(what, t0, l0, t1, l1) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ } fb_##what
+
+#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ } fb_##what
+
+#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ } fb_##what
+
+#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ } fb_##what
+
+#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ } fb_##what
+
+#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+ t6, l6, t7, l7) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ t6 fb_##what##_##l6; \
+ t7 fb_##what##_##l7; \
+ } fb_##what
+
+#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \
+ t6, l6, t7, l7, t8, l8) \
+ struct { \
+ void *fb_##what##_arg0; \
+ caller_context_t *fb_##what##_ct; \
+ t0 fb_##what##_##l0; \
+ t1 fb_##what##_##l1; \
+ t2 fb_##what##_##l2; \
+ t3 fb_##what##_##l3; \
+ t4 fb_##what##_##l4; \
+ t5 fb_##what##_##l5; \
+ t6 fb_##what##_##l6; \
+ t7 fb_##what##_##l7; \
+ t8 fb_##what##_##l8; \
+ } fb_##what
+
+typedef union {
+ FEM_BATON_2(open, int, mode, cred_t *, cr);
+ FEM_BATON_4(close, int, flag, int, count,
+ offset_t, offset, cred_t *, cr);
+ FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr);
+ FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr);
+ FEM_BATON_5(ioctl, int, cmd, intptr_t, arg,
+ int, flag, cred_t *, cr, int *, rvalp);
+ FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr);
+ FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr);
+ FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr);
+ FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr);
+ FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp,
+ pathname_t *, pnp, int, flags, vnode_t *, rdir,
+ cred_t *, cr, int *, direntflags, pathname_t *, realpnp);
+ FEM_BATON_8(create, char *, name, vattr_t *, vap,
+ vcexcl_t, excl, int, mode, vnode_t **, vpp,
+ cred_t *, cr, int, flag, vsecattr_t *, vsecp);
+ FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags);
+ FEM_BATON_4(link, vnode_t *, svp, char *, tnm,
+ cred_t *, cr, int, flags);
+ FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp,
+ char *, tnm, cred_t *, cr, int, flags);
+ FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap,
+ vnode_t **, vpp, cred_t *, cr, int, flags,
+ vsecattr_t *, vsecp);
+ FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir,
+ cred_t *, cr, int, flags);
+ FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr,
+ int *, eofp, int, flags);
+ FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap,
+ char *, target, cred_t *, cr, int, flags);
+ FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr);
+ FEM_BATON_2(fsync, int, syncflag, cred_t *, cr);
+ FEM_BATON_1(inactive, cred_t *, cr);
+ FEM_BATON_1(fid, fid_t *, fidp);
+ FEM_BATON_1(rwlock, int, write_lock);
+ FEM_BATON_1(rwunlock, int, write_lock);
+ FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp);
+ FEM_BATON_1(cmp, vnode_t *, vp2);
+ FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp,
+ int, flag, offset_t, offset, struct flk_callback *, flk_cbp,
+ cred_t *, cr);
+ FEM_BATON_5(space, int, cmd, struct flock64 *, bfp,
+ int, flag, offset_t, offset, cred_t *, cr);
+ FEM_BATON_1(realvp, vnode_t **, vpp);
+ FEM_BATON_9(getpage, offset_t, off, size_t, len,
+ uint_t *, protp, struct page **, plarr, size_t, plsz,
+ struct seg *, seg, caddr_t, addr, enum seg_rw, rw,
+ cred_t *, cr);
+ FEM_BATON_4(putpage, offset_t, off, size_t, len,
+ int, flags, cred_t *, cr);
+ FEM_BATON_8(map, offset_t, off, struct as *, as,
+ caddr_t *, addrp, size_t, len, uchar_t, prot,
+ uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_8(addmap, offset_t, off, struct as *, as,
+ caddr_t, addr, size_t, len, uchar_t, prot,
+ uchar_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_8(delmap, offset_t, off, struct as *, as,
+ caddr_t, addr, size_t, len, uint_t, prot,
+ uint_t, maxprot, uint_t, flags, cred_t *, cr);
+ FEM_BATON_4(poll, short, events, int, anyyet,
+ short *, reventsp, struct pollhead **, phpp);
+ FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks);
+ FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr);
+ FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off,
+ size_t, io_len, int, flags, cred_t *, cr);
+ FEM_BATON_2(dumpctl, int, action, offset_t *, blkp);
+ FEM_BATON_4(dispose, struct page *, pp, int, flag,
+ int, dn, cred_t *, cr);
+ FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+ FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr);
+ FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr,
+ int, flag, cred_t *, cr);
+ FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname);
+ FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag,
+ xuio_t *, xuiop, cred_t *, cr);
+ FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr);
+} fem_baton_payload_t;
+
+typedef struct {
+ fem_baton_payload_t fb_payload;
+ int (*fb_func)();
+ void (*fb_handoff)();
+ int fb_rval;
+} fem_baton_t;
+
+static int
+fem_handoff(fem_baton_t *bp)
+{
+ fem_stack_toodeep++;
+ thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize);
+
+ return (bp->fb_rval);
+}
+
+#define FEM_VNEXT3_DECL(what, a0, a1, a2) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2); \
+}
+
+#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3); \
+}
+
+#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4); \
+}
+
+#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5); \
+}
+
+#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6); \
+}
+
+#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7); \
+}
+
+#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7, \
+ bp->fb_payload.fb_##what.fb_##what##_##a8, \
+ bp->fb_payload.fb_##what.fb_##what##_##a9); \
+}
+
+#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+void \
+fem_handoff_##what(fem_baton_t *bp) \
+{ \
+ bp->fb_rval = bp->fb_func( \
+ bp->fb_payload.fb_##what.fb_##what##_##a0, \
+ bp->fb_payload.fb_##what.fb_##what##_##a1, \
+ bp->fb_payload.fb_##what.fb_##what##_##a2, \
+ bp->fb_payload.fb_##what.fb_##what##_##a3, \
+ bp->fb_payload.fb_##what.fb_##what##_##a4, \
+ bp->fb_payload.fb_##what.fb_##what##_##a5, \
+ bp->fb_payload.fb_##what.fb_##what##_##a6, \
+ bp->fb_payload.fb_##what.fb_##what##_##a7, \
+ bp->fb_payload.fb_##what.fb_##what##_##a8, \
+ bp->fb_payload.fb_##what.fb_##what##_##a9, \
+ bp->fb_payload.fb_##what.fb_##what##_##a10); \
+}
+
+#define FEM_VNEXT3(what, func, a0, a1, a2) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2))
+
+#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3))
+
+#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4))
+
+#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5))
+
+#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6))
+
+#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7))
+
+#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \
+ baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9))
+
+#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
+ if (FEM_TOODEEP()) { \
+ fem_baton_t *baton; \
+ int rval; \
+ \
+ baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \
+ baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \
+ baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \
+ baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \
+ baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \
+ baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \
+ baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \
+ baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \
+ baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \
+ baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \
+ baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \
+ baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \
+ baton->fb_handoff = fem_handoff_##what; \
+ baton->fb_func = func; \
+ \
+ rval = fem_handoff(baton); \
+ kmem_free(baton, sizeof (fem_baton_t)); \
+ \
+ return (rval); \
+ } \
+ return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10))
+
static fem_t *
fem_alloc()
{
@@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[] = {
* 5. Return by invoking the base operation with the base object.
*
* for each classification, there needs to be at least one "next" operation
- * for each "head"operation.
- *
+ * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros
+ * to define the function to run when the stack is split; see the discussion
+ * on "File event monitoring handoffs", above.
*/
+FEM_VNEXT4_DECL(open, arg0, mode, cr, ct)
+FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct)
+FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct)
+FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct)
+FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct)
+FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct)
+FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct)
+FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp)
+FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)
+FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags)
+FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags)
+FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags)
+FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp)
+FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags)
+FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags)
+FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags)
+FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct)
+FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct)
+FEM_VNEXT3_DECL(fid, arg0, fidp, ct)
+FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct)
+FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct)
+FEM_VNEXT3_DECL(cmp, arg0, vp2, ct)
+FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)
+FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct)
+FEM_VNEXT3_DECL(realvp, arg0, vpp, ct)
+FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz,
+ seg, addr, rw, cr, ct)
+FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct)
+FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct)
+FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct)
+FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct)
+FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct)
+FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct)
+FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct)
+FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct)
+FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct)
+FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct)
+FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct)
+FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct)
+
int
vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
{
@@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_open, femop_open);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, mode, cr, ct));
+ FEM_VNEXT4(open, func, arg0, mode, cr, ct);
}
int
@@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_close, femop_close);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, flag, count, offset, cr, ct));
+ FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct);
}
int
@@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_read, femop_read);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, ioflag, cr, ct));
+ FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct);
}
int
@@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_write, femop_write);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, ioflag, cr, ct));
+ FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct);
}
int
@@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct));
+ FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct);
}
int
@@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, oflags, nflags, cr, ct));
+ FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct);
}
int
@@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vap, flags, cr, ct));
+ FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct);
}
int
@@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vap, flags, cr, ct));
+ FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct);
}
int
@@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_access, femop_access);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, mode, flags, cr, ct));
+ FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct);
}
int
@@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp,
vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct,
- direntflags, realpnp));
+ FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct,
+ direntflags, realpnp);
}
int
@@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl,
vsop_find(vf, &func, int, &arg0, vop_create, femop_create);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp));
+ FEM_VNEXT10(create, func, arg0, name, vap, excl,
+ mode, vpp, cr, flag, ct, vsecp);
}
int
@@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct,
vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, cr, ct, flags));
+ FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags);
}
int
@@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_link, femop_link);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, svp, tnm, cr, ct, flags));
+ FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags);
}
int
@@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags));
+ FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags);
}
int
@@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp,
vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp));
+ FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp);
}
int
@@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, nm, cdir, cr, ct, flags));
+ FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags);
}
int
@@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp,
vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, cr, eofp, ct, flags));
+ FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags);
}
int
@@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target,
vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, linkname, vap, target, cr, ct, flags));
+ FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags);
}
int
@@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, uiop, cr, ct));
+ FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct);
}
int
@@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, syncflag, cr, ct));
+ FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct);
}
void
@@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, fidp, ct));
+ FEM_VNEXT3(fid, func, arg0, fidp, ct);
}
int
@@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, write_lock, ct));
+ FEM_VNEXT3(rwlock, func, arg0, write_lock, ct);
}
void
@@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, ooff, noffp, ct));
+ FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct);
}
int
@@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vp2, ct));
+ FEM_VNEXT3(cmp, func, arg0, vp2, ct);
}
int
@@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct));
+ FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct);
}
int
@@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag,
vsop_find(vf, &func, int, &arg0, vop_space, femop_space);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct));
+ FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct);
}
int
@@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vpp, ct));
+ FEM_VNEXT3(realvp, func, arg0, vpp, ct);
}
int
@@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp,
vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw,
- cr, ct));
+ FEM_VNEXT11(getpage, func, arg0, off, len, protp,
+ plarr, plsz, seg, addr, rw, cr, ct);
}
int
@@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags,
vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, len, flags, cr, ct));
+ FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct);
}
int
@@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp,
vsop_find(vf, &func, int, &arg0, vop_map, femop_map);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags,
+ cr, ct);
}
int
@@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct);
}
int
@@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr,
vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags,
- cr, ct));
+ FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot,
+ flags, cr, ct);
}
int
@@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp,
vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, events, anyyet, reventsp, phpp, ct));
+ FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct);
}
int
@@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks,
vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, addr, lbdn, dblks, ct));
+ FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct);
}
int
@@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, valp, cr, ct));
+ FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct);
}
int
@@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off,
vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct));
+ FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct);
}
int
@@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, action, blkp, ct));
+ FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct);
}
void
@@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vsap, flag, cr, ct));
+ FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct);
}
int
@@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vsap, flag, cr, ct));
+ FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct);
}
int
@@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag,
vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, cmd, shr, flag, cr, ct));
+ FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct);
}
int
@@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname,
vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, vnevent, dvp, cname, ct));
+ FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct);
}
int
@@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr,
vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, ioflag, xuiop, cr, ct));
+ FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct);
}
int
@@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct)
vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf);
ASSERT(func != NULL);
ASSERT(arg0 != NULL);
- return ((*func)(arg0, xuiop, cr, ct));
+ FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct);
}
int
diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c
index 6e56000ffe..56204c6741 100644
--- a/usr/src/uts/common/fs/fifofs/fifosubr.c
+++ b/usr/src/uts/common/fs/fifofs/fifosubr.c
@@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld)
/*
* The other end of the pipe is almost closed so
* reject any other open on this end of the pipe
- * This only happens with a pipe mounted under namefs
+ * This normally only happens with a pipe mounted under namefs, but
+ * we can also see an open via proc/fd, which should still succeed.
+ * To indicate the proc/fd case the FKLYR flag is passed.
*/
- if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) {
+ if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) &&
+ (flag & FKLYR) == 0) {
fifo_cleanup(oldvp, flag);
cv_broadcast(&fnp->fn_wait_cv);
if (!lockheld)
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index ac89e430c7..fee2924093 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -27,7 +27,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
/*
* FIFOFS file system vnode operations. This file system
@@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
}
/*
- * if we happened to get something, return
+ * if we happened to get something and we're not edge-triggered, return
*/
-
- if ((*reventsp = (short)retevents) != 0) {
+ if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) {
mutex_exit(&fnp->fn_lock->flk_lock);
return (0);
}
/*
- * If poll() has not found any events yet, set up event cell
- * to wake up the poll if a requested event occurs on this
+ * If poll() has not found any events yet or we're edge-triggered, set
+ * up event cell to wake up the poll if a requested event occurs on this
* pipe/fifo.
*/
if (!anyyet) {
diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c
index 744d269716..ec5b145a86 100644
--- a/usr/src/uts/common/fs/fs_subr.c
+++ b/usr/src/uts/common/fs/fs_subr.c
@@ -25,6 +25,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/*
@@ -246,6 +247,7 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
int frcmd;
int nlmid;
int error = 0;
+ boolean_t skip_lock = B_FALSE;
flk_callback_t serialize_callback;
int serialize = 0;
v_mode_t mode;
@@ -265,6 +267,17 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
}
break;
+ case F_OFD_GETLK:
+ /*
+ * TBD we do not support remote OFD locks at this time.
+ */
+ if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
+ error = EINVAL;
+ goto done;
+ }
+ skip_lock = B_TRUE;
+ break;
+
case F_SETLK_NBMAND:
/*
* Are NBMAND locks allowed on this file?
@@ -326,6 +339,20 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
}
break;
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ case F_FLOCK:
+ case F_FLOCKW:
+ /*
+ * TBD we do not support remote OFD locks at this time.
+ */
+ if (flag & (F_REMOTELOCK | F_PXFSLOCK)) {
+ error = EINVAL;
+ goto done;
+ }
+ skip_lock = B_TRUE;
+ break;
+
case F_HASREMOTELOCKS:
nlmid = GETNLMID(bfp->l_sysid);
if (nlmid != 0) { /* booted as a cluster */
@@ -354,7 +381,8 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
flk_cbp = &serialize_callback;
}
- error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
+ if (!skip_lock)
+ error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp);
done:
if (serialize)
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
new file mode 100644
index 0000000000..05ee2c6e09
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -0,0 +1,640 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/stat.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op,
+ vnode_t *, hlnode_t **, cred_t *);
+static int hldiraddentry(hlnode_t *, hlnode_t *, char *);
+
+
+#define HL_HASH_SIZE 8192 /* must be power of 2 */
+#define HL_MUTEX_SIZE 64
+
+static hldirent_t *hl_hashtable[HL_HASH_SIZE];
+static kmutex_t hl_hashmutex[HL_MUTEX_SIZE];
+
+#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1))
+#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1))
+
+#define HYPRLOFS_HASH(tp, name, hash) \
+ { \
+ char Xc, *Xcp; \
+ hash = (uint_t)(uintptr_t)(tp) >> 8; \
+ for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \
+ hash = (hash << 4) + hash + (uint_t)Xc; \
+ }
+
+void
+hyprlofs_hash_init(void)
+{
+ int ix;
+
+ for (ix = 0; ix < HL_MUTEX_SIZE; ix++)
+ mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+hyprlofs_hash_in(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash);
+ h->hld_hash = hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ h->hld_link = *prevpp;
+ *prevpp = h;
+ mutex_exit(hmtx);
+}
+
+/* Remove hldirent *h from the hash list. */
+static void
+hyprlofs_hash_out(hldirent_t *h)
+{
+ uint_t hash;
+ hldirent_t **prevpp;
+ kmutex_t *hmtx;
+
+ hash = h->hld_hash;
+ prevpp = &hl_hashtable[HL_HASH_INDEX(hash)];
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ while (*prevpp != h)
+ prevpp = &(*prevpp)->hld_link;
+ *prevpp = h->hld_link;
+ mutex_exit(hmtx);
+}
+
+static hldirent_t *
+hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold,
+ hlnode_t **found)
+{
+ hldirent_t *l;
+ uint_t hash;
+ kmutex_t *hmtx;
+ hlnode_t *hnp;
+
+ HYPRLOFS_HASH(parent, name, hash);
+ hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)];
+ mutex_enter(hmtx);
+ l = hl_hashtable[HL_HASH_INDEX(hash)];
+ while (l) {
+ if (l->hld_hash == hash && l->hld_parent == parent &&
+ strcmp(l->hld_name, name) == 0) {
+ /*
+ * Ensure that the hlnode that we put a hold on is the
+ * same one that we pass back. Thus the temp. var
+ * hnp is necessary.
+ */
+ hnp = l->hld_hlnode;
+ if (hold) {
+ ASSERT(hnp);
+ hlnode_hold(hnp);
+ }
+ if (found)
+ *found = hnp;
+ mutex_exit(hmtx);
+ return (l);
+ } else {
+ l = l->hld_link;
+ }
+ }
+ mutex_exit(hmtx);
+ return (NULL);
+}
+
+/*
+ * Search directory 'parent' for entry 'name'.
+ *
+ * The calling thread can't hold the write version of the rwlock for the
+ * directory being searched
+ *
+ * On success *foundtp points to the found hlnode with its vnode held.
+ */
+int
+hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr)
+{
+ int error;
+
+ *foundtp = NULL;
+ if (parent->hln_type != VDIR)
+ return (ENOTDIR);
+
+ if ((error = hyprlofs_taccess(parent, VEXEC, cr)))
+ return (error);
+
+ if (*name == '\0') {
+ hlnode_hold(parent);
+ *foundtp = parent;
+ return (0);
+ }
+
+ /*
+ * Search the directory for the matching name. We need the lock
+ * protecting the hln_dir list so that it doesn't change out from
+ * underneath us. hyprlofs_hash_lookup() will pass back the hlnode
+ * with a hold on it.
+ */
+ if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) {
+ ASSERT(*foundtp);
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+/*
+ * Enter a directory entry (either a file or subdir, depending on op) for
+ * 'name' and 'hp' into directory 'dir'
+ */
+int
+hyprlofs_direnter(
+ hlfsmount_t *hm,
+ hlnode_t *dir, /* target directory to make entry in */
+ char *name, /* name of entry */
+ enum de_op op, /* entry operation */
+ vnode_t *realvp, /* real vnode */
+ vattr_t *va,
+ hlnode_t **hpp, /* return hlnode */
+ cred_t *cr)
+{
+ hldirent_t *hdp;
+ hlnode_t *found = NULL;
+ hlnode_t *hp;
+ int error = 0;
+ char *s;
+
+ /* hln_rwlock is held to serialize direnter and dirdeletes */
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ /* Don't allow '/' characters in pathname component */
+ for (s = name; *s; s++)
+ if (*s == '/')
+ return (EACCES);
+
+ if (name[0] == '\0')
+ panic("hyprlofs_direnter: NULL name");
+
+ /*
+ * This might be a "dangling detached directory". It could have been
+ * removed, but a reference to it kept in u_cwd. Don't bother searching
+ * it, and with any luck the user will get tired of dealing with us and
+ * cd to some absolute pathway. This is in ufs, too.
+ */
+ if (dir->hln_nlink == 0) {
+ return (ENOENT);
+ }
+
+ /* Search for the entry. Return "found" if it exists. */
+ hdp = hyprlofs_hash_lookup(name, dir, 1, &found);
+
+ if (hdp) {
+ ASSERT(found);
+ switch (op) {
+ case DE_CREATE:
+ case DE_MKDIR:
+ if (hpp) {
+ *hpp = found;
+ error = EEXIST;
+ } else {
+ hlnode_rele(found);
+ }
+ break;
+ }
+ } else {
+
+ /*
+ * The entry does not exist. Check write perms in dir to see if
+ * entry can be created.
+ */
+ if ((error = hyprlofs_taccess(dir, VWRITE, cr)))
+ return (error);
+
+ /* Make new hlnode and directory entry as required. */
+ if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp,
+ cr)))
+ return (error);
+
+ if ((error = hldiraddentry(dir, hp, name))) {
+ /* Unmake the inode we just made. */
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ if ((hp->hln_type) == VDIR) {
+ ASSERT(hdp == NULL);
+ /* cleanup allocs made by hyprlofs_dirinit() */
+ hyprlofs_dirtrunc(hp);
+ }
+ mutex_enter(&hp->hln_tlock);
+ hp->hln_nlink = 0;
+ mutex_exit(&hp->hln_tlock);
+ gethrestime(&hp->hln_ctime);
+ rw_exit(&hp->hln_rwlock);
+ hlnode_rele(hp);
+ hp = NULL;
+ } else if (hpp) {
+ *hpp = hp;
+ } else {
+ hlnode_rele(hp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Delete entry hp of name "nm" from dir. Free dir entry space and decrement
+ * link count on hlnode(s).
+ */
+int
+hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op,
+ cred_t *cr)
+{
+ hldirent_t *hpdp;
+ int error;
+ size_t namelen;
+ hlnode_t *hnp;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(RW_WRITE_HELD(&hp->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (nm[0] == '\0')
+ panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp);
+
+ /* return error if removing . or .. */
+ if (nm[0] == '.') {
+ if (nm[1] == '\0')
+ return (EINVAL);
+ if (nm[1] == '.' && nm[2] == '\0')
+ return (EEXIST); /* thus in ufs */
+ }
+
+ if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0)
+ return (error);
+
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp);
+ if (hpdp == NULL) {
+ /*
+ * If it is gone, some other thread got here first!
+ * Return error ENOENT.
+ */
+ return (ENOENT);
+ }
+
+ /*
+ * If the hlnode in the hldirent changed (shouldn't happen since we
+ * don't support rename) then original is gone, so return that status
+ * (same as UFS).
+ */
+ if (hp != hnp)
+ return (ENOENT);
+
+ hyprlofs_hash_out(hpdp);
+
+ /* Take hpdp out of the directory list. */
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ if (hpdp->hld_prev) {
+ hpdp->hld_prev->hld_next = hpdp->hld_next;
+ }
+ if (hpdp->hld_next) {
+ hpdp->hld_next->hld_prev = hpdp->hld_prev;
+ }
+
+ /*
+ * If the roving slot pointer happens to match hpdp, point it at the
+ * previous dirent.
+ */
+ if (dir->hln_dir->hld_prev == hpdp) {
+ dir->hln_dir->hld_prev = hpdp->hld_prev;
+ }
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ /* hpdp points to the correct directory entry */
+ namelen = strlen(hpdp->hld_name) + 1;
+
+ kmem_free(hpdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+ hp->hln_ctime = now;
+
+ ASSERT(hp->hln_nlink > 0);
+ DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock);
+ if (op == DR_RMDIR && hp->hln_type == VDIR) {
+ hyprlofs_dirtrunc(hp);
+ ASSERT(hp->hln_nlink == 0);
+ }
+ return (0);
+}
+
+/*
+ * hyprlofs_dirinit initializes a dir with '.' and '..' entries without
+ * checking perms and locking
+ */
+void
+hyprlofs_dirinit(
+ hlnode_t *parent, /* parent of directory to initialize */
+ hlnode_t *dir) /* the new directory */
+{
+ hldirent_t *dot, *dotdot;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&parent->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP);
+ dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP);
+
+ /* Initialize the entries */
+ dot->hld_hlnode = dir;
+ dot->hld_offset = 0;
+ dot->hld_name = (char *)dot + sizeof (hldirent_t);
+ dot->hld_name[0] = '.';
+ dot->hld_parent = dir;
+ hyprlofs_hash_in(dot);
+
+ dotdot->hld_hlnode = parent;
+ dotdot->hld_offset = 1;
+ dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t);
+ dotdot->hld_name[0] = '.';
+ dotdot->hld_name[1] = '.';
+ dotdot->hld_parent = dir;
+ hyprlofs_hash_in(dotdot);
+
+ /* Initialize directory entry list. */
+ dot->hld_next = dotdot;
+ dot->hld_prev = dotdot;
+ dotdot->hld_next = NULL;
+ dotdot->hld_prev = dot;
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ /*
+ * Since hyprlofs_dirinit is called with both dir and parent being the
+ * same for the root vnode, we need to increment this before we set
+ * hln_nlink = 2 below.
+ */
+ INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock);
+ parent->hln_ctime = now;
+
+ dir->hln_dir = dot;
+ dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */
+ dir->hln_dirents = 2;
+ dir->hln_nlink = 2;
+}
+
+
+/*
+ * hyprlofs_dirtrunc removes all dir entries under this dir.
+ */
+void
+hyprlofs_dirtrunc(hlnode_t *dir)
+{
+ hldirent_t *hdp;
+ hlnode_t *tp;
+ size_t namelen;
+ timestruc_t now;
+
+ ASSERT(RW_WRITE_HELD(&dir->hln_rwlock));
+ ASSERT(dir->hln_type == VDIR);
+
+ if (dir->hln_looped)
+ return;
+
+ for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) {
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hdp->hld_hlnode);
+
+ dir->hln_dir = hdp->hld_next;
+ namelen = strlen(hdp->hld_name) + 1;
+
+ /*
+ * Adjust the link counts to account for this dir entry removal.
+ */
+ tp = hdp->hld_hlnode;
+
+ ASSERT(tp->hln_nlink > 0);
+ DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock);
+
+ hyprlofs_hash_out(hdp);
+
+ kmem_free(hdp, sizeof (hldirent_t) + namelen);
+ dir->hln_size -= (sizeof (hldirent_t) + namelen);
+ dir->hln_dirents--;
+ }
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ ASSERT(dir->hln_dir == NULL);
+ ASSERT(dir->hln_size == 0);
+ ASSERT(dir->hln_dirents == 0);
+}
+
+static int
+hldiraddentry(
+ hlnode_t *dir, /* target directory to make entry in */
+ hlnode_t *hp, /* new hlnode */
+ char *name)
+{
+ hldirent_t *hdp, *hpdp;
+ size_t namelen, alloc_size;
+ timestruc_t now;
+
+ /*
+ * Make sure the parent dir wasn't removed from underneath the caller.
+ */
+ if (dir->hln_dir == NULL)
+ return (ENOENT);
+
+ /* Check that everything is on the same FS. */
+ if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp)
+ return (EXDEV);
+
+ /* Alloc and init dir entry */
+ namelen = strlen(name) + 1;
+ alloc_size = namelen + sizeof (hldirent_t);
+ hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP);
+ if (hdp == NULL)
+ return (ENOSPC);
+
+ dir->hln_size += alloc_size;
+ dir->hln_dirents++;
+ hdp->hld_hlnode = hp;
+ hdp->hld_parent = dir;
+
+ /* The dir entry and its name were allocated sequentially. */
+ hdp->hld_name = (char *)hdp + sizeof (hldirent_t);
+ (void) strcpy(hdp->hld_name, name);
+
+ hyprlofs_hash_in(hdp);
+
+ /*
+ * Some utilities expect the size of a directory to remain fairly
+ * static. For example, a routine which unlinks files between calls to
+ * readdir(); the size of the dir changes from underneath it and so the
+ * real dir offset in bytes is invalid. To circumvent this problem, we
+ * initialize a dir entry with a phony offset, and use this offset to
+ * determine end of file in hyprlofs_readdir.
+ */
+ hpdp = dir->hln_dir->hld_prev;
+ /*
+ * Install at first empty "slot" in directory list.
+ */
+ while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset -
+ hpdp->hld_offset) <= 1) {
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+ ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset);
+ hpdp = hpdp->hld_next;
+ }
+ hdp->hld_offset = hpdp->hld_offset + 1;
+
+ /*
+ * If we're at the end of the dirent list and the offset (which is
+ * necessarily the largest offset in this dir) is more than twice the
+ * number of dirents, that means the dir is 50% holes. At this point
+ * we reset the slot pointer back to the beginning of the dir so we
+ * start using the holes. The idea is that if there are N dirents,
+ * there must also be N holes, so we can satisfy the next N creates by
+ * walking at most 2N entries; thus the average cost of a create is
+ * constant. Note that we use the first dirent's hld_prev as the roving
+ * slot pointer. This saves a word in every dirent.
+ */
+ if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents)
+ dir->hln_dir->hld_prev = dir->hln_dir->hld_next;
+ else
+ dir->hln_dir->hld_prev = hdp;
+
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ hdp->hld_next = hpdp->hld_next;
+ if (hdp->hld_next) {
+ hdp->hld_next->hld_prev = hdp;
+ }
+ hdp->hld_prev = hpdp;
+ hpdp->hld_next = hdp;
+
+ ASSERT(hdp->hld_next != hdp);
+ ASSERT(hdp->hld_prev != hdp);
+ ASSERT(hpdp->hld_next != hpdp);
+ ASSERT(hpdp->hld_prev != hpdp);
+
+ gethrestime(&now);
+ dir->hln_mtime = now;
+ dir->hln_ctime = now;
+
+ return (0);
+}
+
+static int
+hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op,
+ vnode_t *realvp, hlnode_t **newnode, cred_t *cr)
+{
+ hlnode_t *hp;
+ enum vtype type;
+
+ ASSERT(va != NULL);
+ ASSERT(op == DE_CREATE || op == DE_MKDIR);
+ if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
+ ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
+ return (EOVERFLOW);
+ type = va->va_type;
+ hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+ hyprlofs_node_init(hm, hp, va, cr);
+
+ hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV;
+ hp->hln_vnode->v_type = type;
+ hp->hln_uid = crgetuid(cr);
+
+ /*
+ * To determine the gid of the created file:
+ * If the directory's set-gid bit is set, set the gid to the gid
+ * of the parent dir, otherwise, use the process's gid.
+ */
+ if (dir->hln_mode & VSGID)
+ hp->hln_gid = dir->hln_gid;
+ else
+ hp->hln_gid = crgetgid(cr);
+
+ /*
+ * If we're creating a dir and the parent dir has the set-GID bit set,
+ * set it on the new dir. Otherwise, if the user is neither privileged
+ * nor a member of the file's new group, clear the file's set-GID bit.
+ */
+ if (dir->hln_mode & VSGID && type == VDIR)
+ hp->hln_mode |= VSGID;
+ else {
+ if ((hp->hln_mode & VSGID) &&
+ secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0)
+ hp->hln_mode &= ~VSGID;
+ }
+
+ if (va->va_mask & AT_ATIME)
+ hp->hln_atime = va->va_atime;
+ if (va->va_mask & AT_MTIME)
+ hp->hln_mtime = va->va_mtime;
+
+ if (op == DE_MKDIR) {
+ hyprlofs_dirinit(dir, hp);
+ hp->hln_looped = 0;
+ } else {
+ hp->hln_realvp = realvp;
+ hp->hln_size = va->va_size;
+ hp->hln_looped = 1;
+ }
+
+ *newnode = hp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
new file mode 100644
index 0000000000..1d857309f3
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/mode.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/fs/hyprlofs_info.h>
+
+#define MODESHIFT 3
+
+/* Initialize a hlnode and add it to file list under mount point. */
+void
+hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr)
+{
+ vnode_t *vp;
+ timestruc_t now;
+
+ ASSERT(vap != NULL);
+
+ rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL);
+ h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+ h->hln_mask = 0;
+ h->hln_type = vap->va_type;
+ h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3);
+ h->hln_nlink = 1;
+ h->hln_size = 0;
+
+ if (cr == NULL) {
+ h->hln_uid = vap->va_uid;
+ h->hln_gid = vap->va_gid;
+ } else {
+ h->hln_uid = crgetuid(cr);
+ h->hln_gid = crgetgid(cr);
+ }
+
+ h->hln_fsid = hm->hlm_dev;
+ h->hln_rdev = vap->va_rdev;
+ h->hln_blksize = PAGESIZE;
+ h->hln_nblocks = 0;
+ gethrestime(&now);
+ h->hln_atime = now;
+ h->hln_mtime = now;
+ h->hln_ctime = now;
+ h->hln_seq = 0;
+ h->hln_dir = NULL;
+
+ h->hln_vnode = vn_alloc(KM_SLEEP);
+ vp = HLNTOV(h);
+ vn_setops(vp, hyprlofs_vnodeops);
+ vp->v_vfsp = hm->hlm_vfsp;
+ vp->v_type = vap->va_type;
+ vp->v_rdev = vap->va_rdev;
+ vp->v_data = (caddr_t)h;
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * Increment the pseudo generation number for this hlnode. Since
+ * hlnodes are allocated and freed, there really is no particular
+ * generation number for a new hlnode. Just fake it by using a
+ * counter in each file system.
+ */
+ h->hln_gen = hm->hlm_gen++;
+
+ /*
+ * Add new hlnode to end of linked list of hlnodes for this hyprlofs
+ * Root dir is handled specially in hyprlofs_mount.
+ */
+ if (hm->hlm_rootnode != (hlnode_t *)NULL) {
+ h->hln_forw = NULL;
+ h->hln_back = hm->hlm_rootnode->hln_back;
+ h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h;
+ }
+ mutex_exit(&hm->hlm_contents);
+ vn_exists(vp);
+}
+
+int
+hyprlofs_taccess(void *vtp, int mode, cred_t *cr)
+{
+ hlnode_t *hp = vtp;
+ int shift = 0;
+
+ /* Check access based on owner, group and public perms in hlnode. */
+ if (crgetuid(cr) != hp->hln_uid) {
+ shift += MODESHIFT;
+ if (groupmember(hp->hln_gid, cr) == 0)
+ shift += MODESHIFT;
+ }
+
+ return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid,
+ hp->hln_mode << shift, mode));
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
new file mode 100644
index 0000000000..c582a8cac2
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -0,0 +1,614 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and
+ * lofs(7FS) file systems. It is modeled on code from both of these file
+ * systems.
+ *
+ * The purpose is to create a high performance name space for files on which
+ * applications will compute. Given a large number of data files with various
+ * owners, we want to construct a view onto those files such that only a subset
+ * is visible to the applications and such that the view can be changed very
+ * quickly as compute progresses. Entries in the name space are not mounts and
+ * thus do not appear in the mnttab. Entries in the name space are allowed to
+ * refer to files on different backing file systems. Intermediate directories
+ * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes
+ * in the name space except for entries that refer to backing files ala lofs.
+ *
+ * The name space is managed via ioctls issued on the mounted file system and
+ * is mostly read-only for the compute applications. That is, applications
+ * cannot create new files in the name space. If a file is unlinked by an
+ * application, that only removes the file from the name space, the backing
+ * file remains in place. It is possible for applications to write-through to
+ * the backing files if the file system is mounted read-write.
+ *
+ * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES,
+ * and HYPRLOFS_RM_ALL ioctls on the top-level mount.
+ *
+ * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and
+ * the name(s) for the file(s) in the name space. The name(s) may be path(s)
+ * which will be relative to the root of the mount and thus cannot begin with
+ * a /. If the name is a path, it does not have to correspond to any backing
+ * path. The intermediate directories will only exist in the name space. The
+ * entry(ies) will be added to the name space.
+ *
+ * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the
+ * name space which should be removed. The name(s) may be path(s) which will
+ * be relative to the root of the mount and thus cannot begin with a /. The
+ * named entry(ies) will be removed.
+ *
+ * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/time.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/stat.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/statvfs.h>
+#include <sys/mount.h>
+#include <sys/debug.h>
+#include <sys/systm.h>
+#include <sys/mntent.h>
+#include <fs/fs_subr.h>
+#include <vm/page.h>
+#include <vm/anon.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+#include <sys/fs/swapnode.h>
+#include <sys/fs/hyprlofs_info.h>
+
+static int hyprlofsfstype;
+
+/*
+ * hyprlofs vfs operations.
+ */
+static int hyprlofsinit(int, char *);
+static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
+static int hyprlofs_unmount(vfs_t *, int, cred_t *);
+static int hyprlofs_root(vfs_t *, vnode_t **);
+static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *);
+static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *);
+
+/*
+ * Loadable module wrapper
+ */
+#include <sys/modctl.h>
+
+static mntopts_t hyprlofs_mntopts;
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "hyprlofs",
+ hyprlofsinit,
+ VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
+ &hyprlofs_mntopts
+};
+
+static mntopts_t hyprlofs_mntopts = {
+ 0, NULL
+};
+
+/*
+ * Module linkage information
+ */
+static struct modlfs modlfs = {
+ &mod_fsops, "filesystem for hyprlofs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modlfs, NULL
+};
+
+int
+_init()
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+ int error;
+
+ error = mod_remove(&modlinkage);
+ if (error)
+ return (error);
+ /*
+ * Tear down the operations vectors
+ */
+ (void) vfs_freevfsops_by_type(hyprlofsfstype);
+ vn_freevnodeops(hyprlofs_vnodeops);
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * The following are patchable variables limiting the amount of system
+ * resources hyprlofs can use.
+ *
+ * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can
+ * use for it's data structures (e.g. hlnodes, directory entries). It is set
+ * as a percentage of physical memory which is determined when hyprlofs is
+ * first used in the system.
+ *
+ * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for
+ * the rest of the system. If the amount of free swap space in the system
+ * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon
+ * allocations will fail.
+ */
+size_t hyprlofs_maxkmem = 0;
+size_t hyprlofs_minfree = 0;
+size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */
+
+static major_t hyprlofs_major;
+static minor_t hyprlofs_minor;
+static kmutex_t hyprlofs_minor_lock;
+
+/*
+ * initialize global hyprlofs locks and hashes when loading hyprlofs module
+ */
+static int
+hyprlofsinit(int fstype, char *name)
+{
+ static const fs_operation_def_t hl_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount },
+ VFSNAME_ROOT, { .vfs_root = hyprlofs_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs },
+ VFSNAME_VGET, { .vfs_vget = hyprlofs_vget },
+ NULL, NULL
+ };
+ int error;
+ extern void hyprlofs_hash_init();
+
+ hyprlofs_hash_init();
+ hyprlofsfstype = fstype;
+ ASSERT(hyprlofsfstype != 0);
+
+ error = vfs_setfsops(fstype, hl_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template");
+ return (error);
+ }
+
+ error = vn_make_ops(name, hyprlofs_vnodeops_template,
+ &hyprlofs_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * hyprlofs_minfree is an absolute limit of swap space which still
+ * allows other processes to execute. Set it if its not patched.
+ */
+ if (hyprlofs_minfree == 0)
+ hyprlofs_minfree = btopr(HYPRLOFSMINFREE);
+
+ if ((hyprlofs_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "hyprlofsinit: Can't get unique device number.");
+ hyprlofs_major = 0;
+ }
+ mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+ return (0);
+}
+
+static int
+hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ hlfsmount_t *hm = NULL;
+ hlnode_t *hp;
+ struct pathname dpn;
+ int error;
+ vattr_t rattr;
+ int got_attrs;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (uap->flags & MS_REMOUNT)
+ return (EBUSY);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /* Having the resource be anything but "swap" doesn't make sense. */
+ vfs_setresource(vfsp, "swap", 0);
+
+ if ((error = pn_get(uap->dir,
+ (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE,
+ &dpn)) != 0)
+ goto out;
+
+ if ((hm = kmem_zalloc(sizeof (hlfsmount_t),
+ KM_NORMALPRI | KM_NOSLEEP)) == NULL) {
+ pn_free(&dpn);
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Get an available minor device number for this mount */
+ mutex_enter(&hyprlofs_minor_lock);
+ do {
+ hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32;
+ hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor);
+ } while (vfs_devismounted(hm->hlm_dev));
+ mutex_exit(&hyprlofs_minor_lock);
+
+ /*
+ * Set but don't bother entering the mutex since hlfsmount is not on
+ * the mount list yet.
+ */
+ mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL);
+
+ hm->hlm_vfsp = vfsp;
+
+ vfsp->vfs_data = (caddr_t)hm;
+ vfsp->vfs_fstype = hyprlofsfstype;
+ vfsp->vfs_dev = hm->hlm_dev;
+ vfsp->vfs_bsize = PAGESIZE;
+ vfsp->vfs_flag |= VFS_NOTRUNC;
+ vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype);
+ hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
+ (void) strcpy(hm->hlm_mntpath, dpn.pn_path);
+
+ /* allocate and initialize root hlnode structure */
+ bzero(&rattr, sizeof (vattr_t));
+ rattr.va_mode = (mode_t)(S_IFDIR | 0777);
+ rattr.va_type = VDIR;
+ rattr.va_rdev = 0;
+ hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP);
+ hyprlofs_node_init(hm, hp, &rattr, cr);
+
+ /* Get the mode, uid, and gid from the underlying mount point. */
+ rattr.va_mask = AT_MODE|AT_UID|AT_GID;
+ got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ HLNTOV(hp)->v_flag |= VROOT;
+
+ /*
+ * If the getattr succeeded, use its results, otherwise allow the
+ * previously set defaults to prevail.
+ */
+ if (got_attrs == 0) {
+ hp->hln_mode = rattr.va_mode;
+ hp->hln_uid = rattr.va_uid;
+ hp->hln_gid = rattr.va_gid;
+ }
+
+ /*
+ * Initialize linked list of hlnodes so that the back pointer of the
+ * root hlnode always points to the last one on the list and the
+ * forward pointer of the last node is null
+ */
+ hp->hln_back = hp;
+ hp->hln_forw = NULL;
+ hp->hln_nlink = 0;
+ hm->hlm_rootnode = hp;
+
+ hyprlofs_dirinit(hp, hp);
+
+ rw_exit(&hp->hln_rwlock);
+
+ pn_free(&dpn);
+ error = 0;
+
+out:
+ return (error);
+}
+
+static int
+hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hnp, *cancel;
+ vnode_t *vp;
+ int error;
+
+ if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
+ return (error);
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ /*
+ * forced unmount is not supported by this file system
+ * and thus, ENOTSUP, is being returned.
+ */
+ if (flag & MS_FORCE)
+ return (ENOTSUP);
+
+ mutex_enter(&hm->hlm_contents);
+
+ /*
+ * If there are no open files, only the root node should have a ref cnt.
+ * With hlm_contents held, nothing can be added or removed. There may
+ * be some dirty pages. To prevent fsflush from disrupting the unmount,
+ * put a hold on each node while scanning. If we find a previously
+ * referenced node, undo the holds we have placed and fail EBUSY.
+ */
+ hnp = hm->hlm_rootnode;
+ if (HLNTOV(hnp)->v_count > 1) {
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+
+ for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) {
+ if ((vp = HLNTOV(hnp))->v_count > 0) {
+ cancel = hm->hlm_rootnode->hln_forw;
+ while (cancel != hnp) {
+ vp = HLNTOV(cancel);
+ ASSERT(vp->v_count > 0);
+ VN_RELE(vp);
+ cancel = cancel->hln_forw;
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (EBUSY);
+ }
+ VN_HOLD(vp);
+ }
+
+ /* We can drop the mutex now because no one can find this mount */
+ mutex_exit(&hm->hlm_contents);
+
+ /*
+ * Free all alloc'd memory associated with this FS. To do this, we go
+ * through the file list twice, once to remove all the dir entries, and
+ * then to remove all the files.
+ */
+
+ /* Remove all directory entries */
+ for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) {
+ rw_enter(&hnp->hln_rwlock, RW_WRITER);
+ if (hnp->hln_type == VDIR)
+ hyprlofs_dirtrunc(hnp);
+ rw_exit(&hnp->hln_rwlock);
+ }
+
+ ASSERT(hm->hlm_rootnode);
+
+ /*
+ * All links are gone, v_count is keeping nodes in place. VN_RELE
+ * should make the node disappear, unless somebody is holding pages
+ * against it. Wait and retry until it disappears.
+ *
+ * We re-acquire the lock to prevent others who have a HOLD on a hlnode
+ * from blowing it away (in hyprlofs_inactive) while we're trying to
+ * get to it here. Once we have a HOLD on it we know it'll stick around.
+ */
+ mutex_enter(&hm->hlm_contents);
+
+ /* Remove all the files (except the rootnode) backwards. */
+ while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) {
+ mutex_exit(&hm->hlm_contents);
+ /* Note we handled the link count in pass 2 above. */
+ vp = HLNTOV(hnp);
+ VN_RELE(vp);
+ mutex_enter(&hm->hlm_contents);
+ /*
+ * It's still there after the RELE. Someone else like pageout
+ * has a hold on it so wait a bit and then try again.
+ */
+ if (hnp == hm->hlm_rootnode->hln_back) {
+ VN_HOLD(vp);
+ mutex_exit(&hm->hlm_contents);
+ delay(hz / 4);
+ mutex_enter(&hm->hlm_contents);
+ }
+ }
+ mutex_exit(&hm->hlm_contents);
+
+ VN_RELE(HLNTOV(hm->hlm_rootnode));
+
+ ASSERT(hm->hlm_mntpath);
+
+ kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1);
+
+ mutex_destroy(&hm->hlm_contents);
+ kmem_free(hm, sizeof (hlfsmount_t));
+
+ return (0);
+}
+
+/* Return root hlnode for given vnode */
+static int
+hyprlofs_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = hm->hlm_rootnode;
+ vnode_t *vp;
+
+ ASSERT(hp);
+
+ vp = HLNTOV(hp);
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
+{
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ ulong_t blocks;
+ dev32_t d32;
+ zoneid_t eff_zid;
+ struct zone *zp;
+
+ /*
+ * The FS may have been mounted by the GZ on behalf of the NGZ. In
+ * that case, the hlfsmount zone_id will be the global zone. We want
+ * to show the swap cap inside the zone in this case, even though the
+ * FS was mounted by the GZ.
+ */
+ if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID)
+ zp = curproc->p_zone;
+ else
+ zp = hm->hlm_vfsp->vfs_zone;
+
+ if (zp == NULL)
+ eff_zid = GLOBAL_ZONEUNIQID;
+ else
+ eff_zid = zp->zone_id;
+
+ sbp->f_bsize = PAGESIZE;
+ sbp->f_frsize = PAGESIZE;
+
+ /*
+ * Find the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
+ blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+
+ if (blocks > hyprlofs_minfree)
+ sbp->f_bfree = blocks - hyprlofs_minfree;
+ else
+ sbp->f_bfree = 0;
+
+ sbp->f_bavail = sbp->f_bfree;
+
+ /*
+ * Total number of blocks is what's available plus what's been used
+ */
+ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree);
+
+ if (eff_zid != GLOBAL_ZONEUNIQID &&
+ zp->zone_max_swap_ctl != UINT64_MAX) {
+ /*
+ * If the fs is used by a NGZ with a swap cap, then report the
+ * capped size.
+ */
+ rctl_qty_t cap, used;
+ pgcnt_t pgcap, pgused;
+
+ mutex_enter(&zp->zone_mem_lock);
+ cap = zp->zone_max_swap_ctl;
+ used = zp->zone_max_swap;
+ mutex_exit(&zp->zone_mem_lock);
+
+ pgcap = btop(cap);
+ pgused = btop(used);
+
+ sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree);
+ sbp->f_bavail = sbp->f_bfree;
+ sbp->f_blocks = MIN(pgcap, sbp->f_blocks);
+ }
+
+ /*
+ * This is fairly inaccurate since it doesn't take into account the
+ * names stored in the directory entries.
+ */
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
+ (sizeof (hlnode_t) + sizeof (hldirent_t));
+
+ sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sbp->f_fsid = d32;
+ (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name);
+ (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr));
+ /*
+ * ensure null termination
+ */
+ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0';
+ sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sbp->f_namemax = MAXNAMELEN - 1;
+ return (0);
+}
+
+static int
+hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp)
+{
+ hlfid_t *hfid;
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp);
+ hlnode_t *hp = NULL;
+
+ hfid = (hlfid_t *)fidp;
+ *vpp = NULL;
+
+ mutex_enter(&hm->hlm_contents);
+ for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) {
+ mutex_enter(&hp->hln_tlock);
+ if (hp->hln_nodeid == hfid->hlfid_ino) {
+ /*
+ * If the gen numbers don't match we know the file
+ * won't be found since only one hlnode can have this
+ * number at a time.
+ */
+ if (hp->hln_gen != hfid->hlfid_gen ||
+ hp->hln_nlink == 0) {
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ *vpp = (vnode_t *)HLNTOV(hp);
+
+ VN_HOLD(*vpp);
+
+ if ((hp->hln_mode & S_ISVTX) &&
+ !(hp->hln_mode & (S_IXUSR | S_IFDIR))) {
+ mutex_enter(&(*vpp)->v_lock);
+ (*vpp)->v_flag |= VISSWAP;
+ mutex_exit(&(*vpp)->v_lock);
+ }
+ mutex_exit(&hp->hln_tlock);
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+ }
+ mutex_exit(&hp->hln_tlock);
+ }
+ mutex_exit(&hm->hlm_contents);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
new file mode 100644
index 0000000000..a2064dfa1f
--- /dev/null
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c
@@ -0,0 +1,1441 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/t_lock.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/time.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/flock.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/pathname.h>
+#include <sys/fs/hyprlofs.h>
+#include <sys/fs/hyprlofs_info.h>
+#include <sys/mman.h>
+#include <vm/pvn.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/policy.h>
+#include <fs/fs_subr.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *,
+ caller_context_t *);
+static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int);
+static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
+ int);
+static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *,
+ int);
+
+/*
+ * This is a somewhat arbitrary upper limit on the number of entries we can
+ * pass in on a single add/rm ioctl call. This is only used to validate that
+ * the input list looks sane.
+ */
+#define MAX_IOCTL_PARAMS 100000
+
+static int
+hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *rvp;
+ int error;
+
+ rvp = REALVP(*vpp);
+
+ if (VTOHLN(*vpp)->hln_looped == 0)
+ return (0);
+
+ /*
+ * looped back, pass through to real vnode. Need to hold new reference
+ * to vp since VOP_OPEN() may decide to release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ ASSERT(rvp->v_count > 1);
+ VN_RELE(rvp);
+
+ return (error);
+}
+
+static int
+hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 0) {
+ cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+ cleanshares(vp, ttoproc(curthread)->p_pid);
+ return (0);
+ }
+
+ return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct));
+}
+
+static int
+hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ if (vp->v_type == VDIR)
+ return (EISDIR);
+ return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+static int
+hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* We don't support writing to non-regular files */
+ if (vp->v_type != VREG)
+ return (EINVAL);
+
+ if (vn_is_readonly(vp))
+ return (EROFS);
+
+ return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct));
+}
+
+/* ARGSUSED */
+static int
+hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag,
+ cred_t *cr, int *rvalp, caller_context_t *ct)
+{
+ int len, cnt, error;
+ int i;
+ model_t model;
+ char path[MAXPATHLEN];
+ char nm[MAXPATHLEN];
+
+ /* We only support the hyprlofs ioctls on the root vnode */
+ if (!(vp->v_flag & VROOT))
+ return (ENOTTY);
+
+ /*
+ * Check if managing hyprlofs is allowed.
+ */
+ if (secpolicy_hyprlofs_control(cr) != 0)
+ return (EPERM);
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) {
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_entries_t ebuf;
+ hyprlofs_entry_t *e;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ cnt = ebuf.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry_t) * cnt;
+
+ e = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(ebuf.hle_entries), e, len)) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e[i].hle_nlen == 0 ||
+ e[i].hle_nlen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin(e[i].hle_name, nm, e[i].hle_nlen)
+ != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ nm[e[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e[i].hle_plen == 0 ||
+ e[i].hle_plen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin(e[i].hle_path, path,
+ e[i].hle_plen) != 0) {
+ kmem_free(e, len);
+ return (EFAULT);
+ }
+ path[e[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e, len);
+ return (0);
+
+ } else {
+ hyprlofs_entries32_t ebuf32;
+ hyprlofs_entry32_t *e32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ cnt = ebuf32.hle_len;
+ if (cnt > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ len = sizeof (hyprlofs_entry32_t) * cnt;
+
+ e32 = kmem_alloc(len, KM_SLEEP);
+ if (copyin((void *)(unsigned long)(ebuf32.hle_entries),
+ e32, len)) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ if (e32[i].hle_nlen == 0 ||
+ e32[i].hle_nlen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_name, nm,
+ e32[i].hle_nlen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ nm[e32[i].hle_nlen] = '\0';
+
+ if (cmd == HYPRLOFS_ADD_ENTRIES) {
+ if (e32[i].hle_plen == 0 ||
+ e32[i].hle_plen > MAXPATHLEN)
+ return (EINVAL);
+
+ if (copyin((void *)(unsigned long)
+ e32[i].hle_path, path,
+ e32[i].hle_plen) != 0) {
+ kmem_free(e32, len);
+ return (EFAULT);
+ }
+ path[e32[i].hle_plen] = '\0';
+
+ if ((error = hyprlofs_add_entry(vp,
+ path, nm, cr, ct)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ } else {
+ if ((error = hyprlofs_rm_entry(vp, nm,
+ cr, ct, flag)) != 0) {
+ kmem_free(e32, len);
+ return (error);
+ }
+ }
+ }
+
+ kmem_free(e32, len);
+ return (0);
+ }
+ }
+
+ if (cmd == HYPRLOFS_RM_ALL) {
+ return (hyprlofs_rm_all(vp, cr, ct, flag));
+ }
+
+ if (cmd == HYPRLOFS_GET_ENTRIES) {
+ return (hyprlofs_get_all(vp, data, cr, ct, flag));
+ }
+
+ return (ENOTTY);
+}
+
+static int
+hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ vattr_t tmp_va;
+
+ if (tp->hln_looped == 1) {
+ int error;
+
+ if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr,
+ ct)) != 0)
+ return (error);
+ }
+
+ mutex_enter(&tp->hln_tlock);
+ vap->va_type = vp->v_type;
+ vap->va_mode = tp->hln_mode & MODEMASK;
+ vap->va_uid = tp->hln_uid;
+ vap->va_gid = tp->hln_gid;
+ vap->va_fsid = tp->hln_fsid;
+ vap->va_nodeid = (ino64_t)tp->hln_nodeid;
+ vap->va_nlink = tp->hln_nlink;
+ vap->va_size = (u_offset_t)tp->hln_size;
+ vap->va_atime = tp->hln_atime;
+ vap->va_mtime = tp->hln_mtime;
+ vap->va_ctime = tp->hln_ctime;
+ vap->va_blksize = PAGESIZE;
+ vap->va_rdev = tp->hln_rdev;
+ vap->va_seq = tp->hln_seq;
+
+ if (tp->hln_looped == 1) {
+ vap->va_nblocks = tmp_va.va_nblocks;
+ } else {
+ vap->va_nblocks =
+ (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size)));
+ }
+ mutex_exit(&tp->hln_tlock);
+ return (0);
+}
+
+/*ARGSUSED4*/
+static int
+hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error = 0;
+ vattr_t *get;
+ long mask;
+
+ /*
+ * Cannot set these attributes
+ */
+ if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR))
+ return (EINVAL);
+
+ mutex_enter(&tp->hln_tlock);
+
+ get = &tp->hln_attr;
+ /*
+ * Change file access modes. Must be owner or have sufficient
+ * privileges.
+ */
+ error = secpolicy_vnode_setattr(cr, vp, vap, get, flags,
+ hyprlofs_taccess, tp);
+
+ if (error)
+ goto out;
+
+ mask = vap->va_mask;
+
+ if (mask & AT_MODE) {
+ get->va_mode &= S_IFMT;
+ get->va_mode |= vap->va_mode & ~S_IFMT;
+ }
+
+ if (mask & AT_UID)
+ get->va_uid = vap->va_uid;
+ if (mask & AT_GID)
+ get->va_gid = vap->va_gid;
+ if (mask & AT_ATIME)
+ get->va_atime = vap->va_atime;
+ if (mask & AT_MTIME)
+ get->va_mtime = vap->va_mtime;
+
+ if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME))
+ gethrestime(&tp->hln_ctime);
+
+out:
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+static int
+hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(vp);
+ int error;
+
+ if (mode & VWRITE) {
+ if (vp->v_type == VREG && vn_is_readonly(vp))
+ return (EROFS);
+ }
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct));
+
+ mutex_enter(&tp->hln_tlock);
+ error = hyprlofs_taccess(tp, mode, cr);
+ mutex_exit(&tp->hln_tlock);
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ hlnode_t *tp = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *ntp = NULL;
+ int error;
+
+ if (VTOHLN(dvp)->hln_looped == 1)
+ return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir,
+ cr, ct, direntflags, realpnp));
+
+ if (flags & LOOKUP_XATTR)
+ return (EINVAL);
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ *vpp = dvp;
+ return (0);
+ }
+ ASSERT(tp);
+
+ if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) {
+ ASSERT(ntp);
+ *vpp = HLNTOV(ntp);
+ }
+ return (error);
+}
+
+/*
+ * Create the loopback from the hyprlofs vnode to the real vnode.
+ */
+static int
+hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap,
+ int mode, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *parent;
+ hlfsmount_t *tm;
+ int error;
+ hlnode_t *oldtp;
+ vnode_t *vp;
+
+ parent = (hlnode_t *)VTOHLN(dvp);
+ tm = (hlfsmount_t *)VTOHLM(dvp);
+ error = 0;
+ oldtp = NULL;
+
+ if (vap->va_type == VREG && (vap->va_mode & VSVTX)) {
+ /* we don't support the sticky bit */
+ vap->va_mode &= ~VSVTX;
+ } else if (vap->va_type == VNON) {
+ return (EINVAL);
+ }
+
+ /* Null component name is a synonym for directory being searched. */
+ if (*nm == '\0') {
+ VN_HOLD(dvp);
+ oldtp = parent;
+ } else {
+ error = hyprlofs_dirlookup(parent, nm, &oldtp, cr);
+ }
+
+ if (error == 0) { /* name found */
+ ASSERT(oldtp);
+
+ rw_enter(&oldtp->hln_rwlock, RW_WRITER);
+
+ /*
+ * if create/read-only an existing directory, allow it
+ */
+ if ((oldtp->hln_type == VDIR) && (mode & VWRITE))
+ error = EISDIR;
+ else {
+ error = hyprlofs_taccess(oldtp, mode, cr);
+ }
+
+ if (error) {
+ rw_exit(&oldtp->hln_rwlock);
+ hlnode_rele(oldtp);
+ return (error);
+ }
+
+ vp = HLNTOV(oldtp);
+ rw_exit(&oldtp->hln_rwlock);
+
+ if (vp->v_type == VREG) {
+ hlnode_rele(oldtp);
+ return (EEXIST);
+ }
+
+ vnevent_create(vp, ct);
+ return (0);
+ }
+
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL,
+ cr);
+ rw_exit(&parent->hln_rwlock);
+
+ return (error);
+}
+
+/*
+ * Create an in-memory directory based on the add-entry ioctl name.
+ * If the dir exists, return EEXIST but still also return node in vpp.
+ */
+static int
+hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp);
+ int error;
+
+ /*
+ * Might be dangling directory. Catch it here, because a ENOENT return
+ * from hyprlofs_dirlookup() is a valid return.
+ */
+ if (parent->hln_nlink == 0)
+ return (ENOENT);
+
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error == 0) {
+ ASSERT(self);
+ hlnode_rele(self);
+ /* We can't loop in under a looped in directory */
+ if (self->hln_looped)
+ return (EACCES);
+ *vpp = HLNTOV(self);
+ return (EEXIST);
+ }
+ if (error != ENOENT)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL,
+ va, &self, cr);
+ rw_exit(&parent->hln_rwlock);
+
+ if (error == 0 || error == EEXIST) {
+ hlnode_rele(self);
+ *vpp = HLNTOV(self);
+ }
+
+ return (error);
+}
+
+/*
+ * Loop in a file or directory into the namespace.
+ */
+static int
+hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname,
+ cred_t *cr, caller_context_t *ct)
+{
+ int error;
+ char *p, *pnm;
+ vnode_t *realvp, *dvp;
+ vattr_t va;
+
+ /*
+ * Get vnode for the real file/dir. We'll have a hold on realvp which
+ * we won't vn_rele until hyprlofs_inactive.
+ */
+ if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP,
+ &realvp)) != 0)
+ return (error);
+
+ /* no devices allowed */
+ if (IS_DEVVP(realvp)) {
+ VN_RELE(realvp);
+ return (ENODEV);
+ }
+
+ /*
+ * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS
+ * to trigger the mount of the intended filesystem. This causes a
+ * loopback mount of the intended filesystem instead of the AUTOFS
+ * filesystem.
+ */
+ if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ /*
+ * We're interested in the top most filesystem. This is specially
+ * important when fspath is a trigger AUTOFS node, since we're really
+ * interested in mounting the filesystem AUTOFS mounted as result of
+ * the VOP_ACCESS() call not the AUTOFS node itself.
+ */
+ if (vn_mountedvfs(realvp) != NULL) {
+ if ((error = traverse(&realvp)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+ }
+
+ va.va_type = VNON;
+ /*
+ * If the target name is a path, make sure we have all of the
+ * intermediate directories, creating them if necessary.
+ */
+ dvp = vp;
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/') {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ if (va.va_type == VNON)
+ /* use the top-level dir as the template va for mkdir */
+ if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' ||
+ (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 &&
+ error != EEXIST) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ *p = '/';
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0') {
+ VN_RELE(realvp);
+ return (EINVAL);
+ }
+
+ /* Now use the real file's va as the template va */
+ if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) {
+ VN_RELE(realvp);
+ return (error);
+ }
+
+ /* Make the vnode */
+ error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct);
+ if (error != 0)
+ VN_RELE(realvp);
+ return (error);
+}
+
+/*
+ * Remove a looped in file from the namespace.
+ */
+static int
+hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error;
+ char *p, *pnm;
+ hlnode_t *parent;
+ hlnode_t *fndtp;
+
+ pnm = p = fsname;
+
+ /* path cannot be absolute */
+ if (*p == '/')
+ return (EINVAL);
+
+ /*
+ * If the target name is a path, get the containing dir and simple
+ * file name.
+ */
+ parent = (hlnode_t *)VTOHLN(dvp);
+ for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) {
+ *p = '\0';
+
+ /* Path component cannot be empty or relative */
+ if (pnm[0] == '\0' ||
+ (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0'))
+ return (EINVAL);
+
+ if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0)
+ return (error);
+
+ dvp = HLNTOV(fndtp);
+ parent = fndtp;
+ pnm = p + 1;
+ }
+
+ /* The file name is required */
+ if (pnm[0] == '\0')
+ return (EINVAL);
+
+ /* Remove the entry from the parent dir */
+ return (hyprlofs_remove(dvp, pnm, cr, ct, flags));
+}
+
+/*
+ * Remove all looped in files from the namespace.
+ */
+static int
+hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int error = 0;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively remove contents of this subdir */
+ if (fndhp->hln_type == VDIR) {
+ vnode_t *tvp = HLNTOV(fndhp);
+
+ error = hyprlofs_rm_all(tvp, cr, ct, flags);
+ if (error != 0)
+ goto done;
+ }
+ }
+
+ /* remove the entry */
+ error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags);
+ if (error != 0)
+ goto done;
+
+ hdp = hp->hln_dir;
+ }
+
+done:
+ hlnode_rele(hp);
+ return (error);
+}
+
+/*
+ * Get a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp,
+ char *prefix, int *pcnt, int n_max,
+ cred_t *cr, caller_context_t *ct, int flags)
+{
+ int error = 0;
+ int too_big = 0;
+ int cnt;
+ int len;
+ hlnode_t *hp = (hlnode_t *)VTOHLN(dvp);
+ hldirent_t *hdp;
+ char *path;
+
+ cnt = *pcnt;
+ path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ hlnode_hold(hp);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ goto done;
+ }
+
+ hdp = hp->hln_dir;
+ while (hdp) {
+ hlnode_t *fndhp;
+ vnode_t *tvp;
+
+ if (strcmp(hdp->hld_name, ".") == 0 ||
+ strcmp(hdp->hld_name, "..") == 0) {
+ hdp = hdp->hld_next;
+ continue;
+ }
+
+ /* This holds the fndhp vnode */
+ error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr);
+ if (error != 0)
+ goto done;
+ hlnode_rele(fndhp);
+
+ if (fndhp->hln_looped == 0) {
+ /* recursively get contents of this subdir */
+ VERIFY(fndhp->hln_type == VDIR);
+ tvp = HLNTOV(fndhp);
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name, MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN, "%s/%s",
+ prefix, hdp->hld_name);
+
+ error = hyprlofs_get_all_entries(tvp, hcp, path,
+ &cnt, n_max, cr, ct, flags);
+
+ if (error == E2BIG) {
+ too_big = 1;
+ error = 0;
+ }
+ if (error != 0)
+ goto done;
+ } else {
+ if (cnt < n_max) {
+ char *p;
+
+ if (*prefix == '\0')
+ (void) strlcpy(path, hdp->hld_name,
+ MAXPATHLEN);
+ else
+ (void) snprintf(path, MAXPATHLEN,
+ "%s/%s", prefix, hdp->hld_name);
+
+ len = strlen(path);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(path, (void *)(hcp[cnt].hce_name),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+
+ tvp = REALVP(HLNTOV(fndhp));
+ if (tvp->v_path == vn_vpath_empty) {
+ p = "<unknown>";
+ } else {
+ p = tvp->v_path;
+ }
+ len = strlen(p);
+ ASSERT(len <= MAXPATHLEN);
+ if (copyout(p, (void *)(hcp[cnt].hce_path),
+ len)) {
+ error = EFAULT;
+ goto done;
+ }
+ }
+
+ cnt++;
+ if (cnt > n_max)
+ too_big = 1;
+ }
+
+ hdp = hdp->hld_next;
+ }
+
+done:
+ hlnode_rele(hp);
+ kmem_free(path, MAXPATHLEN);
+
+ *pcnt = cnt;
+ if (error == 0 && too_big == 1)
+ error = E2BIG;
+
+ return (error);
+}
+
+/*
+ * Return a list of all looped in files in the namespace.
+ */
+static int
+hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ int limit, cnt, error;
+ model_t model;
+ hyprlofs_curr_entry_t *e;
+
+ model = get_udatamodel();
+
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ if (copyin((void *)data, &ebuf, sizeof (ebuf)))
+ return (EFAULT);
+ limit = ebuf.hce_cnt;
+ e = ebuf.hce_entries;
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ if (copyin((void *)data, &ebuf32, sizeof (ebuf32)))
+ return (EFAULT);
+
+ limit = ebuf32.hce_cnt;
+ e = (hyprlofs_curr_entry_t *)(unsigned long)
+ (ebuf32.hce_entries);
+ if (limit > MAX_IOCTL_PARAMS)
+ return (EINVAL);
+ }
+
+ cnt = 0;
+ error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct,
+ flags);
+
+ if (error == 0 || error == E2BIG) {
+ if (model == DATAMODEL_NATIVE) {
+ hyprlofs_curr_entries_t ebuf;
+
+ ebuf.hce_cnt = cnt;
+ if (copyout(&ebuf, (void *)data, sizeof (ebuf)))
+ return (EFAULT);
+
+ } else {
+ hyprlofs_curr_entries32_t ebuf32;
+
+ ebuf32.hce_cnt = cnt;
+ if (copyout(&ebuf32, (void *)data, sizeof (ebuf32)))
+ return (EFAULT);
+ }
+ }
+
+ return (error);
+}
+
+/* ARGSUSED3 */
+static int
+hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
+ int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ int error;
+ hlnode_t *hp = NULL;
+
+ /* This holds the hp vnode */
+ error = hyprlofs_dirlookup(parent, nm, &hp, cr);
+ if (error)
+ return (error);
+
+ ASSERT(hp);
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr);
+
+ rw_exit(&hp->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_remove(HLNTOV(hp), dvp, nm, ct);
+
+ /*
+ * We've now dropped the dir link so by rele-ing our vnode we should
+ * clean up in hyprlofs_inactive.
+ */
+ hlnode_rele(hp);
+
+ return (error);
+}
+
+/* ARGSUSED4 */
+static int
+hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *parent = (hlnode_t *)VTOHLN(dvp);
+ hlnode_t *self = NULL;
+ vnode_t *vp;
+ int error = 0;
+
+ /* Return error if removing . or .. */
+ if (strcmp(nm, ".") == 0)
+ return (EINVAL);
+ if (strcmp(nm, "..") == 0)
+ return (EEXIST); /* Should be ENOTEMPTY */
+ error = hyprlofs_dirlookup(parent, nm, &self, cr);
+ if (error)
+ return (error);
+
+ rw_enter(&parent->hln_rwlock, RW_WRITER);
+ rw_enter(&self->hln_rwlock, RW_WRITER);
+
+ vp = HLNTOV(self);
+ if (vp == dvp || vp == cdir) {
+ error = EINVAL;
+ goto done1;
+ }
+ if (self->hln_type != VDIR) {
+ error = ENOTDIR;
+ goto done1;
+ }
+
+ /*
+ * When a dir is looped in, we only remove the in-memory dir, not the
+ * backing dir.
+ */
+ if (self->hln_looped == 0) {
+ mutex_enter(&self->hln_tlock);
+ if (self->hln_nlink > 2) {
+ mutex_exit(&self->hln_tlock);
+ error = EEXIST;
+ goto done1;
+ }
+ mutex_exit(&self->hln_tlock);
+
+ if (vn_vfswlock(vp)) {
+ error = EBUSY;
+ goto done1;
+ }
+ if (vn_mountedvfs(vp) != NULL) {
+ error = EBUSY;
+ goto done;
+ }
+
+ /*
+ * Check for an empty directory, i.e. only includes entries for
+ * "." and ".."
+ */
+ if (self->hln_dirents > 2) {
+ error = EEXIST; /* SIGH should be ENOTEMPTY */
+ /*
+ * Update atime because checking hln_dirents is
+ * equivalent to reading the directory
+ */
+ gethrestime(&self->hln_atime);
+ goto done;
+ }
+
+ error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr);
+ } else {
+ error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr);
+ }
+
+done:
+ if (self->hln_looped == 0)
+ vn_vfsunlock(vp);
+done1:
+ rw_exit(&self->hln_rwlock);
+ rw_exit(&parent->hln_rwlock);
+ vnevent_rmdir(HLNTOV(self), dvp, nm, ct);
+
+ /*
+ * We've now dropped the dir link so by rele-ing our vnode we should
+ * clean up in hyprlofs_inactive.
+ */
+ hlnode_rele(self);
+
+ return (error);
+}
+
+static int
+hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hldirent_t *hdp;
+ int error = 0;
+ size_t namelen;
+ struct dirent64 *dp;
+ ulong_t offset;
+ ulong_t total_bytes_wanted;
+ long outcount = 0;
+ long bufsize;
+ int reclen;
+ caddr_t outbuf;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags));
+
+ if (uiop->uio_loffset >= MAXOFF_T) {
+ if (eofp)
+ *eofp = 1;
+ return (0);
+ }
+ /* assuming syscall has already called hln_rwlock */
+ ASSERT(RW_READ_HELD(&hp->hln_rwlock));
+
+ if (uiop->uio_iovcnt != 1)
+ return (EINVAL);
+
+ if (vp->v_type != VDIR)
+ return (ENOTDIR);
+
+ /*
+ * There's a window here where someone could have removed
+ * all the entries in the directory after we put a hold on the
+ * vnode but before we grabbed the rwlock. Just return.
+ */
+ if (hp->hln_dir == NULL) {
+ if (hp->hln_nlink) {
+ panic("empty directory 0x%p", (void *)hp);
+ /*NOTREACHED*/
+ }
+ return (0);
+ }
+
+ /* Get space for multiple dir entries */
+ total_bytes_wanted = uiop->uio_iov->iov_len;
+ bufsize = total_bytes_wanted + sizeof (struct dirent64);
+ outbuf = kmem_alloc(bufsize, KM_SLEEP);
+
+ dp = (struct dirent64 *)((uintptr_t)outbuf);
+
+ offset = 0;
+ hdp = hp->hln_dir;
+ while (hdp) {
+ namelen = strlen(hdp->hld_name); /* no +1 needed */
+ offset = hdp->hld_offset;
+ if (offset >= uiop->uio_offset) {
+ reclen = (int)DIRENT64_RECLEN(namelen);
+ if (outcount + reclen > total_bytes_wanted) {
+ if (!outcount)
+ /* Buffer too small for any entries. */
+ error = EINVAL;
+ break;
+ }
+ ASSERT(hdp->hld_hlnode != NULL);
+
+ /* zero out uninitialized bytes */
+ (void) strncpy(dp->d_name, hdp->hld_name,
+ DIRENT64_NAMELEN(reclen));
+ dp->d_reclen = (ushort_t)reclen;
+ dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid;
+ dp->d_off = (offset_t)hdp->hld_offset + 1;
+ dp = (struct dirent64 *)
+ ((uintptr_t)dp + dp->d_reclen);
+ outcount += reclen;
+ ASSERT(outcount <= bufsize);
+ }
+ hdp = hdp->hld_next;
+ }
+
+ if (!error)
+ error = uiomove(outbuf, outcount, UIO_READ, uiop);
+
+ if (!error) {
+ /*
+ * If we reached the end of the list our offset should now be
+ * just past the end.
+ */
+ if (!hdp) {
+ offset += 1;
+ if (eofp)
+ *eofp = 1;
+ } else if (eofp)
+ *eofp = 0;
+ uiop->uio_offset = offset;
+ }
+ gethrestime(&hp->hln_atime);
+ kmem_free(outbuf, bufsize);
+ return (error);
+}
+
+static int
+hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct));
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp);
+
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+
+ mutex_enter(&hp->hln_tlock);
+ mutex_enter(&vp->v_lock);
+ ASSERT(vp->v_count >= 1);
+
+ /*
+ * If we don't have the last hold or the link count is non-zero,
+ * there's nothing to do except drop our hold.
+ */
+ if (vp->v_count > 1 || hp->hln_nlink != 0) {
+ vp->v_count--;
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+ rw_exit(&hp->hln_rwlock);
+ return;
+ }
+
+ mutex_exit(&vp->v_lock);
+ mutex_exit(&hp->hln_tlock);
+
+ /* release hold on the real vnode now */
+ if (hp->hln_looped == 1 && hp->hln_realvp != NULL)
+ VN_RELE(hp->hln_realvp);
+
+ /* Here's our chance to send invalid event while we're between locks */
+ vn_invalid(HLNTOV(hp));
+
+ mutex_enter(&hm->hlm_contents);
+ if (hp->hln_forw == NULL)
+ hm->hlm_rootnode->hln_back = hp->hln_back;
+ else
+ hp->hln_forw->hln_back = hp->hln_back;
+ hp->hln_back->hln_forw = hp->hln_forw;
+ mutex_exit(&hm->hlm_contents);
+ rw_exit(&hp->hln_rwlock);
+ rw_destroy(&hp->hln_rwlock);
+ mutex_destroy(&hp->hln_tlock);
+ vn_free(HLNTOV(hp));
+ kmem_free(hp, sizeof (hlnode_t));
+}
+
+static int
+hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct)
+{
+ hlnode_t *hp = (hlnode_t *)VTOHLN(vp);
+ hlfid_t *hfid;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_FID(REALVP(vp), fidp, ct));
+
+ if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) {
+ fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t);
+ return (ENOSPC);
+ }
+
+ hfid = (hlfid_t *)fidp;
+ bzero(hfid, sizeof (hlfid_t));
+ hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t);
+
+ hfid->hlfid_ino = hp->hln_nodeid;
+ hfid->hlfid_gen = hp->hln_gen;
+
+ return (0);
+}
+
+static int
+hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
+ cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr,
+ rw, cr, ct));
+}
+
+int
+hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags,
+ cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct));
+}
+
+static int
+hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags,
+ cr, ct));
+}
+
+static int
+hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
+ size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot,
+ flags, cr, ct));
+}
+
+static int
+hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
+ offset_t offset, cred_t *cr, caller_context_t *ct)
+{
+ /* return EACCES to be consistent with mmap */
+ if (VTOHLN(vp)->hln_looped != 1)
+ return (EACCES);
+ return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct));
+}
+
+static int
+hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ if (VTOHLN(vp)->hln_looped == 0)
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+
+ return (VOP_SEEK(REALVP(vp), ooff, noffp, ct));
+}
+
+static int
+hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1)
+ return (VOP_RWLOCK(REALVP(vp), write_lock, ct));
+
+ if (write_lock) {
+ rw_enter(&hp->hln_rwlock, RW_WRITER);
+ } else {
+ rw_enter(&hp->hln_rwlock, RW_READER);
+ }
+ return (write_lock);
+}
+
+static void
+hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct)
+{
+ hlnode_t *hp = VTOHLN(vp);
+
+ if (hp->hln_looped == 1) {
+ VOP_RWUNLOCK(REALVP(vp), write_lock, ct);
+ return;
+ }
+
+ rw_exit(&hp->hln_rwlock);
+}
+
+static int
+hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+ caller_context_t *ct)
+{
+ int error;
+
+ if (VTOHLN(vp)->hln_looped == 1)
+ return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct));
+
+ switch (cmd) {
+ case _PC_XATTR_ENABLED:
+ case _PC_XATTR_EXISTS:
+ case _PC_SATTR_ENABLED:
+ case _PC_SATTR_EXISTS:
+ error = EINVAL;
+ break;
+ case _PC_TIMESTAMP_RESOLUTION:
+ /* nanosecond timestamp resolution */
+ *valp = 1L;
+ error = 0;
+ break;
+ default:
+ error = fs_pathconf(vp, cmd, valp, cr, ct);
+ }
+ return (error);
+}
+
+
+struct vnodeops *hyprlofs_vnodeops;
+
+const fs_operation_def_t hyprlofs_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = hyprlofs_open },
+ VOPNAME_CLOSE, { .vop_close = hyprlofs_close },
+ VOPNAME_READ, { .vop_read = hyprlofs_read },
+ VOPNAME_WRITE, { .vop_write = hyprlofs_write },
+ VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl },
+ VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr },
+ VOPNAME_ACCESS, { .vop_access = hyprlofs_access },
+ VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup },
+ VOPNAME_CREATE, { .error = fs_error },
+ VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove },
+ VOPNAME_LINK, { .error = fs_error },
+ VOPNAME_RENAME, { .error = fs_error },
+ VOPNAME_MKDIR, { .error = fs_error },
+ VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir },
+ VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir },
+ VOPNAME_SYMLINK, { .error = fs_error },
+ VOPNAME_READLINK, { .error = fs_error },
+ VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive },
+ VOPNAME_FID, { .vop_fid = hyprlofs_fid },
+ VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock },
+ VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock },
+ VOPNAME_SEEK, { .vop_seek = hyprlofs_seek },
+ VOPNAME_SPACE, { .vop_space = hyprlofs_space },
+ VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage },
+ VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage },
+ VOPNAME_MAP, { .vop_map = hyprlofs_map },
+ VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap },
+ VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap },
+ VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf },
+ VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
+ NULL, NULL
+};
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 55ffb94805..59ec5d1829 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -21,6 +21,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
*/
@@ -57,6 +58,7 @@
#include <sys/zone.h>
#include <sys/dnlc.h>
#include <sys/fs/snode.h>
+#include <sys/brand.h>
/* Controls whether paths are stored with vnodes. */
int vfs_vnode_path = 1;
@@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr)
}
/*
+ * Clean a stale v_path from a vnode. This is only performed if the v_path has
+ * not been altered since it was found to be stale
+ */
+static void
+vnode_clear_vpath(vnode_t *vp, char *vpath_old)
+{
+ mutex_enter(&vp->v_lock);
+ if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) {
+ vp->v_path = vn_vpath_empty;
+ mutex_exit(&vp->v_lock);
+ kmem_free(vpath_old, strlen(vpath_old) + 1);
+ } else {
+ mutex_exit(&vp->v_lock);
+ }
+}
+
+/*
+ * Validate that a pathname refers to a given vnode.
+ */
+static int
+vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
+ int flags, cred_t *cr)
+{
+ vnode_t *compvp;
+ /*
+ * If we are in a zone or a chroot environment, then we have to
+ * take additional steps, since the path to the root might not
+ * be readable with the current credentials, even though the
+ * process can legitmately access the file. In this case, we
+ * do the following:
+ *
+ * lookuppnvp() with all privileges to get the resolved path.
+ * call localpath() to get the local portion of the path, and
+ * continue as normal.
+ *
+ * If the the conversion to a local path fails, then we continue
+ * as normal. This is a heuristic to make process object file
+ * paths available from within a zone. Because lofs doesn't
+ * support page operations, the vnode stored in the seg_t is
+ * actually the underlying real vnode, not the lofs node itself.
+ * Most of the time, the lofs path is the same as the underlying
+ * vnode (for example, /usr/lib/libc.so.1).
+ */
+ if (vrootp != rootdir) {
+ char *local = NULL;
+
+ VN_HOLD(rootdir);
+ if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir,
+ rootdir, kcred) == 0) {
+ local = localpath(rpn->pn_path, vrootp, kcred);
+ VN_RELE(compvp);
+ }
+
+ /*
+ * The original pn was changed through lookuppnvp().
+ * Set it to local for next validation attempt.
+ */
+ if (local) {
+ (void) pn_set(pn, local);
+ } else {
+ return (1);
+ }
+ }
+
+ /*
+ * We should have a local path at this point, so start the search from
+ * the root of the current process.
+ */
+ VN_HOLD(vrootp);
+ if (vrootp != rootdir)
+ VN_HOLD(vrootp);
+ if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
+ cr) == 0) {
+ /*
+ * Check to see if the returned vnode is the same as the one we
+ * expect.
+ */
+ if (vn_compare(vp, compvp) ||
+ vnode_match(vp, compvp, cr)) {
+ VN_RELE(compvp);
+ return (0);
+ } else {
+ VN_RELE(compvp);
+ }
+ }
+
+ return (1);
+}
+
+/*
* Given a directory, return the full, resolved path. This looks up "..",
* searches for the given vnode in the parent, appends the component, etc. It
* is used to implement vnodetopath() and getcwd() when the cached path fails.
@@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
char *bufloc;
size_t dlen = DIRENT64_RECLEN(MAXPATHLEN);
refstr_t *mntpt;
+ char *vpath_cached;
+ boolean_t vpath_stale;
/* Operation only allowed on directories */
ASSERT(vp->v_type == VDIR);
@@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
* Shortcut: see if this vnode has correct v_path. If so,
* we have the work done.
*/
+ vpath_cached = NULL;
+ vpath_stale = B_FALSE;
mutex_enter(&vp->v_lock);
- if (vp->v_path != NULL) {
-
- if ((err = pn_set(&pn, vp->v_path)) == 0) {
- mutex_exit(&vp->v_lock);
- rpn.pn_path = rpn.pn_buf;
-
- /*
- * Ensure the v_path pointing to correct vnode
- */
- VN_HOLD(vrootp);
- if (vrootp != rootdir)
- VN_HOLD(vrootp);
- if (lookuppnvp(&pn, &rpn, flags, NULL,
- &cmpvp, vrootp, vrootp, cr) == 0) {
-
- if (VN_CMP(vp, cmpvp)) {
- VN_RELE(cmpvp);
+ if (vp->v_path != vn_vpath_empty &&
+ pn_set(&pn, vp->v_path) == 0) {
+ vpath_cached = vp->v_path;
+ mutex_exit(&vp->v_lock);
+ rpn.pn_path = rpn.pn_buf;
- complen = strlen(rpn.pn_path);
- bufloc -= complen;
- if (bufloc < buf) {
- err = ERANGE;
- goto out;
- }
- bcopy(rpn.pn_path, bufloc,
- complen);
- break;
- } else {
- VN_RELE(cmpvp);
- }
+ /* Ensure the v_path pointing to correct vnode */
+ if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags,
+ cr) == 0) {
+ complen = strlen(rpn.pn_path);
+ bufloc -= complen;
+ if (bufloc < buf) {
+ err = ERANGE;
+ goto out;
}
+ bcopy(rpn.pn_path, bufloc, complen);
+ break;
} else {
- mutex_exit(&vp->v_lock);
+ vpath_stale = B_TRUE;
}
} else {
mutex_exit(&vp->v_lock);
@@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
}
/*
- * Try to obtain the path component from dnlc cache
- * before searching through the directory.
- */
- if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) {
- /*
- * If we got parent vnode as a result,
- * then the answered path is correct.
- */
- if (VN_CMP(cmpvp, pvp)) {
- VN_RELE(cmpvp);
- complen = strlen(dbuf);
- bufloc -= complen;
- if (bufloc <= buf) {
- err = ENAMETOOLONG;
- goto out;
- }
- bcopy(dbuf, bufloc, complen);
-
- /* Prepend a slash to the current path */
- *--bufloc = '/';
-
- /* And continue with the next component */
- VN_RELE(vp);
- vp = pvp;
- pvp = NULL;
- continue;
- } else {
- VN_RELE(cmpvp);
- }
- }
-
- /*
* Search the parent directory for the entry corresponding to
* this vnode.
*/
@@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags,
/* Prepend a slash to the current path. */
*--bufloc = '/';
+ /* Clear vp->v_path if it was found to be stale. */
+ if (vpath_stale == B_TRUE) {
+ vnode_clear_vpath(vp, vpath_cached);
+ }
+
/* And continue with the next component */
VN_RELE(vp);
vp = pvp;
@@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen,
VN_RELE(vp);
}
- pn_alloc(&pn);
/*
- * Check to see if we have a cached path in the vnode.
+ * Check to see if we have a valid cached path in the vnode.
*/
+ pn_alloc(&pn);
mutex_enter(&vp->v_lock);
- if (vp->v_path != NULL) {
+ if (vp->v_path != vn_vpath_empty) {
(void) pn_set(&pn, vp->v_path);
mutex_exit(&vp->v_lock);
- pn_alloc(&rpn);
-
/* We should only cache absolute paths */
ASSERT(pn.pn_buf[0] == '/');
- /*
- * If we are in a zone or a chroot environment, then we have to
- * take additional steps, since the path to the root might not
- * be readable with the current credentials, even though the
- * process can legitmately access the file. In this case, we
- * do the following:
- *
- * lookuppnvp() with all privileges to get the resolved path.
- * call localpath() to get the local portion of the path, and
- * continue as normal.
- *
- * If the the conversion to a local path fails, then we continue
- * as normal. This is a heuristic to make process object file
- * paths available from within a zone. Because lofs doesn't
- * support page operations, the vnode stored in the seg_t is
- * actually the underlying real vnode, not the lofs node itself.
- * Most of the time, the lofs path is the same as the underlying
- * vnode (for example, /usr/lib/libc.so.1).
- */
- if (vrootp != rootdir) {
- char *local = NULL;
- VN_HOLD(rootdir);
- if (lookuppnvp(&pn, &rpn, FOLLOW,
- NULL, &compvp, rootdir, rootdir, kcred) == 0) {
- local = localpath(rpn.pn_path, vrootp,
- kcred);
- VN_RELE(compvp);
- }
-
- /*
- * The original pn was changed through lookuppnvp().
- * Set it to local for next validation attempt.
- */
- if (local) {
- (void) pn_set(&pn, local);
- } else {
- goto notcached;
+ pn_alloc(&rpn);
+ if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) {
+ /* Return the result, if we're able. */
+ if (buflen > rpn.pn_pathlen) {
+ bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
+ pn_free(&pn);
+ pn_free(&rpn);
+ VN_RELE(vrootp);
+ if (doclose) {
+ (void) VOP_CLOSE(vp, FREAD, 1, 0, cr,
+ NULL);
+ VN_RELE(vp);
+ }
+ return (0);
}
}
-
/*
- * We should have a local path at this point, so start the
- * search from the root of the current process.
+ * A stale v_path will be purged by the later dirtopath lookup.
*/
- VN_HOLD(vrootp);
- if (vrootp != rootdir)
- VN_HOLD(vrootp);
- ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL,
- &compvp, vrootp, vrootp, cr);
- if (ret == 0) {
- /*
- * Check to see if the returned vnode is the same as
- * the one we expect. If not, give up.
- */
- if (!vn_compare(vp, compvp) &&
- !vnode_match(vp, compvp, cr)) {
- VN_RELE(compvp);
- goto notcached;
- }
-
- VN_RELE(compvp);
-
- /*
- * Return the result.
- */
- if (buflen <= rpn.pn_pathlen)
- goto notcached;
-
- bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1);
- pn_free(&pn);
- pn_free(&rpn);
- VN_RELE(vrootp);
- if (doclose) {
- (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL);
- VN_RELE(vp);
- }
- return (0);
- }
-
-notcached:
pn_free(&rpn);
} else {
mutex_exit(&vp->v_lock);
}
-
pn_free(&pn);
if (vp->v_type != VDIR) {
- /*
- * If we don't have a directory, try to find it in the dnlc via
- * reverse lookup. Once this is found, we can use the regular
- * directory search to find the full path.
- */
- if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) {
- /*
- * Check if we have read privilege so, that
- * we can lookup the path in the directory
- */
- ret = 0;
- if ((flags & LOOKUP_CHECKREAD)) {
- ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL);
- }
- if (ret == 0) {
- ret = dirtopath(vrootp, pvp, buf, buflen,
- flags, cr);
- }
- if (ret == 0) {
- len = strlen(buf);
- if (len + strlen(path) + 1 >= buflen) {
- ret = ENAMETOOLONG;
- } else {
- if (buf[len - 1] != '/')
- buf[len++] = '/';
- bcopy(path, buf + len,
- strlen(path) + 1);
- }
- }
-
- VN_RELE(pvp);
- } else
- ret = ENOENT;
- } else
+ ret = ENOENT;
+ } else {
ret = dirtopath(vrootp, vp, buf, buflen, flags, cr);
+ }
VN_RELE(vrootp);
if (doclose) {
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
new file mode 100644
index 0000000000..3c1405d4af
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c
@@ -0,0 +1,524 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <sys/prsystm.h>
+
+#include "lxproc.h"
+
+#define LXPRCACHE_NAME "lxpr_cache"
+
+static int lxpr_node_constructor(void *, void *, int);
+static void lxpr_node_destructor(void *, void *);
+
+static kmem_cache_t *lxpr_node_cache;
+
+struct lxpr_uiobuf {
+ uio_t *uiop;
+ char *buffer;
+ uint32_t buffsize;
+ char *pos;
+ size_t beg;
+ int error;
+};
+
+int lxpr_bufsize = 4000;
+
+struct lxpr_uiobuf *
+lxpr_uiobuf_new(uio_t *uiop)
+{
+ /* Allocate memory for both lxpr_uiobuf and output buffer */
+ int bufsize = lxpr_bufsize;
+ struct lxpr_uiobuf *uiobuf =
+ kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP);
+
+ uiobuf->uiop = uiop;
+ uiobuf->buffer = (char *)&uiobuf[1];
+ uiobuf->buffsize = bufsize;
+ uiobuf->pos = uiobuf->buffer;
+ uiobuf->beg = 0;
+ uiobuf->error = 0;
+
+ return (uiobuf);
+}
+
+void
+lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf)
+{
+ ASSERT(uiobuf != NULL);
+ ASSERT(uiobuf->pos == uiobuf->buffer);
+
+ kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize);
+}
+
+void
+lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset)
+{
+ uiobuf->uiop->uio_offset = (off_t)offset;
+}
+
+void
+lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err)
+{
+ ASSERT(uiobuf->error == 0);
+
+ uiobuf->error = err;
+}
+
+int
+lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf)
+{
+ off_t off = uiobuf->uiop->uio_offset;
+ caddr_t uaddr = uiobuf->buffer;
+ size_t beg = uiobuf->beg;
+ size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr;
+
+ if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ ASSERT(off >= beg);
+
+ if (beg + size > off && off >= 0)
+ uiobuf->error =
+ uiomove(uaddr + (off - beg), size - (off - beg),
+ UIO_READ, uiobuf->uiop);
+
+ uiobuf->beg += size;
+ }
+
+ uiobuf->pos = uaddr;
+
+ return (uiobuf->error);
+}
+
+void
+lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size)
+{
+ /* While we can still carry on */
+ while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) {
+ uintptr_t remain = (uintptr_t)uiobuf->buffsize -
+ ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer);
+
+ /* Enough space in buffer? */
+ if (remain >= size) {
+ bcopy(buf, uiobuf->pos, size);
+ uiobuf->pos += size;
+ return;
+ }
+
+ /* Not enough space, so copy all we can and try again */
+ bcopy(buf, uiobuf->pos, remain);
+ uiobuf->pos += remain;
+ (void) lxpr_uiobuf_flush(uiobuf);
+ buf += remain;
+ size -= remain;
+ }
+}
+
+#define TYPBUFFSIZE 256
+
+void
+lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...)
+{
+ va_list args;
+ char buff[TYPBUFFSIZE];
+ int len;
+ char *buffer;
+
+ /* Can we still do any output */
+ if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0)
+ return;
+
+ va_start(args, fmt);
+
+ /* Try using stack allocated buffer */
+ len = vsnprintf(buff, TYPBUFFSIZE, fmt, args);
+ if (len < TYPBUFFSIZE) {
+ va_end(args);
+ lxpr_uiobuf_write(uiobuf, buff, len);
+ return;
+ }
+
+ /* Not enough space in pre-allocated buffer */
+ buffer = kmem_alloc(len + 1, KM_SLEEP);
+
+ /*
+ * We know we allocated the correct amount of space
+ * so no check on the return value
+ */
+ (void) vsnprintf(buffer, len+1, fmt, args);
+ lxpr_uiobuf_write(uiobuf, buffer, len);
+ va_end(args);
+ kmem_free(buffer, len+1);
+}
+
+/*
+ * lxpr_lock():
+ *
+ * Lookup process from pid and return with p_plock and P_PR_LOCK held.
+ */
+proc_t *
+lxpr_lock(pid_t pid)
+{
+ proc_t *p;
+ kmutex_t *mp;
+
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ for (;;) {
+ mutex_enter(&pidlock);
+
+ /*
+ * If the pid is 1, we really want the zone's init process
+ */
+ p = prfind((pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (NULL);
+ }
+
+ /*
+ * p_lock is persistent, but p itself is not -- it could
+ * vanish during cv_wait(). Load p->p_lock now so we can
+ * drop it after cv_wait() without referencing p.
+ */
+ mp = &p->p_lock;
+ mutex_enter(mp);
+
+ mutex_exit(&pidlock);
+
+ if (p->p_flag & SEXITING) {
+ /*
+ * This process is exiting -- let it go.
+ */
+ mutex_exit(mp);
+ return (NULL);
+ }
+
+ if (!(p->p_proc_flag & P_PR_LOCK))
+ break;
+
+ cv_wait(&pr_pid_cv[p->p_slot], mp);
+ mutex_exit(mp);
+ }
+
+ p->p_proc_flag |= P_PR_LOCK;
+ THREAD_KPRI_REQUEST();
+ return (p);
+}
+
+/*
+ * lxpr_unlock()
+ *
+ * Unlock locked process
+ */
+void
+lxpr_unlock(proc_t *p)
+{
+ ASSERT(p->p_proc_flag & P_PR_LOCK);
+ ASSERT(MUTEX_HELD(&p->p_lock));
+ ASSERT(!MUTEX_HELD(&pidlock));
+
+ cv_signal(&pr_pid_cv[p->p_slot]);
+ p->p_proc_flag &= ~P_PR_LOCK;
+ mutex_exit(&p->p_lock);
+ THREAD_KPRI_RELEASE();
+}
+
+void
+lxpr_initnodecache()
+{
+ lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME,
+ sizeof (lxpr_node_t), 0,
+ lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+lxpr_fininodecache()
+{
+ kmem_cache_destroy(lxpr_node_cache);
+}
+
+/* ARGSUSED */
+static int
+lxpr_node_constructor(void *buf, void *un, int kmflags)
+{
+ lxpr_node_t *lxpnp = buf;
+ vnode_t *vp;
+
+ vp = lxpnp->lxpr_vnode = vn_alloc(kmflags);
+ if (vp == NULL)
+ return (-1);
+
+ (void) vn_setops(vp, lxpr_vnodeops);
+ vp->v_data = lxpnp;
+
+ return (0);
+}
+
+/* ARGSUSED */
+static void
+lxpr_node_destructor(void *buf, void *un)
+{
+ lxpr_node_t *lxpnp = buf;
+
+ vn_free(LXPTOV(lxpnp));
+}
+
+/*
+ * Calculate an inode number
+ *
+ * This takes various bits of info and munges them
+ * to give the inode number for an lxproc node
+ */
+ino_t
+lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd)
+{
+ if (pid == 1)
+ pid = curproc->p_zone->zone_proc_initpid;
+
+ switch (type) {
+ case LXPR_PIDDIR:
+ return (pid + 1);
+ case LXPR_PROCDIR:
+ return (maxpid + 2);
+ case LXPR_PID_FD_FD:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ LXPR_NFILES + fd);
+ default:
+ return (maxpid + 2 +
+ (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) +
+ type);
+ }
+}
+
+/*
+ * Return inode number of parent (directory)
+ */
+ino_t
+lxpr_parentinode(lxpr_node_t *lxpnp)
+{
+ /*
+ * If the input node is the root then the parent inode
+ * is the mounted on inode so just return our inode number
+ */
+ if (lxpnp->lxpr_type != LXPR_PROCDIR)
+ return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino);
+ else
+ return (lxpnp->lxpr_ino);
+}
+
+/*
+ * Allocate a new lxproc node
+ *
+ * This also allocates the vnode associated with it
+ */
+lxpr_node_t *
+lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd)
+{
+ lxpr_node_t *lxpnp;
+ vnode_t *vp;
+ user_t *up;
+ timestruc_t now;
+
+ /*
+ * Allocate a new node. It is deallocated in vop_innactive
+ */
+ lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP);
+
+ /*
+ * Set defaults (may be overridden below)
+ */
+ gethrestime(&now);
+ lxpnp->lxpr_type = type;
+ lxpnp->lxpr_realvp = NULL;
+ lxpnp->lxpr_parent = dp;
+ VN_HOLD(dp);
+ if (p != NULL) {
+ lxpnp->lxpr_pid = ((p->p_pid ==
+ curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid);
+
+ lxpnp->lxpr_time = PTOU(p)->u_start;
+ lxpnp->lxpr_uid = crgetruid(p->p_cred);
+ lxpnp->lxpr_gid = crgetrgid(p->p_cred);
+ lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd);
+ } else {
+ /* Pretend files without a proc belong to sched */
+ lxpnp->lxpr_pid = 0;
+ lxpnp->lxpr_time = now;
+ lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0;
+ lxpnp->lxpr_ino = lxpr_inode(type, 0, 0);
+ }
+
+ /* initialize the vnode data */
+ vp = lxpnp->lxpr_vnode;
+ vn_reinit(vp);
+ vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT;
+ vp->v_vfsp = dp->v_vfsp;
+
+ /*
+ * Do node specific stuff
+ */
+ switch (type) {
+ case LXPR_PROCDIR:
+ vp->v_flag |= VROOT;
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by everyone */
+ break;
+
+ case LXPR_PID_CURDIR:
+ ASSERT(p != NULL);
+
+ /*
+ * Zombie check. p_stat is officially protected by pidlock,
+ * but we can't grab pidlock here because we already hold
+ * p_lock. Luckily if we look at the process exit code
+ * we see that p_stat only transisions from SRUN to SZOMB
+ * while p_lock is held. Aside from this, the only other
+ * p_stat transition that we need to be aware about is
+ * SIDL to SRUN, but that's not a problem since lxpr_lock()
+ * ignores nodes in the SIDL state so we'll never get a node
+ * that isn't already in the SRUN state.
+ */
+ if (p->p_stat == SZOMB) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ up = PTOU(p);
+ lxpnp->lxpr_realvp = up->u_cdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_ROOTDIR:
+ ASSERT(p != NULL);
+ /* Zombie check. see locking comment above */
+ if (p->p_stat == SZOMB) {
+ lxpnp->lxpr_realvp = NULL;
+ } else {
+ up = PTOU(p);
+ lxpnp->lxpr_realvp =
+ up->u_rdir != NULL ? up->u_rdir : rootdir;
+ ASSERT(lxpnp->lxpr_realvp != NULL);
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_EXE:
+ ASSERT(p != NULL);
+ lxpnp->lxpr_realvp = p->p_exec;
+ if (lxpnp->lxpr_realvp != NULL) {
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777;
+ break;
+
+ case LXPR_SELF:
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0777; /* anyone does anything ! */
+ break;
+
+ case LXPR_PID_FD_FD:
+ ASSERT(p != NULL);
+ /* lxpr_realvp is set after we return */
+ vp->v_type = VLNK;
+ lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */
+ break;
+
+ case LXPR_PID_FDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0500; /* read-search by owner only */
+ break;
+
+ case LXPR_PIDDIR:
+ ASSERT(p != NULL);
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0511;
+ break;
+
+ case LXPR_NETDIR:
+ vp->v_type = VDIR;
+ lxpnp->lxpr_mode = 0555; /* read-search by all */
+ break;
+
+ case LXPR_PID_ENV:
+ case LXPR_PID_MEM:
+ ASSERT(p != NULL);
+ /*FALLTHRU*/
+ case LXPR_KCORE:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0400; /* read-only by owner only */
+ break;
+
+ default:
+ vp->v_type = VREG;
+ lxpnp->lxpr_mode = 0444; /* read-only by all */
+ break;
+ }
+
+ return (lxpnp);
+}
+
+
+/*
+ * Free the storage obtained from lxpr_getnode().
+ */
+void
+lxpr_freenode(lxpr_node_t *lxpnp)
+{
+ ASSERT(lxpnp != NULL);
+ ASSERT(LXPTOV(lxpnp) != NULL);
+
+ /*
+ * delete any association with realvp
+ */
+ if (lxpnp->lxpr_realvp != NULL)
+ VN_RELE(lxpnp->lxpr_realvp);
+
+ /*
+ * delete any association with parent vp
+ */
+ if (lxpnp->lxpr_parent != NULL)
+ VN_RELE(lxpnp->lxpr_parent);
+
+ /*
+ * Release the lxprnode.
+ */
+ kmem_cache_free(lxpr_node_cache, lxpnp);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
new file mode 100644
index 0000000000..1bb7bd3823
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c
@@ -0,0 +1,367 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/signal.h>
+#include <sys/user.h>
+#include <sys/mount.h>
+#include <sys/bitmap.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+
+#include "lxproc.h"
+
+/* Module level parameters */
+static int lxprocfstype;
+static dev_t lxprocdev;
+static kmutex_t lxpr_mount_lock;
+
+int nproc_highbit; /* highbit(v.v_nproc) */
+
+static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *);
+static int lxpr_unmount(vfs_t *, int, cred_t *);
+static int lxpr_root(vfs_t *, vnode_t **);
+static int lxpr_statvfs(vfs_t *, statvfs64_t *);
+static int lxpr_init(int, char *);
+
+static vfsdef_t vfw = {
+ VFSDEF_VERSION,
+ "lxproc",
+ lxpr_init,
+ VSW_ZMOUNT,
+ NULL
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+extern struct mod_ops mod_fsops;
+
+static struct modlfs modlfs = {
+ &mod_fsops, "generic linux procfs", &vfw
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlfs, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+ int retval;
+
+ /*
+ * attempt to unload the module
+ */
+ if ((retval = mod_remove(&modlinkage)) != 0)
+ goto done;
+
+ /*
+ * destroy lxpr_node cache
+ */
+ lxpr_fininodecache();
+
+ /*
+ * clean out the vfsops and vnodeops
+ */
+ (void) vfs_freevfsops_by_type(lxprocfstype);
+ vn_freevnodeops(lxpr_vnodeops);
+
+ mutex_destroy(&lxpr_mount_lock);
+done:
+ return (retval);
+}
+
+static int
+lxpr_init(int fstype, char *name)
+{
+ static const fs_operation_def_t lxpr_vfsops_template[] = {
+ VFSNAME_MOUNT, { .vfs_mount = lxpr_mount },
+ VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount },
+ VFSNAME_ROOT, { .vfs_root = lxpr_root },
+ VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs },
+ NULL, NULL
+ };
+ extern const fs_operation_def_t lxpr_vnodeops_template[];
+ int error;
+ major_t dev;
+
+ nproc_highbit = highbit(v.v_proc);
+ lxprocfstype = fstype;
+ ASSERT(lxprocfstype != 0);
+
+ mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Associate VFS ops vector with this fstype.
+ */
+ error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL);
+ if (error != 0) {
+ cmn_err(CE_WARN, "lxpr_init: bad vfs ops template");
+ return (error);
+ }
+
+ /*
+ * Set up vnode ops vector too.
+ */
+ error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops);
+ if (error != 0) {
+ (void) vfs_freevfsops_by_type(fstype);
+ cmn_err(CE_WARN, "lxpr_init: bad vnode ops template");
+ return (error);
+ }
+
+ /*
+ * Assign a unique "device" number (used by stat(2)).
+ */
+ if ((dev = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN, "lxpr_init: can't get unique device number");
+ dev = 0;
+ }
+
+ /*
+ * Make the pseudo device
+ */
+ lxprocdev = makedevice(dev, 0);
+
+ /*
+ * Initialize cache for lxpr_nodes
+ */
+ lxpr_initnodecache();
+
+ return (0);
+}
+
+static int
+lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt;
+ zone_t *zone = curproc->p_zone;
+ ldi_ident_t li;
+ int err;
+
+ /*
+ * must be root to mount
+ */
+ if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
+ return (EPERM);
+
+ /*
+ * mount point must be a directory
+ */
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ if (zone == global_zone) {
+ zone_t *mntzone;
+
+ mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
+ zone_rele(mntzone);
+ if (zone != mntzone)
+ return (EBUSY);
+ }
+
+ /*
+ * Having the resource be anything but "lxproc" doesn't make sense
+ */
+ vfs_setresource(vfsp, "lxproc", 0);
+
+ lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP);
+
+ if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) {
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+ return (err);
+ }
+
+ lxpr_mnt->lxprm_li = li;
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * Ensure we don't allow overlaying mounts
+ */
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ mutex_exit(&lxpr_mount_lock);
+ kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt)));
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * allocate the first vnode
+ */
+ zone_hold(lxpr_mnt->lxprm_zone = zone);
+
+ /* Arbitrarily set the parent vnode to the mounted over directory */
+ lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0);
+
+ /* Correctly set the fs for the root node */
+ lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp;
+
+ vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype);
+ vfsp->vfs_bsize = DEV_BSIZE;
+ vfsp->vfs_fstype = lxprocfstype;
+ vfsp->vfs_data = (caddr_t)lxpr_mnt;
+ vfsp->vfs_dev = lxprocdev;
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr)
+{
+ lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data;
+ vnode_t *vp;
+ int count;
+
+ ASSERT(lxpr_mnt != NULL);
+ vp = LXPTOV(lxpr_mnt->lxprm_node);
+
+ mutex_enter(&lxpr_mount_lock);
+
+ /*
+ * must be root to unmount
+ */
+ if (secpolicy_fs_unmount(cr, vfsp) != 0) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EPERM);
+ }
+
+ /*
+ * forced unmount is not supported by this file system
+ */
+ if (flag & MS_FORCE) {
+ mutex_exit(&lxpr_mount_lock);
+ return (ENOTSUP);
+ }
+
+ /*
+ * Ensure that no vnodes are in use on this mount point.
+ */
+ mutex_enter(&vp->v_lock);
+ count = vp->v_count;
+ mutex_exit(&vp->v_lock);
+ if (count > 1) {
+ mutex_exit(&lxpr_mount_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * purge the dnlc cache for vnode entries
+ * associated with this file system
+ */
+ count = dnlc_purge_vfsp(vfsp, 0);
+
+ /*
+ * free up the lxprnode
+ */
+ lxpr_freenode(lxpr_mnt->lxprm_node);
+ zone_rele(lxpr_mnt->lxprm_zone);
+ kmem_free(lxpr_mnt, sizeof (*lxpr_mnt));
+
+ mutex_exit(&lxpr_mount_lock);
+
+ return (0);
+}
+
+static int
+lxpr_root(vfs_t *vfsp, vnode_t **vpp)
+{
+ lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node;
+ vnode_t *vp = LXPTOV(lxpnp);
+
+ VN_HOLD(vp);
+ *vpp = vp;
+ return (0);
+}
+
+static int
+lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp)
+{
+ int n;
+ dev32_t d32;
+ extern uint_t nproc;
+
+ n = v.v_proc - nproc;
+
+ bzero((caddr_t)sp, sizeof (*sp));
+ sp->f_bsize = DEV_BSIZE;
+ sp->f_frsize = DEV_BSIZE;
+ sp->f_blocks = (fsblkcnt64_t)0;
+ sp->f_bfree = (fsblkcnt64_t)0;
+ sp->f_bavail = (fsblkcnt64_t)0;
+ sp->f_files = (fsfilcnt64_t)v.v_proc + 2;
+ sp->f_ffree = (fsfilcnt64_t)n;
+ sp->f_favail = (fsfilcnt64_t)n;
+ (void) cmpldev(&d32, vfsp->vfs_dev);
+ sp->f_fsid = d32;
+ /* It is guaranteed that vsw_name will fit in f_basetype */
+ (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name);
+ sp->f_flag = vf_to_stf(vfsp->vfs_flag);
+ sp->f_namemax = 64; /* quite arbitrary */
+
+ (void) strcpy(sp->f_fstr, "lxproc");
+
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
new file mode 100644
index 0000000000..48f4efc1bf
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c
@@ -0,0 +1,3099 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * lxproc -- a loosely Linux-compatible /proc
+ *
+ * We have -- confusingly -- two implementations of Linux /proc. One is to
+ * support the LX brand with a Linux /proc entirely compatible with the Linux
+ * world view; the other -- this one -- is to support native (but Linux-borne)
+ * programs that wish to view the native system via the Linux /proc model. So
+ * the aspiration here is to provide something that sufficiently approximates
+ * the Linux /proc implementation for purposes of offering some compatibility
+ * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not
+ * intended to exactly mimic Linux semantics; when choosing between offering
+ * compatibility and telling the truth, we emphatically pick the truth. A
+ * particular glaring example of this is the Linux notion of "tasks" (that is,
+ * threads), which -- due to historical misadventures on Linux -- allocate their
+ * identifiers from the process identifier space. (That is, each thread has in
+ * effect a pid.) Some Linux /proc readers have come to depend on this
+ * attribute, and become confused when threads appear with proper identifiers,
+ * so we simply opt for the pre-2.6 behavior, and do not present the tasks
+ * directory at all. Similarly, when choosing between offering compatibility
+ * and remaining consistent with our broader security model, we (obviously)
+ * choose security over compatibility. In short, this is meant to be a best
+ * effort -- no more -- and as such, it should not be unified with the much
+ * more complete Linux /proc implementation found in the LX brand.
+ */
+
+#include <sys/cpupart.h>
+#include <sys/cpuvar.h>
+#include <sys/session.h>
+#include <sys/vmparam.h>
+#include <sys/mman.h>
+#include <vm/rm.h>
+#include <vm/seg_vn.h>
+#include <sys/sdt.h>
+#include <sys/strlog.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/fp.h>
+#include <sys/pool_pset.h>
+#include <sys/pset.h>
+#include <sys/zone.h>
+#include <sys/pghw.h>
+#include <sys/vfs_opreg.h>
+
+/* Dependent on procfs */
+extern kthread_t *prchoose(proc_t *);
+
+#include "lxproc.h"
+
+extern pgcnt_t swapfs_minfree;
+extern time_t boot_time;
+
+/*
+ * Pointer to the vnode ops vector for this fs.
+ * This is instantiated in lxprinit() in lxpr_vfsops.c
+ */
+vnodeops_t *lxpr_vnodeops;
+
+static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *);
+static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *,
+ caller_context_t *);
+static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *);
+static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *,
+ caller_context_t *);
+static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *);
+static int lxpr_lookup(vnode_t *, char *, vnode_t **,
+ pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *,
+ pathname_t *);
+static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *,
+ caller_context_t *, int);
+static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *);
+static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *);
+static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *);
+static int lxpr_sync(void);
+static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *);
+
+static vnode_t *lxpr_lookup_procdir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_piddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_fddir(vnode_t *, char *);
+static vnode_t *lxpr_lookup_netdir(vnode_t *, char *);
+
+static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *);
+static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *);
+
+static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t);
+static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *);
+
+static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *);
+static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *);
+
+/*
+ * Simple conversion
+ */
+#define btok(x) ((x) >> 10) /* bytes to kbytes */
+#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */
+
+/*
+ * The lxproc vnode operations vector
+ */
+const fs_operation_def_t lxpr_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = lxpr_open },
+ VOPNAME_CLOSE, { .vop_close = lxpr_close },
+ VOPNAME_READ, { .vop_read = lxpr_read },
+ VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr },
+ VOPNAME_ACCESS, { .vop_access = lxpr_access },
+ VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup },
+ VOPNAME_READDIR, { .vop_readdir = lxpr_readdir },
+ VOPNAME_READLINK, { .vop_readlink = lxpr_readlink },
+ VOPNAME_FSYNC, { .error = lxpr_sync },
+ VOPNAME_SEEK, { .error = lxpr_sync },
+ VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive },
+ VOPNAME_CMP, { .vop_cmp = lxpr_cmp },
+ VOPNAME_REALVP, { .vop_realvp = lxpr_realvp },
+ NULL, NULL
+};
+
+/*
+ * file contents of an lxproc directory.
+ */
+static lxpr_dirent_t lxpr_dir[] = {
+ { LXPR_CMDLINE, "cmdline" },
+ { LXPR_CPUINFO, "cpuinfo" },
+ { LXPR_DEVICES, "devices" },
+ { LXPR_DMA, "dma" },
+ { LXPR_FILESYSTEMS, "filesystems" },
+ { LXPR_INTERRUPTS, "interrupts" },
+ { LXPR_IOPORTS, "ioports" },
+ { LXPR_KCORE, "kcore" },
+ { LXPR_KMSG, "kmsg" },
+ { LXPR_LOADAVG, "loadavg" },
+ { LXPR_MEMINFO, "meminfo" },
+ { LXPR_MOUNTS, "mounts" },
+ { LXPR_NETDIR, "net" },
+ { LXPR_PARTITIONS, "partitions" },
+ { LXPR_SELF, "self" },
+ { LXPR_STAT, "stat" },
+ { LXPR_UPTIME, "uptime" },
+ { LXPR_VERSION, "version" }
+};
+
+#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0]))
+
+/*
+ * Contents of an /lxproc/<pid> directory.
+ */
+static lxpr_dirent_t piddir[] = {
+ { LXPR_PID_CMDLINE, "cmdline" },
+ { LXPR_PID_CPU, "cpu" },
+ { LXPR_PID_CURDIR, "cwd" },
+ { LXPR_PID_ENV, "environ" },
+ { LXPR_PID_EXE, "exe" },
+ { LXPR_PID_MAPS, "maps" },
+ { LXPR_PID_MEM, "mem" },
+ { LXPR_PID_ROOTDIR, "root" },
+ { LXPR_PID_STAT, "stat" },
+ { LXPR_PID_STATM, "statm" },
+ { LXPR_PID_STATUS, "status" },
+ { LXPR_PID_FDDIR, "fd" }
+};
+
+#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]))
+
+/*
+ * contents of /lxproc/net directory
+ */
+static lxpr_dirent_t netdir[] = {
+ { LXPR_NET_ARP, "arp" },
+ { LXPR_NET_DEV, "dev" },
+ { LXPR_NET_DEV_MCAST, "dev_mcast" },
+ { LXPR_NET_IGMP, "igmp" },
+ { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" },
+ { LXPR_NET_IP_MR_VIF, "ip_mr_vif" },
+ { LXPR_NET_MCFILTER, "mcfilter" },
+ { LXPR_NET_NETSTAT, "netstat" },
+ { LXPR_NET_RAW, "raw" },
+ { LXPR_NET_ROUTE, "route" },
+ { LXPR_NET_RPC, "rpc" },
+ { LXPR_NET_RT_CACHE, "rt_cache" },
+ { LXPR_NET_SOCKSTAT, "sockstat" },
+ { LXPR_NET_SNMP, "snmp" },
+ { LXPR_NET_STAT, "stat" },
+ { LXPR_NET_TCP, "tcp" },
+ { LXPR_NET_UDP, "udp" },
+ { LXPR_NET_UNIX, "unix" }
+};
+
+#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0]))
+
+/*
+ * These are the major signal number differences between Linux and native:
+ *
+ * ====================================
+ * | Number | Linux | Native |
+ * | ====== | ========= | ========== |
+ * | 7 | SIGBUS | SIGEMT |
+ * | 10 | SIGUSR1 | SIGBUS |
+ * | 12 | SIGUSR2 | SIGSYS |
+ * | 16 | SIGSTKFLT | SIGUSR1 |
+ * | 17 | SIGCHLD | SIGUSR2 |
+ * | 18 | SIGCONT | SIGCHLD |
+ * | 19 | SIGSTOP | SIGPWR |
+ * | 20 | SIGTSTP | SIGWINCH |
+ * | 21 | SIGTTIN | SIGURG |
+ * | 22 | SIGTTOU | SIGPOLL |
+ * | 23 | SIGURG | SIGSTOP |
+ * | 24 | SIGXCPU | SIGTSTP |
+ * | 25 | SIGXFSZ | SIGCONT |
+ * | 26 | SIGVTALARM | SIGTTIN |
+ * | 27 | SIGPROF | SIGTTOU |
+ * | 28 | SIGWINCH | SIGVTALARM |
+ * | 29 | SIGPOLL | SIGPROF |
+ * | 30 | SIGPWR | SIGXCPU |
+ * | 31 | SIGSYS | SIGXFSZ |
+ * ====================================
+ *
+ * Not every Linux signal maps to a native signal, nor does every native
+ * signal map to a Linux counterpart. However, when signals do map, the
+ * mapping is unique.
+ */
+static int
+lxpr_sigmap[NSIG] = {
+ 0,
+ LX_SIGHUP,
+ LX_SIGINT,
+ LX_SIGQUIT,
+ LX_SIGILL,
+ LX_SIGTRAP,
+ LX_SIGABRT,
+ LX_SIGSTKFLT,
+ LX_SIGFPE,
+ LX_SIGKILL,
+ LX_SIGBUS,
+ LX_SIGSEGV,
+ LX_SIGSYS,
+ LX_SIGPIPE,
+ LX_SIGALRM,
+ LX_SIGTERM,
+ LX_SIGUSR1,
+ LX_SIGUSR2,
+ LX_SIGCHLD,
+ LX_SIGPWR,
+ LX_SIGWINCH,
+ LX_SIGURG,
+ LX_SIGPOLL,
+ LX_SIGSTOP,
+ LX_SIGTSTP,
+ LX_SIGCONT,
+ LX_SIGTTIN,
+ LX_SIGTTOU,
+ LX_SIGVTALRM,
+ LX_SIGPROF,
+ LX_SIGXCPU,
+ LX_SIGXFSZ,
+ -1, /* 32: illumos SIGWAITING */
+ -1, /* 33: illumos SIGLWP */
+ -1, /* 34: illumos SIGFREEZE */
+ -1, /* 35: illumos SIGTHAW */
+ -1, /* 36: illumos SIGCANCEL */
+ -1, /* 37: illumos SIGLOST */
+ -1, /* 38: illumos SIGXRES */
+ -1, /* 39: illumos SIGJVM1 */
+ -1, /* 40: illumos SIGJVM2 */
+ -1, /* 41: illumos SIGINFO */
+ LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */
+ LX_SIGRTMIN + 1,
+ LX_SIGRTMIN + 2,
+ LX_SIGRTMIN + 3,
+ LX_SIGRTMIN + 4,
+ LX_SIGRTMIN + 5,
+ LX_SIGRTMIN + 6,
+ LX_SIGRTMIN + 7,
+ LX_SIGRTMIN + 8,
+ LX_SIGRTMIN + 9,
+ LX_SIGRTMIN + 10,
+ LX_SIGRTMIN + 11,
+ LX_SIGRTMIN + 12,
+ LX_SIGRTMIN + 13,
+ LX_SIGRTMIN + 14,
+ LX_SIGRTMIN + 15,
+ LX_SIGRTMIN + 16,
+ LX_SIGRTMIN + 17,
+ LX_SIGRTMIN + 18,
+ LX_SIGRTMIN + 19,
+ LX_SIGRTMIN + 20,
+ LX_SIGRTMIN + 21,
+ LX_SIGRTMIN + 22,
+ LX_SIGRTMIN + 23,
+ LX_SIGRTMIN + 24,
+ LX_SIGRTMIN + 25,
+ LX_SIGRTMIN + 26,
+ LX_SIGRTMIN + 27,
+ LX_SIGRTMIN + 28,
+ LX_SIGRTMIN + 29,
+ LX_SIGRTMIN + 30,
+ LX_SIGRTMAX
+};
+
+/*
+ * lxpr_open(): Vnode operation for VOP_OPEN()
+ */
+static int
+lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
+{
+ vnode_t *vp = *vpp;
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ vnode_t *rvp;
+ int error = 0;
+
+ /*
+ * We only allow reading in this file systrem
+ */
+ if (flag & FWRITE)
+ return (EROFS);
+
+ /*
+ * If we are opening an underlying file only allow regular files
+ * reject the open for anything but a regular file.
+ * Just do it if we are opening the current or root directory.
+ */
+ if (lxpnp->lxpr_realvp != NULL) {
+ rvp = lxpnp->lxpr_realvp;
+
+ if (type == LXPR_PID_FD_FD && rvp->v_type != VREG)
+ error = EACCES;
+ else {
+ /*
+ * Need to hold rvp since VOP_OPEN() may release it.
+ */
+ VN_HOLD(rvp);
+ error = VOP_OPEN(&rvp, flag, cr, ct);
+ if (error) {
+ VN_RELE(rvp);
+ } else {
+ *vpp = rvp;
+ VN_RELE(vp);
+ }
+ }
+ }
+
+ return (error);
+}
+
+
+/*
+ * lxpr_close(): Vnode operation for VOP_CLOSE()
+ */
+/* ARGSUSED */
+static int
+lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpr = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpr->lxpr_type;
+
+ /*
+ * we should never get here because the close is done on the realvp
+ * for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR &&
+ type != LXPR_PID_EXE);
+
+ return (0);
+}
+
+static void (*lxpr_read_function[LXPR_NFILES])() = {
+ lxpr_read_isdir, /* /proc */
+ lxpr_read_isdir, /* /proc/<pid> */
+ lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */
+ lxpr_read_empty, /* /proc/<pid>/cpu */
+ lxpr_read_invalid, /* /proc/<pid>/cwd */
+ lxpr_read_empty, /* /proc/<pid>/environ */
+ lxpr_read_invalid, /* /proc/<pid>/exe */
+ lxpr_read_pid_maps, /* /proc/<pid>/maps */
+ lxpr_read_empty, /* /proc/<pid>/mem */
+ lxpr_read_invalid, /* /proc/<pid>/root */
+ lxpr_read_pid_stat, /* /proc/<pid>/stat */
+ lxpr_read_pid_statm, /* /proc/<pid>/statm */
+ lxpr_read_pid_status, /* /proc/<pid>/status */
+ lxpr_read_isdir, /* /proc/<pid>/fd */
+ lxpr_read_fd, /* /proc/<pid>/fd/nn */
+ lxpr_read_empty, /* /proc/cmdline */
+ lxpr_read_cpuinfo, /* /proc/cpuinfo */
+ lxpr_read_empty, /* /proc/devices */
+ lxpr_read_empty, /* /proc/dma */
+ lxpr_read_empty, /* /proc/filesystems */
+ lxpr_read_empty, /* /proc/interrupts */
+ lxpr_read_empty, /* /proc/ioports */
+ lxpr_read_empty, /* /proc/kcore */
+ lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */
+ lxpr_read_loadavg, /* /proc/loadavg */
+ lxpr_read_meminfo, /* /proc/meminfo */
+ lxpr_read_mounts, /* /proc/mounts */
+ lxpr_read_isdir, /* /proc/net */
+ lxpr_read_net_arp, /* /proc/net/arp */
+ lxpr_read_net_dev, /* /proc/net/dev */
+ lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */
+ lxpr_read_net_igmp, /* /proc/net/igmp */
+ lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */
+ lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */
+ lxpr_read_net_mcfilter, /* /proc/net/mcfilter */
+ lxpr_read_net_netstat, /* /proc/net/netstat */
+ lxpr_read_net_raw, /* /proc/net/raw */
+ lxpr_read_net_route, /* /proc/net/route */
+ lxpr_read_net_rpc, /* /proc/net/rpc */
+ lxpr_read_net_rt_cache, /* /proc/net/rt_cache */
+ lxpr_read_net_sockstat, /* /proc/net/sockstat */
+ lxpr_read_net_snmp, /* /proc/net/snmp */
+ lxpr_read_net_stat, /* /proc/net/stat */
+ lxpr_read_net_tcp, /* /proc/net/tcp */
+ lxpr_read_net_udp, /* /proc/net/udp */
+ lxpr_read_net_unix, /* /proc/net/unix */
+ lxpr_read_partitions, /* /proc/partitions */
+ lxpr_read_invalid, /* /proc/self */
+ lxpr_read_stat, /* /proc/stat */
+ lxpr_read_uptime, /* /proc/uptime */
+ lxpr_read_version, /* /proc/version */
+};
+
+/*
+ * Array of lookup functions, indexed by /lxproc file type.
+ */
+static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = {
+ lxpr_lookup_procdir, /* /proc */
+ lxpr_lookup_piddir, /* /proc/<pid> */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/root */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/status */
+ lxpr_lookup_fddir, /* /proc/<pid>/fd */
+ lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_lookup_not_a_dir, /* /proc/cmdline */
+ lxpr_lookup_not_a_dir, /* /proc/cpuinfo */
+ lxpr_lookup_not_a_dir, /* /proc/devices */
+ lxpr_lookup_not_a_dir, /* /proc/dma */
+ lxpr_lookup_not_a_dir, /* /proc/filesystems */
+ lxpr_lookup_not_a_dir, /* /proc/interrupts */
+ lxpr_lookup_not_a_dir, /* /proc/ioports */
+ lxpr_lookup_not_a_dir, /* /proc/kcore */
+ lxpr_lookup_not_a_dir, /* /proc/kmsg */
+ lxpr_lookup_not_a_dir, /* /proc/loadavg */
+ lxpr_lookup_not_a_dir, /* /proc/meminfo */
+ lxpr_lookup_not_a_dir, /* /proc/mounts */
+ lxpr_lookup_netdir, /* /proc/net */
+ lxpr_lookup_not_a_dir, /* /proc/net/arp */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev */
+ lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_lookup_not_a_dir, /* /proc/net/igmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_lookup_not_a_dir, /* /proc/net/netstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/raw */
+ lxpr_lookup_not_a_dir, /* /proc/net/route */
+ lxpr_lookup_not_a_dir, /* /proc/net/rpc */
+ lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_lookup_not_a_dir, /* /proc/net/sockstat */
+ lxpr_lookup_not_a_dir, /* /proc/net/snmp */
+ lxpr_lookup_not_a_dir, /* /proc/net/stat */
+ lxpr_lookup_not_a_dir, /* /proc/net/tcp */
+ lxpr_lookup_not_a_dir, /* /proc/net/udp */
+ lxpr_lookup_not_a_dir, /* /proc/net/unix */
+ lxpr_lookup_not_a_dir, /* /proc/partitions */
+ lxpr_lookup_not_a_dir, /* /proc/self */
+ lxpr_lookup_not_a_dir, /* /proc/stat */
+ lxpr_lookup_not_a_dir, /* /proc/uptime */
+ lxpr_lookup_not_a_dir, /* /proc/version */
+};
+
+/*
+ * Array of readdir functions, indexed by /proc file type.
+ */
+static int (*lxpr_readdir_function[LXPR_NFILES])() = {
+ lxpr_readdir_procdir, /* /proc */
+ lxpr_readdir_piddir, /* /proc/<pid> */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/root */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/status */
+ lxpr_readdir_fddir, /* /proc/<pid>/fd */
+ lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */
+ lxpr_readdir_not_a_dir, /* /proc/cmdline */
+ lxpr_readdir_not_a_dir, /* /proc/cpuinfo */
+ lxpr_readdir_not_a_dir, /* /proc/devices */
+ lxpr_readdir_not_a_dir, /* /proc/dma */
+ lxpr_readdir_not_a_dir, /* /proc/filesystems */
+ lxpr_readdir_not_a_dir, /* /proc/interrupts */
+ lxpr_readdir_not_a_dir, /* /proc/ioports */
+ lxpr_readdir_not_a_dir, /* /proc/kcore */
+ lxpr_readdir_not_a_dir, /* /proc/kmsg */
+ lxpr_readdir_not_a_dir, /* /proc/loadavg */
+ lxpr_readdir_not_a_dir, /* /proc/meminfo */
+ lxpr_readdir_not_a_dir, /* /proc/mounts */
+ lxpr_readdir_netdir, /* /proc/net */
+ lxpr_readdir_not_a_dir, /* /proc/net/arp */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev */
+ lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */
+ lxpr_readdir_not_a_dir, /* /proc/net/igmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */
+ lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */
+ lxpr_readdir_not_a_dir, /* /proc/net/netstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/raw */
+ lxpr_readdir_not_a_dir, /* /proc/net/route */
+ lxpr_readdir_not_a_dir, /* /proc/net/rpc */
+ lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */
+ lxpr_readdir_not_a_dir, /* /proc/net/sockstat */
+ lxpr_readdir_not_a_dir, /* /proc/net/snmp */
+ lxpr_readdir_not_a_dir, /* /proc/net/stat */
+ lxpr_readdir_not_a_dir, /* /proc/net/tcp */
+ lxpr_readdir_not_a_dir, /* /proc/net/udp */
+ lxpr_readdir_not_a_dir, /* /proc/net/unix */
+ lxpr_readdir_not_a_dir, /* /proc/partitions */
+ lxpr_readdir_not_a_dir, /* /proc/self */
+ lxpr_readdir_not_a_dir, /* /proc/stat */
+ lxpr_readdir_not_a_dir, /* /proc/uptime */
+ lxpr_readdir_not_a_dir, /* /proc/version */
+};
+
+
+/*
+ * lxpr_read(): Vnode operation for VOP_READ()
+ *
+ * As the format of all the files that can be read in lxproc is human readable
+ * and not binary structures there do not have to be different read variants
+ * depending on whether the reading process model is 32- or 64-bit.
+ */
+/* ARGSUSED */
+static int
+lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr,
+ caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop);
+ int error;
+
+ ASSERT(type < LXPR_NFILES);
+
+ if (type == LXPR_KMSG) {
+ ldi_ident_t li = VTOLXPM(vp)->lxprm_li;
+ ldi_handle_t ldih;
+ struct strioctl str;
+ int rv;
+
+ /*
+ * Open the zone's console device using the layered driver
+ * interface.
+ */
+ if ((error =
+ ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0)
+ return (error);
+
+ /*
+ * Send an ioctl to the underlying console device, letting it
+ * know we're interested in getting console messages.
+ */
+ str.ic_cmd = I_CONSLOG;
+ str.ic_timout = 0;
+ str.ic_len = 0;
+ str.ic_dp = NULL;
+ if ((error = ldi_ioctl(ldih, I_STR,
+ (intptr_t)&str, FKIOCTL, cr, &rv)) != 0)
+ return (error);
+
+ lxpr_read_kmsg(lxpnp, uiobuf, ldih);
+
+ if ((error = ldi_close(ldih, FREAD, cr)) != 0)
+ return (error);
+ } else {
+ lxpr_read_function[type](lxpnp, uiobuf);
+ }
+
+ error = lxpr_uiobuf_flush(uiobuf);
+ lxpr_uiobuf_free(uiobuf);
+
+ return (error);
+}
+
+/*
+ * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty()
+ *
+ * Various special case reads:
+ * - trying to read a directory
+ * - invalid file (used to mean a file that should be implemented,
+ * but isn't yet)
+ * - empty file
+ * - wait to be able to read a file that will never have anything to read
+ */
+/* ARGSUSED */
+static void
+lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EISDIR);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_pid_cmdline():
+ *
+ * This is not precisely compatible with Linux: the Linux cmdline returns argv
+ * with the correct separation using \0 between the arguments, but we cannot do
+ * that without copying the real argv from the correct process context. This
+ * is too difficult to attempt so we pretend that the entire cmdline is just
+ * argv[0]. This is good enough for ps and htop to display correctly, but might
+ * cause some other things not to work correctly.
+ */
+static void
+lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ char *buf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm;
+
+ lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1);
+ lxpr_unlock(p);
+}
+
+/*
+ * lxpr_read_pid_maps(): memory map file
+ */
+static void
+lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ struct seg *seg;
+ char *buf;
+ int buflen = MAXPATHLEN;
+ struct print_data {
+ caddr_t saddr;
+ caddr_t eaddr;
+ int type;
+ char prot[5];
+ uint32_t offset;
+ vnode_t *vp;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *pbuf;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ if (as == &kas) {
+ lxpr_unlock(p);
+ return;
+ }
+
+ mutex_exit(&p->p_lock);
+
+ /* Iterate over all segments in the address space */
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
+ vnode_t *vp;
+ uint_t protbits;
+
+ pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP);
+
+ pbuf->saddr = seg->s_base;
+ pbuf->eaddr = seg->s_base+seg->s_size;
+ pbuf->type = SEGOP_GETTYPE(seg, seg->s_base);
+
+ /*
+ * Cheat and only use the protection bits of the first page
+ * in the segment
+ */
+ (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot));
+ (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits);
+
+ if (protbits & PROT_READ) pbuf->prot[0] = 'r';
+ if (protbits & PROT_WRITE) pbuf->prot[1] = 'w';
+ if (protbits & PROT_EXEC) pbuf->prot[2] = 'x';
+ if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's';
+ else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p';
+
+ if (seg->s_ops == &segvn_ops &&
+ SEGOP_GETVP(seg, seg->s_base, &vp) == 0 &&
+ vp != NULL && vp->v_type == VREG) {
+ VN_HOLD(vp);
+ pbuf->vp = vp;
+ } else {
+ pbuf->vp = NULL;
+ }
+
+ pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr);
+
+ pbuf->next = NULL;
+ *print_tail = pbuf;
+ print_tail = &pbuf->next;
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ buf = kmem_alloc(buflen, KM_SLEEP);
+
+ /* print the data we've extracted */
+ pbuf = print_head;
+ while (pbuf != NULL) {
+ struct print_data *pbuf_next;
+ vattr_t vattr;
+
+ int maj = 0;
+ int min = 0;
+ u_longlong_t inode = 0;
+
+ *buf = '\0';
+ if (pbuf->vp != NULL) {
+ vattr.va_mask = AT_FSID | AT_NODEID;
+ if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(),
+ NULL) == 0) {
+ maj = getmajor(vattr.va_fsid);
+ min = getminor(vattr.va_fsid);
+ inode = vattr.va_nodeid;
+ }
+ (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED());
+ VN_RELE(pbuf->vp);
+ }
+
+ if (*buf != '\0') {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld %s\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode, buf);
+ } else {
+ lxpr_uiobuf_printf(uiobuf,
+ "%08x-%08x %s %08x %02d:%03d %lld\n",
+ pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset,
+ maj, min, inode);
+ }
+
+ pbuf_next = pbuf->next;
+ kmem_free(pbuf, sizeof (*pbuf));
+ pbuf = pbuf_next;
+ }
+
+ kmem_free(buf, buflen);
+}
+
+/*
+ * lxpr_read_pid_statm(): memory status file
+ */
+static void
+lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ struct as *as;
+ size_t vsize;
+ size_t rss;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ as = p->p_as;
+
+ mutex_exit(&p->p_lock);
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = btopr(as->a_resvsize);
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%lu %lu %lu %lu %lu %lu %lu\n",
+ vsize, rss, 0l, rss, 0l, 0l, 0l);
+}
+
+/*
+ * lxpr_read_pid_status(): status file
+ */
+static void
+lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ user_t *up;
+ cred_t *cr;
+ const gid_t *groups;
+ int ngroups;
+ struct as *as;
+ char *status;
+ pid_t pid, ppid;
+ size_t vsize;
+ size_t rss;
+ k_sigset_t current, ignore, handle;
+ int i, lx_sig;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's init
+ * process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1;
+ ppid = 0; /* parent pid for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP)
+ ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ status = "S (sleeping)";
+ break;
+ case TS_RUN:
+ case TS_ONPROC:
+ status = "R (running)";
+ break;
+ case TS_ZOMB:
+ status = "Z (zombie)";
+ break;
+ case TS_STOPPED:
+ status = "T (stopped)";
+ break;
+ default:
+ status = "! (unknown)";
+ break;
+ }
+ thread_unlock(t);
+ } else {
+ /*
+ * there is a hole in the exit code, where a proc can have
+ * no threads but it is yet to be flagged SZOMB. We will
+ * assume we are about to become a zombie
+ */
+ status = "Z (zombie)";
+ }
+
+ up = PTOU(p);
+ mutex_enter(&p->p_crlock);
+ crhold(cr = p->p_cred);
+ mutex_exit(&p->p_crlock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "Name:\t%s\n"
+ "State:\t%s\n"
+ "Tgid:\t%d\n"
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
+ "Uid:\t%u\t%u\t%u\t%u\n"
+ "Gid:\t%u\t%u\t%u\t%u\n"
+ "FDSize:\t%d\n"
+ "Groups:\t",
+ up->u_comm,
+ status,
+ pid, /* thread group id - same as pid */
+ pid,
+ ppid,
+ 0,
+ crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr),
+ crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr),
+ p->p_fno_ctl);
+
+ ngroups = crgetngroups(cr);
+ groups = crgetgroups(cr);
+ for (i = 0; i < ngroups; i++) {
+ lxpr_uiobuf_printf(uiobuf,
+ "%u ",
+ groups[i]);
+ }
+ crfree(cr);
+
+ as = p->p_as;
+ if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) {
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "VmSize:\t%8lu kB\n"
+ "VmLck:\t%8lu kB\n"
+ "VmRSS:\t%8lu kB\n"
+ "VmData:\t%8lu kB\n"
+ "VmStk:\t%8lu kB\n"
+ "VmExe:\t%8lu kB\n"
+ "VmLib:\t%8lu kB",
+ btok(vsize),
+ 0l,
+ ptok(rss),
+ 0l,
+ btok(p->p_stksize),
+ ptok(rss),
+ 0l);
+ }
+
+ sigemptyset(&current);
+ sigemptyset(&ignore);
+ sigemptyset(&handle);
+
+ for (i = 1; i < NSIG; i++) {
+ lx_sig = lxpr_sigmap[i];
+
+ if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) {
+ if (sigismember(&p->p_sig, i))
+ sigaddset(&current, lx_sig);
+
+ if (up->u_signal[i - 1] == SIG_IGN)
+ sigaddset(&ignore, lx_sig);
+ else if (up->u_signal[i - 1] != SIG_DFL)
+ sigaddset(&handle, lx_sig);
+ }
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "\n"
+ "SigPnd:\t%08x%08x\n"
+ "SigBlk:\t%08x%08x\n"
+ "SigIgn:\t%08x%08x\n"
+ "SigCgt:\t%08x%08x\n"
+ "CapInh:\t%016x\n"
+ "CapPrm:\t%016x\n"
+ "CapEff:\t%016x\n",
+ current.__sigbits[1], current.__sigbits[0],
+ 0, 0, /* signals blocked on per thread basis */
+ ignore.__sigbits[1], ignore.__sigbits[0],
+ handle.__sigbits[1], handle.__sigbits[0],
+ /* Can't do anything with linux capabilities */
+ 0,
+ 0,
+ 0);
+
+ lxpr_unlock(p);
+}
+
+
+/*
+ * lxpr_read_pid_stat(): pid stat file
+ */
+static void
+lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ proc_t *p;
+ kthread_t *t;
+ struct as *as;
+ char stat;
+ pid_t pid, ppid, pgpid, spid;
+ gid_t psgid;
+ dev_t psdev;
+ size_t rss, vsize;
+ int nice, pri;
+ caddr_t wchan;
+ processorid_t cpu;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT);
+
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL) {
+ lxpr_uiobuf_seterr(uiobuf, EINVAL);
+ return;
+ }
+
+ pid = p->p_pid;
+
+ /*
+ * Set Linux defaults if we're the zone's init process
+ */
+ if (pid == curproc->p_zone->zone_proc_initpid) {
+ pid = 1; /* PID for init */
+ ppid = 0; /* parent PID for init is 0 */
+ pgpid = 0; /* process group for init is 0 */
+ psgid = (gid_t)-1; /* credential GID for init is -1 */
+ spid = 0; /* session id for init is 0 */
+ psdev = 0; /* session device for init is 0 */
+ } else {
+ /*
+ * Make sure not to reference parent PIDs that reside outside
+ * the zone
+ */
+ ppid = ((p->p_flag & SZONETOP) ?
+ curproc->p_zone->zone_zsched->p_pid : p->p_ppid);
+
+ /*
+ * Convert ppid to the Linux default of 1 if our parent is the
+ * zone's init process
+ */
+ if (ppid == curproc->p_zone->zone_proc_initpid)
+ ppid = 1;
+
+ pgpid = p->p_pgrp;
+
+ mutex_enter(&p->p_splock);
+ mutex_enter(&p->p_sessp->s_lock);
+ spid = p->p_sessp->s_sid;
+ psdev = p->p_sessp->s_dev;
+ if (p->p_sessp->s_cred)
+ psgid = crgetgid(p->p_sessp->s_cred);
+ else
+ psgid = crgetgid(p->p_cred);
+
+ mutex_exit(&p->p_sessp->s_lock);
+ mutex_exit(&p->p_splock);
+ }
+
+ t = prchoose(p);
+ if (t != NULL) {
+ switch (t->t_state) {
+ case TS_SLEEP:
+ stat = 'S'; break;
+ case TS_RUN:
+ case TS_ONPROC:
+ stat = 'R'; break;
+ case TS_ZOMB:
+ stat = 'Z'; break;
+ case TS_STOPPED:
+ stat = 'T'; break;
+ default:
+ stat = '!'; break;
+ }
+
+ if (CL_DONICE(t, NULL, 0, &nice) != 0)
+ nice = 0;
+
+ pri = t->t_pri;
+ wchan = t->t_wchan;
+ cpu = t->t_cpu->cpu_id;
+ thread_unlock(t);
+ } else {
+ /* Only zombies have no threads */
+ stat = 'Z';
+ nice = 0;
+ pri = 0;
+ wchan = 0;
+ cpu = 0;
+ }
+ as = p->p_as;
+ mutex_exit(&p->p_lock);
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ vsize = as->a_resvsize;
+ rss = rm_asrss(as);
+ AS_LOCK_EXIT(as, &as->a_lock);
+ mutex_enter(&p->p_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%d (%s) %c %d %d %d %d %d "
+ "%lu %lu %lu %lu %lu "
+ "%lu %lu %ld %ld "
+ "%d %d %d "
+ "%lu "
+ "%lu "
+ "%lu %ld %llu "
+ "%lu %lu %u "
+ "%lu %lu "
+ "%lu %lu %lu %lu "
+ "%lu "
+ "%lu %lu "
+ "%d "
+ "%d"
+ "\n",
+ pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid,
+ 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */
+ p->p_utime, p->p_stime, p->p_cutime, p->p_cstime,
+ pri, nice, p->p_lwpcnt,
+ 0l, /* itrealvalue (time before next SIGALRM) */
+ PTOU(p)->u_ticks,
+ vsize, rss, p->p_vmem_ctl,
+ 0l, 0l, USRSTACK, /* startcode, endcode, startstack */
+ 0l, 0l, /* kstkesp, kstkeip */
+ 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */
+ wchan,
+ 0l, 0l, /* nswap, cnswap */
+ 0, /* exit_signal */
+ cpu);
+
+ lxpr_unlock(p);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf, "Inter-| Receive "
+ " | Transmit\n");
+ lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo"
+ " frame compressed multicast|bytes packets errs drop fifo"
+ " colls carrier compressed\n");
+
+ /*
+ * Data about each interface should go here, but that shouldn't be added
+ * unless there is an lxproc reader that actually makes use of it (and
+ * doesn't need anything else that we refuse to provide)...
+ */
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+}
+
+/*
+ * lxpr_read_kmsg(): read the contents of the kernel message queue. We
+ * translate this into the reception of console messages for this zone; each
+ * read copies out a single zone console message, or blocks until the next one
+ * is produced.
+ */
+
+#define LX_KMSG_PRI "<0>"
+
+static void
+lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh)
+{
+ mblk_t *mp;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_KMSG);
+
+ if (ldi_getmsg(lh, &mp, NULL) == 0) {
+ /*
+ * lxproc doesn't like successive reads to the same file
+ * descriptor unless we do an explicit rewind each time.
+ */
+ lxpr_uiobuf_seek(uiobuf, 0);
+
+ lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI,
+ mp->b_cont->b_rptr);
+
+ freemsg(mp);
+ }
+}
+
+/*
+ * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just
+ * enough for uptime and other simple lxproc readers to work
+ */
+extern int nthread;
+
+static void
+lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ulong_t avenrun1;
+ ulong_t avenrun5;
+ ulong_t avenrun15;
+ ulong_t avenrun1_cs;
+ ulong_t avenrun5_cs;
+ ulong_t avenrun15_cs;
+ int loadavg[3];
+ int *loadbuf;
+ cpupart_t *cp;
+ zone_t *zone = LXPTOZ(lxpnp);
+
+ uint_t nrunnable = 0;
+ rctl_qty_t nlwps;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG);
+
+ mutex_enter(&cpu_lock);
+
+ /*
+ * Need to add up values over all CPU partitions. If pools are active,
+ * only report the values of the zone's partition, which by definition
+ * includes the current CPU.
+ */
+ if (pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+ ASSERT(curproc->p_zone != &zone0);
+ cp = CPU->cpu_part;
+
+ nrunnable = cp->cp_nrunning + cp->cp_nrunnable;
+ (void) cpupart_get_loadavg(psetid, &loadavg[0], 3);
+ loadbuf = &loadavg[0];
+ } else {
+ cp = cp_list_head;
+ do {
+ nrunnable += cp->cp_nrunning + cp->cp_nrunnable;
+ } while ((cp = cp->cp_next) != cp_list_head);
+
+ loadbuf = zone == global_zone ?
+ &avenrun[0] : zone->zone_avenrun;
+ }
+
+ /*
+ * If we're in the non-global zone, we'll report the total number of
+ * LWPs in the zone for the "nproc" parameter of /proc/loadavg,
+ * otherwise will just use nthread (which will include kernel threads,
+ * but should be good enough for lxproc).
+ */
+ nlwps = zone == global_zone ? nthread : zone->zone_nlwps;
+
+ mutex_exit(&cpu_lock);
+
+ avenrun1 = loadbuf[0] >> FSHIFT;
+ avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun5 = loadbuf[1] >> FSHIFT;
+ avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT;
+ avenrun15 = loadbuf[2] >> FSHIFT;
+ avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n",
+ avenrun1, avenrun1_cs,
+ avenrun5, avenrun5_cs,
+ avenrun15, avenrun15_cs,
+ nrunnable, nlwps, 0);
+}
+
+/*
+ * lxpr_read_meminfo(): read the contents of the "meminfo" file.
+ */
+static void
+lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ zone_t *zone = LXPTOZ(lxpnp);
+ int global = zone == global_zone;
+ long total_mem, free_mem, total_swap, used_swap;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO);
+
+ if (global || zone->zone_phys_mem_ctl == UINT64_MAX) {
+ total_mem = physmem * PAGESIZE;
+ free_mem = freemem * PAGESIZE;
+ } else {
+ total_mem = zone->zone_phys_mem_ctl;
+ free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem;
+ }
+
+ if (global || zone->zone_max_swap_ctl == UINT64_MAX) {
+ total_swap = k_anoninfo.ani_max * PAGESIZE;
+ used_swap = k_anoninfo.ani_phys_resv * PAGESIZE;
+ } else {
+ mutex_enter(&zone->zone_mem_lock);
+ total_swap = zone->zone_max_swap_ctl;
+ used_swap = zone->zone_max_swap;
+ mutex_exit(&zone->zone_mem_lock);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ " total: used: free: shared: buffers: cached:\n"
+ "Mem: %8lu %8lu %8lu %8u %8u %8u\n"
+ "Swap: %8lu %8lu %8lu\n"
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "MemShared: %8u kB\n"
+ "Buffers: %8u kB\n"
+ "Cached: %8u kB\n"
+ "SwapCached:%8u kB\n"
+ "Active: %8u kB\n"
+ "Inactive: %8u kB\n"
+ "HighTotal: %8u kB\n"
+ "HighFree: %8u kB\n"
+ "LowTotal: %8u kB\n"
+ "LowFree: %8u kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n",
+ total_mem, total_mem - free_mem, free_mem, 0, 0, 0,
+ total_swap, used_swap, total_swap - used_swap,
+ btok(total_mem), /* MemTotal */
+ btok(free_mem), /* MemFree */
+ 0, /* MemShared */
+ 0, /* Buffers */
+ 0, /* Cached */
+ 0, /* SwapCached */
+ 0, /* Active */
+ 0, /* Inactive */
+ 0, /* HighTotal */
+ 0, /* HighFree */
+ btok(total_mem), /* LowTotal */
+ btok(free_mem), /* LowFree */
+ btok(total_swap), /* SwapTotal */
+ btok(total_swap - used_swap)); /* SwapFree */
+}
+
+/*
+ * lxpr_read_mounts():
+ */
+/* ARGSUSED */
+static void
+lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ struct vfs *vfsp;
+ struct vfs *vfslist;
+ zone_t *zone = LXPTOZ(lxpnp);
+ struct print_data {
+ refstr_t *vfs_mntpt;
+ refstr_t *vfs_resource;
+ uint_t vfs_flag;
+ int vfs_fstype;
+ struct print_data *next;
+ } *print_head = NULL;
+ struct print_data **print_tail = &print_head;
+ struct print_data *printp;
+
+ vfs_list_read_lock();
+
+ if (zone == global_zone) {
+ vfsp = vfslist = rootvfs;
+ } else {
+ vfsp = vfslist = zone->zone_vfslist;
+ /*
+ * If the zone has a root entry, it will be the first in
+ * the list. If it doesn't, we conjure one up.
+ */
+ if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt),
+ zone->zone_rootpath) != 0) {
+ struct vfs *tvfsp;
+ /*
+ * The root of the zone is not a mount point. The vfs
+ * we want to report is that of the zone's root vnode.
+ */
+ tvfsp = zone->zone_rootvp->v_vfsp;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "/ / %s %s 0 0\n",
+ vfssw[tvfsp->vfs_fstype].vsw_name,
+ tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+ }
+ if (vfslist == NULL) {
+ vfs_list_unlock();
+ return;
+ }
+ }
+
+ /*
+ * Later on we have to do a lookupname, which can end up causing
+ * another vfs_list_read_lock() to be called. Which can lead to a
+ * deadlock. To avoid this, we extract the data we need into a local
+ * list, then we can run this list without holding vfs_list_read_lock()
+ * We keep the list in the same order as the vfs_list
+ */
+ do {
+ /* Skip mounts we shouldn't show */
+ if (vfsp->vfs_flag & VFS_NOMNTTAB) {
+ goto nextfs;
+ }
+
+ printp = kmem_alloc(sizeof (*printp), KM_SLEEP);
+ refstr_hold(vfsp->vfs_mntpt);
+ printp->vfs_mntpt = vfsp->vfs_mntpt;
+ refstr_hold(vfsp->vfs_resource);
+ printp->vfs_resource = vfsp->vfs_resource;
+ printp->vfs_flag = vfsp->vfs_flag;
+ printp->vfs_fstype = vfsp->vfs_fstype;
+ printp->next = NULL;
+
+ *print_tail = printp;
+ print_tail = &printp->next;
+
+nextfs:
+ vfsp = (zone == global_zone) ?
+ vfsp->vfs_next : vfsp->vfs_zone_next;
+
+ } while (vfsp != vfslist);
+
+ vfs_list_unlock();
+
+ /*
+ * now we can run through what we've extracted without holding
+ * vfs_list_read_lock()
+ */
+ printp = print_head;
+ while (printp != NULL) {
+ struct print_data *printp_next;
+ const char *resource;
+ char *mntpt;
+ struct vnode *vp;
+ int error;
+
+ mntpt = (char *)refstr_value(printp->vfs_mntpt);
+ resource = refstr_value(printp->vfs_resource);
+
+ if (mntpt != NULL && mntpt[0] != '\0')
+ mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
+ else
+ mntpt = "-";
+
+ error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+ if (error != 0)
+ goto nextp;
+
+ if (!(vp->v_flag & VROOT)) {
+ VN_RELE(vp);
+ goto nextp;
+ }
+ VN_RELE(vp);
+
+ if (resource != NULL && resource[0] != '\0') {
+ if (resource[0] == '/') {
+ resource = ZONE_PATH_VISIBLE(resource, zone) ?
+ ZONE_PATH_TRANSLATE(resource, zone) :
+ mntpt;
+ }
+ } else {
+ resource = "-";
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%s %s %s %s 0 0\n",
+ resource, mntpt, vfssw[printp->vfs_fstype].vsw_name,
+ printp->vfs_flag & VFS_RDONLY ? "ro" : "rw");
+
+nextp:
+ printp_next = printp->next;
+ refstr_rele(printp->vfs_mntpt);
+ refstr_rele(printp->vfs_resource);
+ kmem_free(printp, sizeof (*printp));
+ printp = printp_next;
+
+ }
+}
+
+/*
+ * lxpr_read_partitions():
+ *
+ * We don't support partitions in a local zone because it requires access to
+ * physical devices. But we need to fake up enough of the file to show that we
+ * have no partitions.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "major minor #blocks name rio rmerge rsect ruse "
+ "wio wmerge wsect wuse running use aveq\n\n");
+}
+
+/*
+ * lxpr_read_version(): read the contents of the "version" file. Note that
+ * we don't lie here -- we don't pretend that we're Linux. If lxproc is to
+ * be used in a Linux-branded zone, there will need to be a mount option to
+ * indicate that Linux should be more fully mimicked.
+ */
+/* ARGSUSED */
+static void
+lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ lxpr_uiobuf_printf(uiobuf,
+ "%s version %s (%s version %d.%d.%d) "
+ "#%s SMP %s\n",
+ utsname.sysname, utsname.release,
+#if defined(__GNUC__)
+ "gcc",
+ __GNUC__,
+ __GNUC_MINOR__,
+ __GNUC_PATCHLEVEL__,
+#else
+ "Sun C",
+ __SUNPRO_C / 0x100,
+ (__SUNPRO_C & 0xff) / 0x10,
+ __SUNPRO_C & 0xf,
+#endif
+ utsname.version,
+ "00:00:00 00/00/00");
+}
+
+/*
+ * lxpr_read_stat(): read the contents of the "stat" file.
+ *
+ */
+/* ARGSUSED */
+static void
+lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t sys_cum = 0;
+ ulong_t user_cum = 0;
+ ulong_t irq_cum = 0;
+ ulong_t cpu_nrunnable_cum = 0;
+ ulong_t w_io_cum = 0;
+
+ ulong_t pgpgin_cum = 0;
+ ulong_t pgpgout_cum = 0;
+ ulong_t pgswapout_cum = 0;
+ ulong_t pgswapin_cum = 0;
+ ulong_t intr_cum = 0;
+ ulong_t pswitch_cum = 0;
+ ulong_t forks_cum = 0;
+ hrtime_t msnsecs[NCMSTATES];
+
+ /* temporary variable since scalehrtime modifies data in place */
+ hrtime_t tmptime;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_STAT);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ /* Calculate cumulative stats */
+ cp = cpstart = CPU->cpu_part->cp_cpulist;
+ do {
+ int i;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ pgpgin_cum += CPU_STATS(cp, vm.pgpgin);
+ pgpgout_cum += CPU_STATS(cp, vm.pgpgout);
+ pgswapin_cum += CPU_STATS(cp, vm.pgswapin);
+ pgswapout_cum += CPU_STATS(cp, vm.pgswapout);
+
+ cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable;
+ w_io_cum += CPU_STATS(cp, sys.iowait);
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_cum += NSEC_TO_TICK(tmptime);
+ }
+
+ for (i = 0; i < PIL_MAX; i++)
+ intr_cum += CPU_STATS(cp, sys.intr[i]);
+
+ pswitch_cum += CPU_STATS(cp, sys.pswitch);
+ forks_cum += CPU_STATS(cp, sys.sysfork);
+ forks_cum += CPU_STATS(cp, sys.sysvfork);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n",
+ user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L);
+
+ /* Do per processor stats */
+ do {
+ int i;
+
+ ulong_t idle_ticks;
+ ulong_t sys_ticks;
+ ulong_t user_ticks;
+ ulong_t irq_ticks = 0;
+
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ get_cpu_mstate(cp, msnsecs);
+
+ idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]);
+ sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]);
+ user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]);
+
+ for (i = 0; i < NCMSTATES; i++) {
+ tmptime = cp->cpu_intracct[i];
+ scalehrtime(&tmptime);
+ irq_ticks += NSEC_TO_TICK(tmptime);
+ }
+
+ lxpr_uiobuf_printf(uiobuf,
+ "cpu%d %lu %lu %lu %lu %lu %lu %lu\n",
+ cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks,
+ 0L, irq_ticks, 0L);
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "page %lu %lu\n"
+ "swap %lu %lu\n"
+ "intr %lu\n"
+ "ctxt %lu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+ "procs_blocked %lu\n",
+ pgpgin_cum, pgpgout_cum,
+ pgswapin_cum, pgswapout_cum,
+ intr_cum,
+ pswitch_cum,
+ boot_time,
+ forks_cum,
+ cpu_nrunnable_cum,
+ w_io_cum);
+}
+
+/*
+ * lxpr_read_uptime(): read the contents of the "uptime" file.
+ *
+ * format is: "%.2lf, %.2lf",uptime_secs, idle_secs
+ * Use fixed point arithmetic to get 2 decimal places
+ */
+/* ARGSUSED */
+static void
+lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ ulong_t idle_cum = 0;
+ ulong_t cpu_count = 0;
+ ulong_t idle_s;
+ ulong_t idle_cs;
+ ulong_t up_s;
+ ulong_t up_cs;
+ hrtime_t birthtime;
+ hrtime_t centi_sec = 10000000; /* 10^7 */
+
+ ASSERT(lxpnp->lxpr_type == LXPR_UPTIME);
+
+ /* Calculate cumulative stats */
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU;
+ do {
+ /*
+ * Don't count CPUs that aren't even in the system
+ * or aren't up yet.
+ */
+ if ((cp->cpu_flags & CPU_EXISTS) == 0) {
+ continue;
+ }
+
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle);
+ idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait);
+ cpu_count += 1;
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+ mutex_exit(&cpu_lock);
+
+ /* Getting the Zone zsched process startup time */
+ birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart;
+ up_cs = (gethrtime() - birthtime) / centi_sec;
+ up_s = up_cs / 100;
+ up_cs %= 100;
+
+ ASSERT(cpu_count > 0);
+ idle_cum /= cpu_count;
+ idle_s = idle_cum / hz;
+ idle_cs = idle_cum % hz;
+ idle_cs *= 100;
+ idle_cs /= hz;
+
+ lxpr_uiobuf_printf(uiobuf,
+ "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs);
+}
+
+static const char *amd_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "mp",
+ "nx", NULL, "mmxext", NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", "3dnowext", "3dnow"
+};
+
+static const char *amd_x_ecx[] = {
+ "lahf_lm", NULL, "svm", NULL,
+ "altmovcr8"
+};
+
+static const char *tm_x_edx[] = {
+ "recovery", "longrun", NULL, "lrti"
+};
+
+/*
+ * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx."
+ */
+static const char *intc_x_edx[] = {
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, "syscall",
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ "nx", NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL,
+ NULL, "lm", NULL, NULL
+};
+
+static const char *intc_edx[] = {
+ "fpu", "vme", "de", "pse",
+ "tsc", "msr", "pae", "mce",
+ "cx8", "apic", NULL, "sep",
+ "mtrr", "pge", "mca", "cmov",
+ "pat", "pse36", "pn", "clflush",
+ NULL, "dts", "acpi", "mmx",
+ "fxsr", "sse", "sse2", "ss",
+ "ht", "tm", "ia64", "pbe"
+};
+
+/*
+ * "sse3" on linux is called "pni" (Prescott New Instructions).
+ */
+static const char *intc_ecx[] = {
+ "pni", NULL, NULL, "monitor",
+ "ds_cpl", NULL, NULL, "est",
+ "tm2", NULL, "cid", NULL,
+ NULL, "cx16", "xtpr"
+};
+
+static void
+lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ int i;
+ uint32_t bits;
+ cpu_t *cp, *cpstart;
+ int pools_enabled;
+ const char **fp;
+ char brandstr[CPU_IDSTRLEN];
+ struct cpuid_regs cpr;
+ int maxeax;
+ int std_ecx, std_edx, ext_ecx, ext_edx;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO);
+
+ mutex_enter(&cpu_lock);
+ pools_enabled = pool_pset_enabled();
+
+ cp = cpstart = CPU;
+ do {
+ /*
+ * This returns the maximum eax value for standard cpuid
+ * functions in eax.
+ */
+ cpr.cp_eax = 0;
+ (void) cpuid_insn(cp, &cpr);
+ maxeax = cpr.cp_eax;
+
+ /*
+ * Get standard x86 feature flags.
+ */
+ cpr.cp_eax = 1;
+ (void) cpuid_insn(cp, &cpr);
+ std_ecx = cpr.cp_ecx;
+ std_edx = cpr.cp_edx;
+
+ /*
+ * Now get extended feature flags.
+ */
+ cpr.cp_eax = 0x80000001;
+ (void) cpuid_insn(cp, &cpr);
+ ext_ecx = cpr.cp_ecx;
+ ext_edx = cpr.cp_edx;
+
+ (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN);
+
+ lxpr_uiobuf_printf(uiobuf,
+ "processor\t: %d\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %d\n"
+ "model name\t: %s\n"
+ "stepping\t: %d\n"
+ "cpu MHz\t\t: %u.%03u\n",
+ cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp),
+ cpuid_getmodel(cp), brandstr, cpuid_getstep(cp),
+ (uint32_t)(cpu_freq_hz / 1000000),
+ ((uint32_t)(cpu_freq_hz / 1000)) % 1000);
+
+ lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n",
+ getl2cacheinfo(cp, NULL, NULL, NULL) / 1024);
+
+ if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+ /*
+ * 'siblings' is used for HT-style threads
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "physical id\t: %lu\n"
+ "siblings\t: %u\n",
+ pg_plat_hw_instance_id(cp, PGHW_CHIP),
+ cpuid_get_ncpu_per_chip(cp));
+ }
+
+ /*
+ * Since we're relatively picky about running on older hardware,
+ * we can be somewhat cavalier about the answers to these ones.
+ *
+ * In fact, given the hardware we support, we just say:
+ *
+ * fdiv_bug : no (if we're on a 64-bit kernel)
+ * hlt_bug : no
+ * f00f_bug : no
+ * coma_bug : no
+ * wp : yes (write protect in supervsr mode)
+ */
+ lxpr_uiobuf_printf(uiobuf,
+ "fdiv_bug\t: %s\n"
+ "hlt_bug \t: no\n"
+ "f00f_bug\t: no\n"
+ "coma_bug\t: no\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "flags\t\t:",
+#if defined(__i386)
+ fpu_pentium_fdivbug ? "yes" : "no",
+#else
+ "no",
+#endif /* __i386 */
+ fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no",
+ maxeax);
+
+ for (bits = std_edx, fp = intc_edx, i = 0;
+ i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ /*
+ * name additional features where appropriate
+ */
+ switch (x86_vendor) {
+ case X86_VENDOR_Intel:
+ for (bits = ext_edx, fp = intc_x_edx, i = 0;
+ i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_AMD:
+ for (bits = ext_edx, fp = amd_x_edx, i = 0;
+ i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ for (bits = ext_ecx, fp = amd_x_ecx, i = 0;
+ i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+
+ case X86_VENDOR_TM:
+ for (bits = ext_edx, fp = tm_x_edx, i = 0;
+ i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]);
+ fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+ break;
+ default:
+ break;
+ }
+
+ for (bits = std_ecx, fp = intc_ecx, i = 0;
+ i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++)
+ if ((bits & (1 << i)) != 0 && *fp)
+ lxpr_uiobuf_printf(uiobuf, " %s", *fp);
+
+ lxpr_uiobuf_printf(uiobuf, "\n\n");
+
+ if (pools_enabled)
+ cp = cp->cpu_next_part;
+ else
+ cp = cp->cpu_next;
+ } while (cp != cpstart);
+
+ mutex_exit(&cpu_lock);
+}
+
+/* ARGSUSED */
+static void
+lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD);
+ lxpr_uiobuf_seterr(uiobuf, EFAULT);
+}
+
+/*
+ * lxpr_getattr(): Vnode operation for VOP_GETATTR()
+ */
+static int
+lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+ caller_context_t *ct)
+{
+ register lxpr_node_t *lxpnp = VTOLXP(vp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ extern uint_t nproc;
+ int error;
+
+ /*
+ * Return attributes of underlying vnode if ATTR_REAL
+ *
+ * but keep fd files with the symlink permissions
+ */
+ if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) {
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+
+ /*
+ * withold attribute information to owner or root
+ */
+ if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * now its attributes
+ */
+ if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * if it's a file in lx /proc/pid/fd/xx then set its
+ * mode and keep it looking like a symlink
+ */
+ if (type == LXPR_PID_FD_FD) {
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_type = vp->v_type;
+ vap->va_size = 0;
+ vap->va_nlink = 1;
+ }
+ return (0);
+ }
+
+ /* Default attributes, that may be overridden below */
+ bzero(vap, sizeof (*vap));
+ vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time;
+ vap->va_nlink = 1;
+ vap->va_type = vp->v_type;
+ vap->va_mode = lxpnp->lxpr_mode;
+ vap->va_fsid = vp->v_vfsp->vfs_dev;
+ vap->va_blksize = DEV_BSIZE;
+ vap->va_uid = lxpnp->lxpr_uid;
+ vap->va_gid = lxpnp->lxpr_gid;
+ vap->va_nodeid = lxpnp->lxpr_ino;
+
+ switch (type) {
+ case LXPR_PROCDIR:
+ vap->va_nlink = nproc + 2 + PROCDIRFILES;
+ vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE;
+ break;
+ case LXPR_PIDDIR:
+ vap->va_nlink = PIDDIRFILES;
+ vap->va_size = PIDDIRFILES * LXPR_SDSIZE;
+ break;
+ case LXPR_SELF:
+ vap->va_uid = crgetruid(curproc->p_cred);
+ vap->va_gid = crgetrgid(curproc->p_cred);
+ break;
+ default:
+ break;
+ }
+
+ vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ return (0);
+}
+
+/*
+ * lxpr_access(): Vnode operation for VOP_ACCESS()
+ */
+static int
+lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ int shift = 0;
+ proc_t *tp;
+
+ /* lx /proc is a read only file system */
+ if (mode & VWRITE)
+ return (EROFS);
+
+ /*
+ * If this is a restricted file, check access permissions.
+ */
+ switch (lxpnp->lxpr_type) {
+ case LXPR_PIDDIR:
+ return (0);
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ENV:
+ case LXPR_PID_EXE:
+ case LXPR_PID_MAPS:
+ case LXPR_PID_MEM:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_FDDIR:
+ case LXPR_PID_FD_FD:
+ if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL)
+ return (ENOENT);
+ if (tp != curproc && secpolicy_proc_access(cr) != 0 &&
+ priv_proc_cred_perm(cr, tp, NULL, mode) != 0) {
+ lxpr_unlock(tp);
+ return (EACCES);
+ }
+ lxpr_unlock(tp);
+ default:
+ break;
+ }
+
+ if (lxpnp->lxpr_realvp != NULL) {
+ /*
+ * For these we use the underlying vnode's accessibility.
+ */
+ return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct));
+ }
+
+ /* If user is root allow access regardless of permission bits */
+ if (secpolicy_proc_access(cr) == 0)
+ return (0);
+
+ /*
+ * Access check is based on only one of owner, group, public. If not
+ * owner, then check group. If not a member of the group, then check
+ * public access.
+ */
+ if (crgetuid(cr) != lxpnp->lxpr_uid) {
+ shift += 3;
+ if (!groupmember((uid_t)lxpnp->lxpr_gid, cr))
+ shift += 3;
+ }
+
+ mode &= ~(lxpnp->lxpr_mode << shift);
+
+ if (mode == 0)
+ return (0);
+
+ return (EACCES);
+}
+
+/* ARGSUSED */
+static vnode_t *
+lxpr_lookup_not_a_dir(vnode_t *dp, char *comp)
+{
+ return (NULL);
+}
+
+/*
+ * lxpr_lookup(): Vnode operation for VOP_LOOKUP()
+ */
+/* ARGSUSED */
+static int
+lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp,
+ int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+ int *direntflags, pathname_t *realpnp)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the lookup
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict lookup permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Just return the parent vnode if that's where we are trying to go.
+ */
+ if (strcmp(comp, "..") == 0) {
+ VN_HOLD(lxpnp->lxpr_parent);
+ *vpp = lxpnp->lxpr_parent;
+ return (0);
+ }
+
+ /*
+ * Special handling for directory searches. Note: null component name
+ * denotes that the current directory is being searched.
+ */
+ if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) {
+ VN_HOLD(dp);
+ *vpp = dp;
+ return (0);
+ }
+
+ *vpp = (lxpr_lookup_function[type](dp, comp));
+ return ((*vpp == NULL) ? ENOENT : 0);
+}
+
+/*
+ * Do a sequential search on the given directory table
+ */
+static vnode_t *
+lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ lxpr_node_t *lxpnp;
+ int count;
+
+ for (count = 0; count < dirtablen; count++) {
+ if (strcmp(dirtab[count].d_name, comp) == 0) {
+ lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+ return (dp);
+ }
+ }
+ return (NULL);
+}
+
+static vnode_t *
+lxpr_lookup_piddir(vnode_t *dp, char *comp)
+{
+ proc_t *p;
+
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR);
+
+ p = lxpr_lock(VTOLXP(dp)->lxpr_pid);
+ if (p == NULL)
+ return (NULL);
+
+ dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES);
+
+ lxpr_unlock(p);
+
+ return (dp);
+}
+
+/*
+ * Lookup one of the process's open files.
+ */
+static vnode_t *
+lxpr_lookup_fddir(vnode_t *dp, char *comp)
+{
+ lxpr_node_t *dlxpnp = VTOLXP(dp);
+ lxpr_node_t *lxpnp;
+ vnode_t *vp = NULL;
+ proc_t *p;
+ file_t *fp;
+ uint_t fd;
+ int c;
+ uf_entry_t *ufp;
+ uf_info_t *fip;
+
+ ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ /*
+ * convert the string rendition of the filename
+ * to a file descriptor
+ */
+ fd = 0;
+ while ((c = *comp++) != '\0') {
+ int ofd;
+ if (c < '0' || c > '9')
+ return (NULL);
+
+ ofd = fd;
+ fd = 10*fd + c - '0';
+ /* integer overflow */
+ if (fd / 10 != ofd)
+ return (NULL);
+ }
+
+ /*
+ * get the proc to work with and lock it
+ */
+ p = lxpr_lock(dlxpnp->lxpr_pid);
+ if ((p == NULL))
+ return (NULL);
+
+ /*
+ * If the process is a zombie or system process
+ * it can't have any open files.
+ */
+ if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * get us a fresh node/vnode
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd);
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we dereference into fi_list.
+ */
+ mutex_exit(&p->p_lock);
+
+ /*
+ * get open file info
+ */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+
+ if (fd < fip->fi_nfiles) {
+ UF_ENTER(ufp, fip, fd);
+ /*
+ * ensure the fd is still kosher.
+ * it may have gone between the readdir and
+ * the lookup
+ */
+ if (fip->fi_list[fd].uf_file == NULL) {
+ mutex_exit(&fip->fi_lock);
+ UF_EXIT(ufp);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ }
+
+ if ((fp = ufp->uf_file) != NULL)
+ vp = fp->f_vnode;
+ UF_EXIT(ufp);
+ }
+ mutex_exit(&fip->fi_lock);
+
+ if (vp == NULL) {
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ lxpr_freenode(lxpnp);
+ return (NULL);
+ } else {
+ /*
+ * Fill in the lxpr_node so future references will be able to
+ * find the underlying vnode. The vnode is held on the realvp.
+ */
+ lxpnp->lxpr_realvp = vp;
+ VN_HOLD(lxpnp->lxpr_realvp);
+ }
+
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_netdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR);
+
+ dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES);
+
+ return (dp);
+}
+
+static vnode_t *
+lxpr_lookup_procdir(vnode_t *dp, char *comp)
+{
+ ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR);
+
+ /*
+ * We know all the names of files & dirs in our file system structure
+ * except those that are pid names. These change as pids are created/
+ * deleted etc., so we just look for a number as the first char to see
+ * if we are we doing pid lookups.
+ *
+ * Don't need to check for "self" as it is implemented as a symlink
+ */
+ if (*comp >= '0' && *comp <= '9') {
+ pid_t pid = 0;
+ lxpr_node_t *lxpnp = NULL;
+ proc_t *p;
+ int c;
+
+ while ((c = *comp++) != '\0')
+ pid = 10 * pid + c - '0';
+
+ /*
+ * Can't continue if the process is still loading or it doesn't
+ * really exist yet (or maybe it just died!)
+ */
+ p = lxpr_lock(pid);
+ if (p == NULL)
+ return (NULL);
+
+ if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ lxpr_unlock(p);
+ return (NULL);
+ }
+
+ /*
+ * allocate and fill in a new lxpr node
+ */
+ lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0);
+
+ lxpr_unlock(p);
+
+ dp = LXPTOV(lxpnp);
+ ASSERT(dp != NULL);
+
+ return (dp);
+ }
+
+ /* Lookup fixed names */
+ return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES));
+}
+
+/*
+ * lxpr_readdir(): Vnode operation for VOP_READDIR()
+ */
+/* ARGSUSED */
+static int
+lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp,
+ caller_context_t *ct, int flags)
+{
+ lxpr_node_t *lxpnp = VTOLXP(dp);
+ lxpr_nodetype_t type = lxpnp->lxpr_type;
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+
+ ASSERT(dp->v_type == VDIR);
+ ASSERT(type < LXPR_NFILES);
+
+ /*
+ * we should never get here because the readdir
+ * is done on the realvp for these nodes
+ */
+ ASSERT(type != LXPR_PID_FD_FD &&
+ type != LXPR_PID_CURDIR &&
+ type != LXPR_PID_ROOTDIR);
+
+ /*
+ * restrict readdir permission to owner or root
+ */
+ if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0)
+ return (error);
+
+ uoffset = uiop->uio_offset;
+ uresid = uiop->uio_resid;
+
+ /* can't do negative reads */
+ if (uoffset < 0 || uresid <= 0)
+ return (EINVAL);
+
+ /* can't read directory entries that don't exist! */
+ if (uoffset % LXPR_SDSIZE)
+ return (ENOENT);
+
+ return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp));
+}
+
+/* ARGSUSED */
+static int
+lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ return (ENOTDIR);
+}
+
+/*
+ * This has the common logic for returning directory entries
+ */
+static int
+lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp,
+ lxpr_dirent_t *dirtab, int dirtablen)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+
+ oresid = uiop->uio_resid;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Satisfy user request
+ */
+ while ((uresid = uiop->uio_resid) > 0) {
+ int dirindex;
+ off_t uoffset;
+ int reclen;
+ int error;
+
+ uoffset = uiop->uio_offset;
+ dirindex = (uoffset / LXPR_SDSIZE) - 2;
+
+ if (uoffset == 0) {
+
+ dirent->d_ino = lxpnp->lxpr_ino;
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '\0';
+ reclen = DIRENT64_RECLEN(1);
+
+ } else if (uoffset == LXPR_SDSIZE) {
+
+ dirent->d_ino = lxpr_parentinode(lxpnp);
+ dirent->d_name[0] = '.';
+ dirent->d_name[1] = '.';
+ dirent->d_name[2] = '\0';
+ reclen = DIRENT64_RECLEN(2);
+
+ } else if (dirindex >= 0 && dirindex < dirtablen) {
+ int slen = strlen(dirtab[dirindex].d_name);
+
+ dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type,
+ lxpnp->lxpr_pid, 0);
+
+ VERIFY(slen < LXPNSIZ);
+ (void) strcpy(dirent->d_name, dirtab[dirindex].d_name);
+ reclen = DIRENT64_RECLEN(slen);
+
+ } else {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid) {
+ return (EINVAL);
+ }
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ /* Have run out of space, but could have just done last table entry */
+ if (eofp) {
+ *eofp =
+ (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+ return (0);
+}
+
+
+static int
+lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ zoneid_t zoneid;
+ pid_t pid;
+ int error;
+ int ceof;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR);
+
+ oresid = uiop->uio_resid;
+ zoneid = LXPTOZ(lxpnp)->zone_id;
+
+ /*
+ * We return directory entries in the order: "." and ".." then the
+ * unique lxproc files, then the directories corresponding to the
+ * running processes. We have defined this as the ordering because
+ * it allows us to more easily keep track of where we are betwen calls
+ * to getdents(). If the number of processes changes between calls
+ * then we can't lose track of where we are in the lxproc files.
+ */
+
+ /* Do the fixed entries */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir,
+ PROCDIRFILES);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ return (error);
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /* Do the process entries */
+ while ((uresid = uiop->uio_resid) > 0) {
+ proc_t *p;
+ int len;
+ int reclen;
+ int i;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop when entire proc table has been examined.
+ */
+ i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES;
+ if (i < 0 || i >= v.v_proc) {
+ /* Run out of table entries */
+ if (eofp) {
+ *eofp = 1;
+ }
+ return (0);
+ }
+ mutex_enter(&pidlock);
+
+ /*
+ * Skip indices for which there is no pid_entry, PIDs for
+ * which there is no corresponding process, a PID of 0,
+ * and anything the security policy doesn't allow
+ * us to look at.
+ */
+ if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL ||
+ p->p_pid == 0 ||
+ secpolicy_basic_procinfo(CRED(), p, curproc) != 0) {
+ mutex_exit(&pidlock);
+ goto next;
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * Convert pid to the Linux default of 1 if we're the zone's
+ * init process, otherwise use the value from the proc
+ * structure
+ */
+ pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ?
+ p->p_pid : 1);
+
+ /*
+ * If this /proc was mounted in the global zone, view
+ * all procs; otherwise, only view zone member procs.
+ */
+ if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) {
+ goto next;
+ }
+
+ ASSERT(p->p_stat != 0);
+
+ dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ /*
+ * if the size of the data to transfer is greater
+ * that that requested then we can't do it this transfer.
+ */
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ return (EINVAL);
+ break;
+ }
+
+ /*
+ * uiomove() updates both uiop->uio_resid and uiop->uio_offset
+ * by the same amount. But we want uiop->uio_offset to change
+ * in increments of LXPR_SDSIZE, which is different from the
+ * number of bytes being returned to the user. So we set
+ * uiop->uio_offset separately, in the increment of this for
+ * the loop, ignoring what uiomove() does.
+ */
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ return (error);
+next:
+ uiop->uio_offset = uoffset + LXPR_SDSIZE;
+ }
+
+ if (eofp != NULL) {
+ *eofp = (uiop->uio_offset >=
+ ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+ return (0);
+}
+
+static int
+lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ proc_t *p;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR);
+
+ /* can't read its contents if it died */
+ mutex_enter(&pidlock);
+
+ p = prfind((lxpnp->lxpr_pid == 1) ?
+ curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid);
+
+ if (p == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ return (ENOENT);
+ }
+ mutex_exit(&pidlock);
+
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES));
+}
+
+static int
+lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ ASSERT(lxpnp->lxpr_type == LXPR_NETDIR);
+ return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES));
+}
+
+static int
+lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp)
+{
+ /* bp holds one dirent64 structure */
+ longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)];
+ dirent64_t *dirent = (dirent64_t *)bp;
+ ssize_t oresid; /* save a copy for testing later */
+ ssize_t uresid;
+ off_t uoffset;
+ int error;
+ int ceof;
+ proc_t *p;
+ int fddirsize = -1;
+ uf_info_t *fip;
+
+ ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR);
+
+ oresid = uiop->uio_resid;
+
+ /* can't read its contents if it died */
+ p = lxpr_lock(lxpnp->lxpr_pid);
+ if (p == NULL)
+ return (ENOENT);
+
+ if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas))
+ fddirsize = 0;
+
+ /*
+ * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from
+ * going away while we iterate over its fi_list.
+ */
+ mutex_exit(&p->p_lock);
+
+ /* Get open file info */
+ fip = (&(p)->p_user.u_finfo);
+ mutex_enter(&fip->fi_lock);
+
+ if (fddirsize == -1)
+ fddirsize = fip->fi_nfiles;
+
+ /* Do the fixed entries (in this case just "." & "..") */
+ error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0);
+
+ /* Finished if we got an error or if we couldn't do all the table */
+ if (error != 0 || ceof == 0)
+ goto out;
+
+ /* clear out the dirent buffer */
+ bzero(bp, sizeof (bp));
+
+ /*
+ * Loop until user's request is satisfied or until
+ * all file descriptors have been examined.
+ */
+ for (; (uresid = uiop->uio_resid) > 0;
+ uiop->uio_offset = uoffset + LXPR_SDSIZE) {
+ int reclen;
+ int fd;
+ int len;
+
+ uoffset = uiop->uio_offset;
+
+ /*
+ * Stop at the end of the fd list
+ */
+ fd = (uoffset / LXPR_SDSIZE) - 2;
+ if (fd < 0 || fd >= fddirsize) {
+ if (eofp) {
+ *eofp = 1;
+ }
+ goto out;
+ }
+
+ if (fip->fi_list[fd].uf_file == NULL)
+ continue;
+
+ dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd);
+ len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd);
+ ASSERT(len < LXPNSIZ);
+ reclen = DIRENT64_RECLEN(len);
+
+ dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE);
+ dirent->d_reclen = (ushort_t)reclen;
+
+ if (reclen > uresid) {
+ /*
+ * Error if no entries have been returned yet.
+ */
+ if (uresid == oresid)
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ,
+ uiop)) != 0)
+ goto out;
+ }
+
+ if (eofp != NULL) {
+ *eofp =
+ (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0;
+ }
+
+out:
+ mutex_exit(&fip->fi_lock);
+ mutex_enter(&p->p_lock);
+ lxpr_unlock(p);
+ return (error);
+}
+
+
+/*
+ * lxpr_readlink(): Vnode operation for VOP_READLINK()
+ */
+/* ARGSUSED */
+static int
+lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct)
+{
+ char bp[MAXPATHLEN + 1];
+ size_t buflen = sizeof (bp);
+ lxpr_node_t *lxpnp = VTOLXP(vp);
+ vnode_t *rvp = lxpnp->lxpr_realvp;
+ pid_t pid;
+ int error = 0;
+
+ /* must be a symbolic link file */
+ if (vp->v_type != VLNK)
+ return (EINVAL);
+
+ /* Try to produce a symlink name for anything that has a realvp */
+ if (rvp != NULL) {
+ if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0)
+ return (error);
+ if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0)
+ return (error);
+ } else {
+ switch (lxpnp->lxpr_type) {
+ case LXPR_SELF:
+ /*
+ * Convert pid to the Linux default of 1 if we're the
+ * zone's init process
+ */
+ pid = ((curproc->p_pid !=
+ curproc->p_zone->zone_proc_initpid)
+ ? curproc->p_pid : 1);
+
+ /*
+ * Don't need to check result as every possible int
+ * will fit within MAXPATHLEN bytes.
+ */
+ (void) snprintf(bp, buflen, "%d", pid);
+ break;
+ case LXPR_PID_CURDIR:
+ case LXPR_PID_ROOTDIR:
+ case LXPR_PID_EXE:
+ return (EACCES);
+ default:
+ /*
+ * Need to return error so that nothing thinks
+ * that the symlink is empty and hence "."
+ */
+ return (EINVAL);
+ }
+ }
+
+ /* copy the link data to user space */
+ return (uiomove(bp, strlen(bp), UIO_READ, uiop));
+}
+
+/*
+ * lxpr_inactive(): Vnode operation for VOP_INACTIVE()
+ * Vnode is no longer referenced, deallocate the file
+ * and all its resources.
+ */
+/* ARGSUSED */
+static void
+lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+ lxpr_freenode(VTOLXP(vp));
+}
+
+/*
+ * lxpr_sync(): Vnode operation for VOP_SYNC()
+ */
+static int
+lxpr_sync()
+{
+ /*
+ * Nothing to sync but this function must never fail
+ */
+ return (0);
+}
+
+/*
+ * lxpr_cmp(): Vnode operation for VOP_CMP()
+ */
+static int
+lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ while (vn_matchops(vp1, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) {
+ vp1 = rvp;
+ }
+
+ while (vn_matchops(vp2, lxpr_vnodeops) &&
+ (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) {
+ vp2 = rvp;
+ }
+
+ if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops))
+ return (vp1 == vp2);
+
+ return (VOP_CMP(vp1, vp2, ct));
+}
+
+/*
+ * lxpr_realvp(): Vnode operation for VOP_REALVP()
+ */
+static int
+lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
+{
+ vnode_t *rvp;
+
+ if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) {
+ vp = rvp;
+ if (VOP_REALVP(vp, &rvp, ct) == 0)
+ vp = rvp;
+ }
+
+ *vpp = vp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h
new file mode 100644
index 0000000000..e3718372e0
--- /dev/null
+++ b/usr/src/uts/common/fs/lxproc/lxproc.h
@@ -0,0 +1,277 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc. All rights reserved.
+ */
+
+#ifdef _LXPROC_BRANDED_H
+#error Attempted to include native lxproc.h after branded lx_proc.h
+#endif
+
+#ifndef _LXPROC_H
+#define _LXPROC_H
+#define _LXPROC_NATIVE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * lxproc.h: declarations, data structures and macros for lxprocfs
+ */
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+#include <sys/dirent.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/var.h>
+#include <sys/user.h>
+#include <sys/t_lock.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/cmn_err.h>
+#include <sys/zone.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/dnlc.h>
+#include <sys/atomic.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <vm/as.h>
+#include <vm/anon.h>
+
+#define LX_SIGHUP 1
+#define LX_SIGINT 2
+#define LX_SIGQUIT 3
+#define LX_SIGILL 4
+#define LX_SIGTRAP 5
+#define LX_SIGABRT 6
+#define LX_SIGIOT 6
+#define LX_SIGBUS 7
+#define LX_SIGFPE 8
+#define LX_SIGKILL 9
+#define LX_SIGUSR1 10
+#define LX_SIGSEGV 11
+#define LX_SIGUSR2 12
+#define LX_SIGPIPE 13
+#define LX_SIGALRM 14
+#define LX_SIGTERM 15
+#define LX_SIGSTKFLT 16
+#define LX_SIGCHLD 17
+#define LX_SIGCONT 18
+#define LX_SIGSTOP 19
+#define LX_SIGTSTP 20
+#define LX_SIGTTIN 21
+#define LX_SIGTTOU 22
+#define LX_SIGURG 23
+#define LX_SIGXCPU 24
+#define LX_SIGXFSZ 25
+#define LX_SIGVTALRM 26
+#define LX_SIGPROF 27
+#define LX_SIGWINCH 28
+#define LX_SIGIO 29
+#define LX_SIGPOLL LX_SIGIO
+#define LX_SIGPWR 30
+#define LX_SIGSYS 31
+#define LX_SIGUNUSED 31
+
+#define LX_NSIG 64 /* Linux _NSIG */
+
+#define LX_SIGRTMIN 32
+#define LX_SIGRTMAX LX_NSIG
+
+/*
+ * Convert a vnode into an lxpr_mnt_t
+ */
+#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data)
+
+/*
+ * convert a vnode into an lxpr_node
+ */
+#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data)
+
+/*
+ * convert a lxprnode into a vnode
+ */
+#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode)
+
+/*
+ * convert a lxpr_node into zone for fs
+ */
+#define LXPTOZ(lxpnp) \
+ (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone)
+
+#define LXPNSIZ 256 /* max size of lx /proc file name entries */
+
+/*
+ * Pretend that a directory entry takes 16 bytes
+ */
+#define LXPR_SDSIZE 16
+
+/*
+ * Node/file types for lx /proc files
+ * (directories and files contained therein).
+ */
+typedef enum lxpr_nodetype {
+ LXPR_PROCDIR, /* /proc */
+ LXPR_PIDDIR, /* /proc/<pid> */
+ LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */
+ LXPR_PID_CPU, /* /proc/<pid>/cpu */
+ LXPR_PID_CURDIR, /* /proc/<pid>/cwd */
+ LXPR_PID_ENV, /* /proc/<pid>/environ */
+ LXPR_PID_EXE, /* /proc/<pid>/exe */
+ LXPR_PID_MAPS, /* /proc/<pid>/maps */
+ LXPR_PID_MEM, /* /proc/<pid>/mem */
+ LXPR_PID_ROOTDIR, /* /proc/<pid>/root */
+ LXPR_PID_STAT, /* /proc/<pid>/stat */
+ LXPR_PID_STATM, /* /proc/<pid>/statm */
+ LXPR_PID_STATUS, /* /proc/<pid>/status */
+ LXPR_PID_FDDIR, /* /proc/<pid>/fd */
+ LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */
+ LXPR_CMDLINE, /* /proc/cmdline */
+ LXPR_CPUINFO, /* /proc/cpuinfo */
+ LXPR_DEVICES, /* /proc/devices */
+ LXPR_DMA, /* /proc/dma */
+ LXPR_FILESYSTEMS, /* /proc/filesystems */
+ LXPR_INTERRUPTS, /* /proc/interrupts */
+ LXPR_IOPORTS, /* /proc/ioports */
+ LXPR_KCORE, /* /proc/kcore */
+ LXPR_KMSG, /* /proc/kmsg */
+ LXPR_LOADAVG, /* /proc/loadavg */
+ LXPR_MEMINFO, /* /proc/meminfo */
+ LXPR_MOUNTS, /* /proc/mounts */
+ LXPR_NETDIR, /* /proc/net */
+ LXPR_NET_ARP, /* /proc/net/arp */
+ LXPR_NET_DEV, /* /proc/net/dev */
+ LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */
+ LXPR_NET_IGMP, /* /proc/net/igmp */
+ LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */
+ LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */
+ LXPR_NET_MCFILTER, /* /proc/net/mcfilter */
+ LXPR_NET_NETSTAT, /* /proc/net/netstat */
+ LXPR_NET_RAW, /* /proc/net/raw */
+ LXPR_NET_ROUTE, /* /proc/net/route */
+ LXPR_NET_RPC, /* /proc/net/rpc */
+ LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */
+ LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */
+ LXPR_NET_SNMP, /* /proc/net/snmp */
+ LXPR_NET_STAT, /* /proc/net/stat */
+ LXPR_NET_TCP, /* /proc/net/tcp */
+ LXPR_NET_UDP, /* /proc/net/udp */
+ LXPR_NET_UNIX, /* /proc/net/unix */
+ LXPR_PARTITIONS, /* /proc/partitions */
+ LXPR_SELF, /* /proc/self */
+ LXPR_STAT, /* /proc/stat */
+ LXPR_UPTIME, /* /proc/uptime */
+ LXPR_VERSION, /* /proc/version */
+ LXPR_NFILES /* number of lx /proc file types */
+} lxpr_nodetype_t;
+
+/*
+ * Number of fds allowed for in the inode number calculation
+ * per process (if a process has more fds then inode numbers
+ * may be duplicated)
+ */
+#define LXPR_FD_PERPROC 2000
+
+/*
+ * external dirent characteristics
+ */
+#define LXPRMAXNAMELEN 14
+typedef struct {
+ lxpr_nodetype_t d_type;
+ char d_name[LXPRMAXNAMELEN];
+} lxpr_dirent_t;
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to v_data in the vnode structure
+ */
+typedef struct lxpr_node {
+ lxpr_nodetype_t lxpr_type; /* type of this node */
+ vnode_t *lxpr_vnode; /* vnode for the node */
+ vnode_t *lxpr_parent; /* parent directory */
+ vnode_t *lxpr_realvp; /* real vnode, file in dirs */
+ timestruc_t lxpr_time; /* creation etc time for file */
+ mode_t lxpr_mode; /* file mode bits */
+ uid_t lxpr_uid; /* file owner */
+ gid_t lxpr_gid; /* file group owner */
+ pid_t lxpr_pid; /* pid of proc referred to */
+ ino_t lxpr_ino; /* node id */
+} lxpr_node_t;
+
+struct zone; /* forward declaration */
+
+/*
+ * This is the lxprocfs private data object
+ * which is attached to vfs_data in the vfs structure
+ */
+typedef struct lxpr_mnt {
+ lxpr_node_t *lxprm_node; /* node at root of proc mount */
+ struct zone *lxprm_zone; /* zone for this mount */
+ ldi_ident_t lxprm_li; /* ident for ldi */
+} lxpr_mnt_t;
+
+extern vnodeops_t *lxpr_vnodeops;
+extern int nproc_highbit; /* highbit(v.v_nproc) */
+
+typedef struct mounta mounta_t;
+
+extern void lxpr_initnodecache();
+extern void lxpr_fininodecache();
+extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *);
+extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int);
+extern ino_t lxpr_parentinode(lxpr_node_t *);
+extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int);
+extern void lxpr_freenode(lxpr_node_t *);
+
+typedef struct lxpr_uiobuf lxpr_uiobuf_t;
+extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *);
+extern void lxpr_uiobuf_free(lxpr_uiobuf_t *);
+extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *);
+extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t);
+extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t);
+extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...);
+extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int);
+
+proc_t *lxpr_lock(pid_t);
+void lxpr_unlock(proc_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LXPROC_H */
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
index 207a708771..2176dcb9de 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
index b7354c168a..cd1e08d5d7 100644
--- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c
@@ -29,7 +29,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
if (nvp)
vnevent_rename_dest(nvp, ndvp, nnm, ct);
- if (odvp != ndvp)
- vnevent_rename_dest_dir(ndvp, ct);
ASSERT(ovp != NULL);
vnevent_rename_src(ovp, odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
}
if (nvp) {
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index ce0c9485a6..3ee41939ac 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -22,6 +22,7 @@
/*
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
#include <sys/systm.h>
@@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head,
kex = &exi->exi_export;
kex->ex_flags = EX_PSEUDO;
- vpathlen = vp->v_path ? strlen(vp->v_path) : 0;
+ vpathlen = strlen(vp->v_path);
kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX);
kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP);
if (vpathlen)
- (void) strcpy(kex->ex_path, vp->v_path);
+ (void) strncpy(kex->ex_path, vp->v_path, vpathlen);
(void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX);
/* Transfer the secinfo data from exdata to this new pseudo node */
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
index 151cb62403..55f6c95289 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c
@@ -22,6 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index d6bf384a8b..2b3fdfdd55 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -34,7 +34,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -8061,8 +8061,9 @@ link_call:
* vnode if it already existed.
*/
if (error == 0) {
- vnode_t *tvp;
+ vnode_t *tvp, *tovp;
rnode4_t *trp;
+
/*
* Notify the vnode. Each links is represented by
* a different vnode, in nfsv4.
@@ -8075,23 +8076,20 @@ link_call:
vnevent_rename_dest(tvp, ndvp, nnm, ct);
}
- /*
- * if the source and destination directory are not the
- * same notify the destination directory.
- */
- if (VTOR4(odvp) != VTOR4(ndvp)) {
- trp = VTOR4(ndvp);
- tvp = ndvp;
- if (IS_SHADOW(ndvp, trp))
- tvp = RTOV4(trp);
- vnevent_rename_dest_dir(tvp, ct);
- }
-
trp = VTOR4(ovp);
- tvp = ovp;
+ tovp = ovp;
if (IS_SHADOW(ovp, trp))
+ tovp = RTOV4(trp);
+
+ vnevent_rename_src(tovp, odvp, onm, ct);
+
+ trp = VTOR4(ndvp);
+ tvp = ndvp;
+
+ if (IS_SHADOW(ndvp, trp))
tvp = RTOV4(trp);
- vnevent_rename_src(tvp, odvp, onm, ct);
+
+ vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
}
if (nvp) {
diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c
index 3410340581..a8bcfdf438 100644
--- a/usr/src/uts/common/fs/nfs/nfs_auth.c
+++ b/usr/src/uts/common/fs/nfs/nfs_auth.c
@@ -22,6 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -559,11 +560,16 @@ retry:
*access = res.ares.auth_perm;
*srv_uid = res.ares.auth_srv_uid;
*srv_gid = res.ares.auth_srv_gid;
- *srv_gids_cnt = res.ares.auth_srv_gids.len;
- *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t),
- KM_SLEEP);
- bcopy(res.ares.auth_srv_gids.val, *srv_gids,
- *srv_gids_cnt * sizeof (gid_t));
+
+ if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) {
+ *srv_gids = kmem_alloc(*srv_gids_cnt *
+ sizeof (gid_t), KM_SLEEP);
+ bcopy(res.ares.auth_srv_gids.val, *srv_gids,
+ *srv_gids_cnt * sizeof (gid_t));
+ } else {
+ *srv_gids = NULL;
+ }
+
break;
case NFSAUTH_DR_EFAIL:
@@ -1114,9 +1120,13 @@ wait:
if (gid != NULL)
*gid = p->auth_srv_gid;
if (ngids != NULL && gids != NULL) {
- *ngids = p->auth_srv_ngids;
- *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP);
- bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t));
+ if ((*ngids = p->auth_srv_ngids) != 0) {
+ size_t sz = *ngids * sizeof (gid_t);
+ *gids = kmem_alloc(sz, KM_SLEEP);
+ bcopy(p->auth_srv_gids, *gids, sz);
+ } else {
+ *gids = NULL;
+ }
}
access = p->auth_access;
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index 7e94c62734..dcc64def59 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -23,6 +23,7 @@
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
/*
@@ -2570,6 +2571,9 @@ nfs_srvinit(void)
{
int error;
+ if (getzoneid() != GLOBAL_ZONEID)
+ return (EACCES);
+
error = nfs_exportinit();
if (error != 0)
return (error);
@@ -3208,7 +3212,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi)
char *path;
mutex_enter(&vp->v_lock);
- if (vp->v_path != NULL) {
+ if (vp->v_path != vn_vpath_empty) {
zone = zone_find_by_any_path(vp->v_path, B_FALSE);
mutex_exit(&vp->v_lock);
} else {
diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
index 57b21778b4..ffd5380a86 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*
* Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All rights reserved.
diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c
index 1a1082bcb8..8c321c8aa3 100644
--- a/usr/src/uts/common/fs/nfs/nfs_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c
@@ -26,7 +26,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -2688,11 +2688,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
if (nvp)
vnevent_rename_dest(nvp, ndvp, nnm, ct);
- if (odvp != ndvp)
- vnevent_rename_dest_dir(ndvp, ct);
-
ASSERT(ovp != NULL);
vnevent_rename_src(ovp, odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, ovp, nnm, ct);
}
if (nvp) {
diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c
index a9ee604b7c..21a0b1a4bd 100644
--- a/usr/src/uts/common/fs/pcfs/pc_dir.c
+++ b/usr/src/uts/common/fs/pcfs/pc_dir.c
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/systm.h>
@@ -822,10 +826,10 @@ top:
return (error);
}
}
-out:
- vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
- if (dp != tdp) {
- vnevent_rename_dest_dir(PCTOV(tdp), ctp);
+
+ if (error == 0) {
+ vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp);
+ vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp);
}
VN_RELE(PCTOV(pcp));
diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c
index 14be8cbbae..11b7386269 100644
--- a/usr/src/uts/common/fs/portfs/port.c
+++ b/usr/src/uts/common/fs/portfs/port.c
@@ -24,7 +24,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
+ */
#include <sys/types.h>
#include <sys/systm.h>
@@ -1381,12 +1383,18 @@ portnowait:
if (model == DATAMODEL_NATIVE) {
eventsz = sizeof (port_event_t);
- kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
- if (kevp == NULL) {
- if (nmax > pp->port_max_list)
- nmax = pp->port_max_list;
- kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+ if (nmax == 0) {
+ kevp = NULL;
+ } else {
+ kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+ if (kevp == NULL) {
+ if (nmax > pp->port_max_list)
+ nmax = pp->port_max_list;
+ kevp = kmem_alloc(eventsz * nmax, KM_SLEEP);
+ }
}
+
results = kevp;
lev = NULL; /* start with first event in the queue */
for (nevents = 0; nevents < nmax; ) {
@@ -1423,12 +1431,18 @@ portnowait:
port_event32_t *kevp32;
eventsz = sizeof (port_event32_t);
- kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
- if (kevp32 == NULL) {
- if (nmax > pp->port_max_list)
- nmax = pp->port_max_list;
- kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+
+ if (nmax == 0) {
+ kevp32 = NULL;
+ } else {
+ kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP);
+ if (kevp32 == NULL) {
+ if (nmax > pp->port_max_list)
+ nmax = pp->port_max_list;
+ kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP);
+ }
}
+
results = kevp32;
lev = NULL; /* start with first event in the queue */
for (nevents = 0; nevents < nmax; ) {
diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c
index b2f5088e06..ab95c0a1f8 100644
--- a/usr/src/uts/common/fs/portfs/port_vnops.c
+++ b/usr/src/uts/common/fs/portfs/port_vnops.c
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/vfs_opreg.h>
@@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
levents |= POLLOUT;
levents &= events;
*reventsp = levents;
- if (levents == 0) {
- if (!anyyet) {
- *phpp = &pp->port_pollhd;
- portq->portq_flags |=
- events & POLLIN ? PORTQ_POLLIN : 0;
- portq->portq_flags |=
- events & POLLOUT ? PORTQ_POLLOUT : 0;
- }
+ if ((levents == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &pp->port_pollhd;
+ portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0;
+ portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0;
}
mutex_exit(&portq->portq_mutex);
return (0);
diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c
new file mode 100644
index 0000000000..a4ad82d661
--- /dev/null
+++ b/usr/src/uts/common/fs/proc/prargv.c
@@ -0,0 +1,441 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/sysmacros.h>
+#include <vm/as.h>
+
+/*
+ * Safely read a contiguous region of memory from 'addr' in the address space
+ * of a particular process into the supplied kernel buffer (*buf, sz).
+ * Partially mapped regions will result in a partial read terminating at the
+ * first hole in the address space. The number of bytes actually read is
+ * returned to the caller via 'rdsz'.
+ */
+static int
+prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz)
+{
+ int error = 0;
+ size_t rem = sz;
+ off_t pos = 0;
+
+ if (rdsz != NULL)
+ *rdsz = 0;
+
+ while (rem != 0) {
+ uintptr_t addr = ustart + pos;
+ size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET));
+
+ if ((error = uread(p, buf + pos, len, addr)) != 0) {
+ if (error == ENXIO) {
+ /*
+ * ENXIO from uread() indicates that the page
+ * does not exist. This will simply be a
+ * partial read.
+ */
+ error = 0;
+ }
+ break;
+ }
+
+ rem -= len;
+ pos += len;
+ }
+
+ if (rdsz != NULL)
+ *rdsz = pos;
+
+ return (error);
+}
+
+/*
+ * Attempt to read the argument vector (argv) from this process. The caller
+ * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via
+ * prlock or lx_prlock).
+ *
+ * The caller must provide a buffer (buf, buflen). We will concatenate each
+ * argument string (including the NUL terminator) into this buffer. The number
+ * of characters written to this buffer (including the final NUL terminator)
+ * will be stored in 'slen'.
+ */
+int
+prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+ int error;
+ user_t *up;
+ struct as *as;
+ size_t pos = 0;
+ caddr_t *argv = NULL;
+ size_t argvsz = 0;
+ int i;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+ up = PTOU(p);
+ as = p->p_as;
+
+ if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) {
+ /*
+ * Return the regular psargs string to the caller.
+ */
+ bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs)));
+ buf[bufsz - 1] = '\0';
+ *slen = strlen(buf) + 1;
+
+ return (0);
+ }
+
+ /*
+ * Allocate space to store argv array.
+ */
+ argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ argv = kmem_alloc(argvsz, KM_SLEEP);
+
+ /*
+ * Extract the argv array from the target process. Drop p_lock
+ * while we do I/O to avoid deadlock with the clock thread.
+ */
+ mutex_exit(&p->p_lock);
+ if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz,
+ NULL)) != 0) {
+ kmem_free(argv, argvsz);
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ /*
+ * Read each argument string from the pointers in the argv array.
+ */
+ pos = 0;
+ for (i = 0; i < up->u_argc; i++) {
+ size_t rdsz, trysz;
+ uintptr_t arg;
+ off_t j;
+ boolean_t found_nul;
+ boolean_t do_retry = B_TRUE;
+
+#ifdef _SYSCALL32_IMPL
+ if (p->p_model == DATAMODEL_ILP32) {
+ arg = (uintptr_t)((caddr32_t *)argv)[i];
+ } else {
+ arg = (uintptr_t)argv[i];
+ }
+#else
+ arg = (uintptr_t)argv[i];
+#endif
+
+ /*
+ * Stop trying to read arguments if we reach a NULL
+ * pointer in the vector.
+ */
+ if (arg == NULL)
+ break;
+
+ /*
+ * Stop reading if we have read the maximum length
+ * we can return to the user.
+ */
+ if (pos >= bufsz)
+ break;
+
+ /*
+ * Initially we try a short read, on the assumption that
+ * most individual argument strings are less than 80
+ * characters long.
+ */
+ if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+ /*
+ * We don't have room in the target buffer for even
+ * an entire short read, so there is no need to retry
+ * with a longer read.
+ */
+ do_retry = B_FALSE;
+ }
+
+retry:
+ /*
+ * Read string data for this argument. Leave room
+ * in the buffer for a final NUL terminator.
+ */
+ if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz,
+ &rdsz)) != 0) {
+ /*
+ * There was a problem reading this string
+ * from the process. Give up.
+ */
+ break;
+ }
+
+ /*
+ * Find the NUL terminator.
+ */
+ found_nul = B_FALSE;
+ for (j = 0; j < rdsz; j++) {
+ if (buf[pos + j] == '\0') {
+ found_nul = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found_nul && do_retry) {
+ /*
+ * We did not find a NUL terminator, but this
+ * was a first pass short read. Try once more
+ * with feeling.
+ */
+ trysz = bufsz - pos - 1;
+ do_retry = B_FALSE;
+ goto retry;
+ }
+
+ /*
+ * Commit the string we read to the buffer.
+ */
+ pos += j + 1;
+ if (!found_nul && pos < bufsz) {
+ /*
+ * A NUL terminator was not found; add one.
+ */
+ buf[pos++] = '\0';
+ }
+ }
+
+ /*
+ * Ensure the entire string is NUL-terminated.
+ */
+ buf[bufsz - 1] = '\0';
+
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ kmem_free(argv, argvsz);
+
+ /*
+ * If the operation was a success, return the copied string length
+ * to the caller.
+ */
+ *slen = (error == 0) ? pos : 0;
+
+ return (error);
+}
+
+/*
+ * Similar to prreadargv except reads the env vector. This is slightly more
+ * complex because there is no count for the env vector that corresponds to
+ * u_argc.
+ */
+int
+prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen)
+{
+ int error;
+ user_t *up;
+ struct as *as;
+ size_t pos = 0;
+ caddr_t *envp = NULL;
+ uintptr_t tmpp = NULL;
+ size_t envpsz = 0, rdsz = 0;
+ int i;
+ int cnt, bound;
+
+ VERIFY(MUTEX_HELD(&p->p_lock));
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+
+ up = PTOU(p);
+ as = p->p_as;
+
+ if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) {
+ /*
+ * Return empty string.
+ */
+ buf[0] = '\0';
+ *slen = 1;
+
+ return (0);
+ }
+
+ /*
+ * Drop p_lock while we do I/O to avoid deadlock with the clock thread.
+ */
+ mutex_exit(&p->p_lock);
+
+ /*
+ * We first have to count how many env entries we have. This is
+ * somewhat painful. We extract the env entries from the target process
+ * one entry at a time. Stop trying to read env entries if we reach a
+ * NULL pointer in the vector or hit our upper bound (which we take
+ * as the bufsz/4) to ensure we don't run off.
+ */
+ rdsz = (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ bound = (int)(bufsz / 4);
+ for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) {
+ caddr_t tmp = NULL;
+
+ if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz,
+ NULL)) != 0) {
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ if (tmp == NULL)
+ break;
+ }
+ if (cnt == 0) {
+ /* Return empty string. */
+ buf[0] = '\0';
+ *slen = 1;
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (0);
+ }
+
+ /*
+ * Allocate space to store env array.
+ */
+ envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ?
+ sizeof (caddr32_t) : sizeof (caddr_t));
+ envp = kmem_alloc(envpsz, KM_SLEEP);
+
+ /*
+ * Extract the env array from the target process.
+ */
+ if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz,
+ NULL)) != 0) {
+ kmem_free(envp, envpsz);
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ return (-1);
+ }
+
+ /*
+ * Read each env string from the pointers in the env array.
+ */
+ pos = 0;
+ for (i = 0; i < cnt; i++) {
+ size_t rdsz, trysz;
+ uintptr_t ev;
+ off_t j;
+ boolean_t found_nul;
+ boolean_t do_retry = B_TRUE;
+
+#ifdef _SYSCALL32_IMPL
+ if (p->p_model == DATAMODEL_ILP32) {
+ ev = (uintptr_t)((caddr32_t *)envp)[i];
+ } else {
+ ev = (uintptr_t)envp[i];
+ }
+#else
+ ev = (uintptr_t)envp[i];
+#endif
+
+ /*
+ * Stop trying to read env entries if we reach a NULL
+ * pointer in the vector.
+ */
+ if (ev == NULL)
+ break;
+
+ /*
+ * Stop reading if we have read the maximum length
+ * we can return to the user.
+ */
+ if (pos >= bufsz)
+ break;
+
+ /*
+ * Initially we try a short read, on the assumption that
+ * most individual env strings are less than 80
+ * characters long.
+ */
+ if ((trysz = MIN(80, bufsz - pos - 1)) < 80) {
+ /*
+ * We don't have room in the target buffer for even
+ * an entire short read, so there is no need to retry
+ * with a longer read.
+ */
+ do_retry = B_FALSE;
+ }
+
+retry:
+ /*
+ * Read string data for this env var. Leave room
+ * in the buffer for a final NUL terminator.
+ */
+ if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz,
+ &rdsz)) != 0) {
+ /*
+ * There was a problem reading this string
+ * from the process. Give up.
+ */
+ break;
+ }
+
+ /*
+ * Find the NUL terminator.
+ */
+ found_nul = B_FALSE;
+ for (j = 0; j < rdsz; j++) {
+ if (buf[pos + j] == '\0') {
+ found_nul = B_TRUE;
+ break;
+ }
+ }
+
+ if (!found_nul && do_retry) {
+ /*
+ * We did not find a NUL terminator, but this
+ * was a first pass short read. Try once more
+ * with feeling.
+ */
+ trysz = bufsz - pos - 1;
+ do_retry = B_FALSE;
+ goto retry;
+ }
+
+ /*
+ * Commit the string we read to the buffer.
+ */
+ pos += j + 1;
+ if (!found_nul && pos < bufsz) {
+ /*
+ * A NUL terminator was not found; add one.
+ */
+ buf[pos++] = '\0';
+ }
+ }
+
+ /*
+ * Ensure the entire string is NUL-terminated.
+ */
+ buf[bufsz - 1] = '\0';
+
+ mutex_enter(&p->p_lock);
+ VERIFY(p->p_proc_flag & P_PR_LOCK);
+ kmem_free(envp, envpsz);
+
+ /*
+ * If the operation was a success, return the copied string length
+ * to the caller.
+ */
+ *slen = (error == 0) ? pos : 0;
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c
index a73a64a4a4..7e99d23b97 100644
--- a/usr/src/uts/common/fs/proc/prcontrol.c
+++ b/usr/src/uts/common/fs/proc/prcontrol.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip)
} else if (t->t_state == TS_STOPPED && sig == SIGKILL) {
/* If SIGKILL, set stopped lwp running */
p->p_stopsig = 0;
- t->t_schedflag |= TS_XSTART | TS_PSTART;
+ t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART;
t->t_dtrace_stop = 0;
setrun_locked(t);
}
@@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr)
return (EPERM);
if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id)
return (EINVAL);
- if ((zptr = zone_find_by_id(zoneid)) == NULL)
- return (EINVAL);
+ /*
+ * We cannot hold p_lock when we call zone_find_by_id since that can
+ * lead to a deadlock. zone_find_by_id() takes zonehash_lock.
+ * zone_enter() can hold the zonehash_lock and needs p_lock when it
+ * calls task_join.
+ */
mutex_exit(&p->p_lock);
+ if ((zptr = zone_find_by_id(zoneid)) == NULL) {
+ mutex_enter(&p->p_lock);
+ return (EINVAL);
+ }
mutex_enter(&p->p_crlock);
oldcred = p->p_cred;
crhold(oldcred);
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 8ea516bf82..72f26b3c05 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -27,7 +27,7 @@
/* All Rights Reserved */
/*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _SYS_PROC_PRDATA_H
@@ -123,6 +123,7 @@ typedef enum prnodetype {
#if defined(__i386) || defined(__amd64)
PR_LDT, /* /proc/<pid>/ldt */
#endif
+ PR_ARGV, /* /proc/<pid>/argv */
PR_USAGE, /* /proc/<pid>/usage */
PR_LUSAGE, /* /proc/<pid>/lusage */
PR_PAGEDATA, /* /proc/<pid>/pagedata */
@@ -347,6 +348,8 @@ extern int pr_unset(proc_t *, long);
extern void pr_sethold(prnode_t *, sigset_t *);
extern void pr_setfault(proc_t *, fltset_t *);
extern int prusrio(proc_t *, enum uio_rw, struct uio *, int);
+extern int prreadargv(proc_t *, char *, size_t, size_t *);
+extern int prreadenvv(proc_t *, char *, size_t, size_t *);
extern int prwritectl(vnode_t *, struct uio *, cred_t *);
extern int prlock(prnode_t *, int);
extern void prunmark(proc_t *);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index 7801fd0ac8..284bf8cb88 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -201,6 +201,7 @@ prchoose(proc_t *p)
case PR_SYSEXIT:
case PR_SIGNALLED:
case PR_FAULTED:
+ case PR_BRAND:
/*
* Make an lwp calling exit() be the
* last lwp seen in the process.
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index 411c9b8b0b..276f54ae3a 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -96,6 +96,11 @@ struct prdirect {
#define PRSDSIZE (sizeof (struct prdirect))
/*
+ * Maximum length of the /proc/$$/argv file:
+ */
+int prmaxargvlen = 4096;
+
+/*
* Directory characteristics.
*/
typedef struct prdirent {
@@ -166,6 +171,8 @@ static prdirent_t piddir[] = {
{ PR_LDT, 27 * sizeof (prdirent_t), sizeof (prdirent_t),
"ldt" },
#endif
+ { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t),
+ "argv" },
};
#define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2)
@@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(),
#if defined(__x86)
pr_read_ldt(),
#endif
+ pr_read_argv(),
pr_read_usage(), pr_read_lusage(), pr_read_pagedata(),
pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(),
pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(),
@@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = {
#if defined(__x86)
pr_read_ldt, /* /proc/<pid>/ldt */
#endif
+ pr_read_argv, /* /proc/<pid>/argv */
pr_read_usage, /* /proc/<pid>/usage */
pr_read_lusage, /* /proc/<pid>/lusage */
pr_read_pagedata, /* /proc/<pid>/pagedata */
@@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop)
}
static int
+pr_read_argv(prnode_t *pnp, uio_t *uiop)
+{
+ char *args;
+ int error;
+ size_t asz = prmaxargvlen, sz;
+
+ /*
+ * Allocate a scratch buffer for collection of the process arguments.
+ */
+ args = kmem_alloc(asz, KM_SLEEP);
+
+ ASSERT(pnp->pr_type == PR_ARGV);
+
+ if ((error = prlock(pnp, ZNO)) != 0) {
+ kmem_free(args, asz);
+ return (error);
+ }
+
+ if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz,
+ &sz)) != 0) {
+ prunlock(pnp);
+ kmem_free(args, asz);
+ return (error);
+ }
+
+ prunlock(pnp);
+
+ error = pr_uioread(args, sz, uiop);
+
+ kmem_free(args, asz);
+
+ return (error);
+}
+
+static int
pr_read_as(prnode_t *pnp, uio_t *uiop)
{
int error;
@@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = {
#if defined(__x86)
pr_read_ldt, /* /proc/<pid>/ldt */
#endif
+ pr_read_argv, /* /proc/<pid>/argv */
pr_read_usage_32, /* /proc/<pid>/usage */
pr_read_lusage_32, /* /proc/<pid>/lusage */
pr_read_pagedata_32, /* /proc/<pid>/pagedata */
@@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
#endif
}
+/*
+ * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile
+ * time that PRFNSZ has the same definition as MAXCOMLEN.
+ */
+#if PRFNSZ != MAXCOMLEN
+#error PRFNSZ/MAXCOMLEN mismatch
+#endif
+
+static int
+pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop)
+{
+ char fname[PRFNSZ];
+ int offset = offsetof(psinfo_t, pr_fname), error;
+
+#ifdef _SYSCALL32_IMPL
+ if (curproc->p_model != DATAMODEL_LP64)
+ offset = offsetof(psinfo32_t, pr_fname);
+#endif
+
+ /*
+ * If this isn't a write to pr_fname (or if the size doesn't match
+ * PRFNSZ) return.
+ */
+ if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ)
+ return (0);
+
+ if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0)
+ return (error);
+
+ fname[PRFNSZ - 1] = '\0';
+
+ if ((error = prlock(pnp, ZNO)) != 0)
+ return (error);
+
+ bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ);
+
+ prunlock(pnp);
+
+ return (0);
+}
+
+/*
+ * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile
+ * time that PRARGSZ has the same definition as PSARGSZ.
+ */
+#if PRARGSZ != PSARGSZ
+#error PRARGSZ/PSARGSZ mismatch
+#endif
+
+static int
+pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop)
+{
+ char psargs[PRARGSZ];
+ int offset = offsetof(psinfo_t, pr_psargs), error;
+
+#ifdef _SYSCALL32_IMPL
+ if (curproc->p_model != DATAMODEL_LP64)
+ offset = offsetof(psinfo32_t, pr_psargs);
+#endif
+
+ /*
+ * If this isn't a write to pr_psargs (or if the size doesn't match
+ * PRARGSZ) return.
+ */
+ if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ)
+ return (0);
+
+ if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0)
+ return (error);
+
+ psargs[PRARGSZ - 1] = '\0';
+
+ if ((error = prlock(pnp, ZNO)) != 0)
+ return (error);
+
+ bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ);
+
+ prunlock(pnp);
+
+ return (0);
+}
+
+int
+pr_write_psinfo(prnode_t *pnp, uio_t *uiop)
+{
+ int error;
+
+ if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0)
+ return (error);
+
+ if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0)
+ return (error);
+
+ return (0);
+}
+
+
/* ARGSUSED */
static int
prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
@@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
uiop->uio_resid = resid;
return (error);
+ case PR_PSINFO:
+ return (pr_write_psinfo(pnp, uiop));
+
default:
return ((vp->v_type == VDIR)? EISDIR : EBADF);
}
@@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
case PR_AUXV:
vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t);
break;
+ case PR_ARGV:
+ if ((p->p_flag & SSYS) || p->p_as == &kas) {
+ vap->va_size = PSARGSZ;
+ } else {
+ vap->va_size = prmaxargvlen;
+ }
+ break;
#if defined(__x86)
case PR_LDT:
mutex_exit(&p->p_lock);
@@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
case PR_USAGE:
case PR_LUSAGE:
case PR_LWPUSAGE:
+ case PR_ARGV:
p = pr_p_lock(pnp);
mutex_exit(&pr_pidlock);
if (p == NULL)
@@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = {
#if defined(__x86)
pr_lookup_notdir, /* /proc/<pid>/ldt */
#endif
+ pr_lookup_notdir, /* /proc/<pid>/argv */
pr_lookup_notdir, /* /proc/<pid>/usage */
pr_lookup_notdir, /* /proc/<pid>/lusage */
pr_lookup_notdir, /* /proc/<pid>/pagedata */
@@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type)
break;
case PR_PSINFO:
+ pnp->pr_mode = 0644; /* readable by all + owner can write */
+ break;
+
case PR_LPSINFO:
case PR_LWPSINFO:
case PR_USAGE:
case PR_LUSAGE:
case PR_LWPUSAGE:
+ case PR_ARGV:
pnp->pr_mode = 0444; /* read-only by all */
break;
@@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = {
#if defined(__x86)
pr_readdir_notdir, /* /proc/<pid>/ldt */
#endif
+ pr_readdir_notdir, /* /proc/<pid>/argv */
pr_readdir_notdir, /* /proc/<pid>/usage */
pr_readdir_notdir, /* /proc/<pid>/lusage */
pr_readdir_notdir, /* /proc/<pid>/pagedata */
@@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp)
case PR_PROCDIR:
case PR_PSINFO:
case PR_USAGE:
+ case PR_ARGV:
break;
default:
continue;
@@ -6010,7 +6170,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp,
}
*reventsp = revents;
- if (!anyyet && revents == 0) {
+ if ((!anyyet && revents == 0) || (events & POLLET)) {
/*
* Arrange to wake up the polling lwp when
* the target process/lwp stops or terminates
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index 703e26ea61..682f1d867b 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
+ so->so_krecv_cb = NULL;
+ so->so_krecv_arg = NULL;
+
return (0);
}
@@ -654,6 +658,10 @@ sonode_fini(struct sonode *so)
if (so->so_filter_top != NULL)
sof_sonode_cleanup(so);
+ /* Clean up any remnants of krecv callbacks */
+ so->so_krecv_cb = NULL;
+ so->so_krecv_arg = NULL;
+
ASSERT(list_is_empty(&so->so_acceptq_list));
ASSERT(list_is_empty(&so->so_acceptq_defer));
ASSERT(!list_link_active(&so->so_acceptq_node));
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 0be628f329..40c368736d 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
{
int error;
- SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+ SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr));
ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
@@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
- if ((so->so_mode & SM_SENDFILESUPP) == 0) {
- SO_UNBLOCK_FALLBACK(so);
- return (EOPNOTSUPP);
- }
-
error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top,
B_FALSE);
@@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr,
{
int error;
- SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+ SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
if (so->so_filter_active == 0 ||
(error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0)
@@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name,
if (level == SOL_FILTER)
return (sof_getsockopt(so, option_name, optval, optlenp, cr));
- SO_BLOCK_FALLBACK(so,
+ SO_BLOCK_FALLBACK_SAFE(so,
SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
if ((so->so_filter_active == 0 ||
@@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name,
if (level == SOL_FILTER)
return (sof_setsockopt(so, option_name, optval, optlen, cr));
- SO_BLOCK_FALLBACK(so,
+ SO_BLOCK_FALLBACK_SAFE(so,
SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
/* X/Open requires this check */
@@ -953,6 +948,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
if (!list_is_empty(&so->so_acceptq_list))
*reventsp |= (POLLIN|POLLRDNORM) & events;
+ /*
+ * If we're looking for POLLRDHUP, indicate it if we have sent the
+ * last rx signal for the socket.
+ */
+ if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG))
+ *reventsp |= POLLRDHUP;
+
/* Data */
/* so_downcalls is null for sctp */
if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
@@ -988,14 +990,20 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
*reventsp |= POLLHUP;
}
- if (!*reventsp && !anyyet) {
+ if ((!*reventsp && !anyyet) || (events & POLLET)) {
/* Check for read events again, but this time under lock */
if (events & (POLLIN|POLLRDNORM)) {
mutex_enter(&so->so_lock);
if (SO_HAVE_DATA(so) ||
!list_is_empty(&so->so_acceptq_list)) {
+ if (events & POLLET) {
+ so->so_pollev |= SO_POLLEV_IN;
+ *phpp = &so->so_poll_list;
+ }
+
mutex_exit(&so->so_lock);
*reventsp |= (POLLIN|POLLRDNORM) & events;
+
return (0);
} else {
so->so_pollev |= SO_POLLEV_IN;
@@ -1316,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp,
}
}
+ mutex_enter(&so->so_lock);
+ if (so->so_krecv_cb != NULL) {
+ boolean_t cont;
+ so_krecv_f func = so->so_krecv_cb;
+ void *arg = so->so_krecv_arg;
+
+ mutex_exit(&so->so_lock);
+ cont = func(so, mp, msg_size, flags & MSG_OOB, arg);
+ mutex_enter(&so->so_lock);
+ if (cont == B_TRUE) {
+ space_left = so->so_rcvbuf;
+ } else {
+ so->so_rcv_queued = so->so_rcvlowat;
+ *errorp = ENOSPC;
+ space_left = -1;
+ }
+ goto done_unlock;
+ }
+ mutex_exit(&so->so_lock);
+
if (flags & MSG_OOB) {
so_queue_oob(so, mp, msg_size);
mutex_enter(&so->so_lock);
@@ -1594,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
return (ENOTCONN);
}
+ mutex_enter(&so->so_lock);
+ if (so->so_krecv_cb != NULL) {
+ mutex_exit(&so->so_lock);
+ return (EOPNOTSUPP);
+ }
+ mutex_exit(&so->so_lock);
+
if (msg->msg_flags & MSG_PEEK)
msg->msg_flags &= ~MSG_WAITALL;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index 957c8f93b4..9604ea5dba 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -2276,9 +2277,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr)
fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
/*
- * Cannot fallback if the socket has active filters
+ * Cannot fallback if the socket has active filters or a krecv callback.
*/
- if (so->so_filter_active > 0)
+ if (so->so_filter_active > 0 || so->so_krecv_cb != NULL)
return (EINVAL);
switch (so->so_family) {
@@ -2456,3 +2457,50 @@ out:
return (error);
}
+
+int
+so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg)
+{
+ int ret;
+
+ if (cb == NULL && arg != NULL)
+ return (EINVAL);
+
+ SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg));
+
+ mutex_enter(&so->so_lock);
+ if (so->so_state & SS_FALLBACK_COMP) {
+ mutex_exit(&so->so_lock);
+ SO_UNBLOCK_FALLBACK(so);
+ return (ENOTSUP);
+ }
+
+ ret = so_lock_read(so, 0);
+ VERIFY(ret == 0);
+ /*
+ * Other consumers may actually care about getting extant data delivered
+ * to them, when they come along, they should figure out the best API
+ * for that.
+ */
+ so_rcv_flush(so);
+
+ so->so_krecv_cb = cb;
+ so->so_krecv_arg = arg;
+
+ so_unlock_read(so);
+ mutex_exit(&so->so_lock);
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (0);
+}
+
+void
+so_krecv_unblock(sonode_t *so)
+{
+ mutex_enter(&so->so_lock);
+ VERIFY(so->so_krecv_cb != NULL);
+
+ so->so_rcv_queued = 0;
+ (void) so_check_flow_control(so);
+ mutex_exit(&so->so_lock);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index 971523945e..7dca6ae6fc 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/systm.h>
@@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name,
/* Module loaded OK, so there must be an ops vector */
ASSERT(ent->sofe_mod != NULL);
+
+ /*
+ * Check again to confirm ATTACH is ok. See if the the module
+ * is not SOF_ATT_SAFE after an unsafe operation has taken
+ * place.
+ */
+ if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 &&
+ so->so_state & SS_FILOP_UNSF) {
+ sof_instance_destroy(inst);
+ return (EINVAL);
+ }
+
inst->sofi_ops = &ent->sofe_mod->sofm_ops;
SOF_STAT_ADD(inst, tot_active_attach, 1);
@@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
* sof_register(version, name, ops, flags)
*
* Register a socket filter identified by name `name' and which should use
- * the ops vector `ops' for event notification. `flags' should be set to 0.
+ * the ops vector `ops' for event notification. `flags' should be set to 0
+ * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An
+ * unsafe filter is one that cannot be attached after any socket operation has
+ * occured. This is the legacy default. A "safe" filter can be attached even
+ * after some basic initial socket operations have taken place. This set is
+ * currently bind, getsockname, getsockopt and setsockopt. The order in which
+ * a "safe" filter can be attached is more relaxed, and thus more flexible.
* On success 0 is returned, otherwise an errno is returned.
*/
int
@@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags)
{
sof_module_t *mod;
- _NOTE(ARGUNUSED(flags));
-
if (version != SOF_VERSION)
return (EINVAL);
mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP);
mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
(void) strcpy(mod->sofm_name, name);
+ mod->sofm_flags = flags;
mod->sofm_ops = *ops;
mutex_enter(&sof_module_lock);
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index 7f7aece1f1..cf2ad8b20d 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _SOCKFS_SOCKFILTER_H
@@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t;
struct sof_module {
char *sofm_name;
+ int sofm_flags;
sof_ops_t sofm_ops;
uint_t sofm_refcnt;
list_node_t sofm_node;
diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c
index 3d5ba2a7e8..3f858afecc 100644
--- a/usr/src/uts/common/fs/sockfs/socknotify.c
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c
@@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev)
so->so_state |= SS_SENTLASTREADSIG;
so->so_pollev &= ~SO_POLLEV_IN;
- *pollev |= POLLIN|POLLRDNORM;
+ *pollev |= POLLIN|POLLRDNORM|POLLRDHUP;
*sigev |= SOCKETSIG_READ;
return (1);
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 06d76044e5..095d84fc42 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -1879,7 +1880,7 @@ ssize_t
soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
{
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec aiov[1];
register vnode_t *vp;
int ioflag, rwflag;
ssize_t cnt;
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index a86cda937c..067eaedd7c 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -21,10 +21,10 @@
/*
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc. All rights reserved.
*/
-/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
-
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
@@ -51,6 +51,7 @@
#include <sys/cmn_err.h>
#include <sys/vmsystm.h>
#include <sys/policy.h>
+#include <sys/limits.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -83,12 +84,6 @@ extern void nl7c_init(void);
extern int sockfs_defer_nl7c_init;
/*
- * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
- * as there isn't a formal definition of IOV_MAX ???
- */
-#define MSG_MAXIOVLEN 16
-
-/*
* Kernel component of socket creation.
*
* The socket library determines which version number to use.
@@ -1023,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
STRUCT_HANDLE(nmsghdr, umsgptr);
struct nmsghdr lmsg;
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ ssize_t iovsize = 0;
int iovcnt;
- ssize_t len;
+ ssize_t len, rval;
int i;
int *flagsp;
model_t model;
@@ -1068,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
iovcnt = lmsg.msg_iovlen;
- if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+ if (iovcnt <= 0 || iovcnt > IOV_MAX) {
return (set_errno(EMSGSIZE));
}
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ aiov = kmem_alloc(iovsize, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded, while ensuring
* that they can't move more than 2Gbytes of data in a single call.
*/
if (model == DATAMODEL_ILP32) {
- struct iovec32 aiov32[MSG_MAXIOVLEN];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ ssize_t iov32size;
ssize32_t count32;
- if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
- iovcnt * sizeof (struct iovec32)))
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0)
+ aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
+ if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
@@ -1091,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EINVAL));
+ }
+
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+
+ if (iovsize != 0)
+ kmem_free(aiov32, iov32size);
} else
#endif /* _SYSCALL32_IMPL */
if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
len = 0;
@@ -1107,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
ssize_t iovlen = aiov[i].iov_len;
len += iovlen;
if (iovlen < 0 || len < 0) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EINVAL));
}
}
@@ -1121,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags)
(do_useracc == 0 ||
useracc(lmsg.msg_control, lmsg.msg_controllen,
B_WRITE) != 0)) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
- return (recvit(sock, &lmsg, &auio, flags,
+ rval = recvit(sock, &lmsg, &auio, flags,
STRUCT_FADDR(umsgptr, msg_namelen),
- STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
+ STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
+
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
+ return (rval);
}
/*
@@ -1264,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
struct nmsghdr lmsg;
STRUCT_DECL(nmsghdr, u_lmsg);
struct uio auio;
- struct iovec aiov[MSG_MAXIOVLEN];
+ struct iovec buf[IOV_MAX_STACK], *aiov = buf;
+ ssize_t iovsize = 0;
int iovcnt;
- ssize_t len;
+ ssize_t len, rval;
int i;
model_t model;
@@ -1309,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
iovcnt = lmsg.msg_iovlen;
- if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
+ if (iovcnt <= 0 || iovcnt > IOV_MAX) {
/*
* Unless this is XPG 4.2 we allow iovcnt == 0 to
* be compatible with SunOS 4.X and 4.4BSD.
@@ -1318,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
return (set_errno(EMSGSIZE));
}
+ if (iovcnt > IOV_MAX_STACK) {
+ iovsize = iovcnt * sizeof (struct iovec);
+ aiov = kmem_alloc(iovsize, KM_SLEEP);
+ }
+
#ifdef _SYSCALL32_IMPL
/*
* 32-bit callers need to have their iovec expanded, while ensuring
* that they can't move more than 2Gbytes of data in a single call.
*/
if (model == DATAMODEL_ILP32) {
- struct iovec32 aiov32[MSG_MAXIOVLEN];
+ struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
+ ssize_t iov32size;
ssize32_t count32;
+ iov32size = iovcnt * sizeof (struct iovec32);
+ if (iovsize != 0)
+ aiov32 = kmem_alloc(iov32size, KM_SLEEP);
+
if (iovcnt != 0 &&
- copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
- iovcnt * sizeof (struct iovec32)))
+ copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EFAULT));
+ }
count32 = 0;
for (i = 0; i < iovcnt; i++) {
@@ -1338,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
iovlen32 = aiov32[i].iov_len;
count32 += iovlen32;
- if (iovlen32 < 0 || count32 < 0)
+ if (iovlen32 < 0 || count32 < 0) {
+ if (iovsize != 0) {
+ kmem_free(aiov32, iov32size);
+ kmem_free(aiov, iovsize);
+ }
+
return (set_errno(EINVAL));
+ }
+
aiov[i].iov_len = iovlen32;
aiov[i].iov_base =
(caddr_t)(uintptr_t)aiov32[i].iov_base;
}
+
+ if (iovsize != 0)
+ kmem_free(aiov32, iov32size);
} else
#endif /* _SYSCALL32_IMPL */
if (iovcnt != 0 &&
copyin(lmsg.msg_iov, aiov,
(unsigned)iovcnt * sizeof (struct iovec))) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EFAULT));
}
len = 0;
@@ -1356,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
ssize_t iovlen = aiov[i].iov_len;
len += iovlen;
if (iovlen < 0 || len < 0) {
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
return (set_errno(EINVAL));
}
}
@@ -1366,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags)
auio.uio_segflg = UIO_USERSPACE;
auio.uio_limit = 0;
- return (sendit(sock, &lmsg, &auio, flags));
+ rval = sendit(sock, &lmsg, &auio, flags);
+
+ if (iovsize != 0)
+ kmem_free(aiov, iovsize);
+
+ return (rval);
}
ssize_t
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index d33f53f7a6..485e73eb02 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -6272,6 +6272,13 @@ sotpi_poll(
if (sti->sti_conn_ind_head != NULL)
*reventsp |= (POLLIN|POLLRDNORM) & events;
+ if (so->so_state & SS_CANTRCVMORE) {
+ *reventsp |= POLLRDHUP & events;
+
+ if (so->so_state & SS_CANTSENDMORE)
+ *reventsp |= POLLHUP;
+ }
+
if (so->so_state & SS_OOBPEND)
*reventsp |= POLLRDBAND & events;
diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c
index 74c4302da9..a4d983665b 100644
--- a/usr/src/uts/common/fs/swapfs/swap_subr.c
+++ b/usr/src/uts/common/fs/swapfs/swap_subr.c
@@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs)
* memory that can be used as swap space should do so by
* setting swapfs_desfree at boot time, not swapfs_minfree.
* However, swapfs_minfree is tunable by install as a
- * workaround for bugid 1147463.
+ * workaround for bugid 1147463. Note swapfs_minfree is set
+ * to 1/8th of memory, but clamped at the limit of 256 MB.
*/
- new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3);
+ new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3),
+ btopr(256 * 1024 * 1024));
}
/*
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index f6621c8097..387cc6ae54 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -516,7 +516,7 @@ tdirdelete(
*/
namelen = strlen(tpdp->td_name) + 1;
- tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
+ kmem_free(tpdp, sizeof (struct tdirent) + namelen);
dir->tn_size -= (sizeof (struct tdirent) + namelen);
dir->tn_dirents--;
@@ -549,8 +549,8 @@ tdirinit(
ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
ASSERT(dir->tn_type == VDIR);
- dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
- dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
+ dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP);
+ dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP);
/*
* Initialize the entries
@@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir)
tmpfs_hash_out(tdp);
- tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
+ kmem_free(tdp, sizeof (struct tdirent) + namelen);
dir->tn_size -= (sizeof (struct tdirent) + namelen);
dir->tn_dirents--;
}
@@ -925,7 +925,7 @@ tdiraddentry(
*/
namelen = strlen(name) + 1;
alloc_size = namelen + sizeof (struct tdirent);
- tdp = tmp_memalloc(alloc_size, 0);
+ tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI);
if (tdp == NULL)
return (ENOSPC);
@@ -1025,7 +1025,7 @@ tdirmaketnode(
((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
return (EOVERFLOW);
type = va->va_type;
- tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+ tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
tmpnode_init(tm, tp, va, cred);
/* setup normal file/dir's extended attribute directory */
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
index 2e59d28d80..e6e2b392fe 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -40,9 +41,19 @@
#include <sys/policy.h>
#include <sys/fs/tmp.h>
#include <sys/fs/tmpnode.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+#define KILOBYTE 1024
+#define MEGABYTE (1024 * KILOBYTE)
+#define GIGABYTE (1024 * MEGABYTE)
#define MODESHIFT 3
+#define VALIDMODEBITS 07777
+
+extern pgcnt_t swapfs_minfree;
+
int
tmp_taccess(void *vtp, int mode, struct cred *cred)
{
@@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred)
* a plain file and you have write access to that file.
* Function returns 0 if remove access is granted.
*/
-
int
tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
struct cred *cr)
@@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry,
}
/*
- * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded
- * or the 'musthave' flag is set. 'musthave' allocations should
- * always be subordinate to normal allocations so that tmpfs_maxkmem
- * can't be exceeded by more than a few KB. Example: when creating
- * a new directory, the tmpnode is a normal allocation; if that
- * succeeds, the dirents for "." and ".." are 'musthave' allocations.
- */
-void *
-tmp_memalloc(size_t size, int musthave)
-{
- static time_t last_warning;
- time_t now;
-
- if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem ||
- musthave)
- return (kmem_zalloc(size, KM_SLEEP));
-
- atomic_add_long(&tmp_kmemspace, -size);
- now = gethrestime_sec();
- if (last_warning != now) {
- last_warning = now;
- cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit");
- }
- return (NULL);
-}
-
-void
-tmp_memfree(void *cp, size_t size)
-{
- kmem_free(cp, size);
- atomic_add_long(&tmp_kmemspace, -size);
-}
-
-/*
* Convert a string containing a number (number of bytes) to a pgcnt_t,
* containing the corresponding number of pages. On 32-bit kernels, the
* maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value
* returned in 'maxpg' is at most ULONG_MAX.
*
- * If the number is followed by a "k" or "K", the value is converted from
- * kilobytes to bytes. If it is followed by an "m" or "M" it is converted
- * from megabytes to bytes. If it is not followed by a character it is
- * assumed to be in bytes. Multiple letter options are allowed, so for instance
- * '2mk' is interpreted as 2gb.
+ * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes;
+ * "m" or "M" for megabytes; "g" or "G" for gigabytes. This interface allows
+ * for an arguably esoteric interpretation of multiple suffix characters:
+ * namely, they cascade. For example, the caller may specify "2mk", which is
+ * interpreted as 2 gigabytes. It would seem, at this late stage, that the
+ * horse has left not only the barn but indeed the country, and possibly the
+ * entire planetary system. Alternatively, the number may be followed by a
+ * single '%' sign, indicating the size is a percentage of either the zone's
+ * swap limit or the system's overall swap size.
*
* Parse and overflow errors are detected and a non-zero number returned on
* error.
*/
-
int
tmp_convnum(char *str, pgcnt_t *maxpg)
{
- uint64_t num = 0, oldnum;
+ u_longlong_t num = 0;
#ifdef _LP64
- uint64_t max_bytes = ULONG_MAX;
+ u_longlong_t max_bytes = ULONG_MAX;
#else
- uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
+ u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX;
#endif
char *c;
-
- if (str == NULL)
+ const struct convchar {
+ char *cc_char;
+ uint64_t cc_factor;
+ } convchars[] = {
+ { "kK", KILOBYTE },
+ { "mM", MEGABYTE },
+ { "gG", GIGABYTE },
+ { NULL, 0 }
+ };
+
+ if (str == NULL) {
return (EINVAL);
+ }
c = str;
/*
- * Convert str to number
+ * Convert the initial numeric portion of the input string.
*/
- while ((*c >= '0') && (*c <= '9')) {
- oldnum = num;
- num = num * 10 + (*c++ - '0');
- if (oldnum > num) /* overflow */
+ if (ddi_strtoull(str, &c, 10, &num) != 0) {
+ return (EINVAL);
+ }
+
+ /*
+ * Handle a size in percent. Anything other than a single percent
+ * modifier is invalid. We use either the zone's swap limit or the
+ * system's total available swap size as the initial value. Perform the
+ * intermediate calculation in pages to avoid overflow.
+ */
+ if (*c == '\%') {
+ u_longlong_t cap;
+
+ if (*(c + 1) != '\0')
+ return (EINVAL);
+
+ if (num > 100)
return (EINVAL);
+
+ cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl;
+ if (cap == UINT64_MAX) {
+ /*
+ * Use the amount of available physical and memory swap
+ */
+ mutex_enter(&anoninfo_lock);
+ cap = TOTAL_AVAILABLE_SWAP;
+ mutex_exit(&anoninfo_lock);
+ } else {
+ cap = btop(cap);
+ }
+
+ num = ptob(cap * num / 100);
+ goto done;
}
/*
- * Terminate on null
+ * Apply the (potentially cascading) magnitude suffixes until an
+ * invalid character is found, or the string comes to an end.
*/
- while (*c != '\0') {
- switch (*c++) {
+ for (; *c != '\0'; c++) {
+ int i;
+
+ for (i = 0; convchars[i].cc_char != NULL; i++) {
+ /*
+ * Check if this character matches this multiplier
+ * class:
+ */
+ if (strchr(convchars[i].cc_char, *c) != NULL) {
+ /*
+ * Check for overflow:
+ */
+ if (num > max_bytes / convchars[i].cc_factor) {
+ return (EINVAL);
+ }
+
+ num *= convchars[i].cc_factor;
+ goto valid_char;
+ }
+ }
/*
- * convert from kilobytes
+ * This was not a valid multiplier suffix character.
*/
- case 'k':
- case 'K':
- if (num > max_bytes / 1024) /* will overflow */
- return (EINVAL);
- num *= 1024;
- break;
+ return (EINVAL);
- /*
- * convert from megabytes
- */
- case 'm':
- case 'M':
- if (num > max_bytes / (1024 * 1024)) /* will overflow */
- return (EINVAL);
- num *= 1024 * 1024;
- break;
-
- default:
- return (EINVAL);
- }
+valid_char:
+ continue;
}
+done:
/*
* Since btopr() rounds up to page granularity, this round-up can
* cause an overflow only if 'num' is between (max_bytes - PAGESIZE)
@@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg)
return (EINVAL);
return (0);
}
+
+/*
+ * Parse an octal mode string for use as the permissions set for the root
+ * of the tmpfs mount.
+ */
+int
+tmp_convmode(char *str, mode_t *mode)
+{
+ ulong_t num;
+ char *c;
+
+ if (str == NULL) {
+ return (EINVAL);
+ }
+
+ if (ddi_strtoul(str, &c, 8, &num) != 0) {
+ return (EINVAL);
+ }
+
+ if ((num & ~VALIDMODEBITS) != 0) {
+ return (EINVAL);
+ }
+
+ *mode = VALIDMODEBITS & num;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index f8a36a528f..3c088c442c 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -55,6 +56,15 @@
static int tmpfsfstype;
/*
+ * tmpfs_mountcount is used to prevent module unloads while there is still
+ * state from a former mount hanging around. With forced umount support, the
+ * filesystem module must not be allowed to go away before the last
+ * VFS_FREEVFS() call has been made. Since this is just an atomic counter,
+ * there's no need for locking.
+ */
+static uint32_t tmpfs_mountcount;
+
+/*
* tmpfs vfs operations.
*/
static int tmpfsinit(int, char *);
@@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *);
static int tmp_root(struct vfs *, struct vnode **);
static int tmp_statvfs(struct vfs *, struct statvfs64 *);
static int tmp_vget(struct vfs *, struct vnode **, struct fid *);
+static void tmp_freevfs(vfs_t *vfsp);
/*
* Loadable module wrapper
@@ -76,7 +87,7 @@ static vfsdef_t vfw = {
VFSDEF_VERSION,
"tmpfs",
tmpfsinit,
- VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT,
+ VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT,
&tmpfs_proto_opttbl
};
@@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = {
/* Option name Cancel Opt Arg Flags Data */
{ MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL},
{ MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL},
- { "size", NULL, "0", MO_HASVALUE, NULL}
+ { "size", NULL, "0", MO_HASVALUE, NULL},
+ { "mode", NULL, NULL, MO_HASVALUE, NULL}
};
@@ -121,6 +133,14 @@ _fini()
{
int error;
+ /*
+ * If a forceably unmounted instance is still hanging around, we cannot
+ * allow the module to be unloaded because that would cause panics once
+ * the VFS framework decides it's time to call into VFS_FREEVFS().
+ */
+ if (tmpfs_mountcount)
+ return (EBUSY);
+
error = mod_remove(&modlinkage);
if (error)
return (error);
@@ -139,14 +159,6 @@ _info(struct modinfo *modinfop)
}
/*
- * The following are patchable variables limiting the amount of system
- * resources tmpfs can use.
- *
- * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory
- * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries)
- * It is not determined by setting a hard limit but rather as a percentage of
- * physical memory which is determined when tmpfs is first used in the system.
- *
* tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for
* the rest of the system. In other words, if the amount of free swap space
* in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs
@@ -155,9 +167,7 @@ _info(struct modinfo *modinfop)
* There is also a per mount limit on the amount of swap space
* (tmount.tm_anonmax) settable via a mount option.
*/
-size_t tmpfs_maxkmem = 0;
size_t tmpfs_minfree = 0;
-size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */
static major_t tmpfs_major;
static minor_t tmpfs_minor;
@@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name)
VFSNAME_ROOT, { .vfs_root = tmp_root },
VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs },
VFSNAME_VGET, { .vfs_vget = tmp_vget },
+ VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs },
NULL, NULL
};
int error;
@@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name)
tmpfs_minfree = btopr(TMPMINFREE);
}
- /*
- * The maximum amount of space tmpfs can allocate is
- * TMPMAXPROCKMEM percent of kernel memory
- */
- if (tmpfs_maxkmem == 0)
- tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM);
-
if ((tmpfs_major = getudev()) == (major_t)-1) {
cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number.");
tmpfs_major = 0;
}
mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
+ tmpfs_mountcount = 0;
return (0);
}
static int
-tmp_mount(
- struct vfs *vfsp,
- struct vnode *mvp,
- struct mounta *uap,
- struct cred *cr)
+tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
struct tmount *tm = NULL;
struct tmpnode *tp;
@@ -239,8 +240,9 @@ tmp_mount(
pgcnt_t anonmax;
struct vattr rattr;
int got_attrs;
-
- char *sizestr;
+ boolean_t mode_arg = B_FALSE;
+ mode_t root_mode = 0777;
+ char *argstr;
if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
return (error);
@@ -249,7 +251,7 @@ tmp_mount(
return (ENOTDIR);
mutex_enter(&mvp->v_lock);
- if ((uap->flags & MS_OVERLAY) == 0 &&
+ if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 &&
(mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
mutex_exit(&mvp->v_lock);
return (EBUSY);
@@ -275,18 +277,45 @@ tmp_mount(
* tm_anonmax is set according to the mount arguments
* if any. Otherwise, it is set to a maximum value.
*/
- if (vfs_optionisset(vfsp, "size", &sizestr)) {
- if ((error = tmp_convnum(sizestr, &anonmax)) != 0)
+ if (vfs_optionisset(vfsp, "size", &argstr)) {
+ if ((error = tmp_convnum(argstr, &anonmax)) != 0)
goto out;
} else {
anonmax = ULONG_MAX;
}
+ /*
+ * The "mode" mount argument allows the operator to override the
+ * permissions of the root of the tmpfs mount.
+ */
+ if (vfs_optionisset(vfsp, "mode", &argstr)) {
+ if ((error = tmp_convmode(argstr, &root_mode)) != 0) {
+ goto out;
+ }
+ mode_arg = B_TRUE;
+ }
+
if (error = pn_get(uap->dir,
(uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn))
goto out;
- if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) {
+ if (uap->flags & MS_REMOUNT) {
+ tm = (struct tmount *)VFSTOTM(vfsp);
+
+ /*
+ * If we change the size so its less than what is currently
+ * being used, we allow that. The file system will simply be
+ * full until enough files have been removed to get below the
+ * new max.
+ */
+ mutex_enter(&tm->tm_contents);
+ tm->tm_anonmax = anonmax;
+ mutex_exit(&tm->tm_contents);
+ goto out;
+ }
+
+ if ((tm = kmem_zalloc(sizeof (struct tmount),
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
pn_free(&dpn);
error = ENOMEM;
goto out;
@@ -318,17 +347,17 @@ tmp_mount(
vfsp->vfs_bsize = PAGESIZE;
vfsp->vfs_flag |= VFS_NOTRUNC;
vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype);
- tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE);
+ tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP);
(void) strcpy(tm->tm_mntpath, dpn.pn_path);
/*
* allocate and initialize root tmpnode structure
*/
bzero(&rattr, sizeof (struct vattr));
- rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */
+ rattr.va_mode = (mode_t)(S_IFDIR | root_mode);
rattr.va_type = VDIR;
rattr.va_rdev = 0;
- tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
+ tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
tmpnode_init(tm, tp, &rattr, cr);
/*
@@ -345,7 +374,14 @@ tmp_mount(
* the previously set hardwired defaults to prevail.
*/
if (got_attrs == 0) {
- tp->tn_mode = rattr.va_mode;
+ if (!mode_arg) {
+ /*
+ * Only use the underlying mount point for the
+ * mode if the "mode" mount argument was not
+ * provided.
+ */
+ tp->tn_mode = rattr.va_mode;
+ }
tp->tn_uid = rattr.va_uid;
tp->tn_gid = rattr.va_gid;
}
@@ -366,6 +402,7 @@ tmp_mount(
pn_free(&dpn);
error = 0;
+ atomic_inc_32(&tmpfs_mountcount);
out:
if (error == 0)
@@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
struct tmpnode *tnp, *cancel;
struct vnode *vp;
int error;
+ uint_t cnt;
+ int i;
if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0)
return (error);
- /*
- * forced unmount is not supported by this file system
- * and thus, ENOTSUP, is being returned.
- */
- if (flag & MS_FORCE)
- return (ENOTSUP);
-
mutex_enter(&tm->tm_contents);
/*
- * If there are no open files, only the root node should have
- * a reference count.
+ * In the normal unmount case (non-forced unmount), if there are no
+ * open files, only the root node should have a reference count.
+ *
* With tm_contents held, nothing can be added or removed.
* There may be some dirty pages. To prevent fsflush from
* disrupting the unmount, put a hold on each node while scanning.
* If we find a previously referenced node, undo the holds we have
* placed and fail EBUSY.
+ *
+ * However, in the case of a forced umount, things are a bit different.
+ * An additional VFS_HOLD is added for each outstanding VN_HOLD to
+ * ensure that the file system is not cleaned up (tmp_freevfs) until
+ * the last vfs hold is dropped. This happens in tmp_inactive as the
+ * vnodes are released. Also, we can't add an additional VN_HOLD in
+ * this case since that would prevent tmp_inactive from ever being
+ * called. Finally, we do need to drop the zone ref now (zone_rele_ref)
+ * so that the zone is not blocked waiting for the final file system
+ * cleanup.
*/
tnp = tm->tm_rootnode;
- if (TNTOV(tnp)->v_count > 1) {
+
+ vp = TNTOV(tnp);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (flag & MS_FORCE) {
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ /* Extra hold which we rele below when we drop the zone ref */
+ VFS_HOLD(vfsp);
+
+ for (i = 1; i < cnt; i++)
+ VFS_HOLD(vfsp);
+
+ /* drop the mutex now because no one can find this mount */
+ mutex_exit(&tm->tm_contents);
+ } else if (cnt > 1) {
+ mutex_exit(&vp->v_lock);
mutex_exit(&tm->tm_contents);
return (EBUSY);
}
+ mutex_exit(&vp->v_lock);
+ /*
+ * Check for open files. An open file causes everything to unwind
+ * unless this is a forced umount.
+ */
for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) {
- if ((vp = TNTOV(tnp))->v_count > 0) {
+ vp = TNTOV(tnp);
+ mutex_enter(&vp->v_lock);
+ cnt = vp->v_count;
+ if (flag & MS_FORCE) {
+ for (i = 0; i < cnt; i++)
+ VFS_HOLD(vfsp);
+
+ /*
+ * In the case of a forced umount don't add an
+ * additional VN_HOLD on the already held vnodes, like
+ * we do in the non-forced unmount case. If the
+ * cnt > 0, then the vnode already has at least one
+ * hold and we need tmp_inactive to get called when the
+ * last pre-existing hold on the node is released so
+ * that we can VFS_RELE the VFS holds we just added.
+ */
+ if (cnt == 0) {
+ /* directly add VN_HOLD since have the lock */
+ vp->v_count++;
+ }
+
+ mutex_exit(&vp->v_lock);
+
+ /*
+ * If the tmpnode has any pages associated with it
+ * (i.e. if it's a normal file with non-zero size), the
+ * tmpnode could still be discovered by pageout or
+ * fsflush via the page vnode pointers. To prevent this
+ * from interfering with the tmp_freevfs, truncate the
+ * tmpnode now.
+ */
+ if (tnp->tn_size != 0 && tnp->tn_type == VREG) {
+ rw_enter(&tnp->tn_rwlock, RW_WRITER);
+ rw_enter(&tnp->tn_contents, RW_WRITER);
+
+ (void) tmpnode_trunc(tm, tnp, 0);
+
+ rw_exit(&tnp->tn_contents);
+ rw_exit(&tnp->tn_rwlock);
+
+ ASSERT(tnp->tn_size == 0);
+ ASSERT(tnp->tn_nblocks == 0);
+ }
+ } else if (cnt > 0) {
+ /* An open file; unwind the holds we've been adding. */
+ mutex_exit(&vp->v_lock);
cancel = tm->tm_rootnode->tn_forw;
while (cancel != tnp) {
vp = TNTOV(cancel);
@@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
}
mutex_exit(&tm->tm_contents);
return (EBUSY);
+ } else {
+ /* directly add a VN_HOLD since we have the lock */
+ vp->v_count++;
+ mutex_exit(&vp->v_lock);
}
- VN_HOLD(vp);
}
- /*
- * We can drop the mutex now because no one can find this mount
- */
- mutex_exit(&tm->tm_contents);
+ if (flag & MS_FORCE) {
+ /*
+ * Drop the zone ref now since we don't know how long it will
+ * be until the final vfs_rele is called by tmp_inactive.
+ */
+ if (vfsp->vfs_zone) {
+ zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref,
+ ZONE_REF_VFS);
+ vfsp->vfs_zone = 0;
+ }
+ /* We can now drop the extra hold we added above. */
+ VFS_RELE(vfsp);
+ } else {
+ /*
+ * For the non-forced case, we can drop the mutex now because
+ * no one can find this mount anymore
+ */
+ vfsp->vfs_flag |= VFS_UNMOUNTED;
+ mutex_exit(&tm->tm_contents);
+ }
+
+ return (0);
+}
+
+/*
+ * Implementation of VFS_FREEVFS() to support forced umounts. This is called by
+ * the vfs framework after umount and the last VFS_RELE, to trigger the release
+ * of any resources still associated with the given vfs_t. We only add
+ * additional VFS_HOLDs during the forced umount case, so this is normally
+ * called immediately after tmp_umount.
+ */
+void
+tmp_freevfs(vfs_t *vfsp)
+{
+ struct tmount *tm = (struct tmount *)VFSTOTM(vfsp);
+ struct tmpnode *tnp;
+ struct vnode *vp;
/*
* Free all kmemalloc'd and anonalloc'd memory associated with
@@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
* tmpnode_free which assumes that the directory entry has been
* removed before the file.
*/
+
+ /*
+ * Now that we are tearing ourselves down we need to remove the
+ * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove
+ * files from the system causing us to have a negative value. Doing this
+ * seems a bit better than trying to set a flag on the tmount that says
+ * we're tearing down.
+ */
+ vfsp->vfs_flag &= ~VFS_UNMOUNTED;
+
/*
* Remove all directory entries
*/
@@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr)
ASSERT(tm->tm_mntpath);
- tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
+ kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1);
ASSERT(tm->tm_anonmem == 0);
mutex_destroy(&tm->tm_contents);
mutex_destroy(&tm->tm_renamelck);
- tmp_memfree(tm, sizeof (struct tmount));
+ kmem_free(tm, sizeof (struct tmount));
- return (0);
+ /* Allow _fini() to succeed now */
+ atomic_dec_32(&tmpfs_mountcount);
}
/*
@@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp)
* available to tmpfs. This is fairly inaccurate since it doesn't
* take into account the names stored in the directory entries.
*/
- if (tmpfs_maxkmem > tmp_kmemspace)
- sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) /
- (sizeof (struct tmpnode) + sizeof (struct tdirent));
- else
- sbp->f_ffree = 0;
-
- sbp->f_files = tmpfs_maxkmem /
+ sbp->f_ffree = sbp->f_files = ptob(availrmem) /
(sizeof (struct tmpnode) + sizeof (struct tdirent));
sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree);
(void) cmpldev(&d32, vfsp->vfs_dev);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index 61607a6dcc..464d638db0 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
@@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred,
struct tmount *tm = (struct tmount *)VTOTM(vp);
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
/*
* We don't currently support reading non-regular files
*/
@@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
struct tmount *tm = (struct tmount *)VTOTM(vp);
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
/*
* We don't currently support writing to non-regular files
*/
@@ -833,6 +841,9 @@ tmp_lookup(
struct tmpnode *ntp = NULL;
int error;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
/* allow cd into @ dir */
if (flags & LOOKUP_XATTR) {
@@ -871,8 +882,7 @@ tmp_lookup(
return (error);
}
- xdp = tmp_memalloc(sizeof (struct tmpnode),
- TMP_MUSTHAVE);
+ xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP);
tm = VTOTM(dvp);
tmpnode_init(tm, xdp, &tp->tn_attr, NULL);
/*
@@ -1186,7 +1196,7 @@ tmp_rename(
struct tmpnode *toparent;
struct tmpnode *fromtp = NULL; /* source tmpnode */
struct tmount *tm = (struct tmount *)VTOTM(odvp);
- int error;
+ int error, err;
int samedir = 0; /* set if odvp == ndvp */
struct vnode *realvp;
@@ -1262,15 +1272,6 @@ tmp_rename(
error = 0;
goto done;
}
- vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
-
- /*
- * Notify the target directory if not same as
- * source directory.
- */
- if (ndvp != odvp) {
- vnevent_rename_dest_dir(ndvp, ct);
- }
/*
* Unlink from source.
@@ -1278,7 +1279,7 @@ tmp_rename(
rw_enter(&fromparent->tn_rwlock, RW_WRITER);
rw_enter(&fromtp->tn_rwlock, RW_WRITER);
- error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
+ error = err = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred);
/*
* The following handles the case where our source tmpnode was
@@ -1293,6 +1294,12 @@ tmp_rename(
rw_exit(&fromtp->tn_rwlock);
rw_exit(&fromparent->tn_rwlock);
+
+ if (err == 0) {
+ vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct);
+ vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct);
+ }
+
done:
tmpnode_rele(fromtp);
mutex_exit(&tm->tm_renamelck);
@@ -1459,6 +1466,10 @@ tmp_readdir(
int reclen;
caddr_t outbuf;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
if (uiop->uio_loffset >= MAXOFF_T) {
if (eofp)
*eofp = 1;
@@ -1597,7 +1608,7 @@ tmp_symlink(
return (error);
}
len = strlen(tnm) + 1;
- cp = tmp_memalloc(len, 0);
+ cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI);
if (cp == NULL) {
tmpnode_rele(self);
return (ENOSPC);
@@ -1662,10 +1673,27 @@ top:
* there's little to do -- just drop our hold.
*/
if (vp->v_count > 1 || tp->tn_nlink != 0) {
- vp->v_count--;
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) {
+ /*
+ * Since the file system was forcibly unmounted, we can
+ * have a case (v_count == 1, tn_nlink != 0) where this
+ * file was open so we didn't add an extra hold on the
+ * file in tmp_unmount. We are counting on the
+ * interaction of the hold made in tmp_unmount and
+ * rele-ed in tmp_vfsfree so we need to be sure we
+ * don't decrement in this case.
+ */
+ if (vp->v_count > 1)
+ vp->v_count--;
+ } else {
+ vp->v_count--;
+ }
mutex_exit(&vp->v_lock);
mutex_exit(&tp->tn_tlock);
rw_exit(&tp->tn_rwlock);
+ /* If the filesystem was umounted by force, rele the vfs ref */
+ if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+ VFS_RELE(tm->tm_vfsp);
return;
}
@@ -1690,7 +1718,7 @@ top:
goto top;
}
if (tp->tn_type == VLNK)
- tmp_memfree(tp->tn_symlink, tp->tn_size + 1);
+ kmem_free(tp->tn_symlink, tp->tn_size + 1);
}
/*
@@ -1724,7 +1752,11 @@ top:
rw_destroy(&tp->tn_rwlock);
mutex_destroy(&tp->tn_tlock);
vn_free(TNTOV(tp));
- tmp_memfree(tp, sizeof (struct tmpnode));
+ kmem_free(tp, sizeof (struct tmpnode));
+
+ /* If the filesystem was umounted by force, rele the vfs ref */
+ if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED)
+ VFS_RELE(tm->tm_vfsp);
}
/* ARGSUSED2 */
@@ -1846,6 +1878,10 @@ tmp_getapage(
struct vnode *pvp;
u_offset_t poff;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
if (protp != NULL)
*protp = PROT_ALL;
again:
@@ -2067,6 +2103,10 @@ tmp_putapage(
u_offset_t offset;
u_offset_t tmpoff;
+ /* If the filesystem was umounted by force, return immediately. */
+ if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
+ return (EIO);
+
ASSERT(PAGE_LOCKED(pp));
/* Kluster in tmp_klustsize chunks */
diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c
index c1e2c74a87..def046a0bf 100644
--- a/usr/src/uts/common/fs/udfs/udf_dir.c
+++ b/usr/src/uts/common/fs/udfs/udf_dir.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -562,9 +563,8 @@ out:
namep, ctp);
}
- if (sdp != tdp) {
- vnevent_rename_dest_dir(ITOV(tdp), ctp);
- }
+ vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip),
+ namep, ctp);
}
/*
diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c
index 307d3987ed..2d8de23399 100644
--- a/usr/src/uts/common/fs/udfs/udf_vnops.c
+++ b/usr/src/uts/common/fs/udfs/udf_vnops.c
@@ -911,7 +911,7 @@ udf_rename(
caller_context_t *ct,
int flags)
{
- int32_t error = 0;
+ int32_t error = 0, err;
struct udf_vfs *udf_vfsp;
struct ud_inode *sip; /* source inode */
struct ud_inode *sdp, *tdp; /* source and target parent inode */
@@ -995,7 +995,6 @@ udf_rename(
rw_exit(&tdp->i_rwlock);
goto errout;
}
- vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
rw_exit(&tdp->i_rwlock);
rw_enter(&sdp->i_rwlock, RW_WRITER);
@@ -1006,11 +1005,15 @@ udf_rename(
* If the entry has changed just forget about it. Release
* the source inode.
*/
- if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
+ if ((error = err = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
DR_RENAME, cr, ct)) == ENOENT) {
error = 0;
}
rw_exit(&sdp->i_rwlock);
+
+ if (err == 0)
+ vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
+
errout:
ITIMES(sdp);
ITIMES(tdp);
diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c
index fcffd952ed..c77872b11d 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -3367,11 +3367,10 @@ ufs_rename(
struct inode *ip = NULL; /* check inode */
struct inode *sdp; /* old (source) parent inode */
struct inode *tdp; /* new (target) parent inode */
- struct vnode *svp = NULL; /* source vnode */
struct vnode *tvp = NULL; /* target vnode, if it exists */
struct vnode *realvp;
struct ufsvfs *ufsvfsp;
- struct ulockfs *ulp;
+ struct ulockfs *ulp = NULL;
struct ufs_slot slot;
timestruc_t now;
int error;
@@ -3380,7 +3379,7 @@ ufs_rename(
krwlock_t *first_lock;
krwlock_t *second_lock;
krwlock_t *reverse_lock;
- int serr, terr;
+ int terr;
sdp = VTOI(sdvp);
slot.fbp = NULL;
@@ -3389,34 +3388,13 @@ ufs_rename(
if (VOP_REALVP(tdvp, &realvp, ct) == 0)
tdvp = realvp;
+ /* Must do this before taking locks in case of DNLC miss */
terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
- serr = ufs_eventlookup(sdvp, snm, cr, &svp);
-
- if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
- if (tvp != NULL)
- vnevent_rename_dest(tvp, tdvp, tnm, ct);
-
- /*
- * Notify the target directory of the rename event
- * if source and target directories are not the same.
- */
- if (sdvp != tdvp)
- vnevent_rename_dest_dir(tdvp, ct);
-
- if (svp != NULL)
- vnevent_rename_src(svp, sdvp, snm, ct);
- }
-
- if (tvp != NULL)
- VN_RELE(tvp);
-
- if (svp != NULL)
- VN_RELE(svp);
retry_rename:
error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
if (error)
- goto out;
+ goto unlock;
if (ulp)
TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
@@ -3712,6 +3690,9 @@ retry_firstlock:
goto errout;
}
+ if (terr == 0 && tvp != NULL)
+ vnevent_rename_dest(tvp, tdvp, tnm, ct);
+
/*
* Unlink the source.
* Remove the source entry. ufs_dirremove() checks that the entry
@@ -3723,6 +3704,9 @@ retry_firstlock:
DR_RENAME, cr)) == ENOENT)
error = 0;
+ vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
+ vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
+
errout:
if (slot.fbp)
fbrelse(slot.fbp, S_OTHER);
@@ -3732,15 +3716,17 @@ errout:
rw_exit(&sdp->i_rwlock);
}
- VN_RELE(ITOV(sip));
-
unlock:
+ if (tvp != NULL)
+ VN_RELE(tvp);
+ if (sip != NULL)
+ VN_RELE(ITOV(sip));
+
if (ulp) {
TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
ufs_lockfs_end(ulp);
}
-out:
return (error);
}
diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c
index 6e93d056df..b680b1168c 100644
--- a/usr/src/uts/common/fs/vfs.c
+++ b/usr/src/uts/common/fs/vfs.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -235,7 +235,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp)
* Make sure this root has a path. With lofs, it is possible to have
* a NULL mountpoint.
*/
- if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
+ if (ret == 0 && vfsp->vfs_mntpt != NULL &&
+ (*vpp)->v_path == vn_vpath_empty) {
mntpt = vfs_getmntpoint(vfsp);
vn_setpath_str(*vpp, refstr_value(mntpt),
strlen(refstr_value(mntpt)));
@@ -905,6 +906,7 @@ vfs_mountroot(void)
vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
+ vfs_mountfs("bootfs", "bootfs", "/system/boot");
if (getzoneid() == GLOBAL_ZONEID) {
vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
@@ -3899,6 +3901,8 @@ vfs_to_modname(const char *vfstype)
vfstype = "fdfs";
} else if (strncmp(vfstype, "nfs", 3) == 0) {
vfstype = "nfs";
+ } else if (strcmp(vfstype, "lxproc") == 0) {
+ vfstype = "lxprocfs";
}
return (vfstype);
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 4abb040de0..9fb1ea702a 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -66,6 +66,7 @@
#include <fs/fs_subr.h>
#include <sys/taskq.h>
#include <fs/fs_reparse.h>
+#include <sys/time.h>
/* Determine if this vnode is a file that is read-only */
#define ISROFILE(vp) \
@@ -102,6 +103,9 @@ kmutex_t vskstat_tree_lock;
/* Global variable which enables/disables the vopstats collection */
int vopstats_enabled = 1;
+/* Global used for empty/invalid v_path */
+char *vn_vpath_empty = "";
+
/*
* forward declarations for internal vnode specific data (vsd)
*/
@@ -200,6 +204,11 @@ static void (**vsd_destructor)(void *);
cr = crgetmapped(cr); \
}
+#define VOP_LATENCY_10MS 10000000
+#define VOP_LATENCY_100MS 100000000
+#define VOP_LATENCY_1S 1000000000
+#define VOP_LATENCY_10S 10000000000
+
/*
* Convert stat(2) formats to vnode types and vice versa. (Knows about
* numerical order of S_IFMT and vnode types.)
@@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
vp->v_femhead = NULL; /* Must be done before vn_reinit() */
- vp->v_path = NULL;
+ vp->v_path = vn_vpath_empty;
vp->v_mpssdata = NULL;
vp->v_vsd = NULL;
vp->v_fopdata = NULL;
@@ -2331,6 +2340,7 @@ void
vn_recycle(vnode_t *vp)
{
ASSERT(vp->v_pages == NULL);
+ VERIFY(vp->v_path != NULL);
/*
* XXX - This really belongs in vn_reinit(), but we have some issues
@@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp)
kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
vp->v_femhead = NULL;
}
- if (vp->v_path) {
+ if (vp->v_path != vn_vpath_empty) {
kmem_free(vp->v_path, strlen(vp->v_path) + 1);
- vp->v_path = NULL;
+ vp->v_path = vn_vpath_empty;
}
if (vp->v_fopdata != NULL) {
@@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp)
*/
ASSERT((vp->v_count == 0) || (vp->v_count == 1));
ASSERT(vp->v_count_dnlc == 0);
- if (vp->v_path != NULL) {
+ VERIFY(vp->v_path != NULL);
+ if (vp->v_path != vn_vpath_empty) {
kmem_free(vp->v_path, strlen(vp->v_path) + 1);
- vp->v_path = NULL;
+ vp->v_path = vn_vpath_empty;
}
/* If FEM was in use, make sure everything gets cleaned up */
@@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
if (vp == NULL || vp->v_femhead == NULL) {
return;
}
+ (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
(void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
}
@@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
}
void
-vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)
+vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
+ caller_context_t *ct)
{
if (vp == NULL || vp->v_femhead == NULL) {
return;
}
- (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
+ (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
}
void
@@ -2951,7 +2964,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
* the potential for deadlock.
*/
mutex_enter(&base->v_lock);
- if (base->v_path == NULL) {
+ if (base->v_path == vn_vpath_empty) {
mutex_exit(&base->v_lock);
return;
}
@@ -2978,7 +2991,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
rpath = kmem_alloc(rpathalloc, KM_SLEEP);
mutex_enter(&base->v_lock);
- if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
+ if (base->v_path == vn_vpath_empty ||
+ strlen(base->v_path) != rpathlen) {
mutex_exit(&base->v_lock);
kmem_free(rpath, rpathalloc);
return;
@@ -2992,7 +3006,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
rpath[rpathlen + plen] = '\0';
mutex_enter(&vp->v_lock);
- if (vp->v_path != NULL) {
+ if (vp->v_path != vn_vpath_empty) {
mutex_exit(&vp->v_lock);
kmem_free(rpath, rpathalloc);
} else {
@@ -3012,7 +3026,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len)
char *buf = kmem_alloc(len + 1, KM_SLEEP);
mutex_enter(&vp->v_lock);
- if (vp->v_path != NULL) {
+ if (vp->v_path != vn_vpath_empty) {
mutex_exit(&vp->v_lock);
kmem_free(buf, len + 1);
return;
@@ -3036,10 +3050,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
mutex_enter(&vp->v_lock);
tmp = vp->v_path;
- vp->v_path = NULL;
+ vp->v_path = vn_vpath_empty;
mutex_exit(&vp->v_lock);
vn_setpath(rootdir, dvp, vp, nm, len);
- if (tmp != NULL)
+ if (tmp != vn_vpath_empty)
kmem_free(tmp, strlen(tmp) + 1);
}
@@ -3054,7 +3068,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
int alloc;
mutex_enter(&src->v_lock);
- if (src->v_path == NULL) {
+ if (src->v_path == vn_vpath_empty) {
mutex_exit(&src->v_lock);
return;
}
@@ -3064,7 +3078,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
mutex_exit(&src->v_lock);
buf = kmem_alloc(alloc, KM_SLEEP);
mutex_enter(&src->v_lock);
- if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
+ if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) {
mutex_exit(&src->v_lock);
kmem_free(buf, alloc);
return;
@@ -3073,7 +3087,7 @@ vn_copypath(struct vnode *src, struct vnode *dst)
mutex_exit(&src->v_lock);
mutex_enter(&dst->v_lock);
- if (dst->v_path != NULL) {
+ if (dst->v_path != vn_vpath_empty) {
mutex_exit(&dst->v_lock);
kmem_free(buf, alloc);
return;
@@ -3231,14 +3245,57 @@ fop_read(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start, lat;
+ ssize_t len;
+ int err;
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_runq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, read,
- read_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.reads++;
+ zonep->zone_vfs_rwstats.nread += len;
+ kstat_runq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
@@ -3250,14 +3307,62 @@ fop_write(
cred_t *cr,
caller_context_t *ct)
{
- int err;
ssize_t resid_start = uiop->uio_resid;
+ zone_t *zonep = curzone;
+ zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
+
+ hrtime_t start, lat;
+ ssize_t len;
+ int err;
+
+ /*
+ * For the purposes of VFS kstat consumers, the "waitq" calculation is
+ * repurposed as the active queue for VFS write operations. There's no
+ * actual wait queue for VFS operations.
+ */
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ start = gethrtime();
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ }
VOPXID_MAP_CR(vp, cr);
err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
- VOPSTATS_UPDATE_IO(vp, write,
- write_bytes, (resid_start - uiop->uio_resid));
+ len = resid_start - uiop->uio_resid;
+
+ VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
+
+ if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) {
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.writes++;
+ zonep->zone_vfs_rwstats.nwritten += len;
+ kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ if (lat < VOP_LATENCY_100MS)
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+ }
+
return (err);
}
@@ -3421,7 +3526,7 @@ fop_lookup(
}
if (ret == 0 && *vpp) {
VOPSTATS_UPDATE(*vpp, lookup);
- if ((*vpp)->v_path == NULL) {
+ if ((*vpp)->v_path == vn_vpath_empty) {
vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
}
}
@@ -3463,7 +3568,7 @@ fop_create(
(dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
if (ret == 0 && *vpp) {
VOPSTATS_UPDATE(*vpp, create);
- if ((*vpp)->v_path == NULL) {
+ if ((*vpp)->v_path == vn_vpath_empty) {
vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
}
}
@@ -3585,7 +3690,7 @@ fop_mkdir(
(dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
if (ret == 0 && *vpp) {
VOPSTATS_UPDATE(*vpp, mkdir);
- if ((*vpp)->v_path == NULL) {
+ if ((*vpp)->v_path == vn_vpath_empty) {
vn_setpath(rootdir, dvp, *vpp, dirname,
strlen(dirname));
}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 09d2e1dd8f..d2b584cf02 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -129,6 +129,7 @@
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
@@ -4277,6 +4278,14 @@ top:
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, zio_flags, zb);
+ /*
+ * At this point, this read I/O has already missed in the ARC
+ * and will be going through to the disk. The I/O throttle
+ * should delay this I/O if this zone is using more than its I/O
+ * priority allows.
+ */
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
+
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 465dfd08b2..4b644f7479 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -617,8 +617,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
- if (bonuslen)
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+
+ if (bonuslen) {
+ /*
+ * Absent byzantine on-disk corruption, we fully expect
+ * our bonuslen to be no more than DN_MAX_BONUSLEN --
+ * but we nonetheless explicitly clamp it on the bcopy()
+ * to prevent any on-disk corruption from becoming
+ * rampant in-kernel corruption.
+ */
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+ MIN(bonuslen, DN_MAX_BONUSLEN));
+ }
+
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 43bcfee91c..bfe5cda5e4 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1799,7 +1799,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (!zio_checksum_table[checksum].ci_dedup)
dedup_verify = B_TRUE;
}
-
/*
* Enable nopwrite if we have a cryptographically secure
* checksum that has no known collisions (i.e. SHA-256)
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 56ce2f0c27..c2b1dca1c0 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -38,11 +38,11 @@
#include <sys/sa_impl.h>
#include <sys/zfs_context.h>
#include <sys/varargs.h>
+#include <sys/zfs_zone.h>
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
uint64_t arg1, uint64_t arg2);
-
dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t *dd)
{
@@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
if (len == 0)
return;
+ zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE);
+
min_bs = SPA_MINBLOCKSHIFT;
max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 6a20ab3ca2..1222b8933a 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -42,6 +42,7 @@
#include <sys/zio.h>
#include <sys/arc.h>
#include <sys/sunddi.h>
+#include <sys/zfs_zone.h>
#include <sys/zfeature.h>
#include <sys/policy.h>
#include <sys/zfs_znode.h>
@@ -1266,7 +1267,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
* locks are held.
*/
txg_delay(dd->dd_pool, tx->tx_txg,
- MSEC2NSEC(10), MSEC2NSEC(10));
+ zfs_zone_txg_delay(), MSEC2NSEC(10));
err = SET_ERROR(ERESTART);
}
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 8c5d820ede..8ad7c789d3 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -42,6 +42,7 @@
#include <sys/zfs_znode.h>
#include <sys/spa_impl.h>
#include <sys/dsl_deadlist.h>
+#include <sys/zfs_zone.h>
#include <sys/bptree.h>
#include <sys/zfeature.h>
#include <sys/zil_impl.h>
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9ba9fd3841..02cccae29d 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -63,6 +63,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
int zfs_condense_pct = 200;
/*
+ * Never condense any space map. This is for debugging/recovery only.
+ */
+int zfs_condense_never = 0;
+
+/*
* Condensing a metaslab is not guaranteed to actually reduce the amount of
* space used on disk. In particular, a space map uses data in increments of
* MAX(1 << ashift, space_map_blksize), so a metaslab might use the
@@ -1656,6 +1661,9 @@ metaslab_should_condense(metaslab_t *msp)
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT(msp->ms_loaded);
+ if (zfs_condense_never != 0)
+ return (B_FALSE);
+
/*
* Use the ms_size_tree range tree, which is ordered by size, to
* obtain the largest segment in the free tree. We always condense
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 0b5b37f5fb..5a2602271b 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -24,6 +24,7 @@
* Portions Copyright 2011 iXsystems, Inc
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -406,15 +407,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
{
sa_os_t *sa = os->os_sa;
sa_lot_t *tb, *findtb;
- int i;
+ int i, size;
avl_index_t loc;
ASSERT(MUTEX_HELD(&sa->sa_lock));
tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
tb->lot_attr_count = attr_count;
- tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
- KM_SLEEP);
- bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+
+ if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) {
+ tb->lot_attrs = kmem_alloc(size, KM_SLEEP);
+ bcopy(attrs, tb->lot_attrs, size);
+ }
+
tb->lot_num = lot_num;
tb->lot_hash = hash;
tb->lot_instance = 0;
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 17a18a3199..bde9b0a7ab 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -116,6 +116,7 @@ struct vdev_queue {
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
uint64_t vq_last_offset;
+ zoneid_t vq_last_zone_id;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
};
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
new file mode 100644
index 0000000000..f1431b3f55
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_FS_ZFS_ZONE_H
+#define _SYS_FS_ZFS_ZONE_H
+
+#ifdef _KERNEL
+#include <sys/isa_defs.h>
+#include <sys/types32.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ ZFS_ZONE_IOP_READ = 0,
+ ZFS_ZONE_IOP_WRITE,
+ ZFS_ZONE_IOP_LOGICAL_WRITE,
+} zfs_zone_iop_type_t;
+
+extern void zfs_zone_io_throttle(zfs_zone_iop_type_t);
+
+extern void zfs_zone_zio_init(zio_t *);
+extern void zfs_zone_zio_start(zio_t *);
+extern void zfs_zone_zio_done(zio_t *);
+extern void zfs_zone_zio_dequeue(zio_t *);
+extern void zfs_zone_zio_enqueue(zio_t *);
+extern void zfs_zone_report_txg_sync(void *);
+extern hrtime_t zfs_zone_txg_delay();
+#ifdef _KERNEL
+extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t,
+ avl_tree_t *);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FS_ZFS_ZONE_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 198dc92387..78c9355438 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -421,7 +421,8 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
uint64_t io_offset;
- hrtime_t io_timestamp;
+ hrtime_t io_timestamp; /* time I/O entered zio pipeline */
+ hrtime_t io_dispatched; /* time I/O was dispatched to disk */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
@@ -450,6 +451,7 @@ struct zio {
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
+ zoneid_t io_zoneid; /* zone which originated this I/O */
/* Taskq dispatching state */
taskq_ent_t io_tqent;
};
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 191259e75b..915c9bb4b2 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -31,6 +31,7 @@
#include <sys/dsl_pool.h>
#include <sys/dsl_scan.h>
#include <sys/callb.h>
+#include <sys/zfs_zone.h>
/*
* ZFS Transaction Groups
@@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp)
txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
mutex_exit(&tx->tx_sync_lock);
+ zfs_zone_report_txg_sync(dp);
+
start = ddi_get_lbolt();
spa_sync(spa, txg);
delta = ddi_get_lbolt() - start;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index ed4a8b773b..e4cc42452a 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -21,11 +21,13 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_zone.h>
#include <sys/spa_impl.h>
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
@@ -44,6 +46,11 @@ extern ldi_ident_t zfs_li;
static void vdev_disk_close(vdev_t *);
+typedef struct vdev_disk_buf {
+ buf_t vdb_buf;
+ zio_t *vdb_io;
+} vdev_disk_buf_t;
+
typedef struct vdev_disk_ldi_cb {
list_node_t lcb_next;
ldi_callback_id_t lcb_id;
@@ -127,6 +134,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
int ldi_result, void *arg, void *ev_data)
{
vdev_t *vd = (vdev_t *)arg;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
/*
* Ignore events other than offline.
@@ -586,6 +595,7 @@ static void
vdev_disk_close(vdev_t *vd)
{
vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
if (vd->vdev_reopening || dvd == NULL)
return;
@@ -812,6 +822,8 @@ vdev_disk_io_start(zio_t *zio)
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
+ zfs_zone_zio_start(zio);
+
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
}
@@ -821,6 +833,8 @@ vdev_disk_io_done(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
+ zfs_zone_zio_done(zio);
+
/*
* If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if
* the device has been removed. If this is the case, then we trigger an
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 79d6e13b3b..4c02013405 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
*/
/*
@@ -33,6 +34,7 @@
#include <sys/zio.h>
#include <sys/avl.h>
#include <sys/dsl_pool.h>
+#include <sys/zfs_zone.h>
/*
* ZFS I/O Scheduler
@@ -141,7 +143,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10;
uint32_t zfs_vdev_sync_write_max_active = 10;
uint32_t zfs_vdev_async_read_min_active = 1;
uint32_t zfs_vdev_async_read_max_active = 3;
-uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_min_active = 3;
uint32_t zfs_vdev_async_write_max_active = 10;
uint32_t zfs_vdev_scrub_min_active = 1;
uint32_t zfs_vdev_scrub_max_active = 2;
@@ -237,6 +239,8 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
+ vq->vq_last_zone_id = 0;
+
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
@@ -274,6 +278,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ zfs_zone_zio_enqueue(zio);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
@@ -289,6 +294,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ zfs_zone_zio_dequeue(zio);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
@@ -648,7 +654,11 @@ again:
search.io_timestamp = 0;
search.io_offset = vq->vq_last_offset + 1;
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+#ifdef _KERNEL
+ zio = zfs_zone_schedule(vq, p, idx, tree);
+#else
zio = avl_nearest(tree, idx, AVL_AFTER);
+#endif
if (zio == NULL)
zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index bd7424b55b..2adb297937 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -847,9 +848,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
if (zp->z_links <= zp_is_dir) {
zfs_panic_recover("zfs: link count on %s is %u, "
"should be at least %u",
- zp->z_vnode->v_path ? zp->z_vnode->v_path :
- "<unknown>", (int)zp->z_links,
- zp_is_dir + 1);
+ zp->z_vnode->v_path != vn_vpath_empty ?
+ zp->z_vnode->v_path : "<unknown>",
+ (int)zp->z_links, zp_is_dir + 1);
zp->z_links = zp_is_dir + 1;
}
if (--zp->z_links == zp_is_dir) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 692b49611d..47b0b6f650 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -22,8 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska
- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
@@ -611,9 +611,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
* Check permissions for special properties.
*/
switch (prop) {
+ case ZFS_PROP_DEDUP:
case ZFS_PROP_ZONED:
/*
- * Disallow setting of 'zoned' from within a local zone.
+ * Disallow setting these properties from within a local zone.
*/
if (!INGLOBALZONE(curproc))
return (SET_ERROR(EPERM));
@@ -943,6 +944,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
int error;
+ if (secpolicy_fs_import(cr) != 0)
+ return (set_errno(EPERM));
+
if ((error = zfs_secpolicy_write_perms(zc->zc_name,
ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
return (error);
@@ -2034,7 +2038,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
}
static int
-zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os,
+ boolean_t cachedpropsonly)
{
int error = 0;
nvlist_t *nv;
@@ -2052,7 +2057,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
* XXX reading with out owning
*/
if (!zc->zc_objset_stats.dds_inconsistent &&
- dmu_objset_type(os) == DMU_OST_ZVOL) {
+ dmu_objset_type(os) == DMU_OST_ZVOL &&
+ !cachedpropsonly) {
error = zvol_get_stats(os, nv);
if (error == EIO)
return (error);
@@ -2079,11 +2085,24 @@ static int
zfs_ioc_objset_stats(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error == 0) {
- error = zfs_ioc_objset_stats_impl(zc, os);
+ error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly);
dmu_objset_rele(os, FTAG);
}
@@ -2277,8 +2296,21 @@ static int
zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
{
objset_t *os;
+ nvlist_t *nvl = NULL;
+ boolean_t cachedpropsonly = B_FALSE;
int error;
+ if (zc->zc_nvlist_src != NULL &&
+ (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+ zc->zc_iflags, &nvl) != 0))
+ return (error);
+
+ if (nvl != NULL) {
+ (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly",
+ &cachedpropsonly);
+ nvlist_free(nvl);
+ }
+
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
if (error != 0) {
return (error == ENOENT ? ESRCH : error);
@@ -2307,8 +2339,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
objset_t *ossnap;
error = dmu_objset_from_ds(ds, &ossnap);
- if (error == 0)
- error = zfs_ioc_objset_stats_impl(zc, ossnap);
+ if (error == 0) {
+ error = zfs_ioc_objset_stats_impl(zc,
+ ossnap, cachedpropsonly);
+ }
dsl_dataset_rele(ds, FTAG);
}
} else if (error == ENOENT) {
@@ -3018,6 +3052,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
uint64_t sense = ZFS_PROP_UNDEFINED;
uint64_t norm = ZFS_PROP_UNDEFINED;
uint64_t u8 = ZFS_PROP_UNDEFINED;
+ int error;
ASSERT(zplprops != NULL);
@@ -3061,8 +3096,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
- if (norm == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0);
+ if (norm == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
@@ -3071,13 +3107,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
*/
if (norm)
u8 = 1;
- if (u8 == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0);
+ if (u8 == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
- if (sense == ZFS_PROP_UNDEFINED)
- VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0);
+ if (sense == ZFS_PROP_UNDEFINED &&
+ (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+ return (error);
VERIFY(nvlist_add_uint64(zplprops,
zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 8cf83b399a..f9b7986c56 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -1950,6 +1951,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
if (zfsvfs->z_ctldir != NULL)
zfsctl_destroy(zfsvfs);
+ /*
+ * If we're doing a forced unmount on a dataset which still has
+ * references and is in a zone, then we need to cleanup the zone
+ * reference at this point or else the zone will never be able to
+ * shutdown.
+ */
+ if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) {
+ zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS);
+ vfsp->vfs_zone = NULL;
+ }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index bf75097d2a..ddec59ffc9 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -22,11 +22,16 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
@@ -662,6 +667,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
int error = 0;
+ int prev_error;
arc_buf_t *abuf;
iovec_t *aiov = NULL;
xuio_t *xuio = NULL;
@@ -683,6 +689,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
limit = MAXOFFSET_T;
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+ uio_prefaultpages(n, uio);
+
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
@@ -735,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
}
/*
- * Pre-fault the pages to ensure slow (eg NFS) pages
- * don't hold up txg.
- * Skip this if uio contains loaned arc_buf.
- */
- if ((uio->uio_extflg == UIO_XUIO) &&
- (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
- xuio = (xuio_t *)uio;
- else
- uio_prefaultpages(MIN(n, max_blksz), uio);
-
- /*
* If in append mode, set the io offset pointer to eof.
*/
if (ioflag & FAPPEND) {
@@ -966,7 +972,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
while ((end_size = zp->z_size) < uio->uio_loffset) {
(void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
- ASSERT(error == 0);
}
/*
* If we are replaying and eof is non zero then force
@@ -976,18 +981,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
zp->z_size = zfsvfs->z_replay_eof;
+ /*
+ * Keep track of a possible pre-existing error from a partial
+ * write via dmu_write_uio_dbuf above.
+ */
+ prev_error = error;
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
- if (error != 0)
+ if (prev_error != 0 || error != 0)
break;
ASSERT(tx_bytes == nbytes);
n -= nbytes;
-
- if (!xuio && n > 0)
- uio_prefaultpages(MIN(n, max_blksz), uio);
}
zfs_range_unlock(rl);
@@ -3659,18 +3666,6 @@ top:
}
}
- vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
- if (tzp)
- vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
-
- /*
- * notify the target directory if it is not the same
- * as source directory.
- */
- if (tdvp != sdvp) {
- vnevent_rename_dest_dir(tdvp, ct);
- }
-
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
@@ -3711,8 +3706,12 @@ top:
return (error);
}
- if (tzp) /* Attempt to remove the existing target */
+ if (tzp) {
+ /* Attempt to remove the existing target */
error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+ if (error == 0)
+ vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+ }
if (error == 0) {
error = zfs_link_create(tdl, szp, tx, ZRENAMING);
@@ -3754,6 +3753,11 @@ top:
}
dmu_tx_commit(tx);
+
+ if (error == 0) {
+ vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+ vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
+ }
out:
if (zl != NULL)
zfs_rename_unlock(&zl);
@@ -4244,6 +4248,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
&zp->z_pflags, 8);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
}
dmu_tx_commit(tx);
@@ -4779,10 +4785,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c
new file mode 100644
index 0000000000..4861c64f8e
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_zone.c
@@ -0,0 +1,1336 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to
+ * ZFS I/O resources for each zone.
+ *
+ * I/O contention can be major pain point on a multi-tenant system. A single
+ * zone can issue a stream of I/O operations, usually synchronous writes, which
+ * disrupt I/O performance for all other zones. This problem is further
+ * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG,
+ * a set of blocks which are atomically synced to disk. The process of
+ * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving
+ * out any pending read operations.
+ *
+ * There are two facets to this capability; the throttle and the scheduler.
+ *
+ * Throttle
+ *
+ * The requirements on the throttle are:
+ *
+ * 1) Ensure consistent and predictable I/O latency across all zones.
+ * 2) Sequential and random workloads have very different characteristics,
+ * so it is a non-starter to track IOPS or throughput.
+ * 3) A zone should be able to use the full disk bandwidth if no other zone
+ * is actively using the disk.
+ *
+ * The throttle has two components: one to track and account for each zone's
+ * I/O requests, and another to throttle each zone's operations when it
+ * exceeds its fair share of disk I/O. When the throttle detects that a zone is
+ * consuming more than is appropriate, each read or write system call is
+ * delayed by up to 100 microseconds, which we've found is sufficient to allow
+ * other zones to interleave I/O requests during those delays.
+ *
+ * Note: The throttle will delay each logical I/O (as opposed to the physical
+ * I/O which will likely be issued asynchronously), so it may be easier to
+ * think of the I/O throttle delaying each read/write syscall instead of the
+ * actual I/O operation. For each zone, the throttle tracks an ongoing average
+ * of read and write operations performed to determine the overall I/O
+ * utilization for each zone.
+ *
+ * The throttle calculates a I/O utilization metric for each zone using the
+ * following formula:
+ *
+ * (# of read syscalls) x (Average read latency) +
+ * (# of write syscalls) x (Average write latency)
+ *
+ * Once each zone has its utilization metric, the I/O throttle will compare I/O
+ * utilization across all zones, and if a zone has a higher-than-average I/O
+ * utilization, system calls from that zone are throttled. That is, if one
+ * zone has a much higher utilization, that zone's delay is increased by 5
+ * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is
+ * already throttled and has a lower utilization than average, its delay will
+ * be lowered by 5 microseconds.
+ *
+ * The throttle calculation is driven by IO activity, but since IO does not
+ * happen at fixed intervals, timestamps are used to track when the last update
+ * was made and to drive recalculation.
+ *
+ * The throttle recalculates each zone's I/O usage and throttle delay (if any)
+ * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as
+ * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval.
+ *
+ * Scheduler
+ *
+ * The I/O scheduler manages the vdev queues – the queues of pending I/Os to
+ * issue to the disks. It only makes scheduling decisions for the two
+ * synchronous I/O queues (read & write).
+ *
+ * The scheduler maintains how many I/Os in the queue are from each zone, and
+ * if one zone has a disproportionately large number of I/Os in the queue, the
+ * scheduler will allow certain I/Os from the underutilized zones to be "bumped"
+ * and pulled from the middle of the queue. This bump allows zones with a small
+ * number of I/Os (so small they may not even be taken into account by the
+ * throttle) to complete quickly instead of waiting behind dozens of I/Os from
+ * other zones.
+ */
+
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_zone.h>
+
+#ifndef _KERNEL
+
+/*
+ * Stubs for when compiling for user-land.
+ */
+
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+}
+
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+}
+
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+ return (MSEC2NSEC(10));
+}
+
+#else
+
+/*
+ * The real code.
+ */
+
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/zio.h>
+#include <sys/zone.h>
+#include <sys/avl.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+
+/*
+ * The zone throttle delays read and write operations from certain zones based
+ * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time
+ * below), the delays for each zone are recalculated based on the utilization
+ * over the previous window.
+ */
+boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */
+uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */
+uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */
+
+boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */
+
+/*
+ * For certain workloads, one zone may be issuing primarily sequential I/O and
+ * another primarily random I/O. The sequential I/O will complete much more
+ * quickly than the random I/O, driving the average system latency for those
+ * operations way down. As a result, the random I/O may be throttled back, even
+ * though the sequential I/O should be throttled to allow the random I/O more
+ * access to the disk.
+ *
+ * This tunable limits the discrepancy between the read and write system
+ * latency. If one becomes excessively high, this tunable prevents the I/O
+ * throttler from exacerbating the imbalance.
+ */
+uint_t zfs_zone_rw_lat_limit = 10;
+
+/*
+ * The I/O throttle will only start delaying zones when it detects disk
+ * utilization has reached a certain level. This tunable controls the
+ * threshold at which the throttle will start delaying zones. When the number
+ * of vdevs is small, the calculation should correspond closely with the %b
+ * column from iostat -- but as the number of vdevs becomes large, it will
+ * correlate less and less to any single device (therefore making it a poor
+ * approximation for the actual I/O utilization on such systems). We
+ * therefore use our derived utilization conservatively: we know that low
+ * derived utilization does indeed correlate to low I/O use -- but that a high
+ * rate of derived utilization does not necesarily alone denote saturation;
+ * where we see a high rate of utilization, we also look for laggard I/Os to
+ * attempt to detect saturation.
+ */
+uint_t zfs_zone_util_threshold = 80;
+uint_t zfs_zone_underutil_threshold = 60;
+
+/*
+ * There are three important tunables here: zfs_zone_laggard_threshold denotes
+ * the threshold at which an I/O is considered to be of notably high latency;
+ * zfs_zone_laggard_recent denotes the number of microseconds before the
+ * current time after which the last laggard is considered to be sufficiently
+ * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes
+ * the microseconds before the current time before which the last laggard is
+ * considered to be sufficiently old to merit decreasing the throttle. The
+ * most important tunable of these three is the zfs_zone_laggard_threshold: in
+ * modeling data from a large public cloud, this tunable was found to have a
+ * much greater effect on the throttle than the two time-based thresholds.
+ * This must be set high enough to not result in spurious throttling, but not
+ * so high as to allow pathological I/O to persist in the system.
+ */
+uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */
+uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */
+uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */
+
+/*
+ * Throughout this subsystem, our timestamps are in microseconds. Our system
+ * average cycle is one second or 1 million microseconds. Our zone counter
+ * update cycle is two seconds or 2 million microseconds. We use a longer
+ * duration for that cycle because some ops can see a little over two seconds of
+ * latency when they are being starved by another zone.
+ */
+uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */
+uint_t zfs_zone_cycle_time = 2000000; /* 2 s */
+
+/*
+ * How often the I/O throttle will reevaluate each zone's utilization, in
+ * microseconds. Default is 1/4 sec.
+ */
+uint_t zfs_zone_adjust_time = 250000; /* 250 ms */
+
+typedef struct {
+ hrtime_t cycle_start;
+ int cycle_cnt;
+ hrtime_t cycle_lat;
+ hrtime_t sys_avg_lat;
+} sys_lat_cycle_t;
+
+typedef struct {
+ hrtime_t zi_now;
+ uint_t zi_avgrlat;
+ uint_t zi_avgwlat;
+ uint64_t zi_totpri;
+ uint64_t zi_totutil;
+ int zi_active;
+ uint_t zi_diskutil;
+ boolean_t zi_underutil;
+ boolean_t zi_overutil;
+} zoneio_stats_t;
+
+static sys_lat_cycle_t rd_lat;
+static sys_lat_cycle_t wr_lat;
+
+/*
+ * Some basic disk stats to determine disk utilization. The utilization info
+ * for all disks on the system is aggregated into these values.
+ *
+ * Overall disk utilization for the current cycle is calculated as:
+ *
+ * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100)
+ * ----------------------------------------------
+ * ((now - zfs_zone_last_checked) * 1000);
+ */
+kmutex_t zfs_disk_lock; /* protects the following: */
+uint_t zfs_disk_rcnt; /* Number of outstanding IOs */
+hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */
+hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */
+
+hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */
+/* time that we last updated per-zone throttle info */
+hrtime_t zfs_zone_last_checked = 0;
+hrtime_t zfs_disk_last_laggard = 0;
+
+/*
+ * Data used to keep track of how often txg sync is running.
+ */
+extern int zfs_txg_timeout;
+static uint_t txg_last_check;
+static uint_t txg_cnt;
+static uint_t txg_sync_rate;
+
+boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */
+/*
+ * Threshold for when zio scheduling should kick in.
+ *
+ * This threshold is based on the zfs_vdev_sync_read_max_active value for the
+ * number of I/Os that can be pending on a device. If there are more than the
+ * max_active ops already queued up, beyond those already issued to the vdev,
+ * then use zone-based scheduling to get the next synchronous zio.
+ */
+uint32_t zfs_zone_schedule_thresh = 10;
+
+/*
+ * On each pass of the scheduler we increment the zone's weight (up to this
+ * maximum). The weight is used by the scheduler to prevent starvation so
+ * that zones which haven't been able to do any IO over many iterations
+ * will max out thier weight to this value.
+ */
+#define SCHED_WEIGHT_MAX 20
+
+/*
+ * Tunables for delay throttling when TXG sync is occurring.
+ *
+ * If the zone is performing a write and we're doing above normal TXG syncing,
+ * then throttle for longer than normal. The zone's wait time is multiplied
+ * by the scale (zfs_zone_txg_throttle_scale).
+ */
+int zfs_zone_txg_throttle_scale = 2;
+hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20);
+
+typedef struct {
+ int zq_qdepth;
+ zio_priority_t zq_queue;
+ int zq_priority;
+ int zq_wt;
+ zoneid_t zq_zoneid;
+} zone_q_bump_t;
+
+/*
+ * This uses gethrtime() but returns a value in usecs.
+ */
+#define GET_USEC_TIME (gethrtime() / 1000)
+#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC))
+
+/*
+ * Keep track of the zone's ZFS IOPs.
+ *
+ * See the comment on the zfs_zone_io_throttle function for which/how IOPs are
+ * accounted for.
+ *
+ * If the number of ops is >1 then we can just use that value. However,
+ * if the number of ops is <2 then we might have a zone which is trying to do
+ * IO but is not able to get any ops through the system. We don't want to lose
+ * track of this zone so we factor in its decayed count into the current count.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last update
+ * was made. If it was more than one cycle ago, then we need to decay the
+ * historical count by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return a time delta indicating how far into the current cycle we are or 0
+ * if the last IO was more than a cycle ago.
+ */
+static hrtime_t
+compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new zone count.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_cycle_time)
+ return (delta);
+
+ /* A previous cycle is past, compute the new zone count. */
+
+ /*
+ * Figure out how many generations we have to decay the historical
+ * count, since multiple cycles may have elapsed since our last IO.
+ * We depend on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_cycle_time);
+
+ /* If more than 5 cycles since last the IO, reset count. */
+ if (gen_cnt > 5) {
+ cp->zone_avg_cnt = 0;
+ } else {
+ /* Update the count. */
+ int i;
+
+ /*
+ * If the zone did more than 1 IO, just use its current count
+ * as the historical value, otherwise decay the historical
+ * count and factor that into the new historical count. We
+ * pick a threshold > 1 so that we don't lose track of IO due
+ * to int rounding.
+ */
+ if (cp->cycle_cnt > 1)
+ cp->zone_avg_cnt = cp->cycle_cnt;
+ else
+ cp->zone_avg_cnt = cp->cycle_cnt +
+ (cp->zone_avg_cnt / 2);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->zone_avg_cnt = cp->zone_avg_cnt / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+
+ return (0);
+}
+
+/*
+ * Add IO op data to the zone.
+ */
+static void
+add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops);
+ zonep->zone_rd_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops);
+ zonep->zone_wr_ops.cycle_cnt++;
+ break;
+ case ZFS_ZONE_IOP_LOGICAL_WRITE:
+ (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops);
+ zonep->zone_lwr_ops.cycle_cnt++;
+ break;
+ }
+}
+
+/*
+ * Use a decaying average to keep track of the overall system latency.
+ *
+ * We want to have the recent activity heavily weighted, but if the
+ * activity decreases or stops, then the average should quickly decay
+ * down to the new value.
+ *
+ * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average.
+ * However, since this calculation is driven by IO activity and since IO does
+ * not happen at fixed intervals, we use a timestamp to see when the last
+ * update was made. If it was more than one cycle ago, then we need to decay
+ * the average by the proper number of additional cycles in which no IO was
+ * performed.
+ *
+ * Return true if we actually computed a new system average.
+ * If we're still within an active cycle there is nothing to do, return false.
+ */
+static boolean_t
+compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ hrtime_t delta;
+ int gen_cnt;
+
+ /*
+ * Check if its time to recompute a new average.
+ * If we're still collecting data for the current cycle, return false.
+ */
+ delta = unow - cp->cycle_start;
+ if (delta < zfs_zone_sys_avg_cycle)
+ return (B_FALSE);
+
+ /* A previous cycle is past, compute a new system average. */
+
+ /*
+ * Figure out how many generations we have to decay, since multiple
+ * cycles may have elapsed since our last IO.
+ * We count on int rounding here.
+ */
+ gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle);
+
+ /* If more than 5 cycles since last the IO, reset average. */
+ if (gen_cnt > 5) {
+ cp->sys_avg_lat = 0;
+ } else {
+ /* Update the average. */
+ int i;
+
+ cp->sys_avg_lat =
+ (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt);
+
+ /*
+ * If more than one generation has elapsed since the last
+ * update, decay the values further.
+ */
+ for (i = 1; i < gen_cnt; i++)
+ cp->sys_avg_lat = cp->sys_avg_lat / 2;
+ }
+
+ /* A new cycle begins. */
+ cp->cycle_start = unow;
+ cp->cycle_cnt = 0;
+ cp->cycle_lat = 0;
+
+ return (B_TRUE);
+}
+
+static void
+add_sys_iop(hrtime_t unow, int op, int lat)
+{
+ switch (op) {
+ case ZFS_ZONE_IOP_READ:
+ (void) compute_new_sys_avg(unow, &rd_lat);
+ rd_lat.cycle_cnt++;
+ rd_lat.cycle_lat += lat;
+ break;
+ case ZFS_ZONE_IOP_WRITE:
+ (void) compute_new_sys_avg(unow, &wr_lat);
+ wr_lat.cycle_cnt++;
+ wr_lat.cycle_lat += lat;
+ break;
+ }
+}
+
+/*
+ * Get the zone IO counts.
+ */
+static uint_t
+calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp)
+{
+ hrtime_t delta;
+ uint_t cnt;
+
+ if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ cnt = cp->zone_avg_cnt;
+ } else {
+ /*
+ * If we're less than half way through the cycle then use
+ * the current count plus half the historical count, otherwise
+ * just use the current count.
+ */
+ if (delta < (zfs_zone_cycle_time / 2))
+ cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2);
+ else
+ cnt = cp->cycle_cnt;
+ }
+
+ return (cnt);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static uint_t
+calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp)
+{
+ if (compute_new_sys_avg(unow, cp)) {
+ /*
+ * No activity in the current cycle, we already have the
+ * historical data so we'll use that.
+ */
+ return (cp->sys_avg_lat);
+ } else {
+ /*
+ * We're within a cycle; weight the current activity higher
+ * compared to the historical data and use that.
+ */
+ DTRACE_PROBE3(zfs__zone__calc__wt__avg,
+ uintptr_t, cp->sys_avg_lat,
+ uintptr_t, cp->cycle_lat,
+ uintptr_t, cp->cycle_cnt);
+
+ return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) /
+ (1 + (cp->cycle_cnt * 8)));
+ }
+}
+
+/*
+ * Account for the current IOP on the zone and for the system as a whole.
+ * The latency parameter is in usecs.
+ */
+static void
+add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat)
+{
+ /* Add op to zone */
+ add_zone_iop(zonep, unow, op);
+
+ /* Track system latency */
+ if (op != ZFS_ZONE_IOP_LOGICAL_WRITE)
+ add_sys_iop(unow, op, lat);
+}
+
+/*
+ * Calculate and return the total number of read ops, write ops and logical
+ * write ops for the given zone. If the zone has issued operations of any type
+ * return a non-zero value, otherwise return 0.
+ */
+static int
+get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops,
+ uint_t *lwops)
+{
+ *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops);
+ *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops);
+ *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops);
+
+ DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id,
+ uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops);
+
+ return (*rops | *wops | *lwops);
+}
+
+/*
+ * Get the average read/write latency in usecs for the system.
+ */
+static void
+get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat)
+{
+ *rlat = calc_avg_lat(unow, &rd_lat);
+ *wlat = calc_avg_lat(unow, &wr_lat);
+
+ /*
+ * In an attempt to improve the accuracy of the throttling algorithm,
+ * assume that IO operations can't have zero latency. Instead, assume
+ * a reasonable lower bound for each operation type. If the actual
+ * observed latencies are non-zero, use those latency values instead.
+ */
+ if (*rlat == 0)
+ *rlat = 1000;
+ if (*wlat == 0)
+ *wlat = 1000;
+
+ DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat,
+ uintptr_t, *wlat);
+}
+
+/*
+ * Find disk utilization for each zone and average utilization for all active
+ * zones.
+ */
+static int
+zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint_t rops, wops, lwops;
+
+ if (zonep->zone_id == GLOBAL_ZONEID ||
+ get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) {
+ zonep->zone_io_util = 0;
+ return (0);
+ }
+
+ zonep->zone_io_util = (rops * sp->zi_avgrlat) +
+ (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat);
+ sp->zi_totutil += zonep->zone_io_util;
+
+ if (zonep->zone_io_util > 0) {
+ sp->zi_active++;
+ sp->zi_totpri += zonep->zone_zfs_io_pri;
+ }
+
+ /*
+ * sdt:::zfs-zone-utilization
+ *
+ * arg0: zone ID
+ * arg1: read operations observed during time window
+ * arg2: physical write operations observed during time window
+ * arg3: logical write ops observed during time window
+ * arg4: calculated utilization given read and write ops
+ * arg5: I/O priority assigned to this zone
+ */
+ DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id,
+ uint_t, rops, uint_t, wops, uint_t, lwops,
+ uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri);
+
+ return (0);
+}
+
+static void
+zfs_zone_delay_inc(zone_t *zonep)
+{
+ if (zonep->zone_io_delay < zfs_zone_delay_ceiling)
+ zonep->zone_io_delay += zfs_zone_delay_step;
+}
+
+static void
+zfs_zone_delay_dec(zone_t *zonep)
+{
+ if (zonep->zone_io_delay > 0)
+ zonep->zone_io_delay -= zfs_zone_delay_step;
+}
+
+/*
+ * For all zones "far enough" away from the average utilization, increase that
+ * zones delay. Otherwise, reduce its delay.
+ */
+static int
+zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg)
+{
+ zoneio_stats_t *sp = arg;
+ uint16_t delay = zonep->zone_io_delay;
+ uint_t fairutil = 0;
+
+ zonep->zone_io_util_above_avg = B_FALSE;
+
+ /*
+ * Given the calculated total utilitzation for all zones, calculate the
+ * fair share of I/O for this zone.
+ */
+ if (zfs_zone_priority_enable && sp->zi_totpri > 0) {
+ fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) /
+ sp->zi_totpri;
+ } else if (sp->zi_active > 0) {
+ fairutil = sp->zi_totutil / sp->zi_active;
+ }
+
+ /*
+ * Adjust each IO's delay. If the overall delay becomes too high, avoid
+ * increasing beyond the ceiling value.
+ */
+ if (zonep->zone_io_util > fairutil && sp->zi_overutil) {
+ zonep->zone_io_util_above_avg = B_TRUE;
+
+ if (sp->zi_active > 1)
+ zfs_zone_delay_inc(zonep);
+ } else if (zonep->zone_io_util < fairutil || sp->zi_underutil ||
+ sp->zi_active <= 1) {
+ zfs_zone_delay_dec(zonep);
+ }
+
+ /*
+ * sdt:::zfs-zone-throttle
+ *
+ * arg0: zone ID
+ * arg1: old delay for this zone
+ * arg2: new delay for this zone
+ * arg3: calculated fair I/O utilization
+ * arg4: actual I/O utilization
+ */
+ DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id,
+ uintptr_t, delay, uintptr_t, zonep->zone_io_delay,
+ uintptr_t, fairutil, uintptr_t, zonep->zone_io_util);
+
+ return (0);
+}
+
+/*
+ * Examine the utilization between different zones, and adjust the delay for
+ * each zone appropriately.
+ */
+static void
+zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked)
+{
+ zoneio_stats_t stats;
+ hrtime_t laggard_udelta = 0;
+
+ (void) bzero(&stats, sizeof (stats));
+
+ stats.zi_now = unow;
+ get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat);
+
+ if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit)
+ stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit;
+ else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat)
+ stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit;
+
+ if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0)
+ return;
+
+ /*
+ * Calculate disk utilization for the most recent period.
+ */
+ if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) {
+ stats.zi_diskutil = 0;
+ } else {
+ stats.zi_diskutil =
+ ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) /
+ ((unow - last_checked) * 1000);
+ }
+ zfs_disk_last_rtime = zfs_disk_rtime;
+
+ if (unow > zfs_disk_last_laggard)
+ laggard_udelta = unow - zfs_disk_last_laggard;
+
+ /*
+ * To minimize porpoising, we have three separate states for our
+ * assessment of I/O performance: overutilized, underutilized, and
+ * neither overutilized nor underutilized. We will increment the
+ * throttle if a zone is using more than its fair share _and_ I/O
+ * is overutilized; we will decrement the throttle if a zone is using
+ * less than its fair share _or_ I/O is underutilized.
+ */
+ stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold ||
+ laggard_udelta > zfs_zone_laggard_ancient;
+
+ stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold &&
+ laggard_udelta < zfs_zone_laggard_recent;
+
+ /*
+ * sdt:::zfs-zone-stats
+ *
+ * Statistics observed over the last period:
+ *
+ * arg0: average system read latency
+ * arg1: average system write latency
+ * arg2: number of active zones
+ * arg3: total I/O 'utilization' for all zones
+ * arg4: total I/O priority of all active zones
+ * arg5: calculated disk utilization
+ */
+ DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat,
+ uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active,
+ uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri,
+ uintptr_t, stats.zi_diskutil);
+
+ (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats);
+}
+
+/*
+ * Callback used to calculate a zone's IO schedule priority.
+ *
+ * We scan the zones looking for ones with ops in the queue. Out of those,
+ * we pick the one that calculates to the highest schedule priority.
+ */
+static int
+get_sched_pri_cb(zone_t *zonep, void *arg)
+{
+ int pri;
+ uint_t cnt;
+ zone_q_bump_t *qbp = arg;
+ zio_priority_t p = qbp->zq_queue;
+
+ cnt = zonep->zone_zfs_queued[p];
+ if (cnt == 0) {
+ zonep->zone_zfs_weight = 0;
+ return (0);
+ }
+
+ /*
+ * On each pass, increment the zone's weight. We use this as input
+ * to the calculation to prevent starvation. The value is reset
+ * each time we issue an IO for this zone so zones which haven't
+ * done any IO over several iterations will see their weight max
+ * out.
+ */
+ if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX)
+ zonep->zone_zfs_weight++;
+
+ /*
+ * This zone's IO priority is the inverse of the number of IOs
+ * the zone has enqueued * zone's configured priority * weight.
+ * The queue depth has already been scaled by 10 to avoid problems
+ * with int rounding.
+ *
+ * This means that zones with fewer IOs in the queue will get
+ * preference unless other zone's assigned priority pulls them
+ * ahead. The weight is factored in to help ensure that zones
+ * which haven't done IO in a while aren't getting starved.
+ */
+ pri = (qbp->zq_qdepth / cnt) *
+ zonep->zone_zfs_io_pri * zonep->zone_zfs_weight;
+
+ /*
+ * If this zone has a higher priority than what we found so far,
+ * it becomes the new leading contender.
+ */
+ if (pri > qbp->zq_priority) {
+ qbp->zq_zoneid = zonep->zone_id;
+ qbp->zq_priority = pri;
+ qbp->zq_wt = zonep->zone_zfs_weight;
+ }
+ return (0);
+}
+
+/*
+ * See if we need to bump a zone's zio to the head of the queue. This is only
+ * done on the two synchronous I/O queues (see the block comment on the
+ * zfs_zone_schedule function). We get the correct vdev_queue_class_t and
+ * queue depth from our caller.
+ *
+ * For single-threaded synchronous processes a zone cannot get more than
+ * 1 op into the queue at a time unless the zone is running multiple processes
+ * in parallel. This can cause an imbalance in performance if there are zones
+ * with many parallel processes (and ops in the queue) vs. other zones which
+ * are doing simple single-threaded processes, such as interactive tasks in the
+ * shell. These zones can get backed up behind a deep queue and their IO
+ * performance will appear to be very poor as a result. This can make the
+ * zone work badly for interactive behavior.
+ *
+ * The scheduling algorithm kicks in once we start to get a deeper queue.
+ * Once that occurs, we look at all of the zones to see which one calculates
+ * to the highest priority. We bump that zone's first zio to the head of the
+ * queue.
+ *
+ * We use a counter on the zone so that we can quickly find how many ops each
+ * zone has in the queue without having to search the entire queue itself.
+ * This scales better since the number of zones is expected to be on the
+ * order of 10-100 whereas the queue depth can be in the range of 50-2000.
+ * In addition, since the zio's in the queue only have the zoneid, we would
+ * have to look up the zone for each zio enqueued and that means the overhead
+ * for scanning the queue each time would be much higher.
+ *
+ * In all cases, we fall back to simply pulling the next op off the queue
+ * if something should go wrong.
+ */
+static zio_t *
+get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p,
+ avl_tree_t *tree)
+{
+ zone_q_bump_t qbump;
+ zio_t *zp = NULL, *zphead;
+ int cnt = 0;
+
+ /* To avoid problems with int rounding, scale the queue depth by 10 */
+ qbump.zq_qdepth = qdepth * 10;
+ qbump.zq_priority = 0;
+ qbump.zq_zoneid = 0;
+ qbump.zq_queue = p;
+ (void) zone_walk(get_sched_pri_cb, &qbump);
+
+ zphead = avl_first(tree);
+
+ /* Check if the scheduler didn't pick a zone for some reason!? */
+ if (qbump.zq_zoneid != 0) {
+ for (zp = avl_first(tree); zp != NULL;
+ zp = avl_walk(tree, zp, AVL_AFTER)) {
+ if (zp->io_zoneid == qbump.zq_zoneid)
+ break;
+ cnt++;
+ }
+ }
+
+ if (zp == NULL) {
+ zp = zphead;
+ } else if (zp != zphead) {
+ /*
+ * Only fire the probe if we actually picked a different zio
+ * than the one already at the head of the queue.
+ */
+ DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid,
+ uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt);
+ }
+
+ return (zp);
+}
+
+/*
+ * Add our zone ID to the zio so we can keep track of which zones are doing
+ * what, even when the current thread processing the zio is not associated
+ * with the zone (e.g. the kernel taskq which pushes out TX groups).
+ */
+void
+zfs_zone_zio_init(zio_t *zp)
+{
+ zone_t *zonep = curzone;
+
+ zp->io_zoneid = zonep->zone_id;
+}
+
+/*
+ * Track and throttle IO operations per zone. Called from:
+ * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes
+ * go through this path)
+ * - arc_read for read ops that miss the ARC (both dataset and zvol)
+ * For each operation, increment that zone's counter based on the type of
+ * operation, then delay the operation, if necessary.
+ *
+ * There are three basic ways that we can see write ops:
+ * 1) An application does write syscalls. Those ops go into a TXG which
+ * we'll count here. Sometime later a kernel taskq thread (we'll see the
+ * vdev IO as zone 0) will perform some number of physical writes to commit
+ * the TXG to disk. Those writes are not associated with the zone which
+ * made the write syscalls and the number of operations is not correlated
+ * between the taskq and the zone. We only see logical writes in this
+ * function, we see the physcial writes in the zfs_zone_zio_start and
+ * zfs_zone_zio_done functions.
+ * 2) An application opens a file with O_SYNC. Each write will result in
+ * an operation which we'll see here plus a low-level vdev write from
+ * that zone.
+ * 3) An application does write syscalls followed by an fsync(). We'll
+ * count the writes going into a TXG here. We'll also see some number
+ * (usually much smaller, maybe only 1) of low-level vdev writes from this
+ * zone when the fsync is performed, plus some other low-level vdev writes
+ * from the taskq in zone 0 (are these metadata writes?).
+ *
+ * 4) In addition to the above, there are misc. system-level writes, such as
+ * writing out dirty pages to swap, or sync(2) calls, which will be handled
+ * by the global zone and which we count but don't generally worry about.
+ *
+ * Because of the above, we can see writes twice; first because this function
+ * is always called by a zone thread for logical writes, but then we also will
+ * count the physical writes that are performed at a low level via
+ * zfs_zone_zio_start. Without this, it can look like a non-global zone never
+ * writes (case 1). Depending on when the TXG is synced, the counts may be in
+ * the same sample bucket or in a different one.
+ *
+ * Tracking read operations is simpler due to their synchronous semantics. The
+ * zfs_read function -- called as a result of a read(2) syscall -- will always
+ * retrieve the data to be read through arc_read and we only come into this
+ * function when we have an arc miss.
+ */
+void
+zfs_zone_io_throttle(zfs_zone_iop_type_t type)
+{
+ zone_t *zonep = curzone;
+ hrtime_t unow, last_checked;
+ uint16_t wait;
+
+ unow = GET_USEC_TIME;
+
+ /*
+ * Only bump the counter for logical writes here. The counters for
+ * tracking physical IO operations are handled in zfs_zone_zio_done.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, type, 0);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ if (!zfs_zone_delay_enable)
+ return;
+
+ /*
+ * If the zone's I/O priority is set to zero, don't throttle that zone's
+ * operations at all.
+ */
+ if (zonep->zone_zfs_io_pri == 0)
+ return;
+
+ /*
+ * XXX There's a potential race here in that more than one thread may
+ * update the zone delays concurrently. The worst outcome is corruption
+ * of our data to track each zone's IO, so the algorithm may make
+ * incorrect throttling decisions until the data is refreshed.
+ */
+ last_checked = zfs_zone_last_checked;
+ if ((unow - last_checked) > zfs_zone_adjust_time) {
+ zfs_zone_last_checked = unow;
+ zfs_zone_wait_adjust(unow, last_checked);
+ }
+
+ if ((wait = zonep->zone_io_delay) > 0) {
+ /*
+ * If this is a write and we're doing above normal TXG
+ * syncing, then throttle for longer than normal.
+ */
+ if (type == ZFS_ZONE_IOP_LOGICAL_WRITE &&
+ (txg_cnt > 1 || txg_sync_rate > 1))
+ wait *= zfs_zone_txg_throttle_scale;
+
+ /*
+ * sdt:::zfs-zone-wait
+ *
+ * arg0: zone ID
+ * arg1: type of IO operation
+ * arg2: time to delay (in us)
+ */
+ DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id,
+ uintptr_t, type, uintptr_t, wait);
+
+ drv_usecwait(wait);
+
+ if (zonep->zone_vfs_stats != NULL) {
+ atomic_inc_64(&zonep->zone_vfs_stats->
+ zv_delay_cnt.value.ui64);
+ atomic_add_64(&zonep->zone_vfs_stats->
+ zv_delay_time.value.ui64, wait);
+ }
+ }
+}
+
+/*
+ * XXX Ignore the pool pointer parameter for now.
+ *
+ * Keep track to see if the TXG sync rate is running above the expected rate.
+ * If so, this implies that we are filling TXG's at a high rate due to a heavy
+ * write workload. We use this as input into the zone throttle.
+ *
+ * This function is called every 5 seconds (zfs_txg_timeout) under a normal
+ * write load. In this case, the sync rate is going to be 1. When there
+ * is a heavy write load, TXG's fill up fast and the sync thread will write
+ * the TXG more frequently (perhaps once a second). In this case the rate
+ * will be > 1. The sync rate is a lagging indicator since it can be up
+ * to 5 seconds old. We use the txg_cnt to keep track of the rate in the
+ * current 5 second interval and txg_sync_rate to keep track of the previous
+ * 5 second interval. In that way we don't have a period (1 or more seconds)
+ * where the txg_cnt == 0 and we cut back on throttling even though the rate
+ * is still high.
+ */
+/*ARGSUSED*/
+void
+zfs_zone_report_txg_sync(void *dp)
+{
+ uint_t now;
+
+ txg_cnt++;
+ now = (uint_t)(gethrtime() / NANOSEC);
+ if ((now - txg_last_check) >= zfs_txg_timeout) {
+ txg_sync_rate = txg_cnt / 2;
+ txg_cnt = 0;
+ txg_last_check = now;
+ }
+}
+
+hrtime_t
+zfs_zone_txg_delay()
+{
+ if (curzone->zone_io_util_above_avg)
+ return (zfs_zone_txg_delay_nsec);
+
+ return (MSEC2NSEC(10));
+}
+
+/*
+ * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline
+ * and is issued.
+ * Keep track of start time for latency calculation in zfs_zone_zio_done.
+ */
+void
+zfs_zone_zio_start(zio_t *zp)
+{
+ zone_t *zonep;
+
+ /*
+ * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for
+ * an actual I/O operation. Ignore those operations as they relate to
+ * throttling and scheduling.
+ */
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_zfs_lock);
+ if (zp->io_type == ZIO_TYPE_READ)
+ kstat_runq_enter(&zonep->zone_zfs_rwstats);
+ zonep->zone_zfs_weight = 0;
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zp->io_dispatched = gethrtime();
+
+ if (zfs_disk_rcnt++ != 0)
+ zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = zp->io_dispatched;
+ mutex_exit(&zfs_disk_lock);
+
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_disk_io_done when an IO completes.
+ * Increment our counter for zone ops.
+ * Calculate the IO latency avg. for this zone.
+ */
+void
+zfs_zone_zio_done(zio_t *zp)
+{
+ zone_t *zonep;
+ hrtime_t now, unow, udelta;
+
+ if (zp->io_type == ZIO_TYPE_IOCTL)
+ return;
+
+ if (zp->io_dispatched == 0)
+ return;
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ now = gethrtime();
+ unow = NANO_TO_MICRO(now);
+ udelta = unow - NANO_TO_MICRO(zp->io_dispatched);
+
+ mutex_enter(&zonep->zone_zfs_lock);
+
+ /*
+ * To calculate the wsvc_t average, keep a cumulative sum of all the
+ * wait time before each I/O was dispatched. Since most writes are
+ * asynchronous, only track the wait time for read I/Os.
+ */
+ if (zp->io_type == ZIO_TYPE_READ) {
+ zonep->zone_zfs_rwstats.reads++;
+ zonep->zone_zfs_rwstats.nread += zp->io_size;
+
+ zonep->zone_zfs_stats->zz_waittime.value.ui64 +=
+ zp->io_dispatched - zp->io_timestamp;
+
+ kstat_runq_exit(&zonep->zone_zfs_rwstats);
+ } else {
+ zonep->zone_zfs_rwstats.writes++;
+ zonep->zone_zfs_rwstats.nwritten += zp->io_size;
+ }
+
+ mutex_exit(&zonep->zone_zfs_lock);
+
+ mutex_enter(&zfs_disk_lock);
+ zfs_disk_rcnt--;
+ zfs_disk_rtime += (now - zfs_disk_rlastupdate);
+ zfs_disk_rlastupdate = now;
+
+ if (udelta > zfs_zone_laggard_threshold)
+ zfs_disk_last_laggard = unow;
+
+ mutex_exit(&zfs_disk_lock);
+
+ if (zfs_zone_delay_enable) {
+ mutex_enter(&zonep->zone_stg_io_lock);
+ add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ?
+ ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta);
+ mutex_exit(&zonep->zone_stg_io_lock);
+ }
+
+ zone_rele(zonep);
+
+ /*
+ * sdt:::zfs-zone-latency
+ *
+ * arg0: zone ID
+ * arg1: type of I/O operation
+ * arg2: I/O latency (in us)
+ */
+ DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid,
+ uintptr_t, zp->io_type, uintptr_t, udelta);
+}
+
+void
+zfs_zone_zio_dequeue(zio_t *zp)
+{
+ zio_priority_t p;
+ zone_t *zonep;
+
+ p = zp->io_priority;
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return;
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ ASSERT(zonep->zone_zfs_queued[p] > 0);
+ if (zonep->zone_zfs_queued[p] == 0)
+ cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0");
+ else
+ zonep->zone_zfs_queued[p]--;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+void
+zfs_zone_zio_enqueue(zio_t *zp)
+{
+ zio_priority_t p;
+ zone_t *zonep;
+
+ p = zp->io_priority;
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return;
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL)
+ return;
+
+ mutex_enter(&zonep->zone_stg_io_lock);
+ zonep->zone_zfs_queued[p]++;
+ mutex_exit(&zonep->zone_stg_io_lock);
+ zone_rele(zonep);
+}
+
+/*
+ * Called from vdev_queue_io_to_issue. That function is where zio's are listed
+ * in FIFO order on one of the sync queues, then pulled off (by
+ * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling
+ * here to find a zone's zio deeper in the sync queue and issue that instead
+ * of simply doing FIFO.
+ *
+ * We only do zone-based zio scheduling for the two synchronous I/O queues
+ * (read & write). These queues are normally serviced in FIFO order but we
+ * may decide to move a zone's zio to the head of the line. A typical I/O
+ * load will be mostly synchronous reads and some asynchronous writes (which
+ * are scheduled differently due to transaction groups). There will also be
+ * some synchronous writes for those apps which want to ensure their data is on
+ * disk. We want to make sure that a zone with a single-threaded app (e.g. the
+ * shell) that is doing synchronous I/O (typically reads) isn't penalized by
+ * other zones which are doing lots of synchronous I/O because they have many
+ * running threads.
+ *
+ * The vq->vq_lock mutex is held when we're executing this function so we
+ * can safely access the "last zone" variable on the queue.
+ */
+zio_t *
+zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx,
+ avl_tree_t *tree)
+{
+ vdev_queue_class_t *vqc = &vq->vq_class[p];
+ uint_t cnt;
+ zoneid_t last_zone;
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ /* Don't change the order on the LBA ordered queues. */
+ if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE)
+ return (avl_nearest(tree, idx, AVL_AFTER));
+
+ /* We depend on p being defined as either 0 or 1 */
+ ASSERT(p < 2);
+
+ cnt = avl_numnodes(tree);
+ last_zone = vq->vq_last_zone_id;
+
+ /*
+ * If there are only a few zios in the queue then just issue the head.
+ * If there are more than a few zios already queued up, then use
+ * scheduling to get the next zio.
+ */
+ if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh)
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ else
+ zio = get_next_zio(vqc, cnt, p, tree);
+
+ vq->vq_last_zone_id = zio->io_zoneid;
+
+ /*
+ * Probe with 4 args; the number of IOs in the queue, the zone that
+ * was last scheduled off this queue, the zone that was associated
+ * with the next IO that is scheduled, and which queue (priority).
+ */
+ DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone,
+ uint_t, zio->io_zoneid, uint_t, p);
+
+ return (zio);
+}
+
+#endif
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 38b57e0123..f9f93999db 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -1801,9 +1802,18 @@ zil_close(zilog_t *zilog)
if (lwb != NULL)
txg = lwb->lwb_max_txg;
mutex_exit(&zilog->zl_lock);
- if (txg)
+
+ if (zilog_is_dirty(zilog)) {
+ /*
+ * If we're dirty, always wait for the current transaction --
+ * our lwb_max_txg may be in the past.
+ */
+ txg_wait_synced(zilog->zl_dmu_pool, 0);
+ } else if (txg) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
- ASSERT(!zilog_is_dirty(zilog));
+ }
+
+ VERIFY(!zilog_is_dirty(zilog));
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index aa0f2945dd..2fa3213be0 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
#include <sys/sysmacros.h>
@@ -39,6 +40,7 @@
#include <sys/ddt.h>
#include <sys/blkptr.h>
#include <sys/zfeature.h>
+#include <sys/zfs_zone.h>
/*
* ==========================================================================
@@ -557,11 +559,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb;
if (pio != NULL) {
+ zio->io_zoneid = pio->io_zoneid;
if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
+ } else {
+ zfs_zone_zio_init(zio);
}
return (zio);
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 80888103fe..f681b1dc65 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -25,7 +25,7 @@
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -83,6 +83,7 @@
#include <sys/zvol.h>
#include <sys/dumphdr.h>
#include <sys/zil_impl.h>
+#include <sys/sdt.h>
#include <sys/dbuf.h>
#include <sys/dmu_tx.h>
#include <sys/zfeature.h>
@@ -137,6 +138,11 @@ typedef struct zvol_state {
#define ZVOL_EXCL 0x4
#define ZVOL_WCE 0x8
+#define VOP_LATENCY_10MS 10000000
+#define VOP_LATENCY_100MS 100000000
+#define VOP_LATENCY_1S 1000000000
+#define VOP_LATENCY_10S 10000000000
+
/*
* zvol maximum transfer in one DMU tx.
*/
@@ -1378,6 +1384,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
uint64_t volsize;
rl_t *rl;
int error = 0;
+ zone_t *zonep = curzone;
+ uint64_t tot_bytes;
+ hrtime_t start, lat;
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
@@ -1394,6 +1403,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0);
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_runq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ start = gethrtime();
+ tot_bytes = 0;
+
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1403,6 +1420,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
if (bytes > volsize - uio->uio_loffset)
bytes = volsize - uio->uio_loffset;
+ tot_bytes += bytes;
error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
@@ -1412,6 +1430,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
}
}
zfs_range_unlock(rl);
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.reads++;
+ zonep->zone_vfs_rwstats.nread += tot_bytes;
+ kstat_runq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ zone_vfs_kstat_t *zvp;
+
+ zvp = zonep->zone_vfs_stats;
+ if (lat < VOP_LATENCY_100MS) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int,
+ error);
+
return (error);
}
@@ -1425,6 +1476,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
rl_t *rl;
int error = 0;
boolean_t sync;
+ zone_t *zonep = curzone;
+ uint64_t tot_bytes;
+ hrtime_t start, lat;
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
@@ -1441,6 +1495,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
return (error);
}
+ DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1);
+
+ /*
+ * For the purposes of VFS kstat consumers, the "waitq" calculation is
+ * repurposed as the active queue for zvol write operations. There's no
+ * actual wait queue for zvol operations.
+ */
+ mutex_enter(&zonep->zone_vfs_lock);
+ kstat_waitq_enter(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+ start = gethrtime();
+ tot_bytes = 0;
+
sync = !(zv->zv_flags & ZVOL_WCE) ||
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
@@ -1454,6 +1521,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
if (bytes > volsize - off) /* don't write past the end */
bytes = volsize - off;
+ tot_bytes += bytes;
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -1471,6 +1539,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
zfs_range_unlock(rl);
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+ DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int,
+ error);
+
+ mutex_enter(&zonep->zone_vfs_lock);
+ zonep->zone_vfs_rwstats.writes++;
+ zonep->zone_vfs_rwstats.nwritten += tot_bytes;
+ kstat_waitq_exit(&zonep->zone_vfs_rwstats);
+ mutex_exit(&zonep->zone_vfs_lock);
+
+ lat = gethrtime() - start;
+
+ if (lat >= VOP_LATENCY_10MS) {
+ zone_vfs_kstat_t *zvp;
+
+ zvp = zonep->zone_vfs_stats;
+ if (lat < VOP_LATENCY_100MS) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_1S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ } else if (lat < VOP_LATENCY_10S) {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ } else {
+ atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
+ atomic_inc_64(&zvp->zv_10s_ops.value.ui64);
+ }
+ }
+
return (error);
}