diff options
Diffstat (limited to 'usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c')
-rw-r--r-- | usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c | 860 |
1 files changed, 860 insertions, 0 deletions
diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c new file mode 100644 index 0000000000..69c131d886 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c @@ -0,0 +1,860 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * The lx devfs (lxd) file system is used within lx branded zones to provide + * the Linux view of /dev. + * + * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev. + * lxd now provides the Linux /dev. + * + * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file + * system which is the special device and corresponds to the special device in + * a lofs mount. As with lofs, all files in the special device are accessible + * through the lxd mount. Because the zone's devfs is not directly modifiable + * within the zone (also mknod(2) is not generally allowed within a zone) it is + * impossible to create files in devfs. For lx, in some cases it's useful to be + * able to make new symlinks or new directories under /dev. lxd implements + * these operations by creating "files" in memory in the same way as tmpfs + * does. Within lxd these are referred to as "front" files. For operations such + * as lookup or readdir, lxd provides a merged view of both the front and back + * files. lxd does not support regular front files or simple I/O (read/write) + * to front files, since there is no need for that. For back files, all + * operations are simply passed through to the real vnode, as is done with + * lofs. Front files are not allowed to mask back files. + * + * The Linux /dev is now a lxd mount with the special file (i.e. the back + * file system) as /native/dev. + * + * In addition, lx has a need for some illumos/Linux translation for the + * various *stat(2) system calls when used on a device. This translation can + * be centralized within lxd's getattr vnode entry point. + * + * Because the front file system only exists in memory and the back file + * system is the zone's devfs, which is not persistent across reboots, we + * track any device uid/gid/mode changes in a per-zone /etc/.lxd_dev_attr + * file and re-apply those changes when the lx devfs file system is mounted. + * Currently only changes to block device nodes are persistent. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_ptm.h> +#include <sys/lx_impl.h> + +#include "lxd.h" + +/* Module level parameters */ +static int lxd_fstype; +static dev_t lxd_dev; + +/* + * lxd_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t lxd_mountcount; + +/* + * lxd_minfree is the minimum amount of swap space that lx devfs leaves for + * the rest of the zone. + */ +size_t lxd_minfree = 0; + +/* + * LXDMINFREE -- the value from which lxd_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for lxd in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow lxd to consume + * no more than ~10% of this, yielding a LXDMINFREE of 12MB. + */ +#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *, + caller_context_t *, int); +extern int stat64(char *, struct stat64 *); + +/* + * lxd vfs operations. + */ +static int lxd_init(int, char *); +static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int lxd_unmount(vfs_t *, int, cred_t *); +static int lxd_root(vfs_t *, vnode_t **); +static int lxd_statvfs(vfs_t *, statvfs64_t *); +static void lxd_freevfs(vfs_t *vfsp); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_devfs", + lxd_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand devfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +/* + * Definitions and translators for devt's. + */ +static void lxd_pts_devt_translator(dev_t, dev_t *); +static void lxd_ptm_devt_translator(dev_t, dev_t *); + +static kmutex_t lxd_xlate_lock; +static boolean_t lxd_xlate_initialized = B_FALSE; + +static lxd_minor_translator_t lxd_mtranslator_mm[] = { + { "/dev/null", 0, 1, 3 }, + { "/dev/zero", 0, 1, 5 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_random[] = { + { "/dev/random", 0, 1, 8 }, + { "/dev/urandom", 0, 1, 9 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_sy[] = { + { "/dev/tty", 0, LX_TTY_MAJOR, 0 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_zcons[] = { + { "/dev/console", 0, LX_TTY_MAJOR, 1 }, + { NULL, 0, 0, 0 } +}; +lxd_devt_translator_t lxd_devt_translators[] = { + { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm }, + { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random }, + { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy }, + { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons }, + { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator }, + { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator }, + { NULL, 0, DTT_INVALID, (uintptr_t)NULL } +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (lxd_mountcount > 0) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(lxd_fstype); + vn_freevnodeops(lxd_vnodeops); + mutex_destroy(&lxd_xlate_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading lxd module. + */ +static int +lxd_init(int fstype, char *name) +{ + static const fs_operation_def_t lxd_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxd_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount }, + VFSNAME_ROOT, { .vfs_root = lxd_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def lxd_vnodeops_template[]; + int error; + major_t dev; + + lxd_fstype = fstype; + ASSERT(lxd_fstype != 0); + + error = vfs_setfsops(fstype, lxd_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxd_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxd_init: bad vnode ops template"); + return (error); + } + + /* + * lxd_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (lxd_minfree == 0) { + /* Set if not patched */ + lxd_minfree = btopr(LXDMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxd_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxd_dev = makedevice(dev, 0); + + mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* + * Initialize device translator mapping table. + * + * Note that we cannot do this in lxd_init since that can lead to a recursive + * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/ + * devi_attach_node/modload). Thus we do it in the mount path and keep track + * so that we only initialize the table once. + */ +static void +lxd_xlate_init() +{ + int i; + + mutex_enter(&lxd_xlate_lock); + if (lxd_xlate_initialized) { + mutex_exit(&lxd_xlate_lock); + return; + } + + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + lxd_minor_translator_t *mt; + int j; + + lxd_devt_translators[i].lxd_xl_major = + mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver); + + /* if this translator doesn't use a list mapping we're done. */ + if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST) + continue; + + /* for each device listed, lookup the minor node number */ + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + vnode_t *vp; + struct vattr va; + char *tpath; + char tnm[MAXPATHLEN]; + + /* + * The attach might be triggered in either the global + * zone or in a non-global zone, so we may need to + * adjust the path if we're in a NGZ. + */ + if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) { + tpath = mt[j].lxd_mt_path; + } else { + (void) snprintf(tnm, sizeof (tnm), "/native%s", + mt[j].lxd_mt_path); + tpath = tnm; + } + + if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL, + &vp, NULL) != 0) { + mt[j].lxd_mt_minor = UINT_MAX; + continue; + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) { + va.va_rdev = NODEV; + } else { + ASSERT(getmajor(va.va_rdev) == + lxd_devt_translators[i].lxd_xl_major); + ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN); + } + + mt[j].lxd_mt_minor = getminor(va.va_rdev); + + VN_RELE(vp); + } + } + + lxd_xlate_initialized = B_TRUE; + mutex_exit(&lxd_xlate_lock); +} + +static int +lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lxd_mnt_t *lxdm = NULL; + struct lxd_node *ldn; + struct pathname dpn; + int error; + int i; + int nodev; + struct vattr rattr; + vnode_t *realrootvp; + vnode_t *tvp; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + vattr_t vattr; + + nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + lxd_xlate_init(); + + /* + * This is the same behavior as with lofs. + * Loopback devices which get "nodevices" added can be done without + * "nodevices" set because we cannot import devices into a zone + * with loopback. Note that we have all zone privileges when + * this happens; if not, we'd have gotten "nosuid". + */ + if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); + + /* + * Only allow mounting within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* lxd doesn't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * Find real root + */ + if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) { + pn_free(&dpn); + return (error); + } + + if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (error); + } + + /* If realroot is not a devfs, error out */ + if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (EINVAL); + } + + lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP); + + /* init but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_attrlck, NULL, MUTEX_DEFAULT, NULL); + + list_create(&lxdm->lxdm_devattrs, sizeof (lxd_dev_attr_t), + offsetof(lxd_dev_attr_t, lxda_link)); + + /* Initialize the hash table mutexes */ + for (i = 0; i < LXD_HASH_SZ; i++) { + mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT, + NULL); + } + + lxdm->lxdm_vfsp = vfsp; + lxdm->lxdm_gen = 1; /* start inode counter at 1 */ + + vfsp->vfs_data = (caddr_t)lxdm; + vfsp->vfs_fstype = lxd_fstype; + vfsp->vfs_dev = lxd_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype); + lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path); + + /* allocate and initialize root lxd_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + + tvp = lxd_make_back_node(realrootvp, lxdm); + ldn = VTOLDN(tvp); + + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + LDNTOV(ldn)->v_flag |= VROOT; + + /* + * initialize linked list of lxd_nodes so that the back pointer of + * the root lxd_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + ldn->lxdn_prev = ldn; + ldn->lxdn_next = NULL; + ldn->lxdn_nlink = 0; + lxdm->lxdm_rootnode = ldn; + + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + lxd_dirinit(ldn, ldn); + + rw_exit(&ldn->lxdn_rwlock); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&lxd_mountcount); + + lxzdata = ztolxzd(curproc->p_zone); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vattr.va_mask = AT_TYPE | AT_MODE; + vattr.va_type = VLNK; + vattr.va_mode = 0777; + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL) { + char lnknm[MAXPATHLEN]; + + /* Create a symlink for the actual zvol. */ + (void) snprintf(lnknm, sizeof (lnknm), + "./zvol/dsk/%s", vd->lxvd_real_name); + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + lnknm, cr, NULL, 0); + } else if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Create a symlink for the root "disk" using /dev/zfs + * as the target device. + */ + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + "./zfs", cr, NULL, 0); + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Apply any persistent attribute changes. */ + lxd_apply_db(lxdm); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + + mutex_enter(&lxdm->lxdm_contents); + + /* + * In the normal unmount case only the root node would have a reference + * count. + * + * With lxdm_contents held, nothing can be added or removed. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + ldn = lxdm->lxdm_rootnode; + + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EINVAL); + } + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) { + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = lxdm->lxdm_rootnode->lxdn_next; + while (cancel != ldn) { + vp = LDNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->lxdn_next; + } + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } else { + /* + * It may seem incorrect for us to have a vnode with + * a count of 0, but this is modeled on tmpfs and works + * the same way. See lxd_front_inactive. There we allow + * the v_count to go to 0 but rely on the link count to + * keep the vnode alive. Since we now want to cleanup + * these vnodes we manually add a VN_HOLD so that the + * VN_RELEs that occur in the lxd_freevfs() cleanup + * will take us down the lxd_inactive code path. We + * can directly add a VN_HOLD since we have the lock. + */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&lxdm->lxdm_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * lxd_unmount. + */ +void +lxd_freevfs(vfs_t *vfsp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn; + struct vnode *vp; + lxd_dev_attr_t *da; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the lxd_mnt_t that + * says we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries (this doesn't remove top-level dirs). + */ + for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if (ldn->lxdn_vnode->v_type == VDIR) + lxd_dirtrunc(ldn); + rw_exit(&ldn->lxdn_rwlock); + } + + ASSERT(lxdm->lxdm_rootnode != NULL); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a + * lxd_node from blowing it away (in lxd_inactive) while we're trying + * to get to it here. Once we have a HOLD on it we know it'll stick + * around. + */ + mutex_enter(&lxdm->lxdm_contents); + + /* + * Remove all the files (except the rootnode) backwards. + */ + while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) { + mutex_exit(&lxdm->lxdm_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = LDNTOV(ldn); + ASSERT(vp->v_type == VLNK || vp->v_type == VDIR || + vp->v_type == VSOCK); + VN_RELE(vp); + mutex_enter(&lxdm->lxdm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (ldn == lxdm->lxdm_rootnode->lxdn_prev) { + VN_HOLD(vp); + mutex_exit(&lxdm->lxdm_contents); + delay(hz / 4); + mutex_enter(&lxdm->lxdm_contents); + } + } + mutex_exit(&lxdm->lxdm_contents); + + ASSERT(lxdm->lxdm_back_refcnt == 1); + ASSERT(lxdm->lxdm_dent_refcnt == 0); + + VN_RELE(LDNTOV(lxdm->lxdm_rootnode)); + + ASSERT(lxdm->lxdm_mntpath != NULL); + kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1); + + da = list_remove_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + kmem_free(da, sizeof (lxd_dev_attr_t)); + da = list_remove_head(&lxdm->lxdm_devattrs); + } + list_destroy(&lxdm->lxdm_devattrs); + + mutex_destroy(&lxdm->lxdm_contents); + mutex_destroy(&lxdm->lxdm_renamelck); + mutex_destroy(&lxdm->lxdm_attrlck); + kmem_free(lxdm, sizeof (lxd_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&lxd_mountcount); +} + +/* + * return root lxdnode for given vnode + */ +static int +lxd_root(struct vfs *vfsp, struct vnode **vpp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn = lxdm->lxdm_rootnode; + struct vnode *vp; + + ASSERT(ldn != NULL); + + vp = LDNTOV(ldn); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = lxdm->lxdm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > lxd_minfree) + sbp->f_bfree = blocks - lxd_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of lxd_nodes we can allocate from the remaining kernel memory + * available to lxdevfs in this zone. This is fairly inaccurate since + * it doesn't take into account the names stored in the directory + * entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (lxd_node_t) + sizeof (lxd_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static void +lxd_pts_devt_translator(dev_t dev, dev_t *jdev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + /* + * Linux uses a range of major numbers for pts devices to address the + * relatively small minor number space (20 bits). + */ + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + if (lx_maj > LX_PTS_MAJOR_MAX) { + /* + * The major is outside the acceptable range but there's little + * we can presently do about it short of overhauling the + * translation logic. + */ + lx_unsupported("pts major out of translation range"); + } + + *jdev = LX_MAKEDEVICE(lx_maj, lx_min); +} + +/* ARGSUSED */ +static void +lxd_ptm_devt_translator(dev_t dev, dev_t *jdev) +{ + *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR); +} |