/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and * lofs(7FS) file systems. It is modeled on code from both of these file * systems. * * The purpose is to create a high performance name space for files on which * applications will compute. Given a large number of data files with various * owners, we want to construct a view onto those files such that only a subset * is visible to the applications and such that the view can be changed very * quickly as compute progresses. Entries in the name space are not mounts and * thus do not appear in the mnttab. Entries in the name space are allowed to * refer to files on different backing file systems. Intermediate directories * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes * in the name space except for entries that refer to backing files ala lofs. * * The name space is managed via ioctls issued on the mounted file system and * is mostly read-only for the compute applications. That is, applications * cannot create new files in the name space. If a file is unlinked by an * application, that only removes the file from the name space, the backing * file remains in place. It is possible for applications to write-through to * the backing files if the file system is mounted read-write. * * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, * and HYPRLOFS_RM_ALL ioctls on the top-level mount. * * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and * the name(s) for the file(s) in the name space. The name(s) may be path(s) * which will be relative to the root of the mount and thus cannot begin with * a /. If the name is a path, it does not have to correspond to any backing * path. The intermediate directories will only exist in the name space. The * entry(ies) will be added to the name space. * * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the * name space which should be removed. The name(s) may be path(s) which will * be relative to the root of the mount and thus cannot begin with a /. The * named entry(ies) will be removed. * * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int hyprlofsfstype; /* * hyprlofs vfs operations. */ static int hyprlofsinit(int, char *); static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); static int hyprlofs_unmount(vfs_t *, int, cred_t *); static int hyprlofs_root(vfs_t *, vnode_t **); static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); /* * Loadable module wrapper */ #include static mntopts_t hyprlofs_mntopts; static vfsdef_t vfw = { VFSDEF_VERSION, "hyprlofs", hyprlofsinit, VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, &hyprlofs_mntopts }; static mntopts_t hyprlofs_mntopts = { 0, NULL }; /* * Module linkage information */ static struct modlfs modlfs = { &mod_fsops, "filesystem for hyprlofs", &vfw }; static struct modlinkage modlinkage = { MODREV_1, &modlfs, NULL }; int _init() { return (mod_install(&modlinkage)); } int _fini() { int error; error = mod_remove(&modlinkage); if (error) return (error); /* * Tear down the operations vectors */ (void) vfs_freevfsops_by_type(hyprlofsfstype); vn_freevnodeops(hyprlofs_vnodeops); return (0); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /* * The following are patchable variables limiting the amount of system * resources hyprlofs can use. * * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can * use for it's data structures (e.g. hlnodes, directory entries). It is set * as a percentage of physical memory which is determined when hyprlofs is * first used in the system. * * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for * the rest of the system. If the amount of free swap space in the system * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon * allocations will fail. */ size_t hyprlofs_maxkmem = 0; size_t hyprlofs_minfree = 0; size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ static major_t hyprlofs_major; static minor_t hyprlofs_minor; static kmutex_t hyprlofs_minor_lock; /* * initialize global hyprlofs locks and hashes when loading hyprlofs module */ static int hyprlofsinit(int fstype, char *name) { static const fs_operation_def_t hl_vfsops_template[] = { VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, NULL, NULL }; int error; extern void hyprlofs_hash_init(); hyprlofs_hash_init(); hyprlofsfstype = fstype; ASSERT(hyprlofsfstype != 0); error = vfs_setfsops(fstype, hl_vfsops_template, NULL); if (error != 0) { cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); return (error); } error = vn_make_ops(name, hyprlofs_vnodeops_template, &hyprlofs_vnodeops); if (error != 0) { (void) vfs_freevfsops_by_type(fstype); cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); return (error); } /* * hyprlofs_minfree is an absolute limit of swap space which still * allows other processes to execute. Set it if its not patched. */ if (hyprlofs_minfree == 0) hyprlofs_minfree = btopr(HYPRLOFSMINFREE); if ((hyprlofs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "hyprlofsinit: Can't get unique device number."); hyprlofs_major = 0; } mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } static int hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) { hlfsmount_t *hm = NULL; hlnode_t *hp; struct pathname dpn; int error; vattr_t rattr; int got_attrs; if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) return (error); if (secpolicy_hyprlofs_control(cr) != 0) return (EPERM); if (mvp->v_type != VDIR) return (ENOTDIR); if (uap->flags & MS_REMOUNT) return (EBUSY); mutex_enter(&mvp->v_lock); if ((uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (EBUSY); } mutex_exit(&mvp->v_lock); /* Having the resource be anything but "swap" doesn't make sense. */ vfs_setresource(vfsp, "swap", 0); if ((error = pn_get(uap->dir, (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) != 0) goto out; if ((hm = kmem_zalloc(sizeof (hlfsmount_t), KM_NORMALPRI | KM_NOSLEEP)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; } /* Get an available minor device number for this mount */ mutex_enter(&hyprlofs_minor_lock); do { hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); } while (vfs_devismounted(hm->hlm_dev)); mutex_exit(&hyprlofs_minor_lock); /* * Set but don't bother entering the mutex since hlfsmount is not on * the mount list yet. */ mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); hm->hlm_vfsp = vfsp; vfsp->vfs_data = (caddr_t)hm; vfsp->vfs_fstype = hyprlofsfstype; vfsp->vfs_dev = hm->hlm_dev; vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(hm->hlm_mntpath, dpn.pn_path); /* allocate and initialize root hlnode structure */ bzero(&rattr, sizeof (vattr_t)); rattr.va_mode = (mode_t)(S_IFDIR | 0777); rattr.va_type = VDIR; rattr.va_rdev = 0; hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); hyprlofs_node_init(hm, hp, &rattr, cr); /* Get the mode, uid, and gid from the underlying mount point. */ rattr.va_mask = AT_MODE|AT_UID|AT_GID; got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); rw_enter(&hp->hln_rwlock, RW_WRITER); HLNTOV(hp)->v_flag |= VROOT; /* * If the getattr succeeded, use its results, otherwise allow the * previously set defaults to prevail. */ if (got_attrs == 0) { hp->hln_mode = rattr.va_mode; hp->hln_uid = rattr.va_uid; hp->hln_gid = rattr.va_gid; } /* * Initialize linked list of hlnodes so that the back pointer of the * root hlnode always points to the last one on the list and the * forward pointer of the last node is null */ hp->hln_back = hp; hp->hln_forw = NULL; hp->hln_nlink = 0; hm->hlm_rootnode = hp; hyprlofs_dirinit(hp, hp); rw_exit(&hp->hln_rwlock); pn_free(&dpn); error = 0; out: return (error); } static int hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) { hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); hlnode_t *hnp, *cancel; vnode_t *vp; int error; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); if (secpolicy_hyprlofs_control(cr) != 0) return (EPERM); /* * forced unmount is not supported by this file system * and thus, ENOTSUP, is being returned. */ if (flag & MS_FORCE) return (ENOTSUP); mutex_enter(&hm->hlm_contents); /* * If there are no open files, only the root node should have a ref cnt. * With hlm_contents held, nothing can be added or removed. There may * be some dirty pages. To prevent fsflush from disrupting the unmount, * put a hold on each node while scanning. If we find a previously * referenced node, undo the holds we have placed and fail EBUSY. */ hnp = hm->hlm_rootnode; if (HLNTOV(hnp)->v_count > 1) { mutex_exit(&hm->hlm_contents); return (EBUSY); } for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { if ((vp = HLNTOV(hnp))->v_count > 0) { cancel = hm->hlm_rootnode->hln_forw; while (cancel != hnp) { vp = HLNTOV(cancel); ASSERT(vp->v_count > 0); VN_RELE(vp); cancel = cancel->hln_forw; } mutex_exit(&hm->hlm_contents); return (EBUSY); } VN_HOLD(vp); } /* We can drop the mutex now because no one can find this mount */ mutex_exit(&hm->hlm_contents); /* * Free all alloc'd memory associated with this FS. To do this, we go * through the file list twice, once to remove all the dir entries, and * then to remove all the files. */ /* Remove all directory entries */ for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { rw_enter(&hnp->hln_rwlock, RW_WRITER); if (hnp->hln_type == VDIR) hyprlofs_dirtrunc(hnp); rw_exit(&hnp->hln_rwlock); } ASSERT(hm->hlm_rootnode); /* * All links are gone, v_count is keeping nodes in place. VN_RELE * should make the node disappear, unless somebody is holding pages * against it. Wait and retry until it disappears. * * We re-acquire the lock to prevent others who have a HOLD on a hlnode * from blowing it away (in hyprlofs_inactive) while we're trying to * get to it here. Once we have a HOLD on it we know it'll stick around. */ mutex_enter(&hm->hlm_contents); /* Remove all the files (except the rootnode) backwards. */ while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { mutex_exit(&hm->hlm_contents); /* Note we handled the link count in pass 2 above. */ vp = HLNTOV(hnp); VN_RELE(vp); mutex_enter(&hm->hlm_contents); /* * It's still there after the RELE. Someone else like pageout * has a hold on it so wait a bit and then try again. */ if (hnp == hm->hlm_rootnode->hln_back) { VN_HOLD(vp); mutex_exit(&hm->hlm_contents); delay(hz / 4); mutex_enter(&hm->hlm_contents); } } mutex_exit(&hm->hlm_contents); VN_RELE(HLNTOV(hm->hlm_rootnode)); ASSERT(hm->hlm_mntpath); kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); mutex_destroy(&hm->hlm_contents); kmem_free(hm, sizeof (hlfsmount_t)); return (0); } /* Return root hlnode for given vnode */ static int hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) { hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); hlnode_t *hp = hm->hlm_rootnode; vnode_t *vp; ASSERT(hp); vp = HLNTOV(hp); VN_HOLD(vp); *vpp = vp; return (0); } static int hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) { hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); ulong_t blocks; dev32_t d32; zoneid_t eff_zid; struct zone *zp; /* * The FS may have been mounted by the GZ on behalf of the NGZ. In * that case, the hlfsmount zone_id will be the global zone. We want * to show the swap cap inside the zone in this case, even though the * FS was mounted by the GZ. */ if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) zp = curproc->p_zone; else zp = hm->hlm_vfsp->vfs_zone; if (zp == NULL) eff_zid = GLOBAL_ZONEUNIQID; else eff_zid = zp->zone_id; sbp->f_bsize = PAGESIZE; sbp->f_frsize = PAGESIZE; /* * Find the amount of available physical and memory swap */ mutex_enter(&anoninfo_lock); ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; mutex_exit(&anoninfo_lock); if (blocks > hyprlofs_minfree) sbp->f_bfree = blocks - hyprlofs_minfree; else sbp->f_bfree = 0; sbp->f_bavail = sbp->f_bfree; /* * Total number of blocks is what's available plus what's been used */ sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); if (eff_zid != GLOBAL_ZONEUNIQID && zp->zone_max_swap_ctl != UINT64_MAX) { /* * If the fs is used by a NGZ with a swap cap, then report the * capped size. */ rctl_qty_t cap, used; pgcnt_t pgcap, pgused; mutex_enter(&zp->zone_mem_lock); cap = zp->zone_max_swap_ctl; used = zp->zone_max_swap; mutex_exit(&zp->zone_mem_lock); pgcap = btop(cap); pgused = btop(used); sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); sbp->f_bavail = sbp->f_bfree; sbp->f_blocks = MIN(pgcap, sbp->f_blocks); } /* * This is fairly inaccurate since it doesn't take into account the * names stored in the directory entries. */ sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (hlnode_t) + sizeof (hldirent_t)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); sbp->f_fsid = d32; (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); /* * ensure null termination */ sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; sbp->f_flag = vf_to_stf(vfsp->vfs_flag); sbp->f_namemax = MAXNAMELEN - 1; return (0); } static int hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) { hlfid_t *hfid; hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); hlnode_t *hp = NULL; hfid = (hlfid_t *)fidp; *vpp = NULL; mutex_enter(&hm->hlm_contents); for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { mutex_enter(&hp->hln_tlock); if (hp->hln_nodeid == hfid->hlfid_ino) { /* * If the gen numbers don't match we know the file * won't be found since only one hlnode can have this * number at a time. */ if (hp->hln_gen != hfid->hlfid_gen || hp->hln_nlink == 0) { mutex_exit(&hp->hln_tlock); mutex_exit(&hm->hlm_contents); return (0); } *vpp = (vnode_t *)HLNTOV(hp); VN_HOLD(*vpp); if ((hp->hln_mode & S_ISVTX) && !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { mutex_enter(&(*vpp)->v_lock); (*vpp)->v_flag |= VISSWAP; mutex_exit(&(*vpp)->v_lock); } mutex_exit(&hp->hln_tlock); mutex_exit(&hm->hlm_contents); return (0); } mutex_exit(&hp->hln_tlock); } mutex_exit(&hm->hlm_contents); return (0); }