diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2015-12-28 18:00:01 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2015-12-28 18:00:01 +0000 |
commit | 673d120c71e85f5f52136d47d1e44282d0e41632 (patch) | |
tree | dbca459bc3367d492053c4c00975c1343742e9ce | |
parent | 180116059a3bc57660669b92f988a75f95d54257 (diff) | |
download | illumos-joyent-673d120c71e85f5f52136d47d1e44282d0e41632.tar.gz |
OS-4665 LX brand want devfs which allows symlinks to devices in root of /dev
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
-rw-r--r-- | manifest | 1 | ||||
-rw-r--r-- | usr/src/lib/brand/lx/zone/platform.xml | 2 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/devfs/lxd.h | 237 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/devfs/lxd_node.c | 1004 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c | 781 | ||||
-rw-r--r-- | usr/src/uts/common/brand/lx/devfs/lxd_vnops.c | 1453 | ||||
-rw-r--r-- | usr/src/uts/intel/Makefile.files | 5 | ||||
-rw-r--r-- | usr/src/uts/intel/Makefile.intel | 2 | ||||
-rw-r--r-- | usr/src/uts/intel/lx_devfs/Makefile | 57 | ||||
-rw-r--r-- | usr/src/uts/intel/lx_devfs/Makefile.rules | 21 |
10 files changed, 3561 insertions, 2 deletions
@@ -4571,6 +4571,7 @@ d usr/kernel/fs/amd64 0755 root sys f usr/kernel/fs/amd64/fdfs 0755 root sys f usr/kernel/fs/amd64/lxautofs 0755 root sys f usr/kernel/fs/amd64/lx_cgroup 0755 root sys +f usr/kernel/fs/amd64/lx_devfs 0755 root sys f usr/kernel/fs/amd64/lx_proc 0755 root sys f usr/kernel/fs/amd64/lx_sysfs 0755 root sys f usr/kernel/fs/amd64/pcfs 0755 root sys diff --git a/usr/src/lib/brand/lx/zone/platform.xml b/usr/src/lib/brand/lx/zone/platform.xml index cb3c9bb124..049ebbfd18 100644 --- a/usr/src/lib/brand/lx/zone/platform.xml +++ b/usr/src/lib/brand/lx/zone/platform.xml @@ -57,7 +57,7 @@ directory="/native/etc/zones/%z.xml" opt="ro" type="lofs" /> <!-- Local filesystems to mount when booting the zone --> - <mount special="/native/dev" directory="/dev" type="lofs" /> + <mount special="/native/dev" directory="/dev" type="lx_devfs" /> <mount special="proc" directory="/native/proc" type="proc" /> <mount special="swap" directory="/native/etc/svc/volatile" type="tmpfs" /> diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h new file mode 100644 index 0000000000..add9515891 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd.h @@ -0,0 +1,237 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LXD_H +#define _LXD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxd.h: declarations, data structures and macros for lxd (lxd devfs). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> +#include <sys/lx_types.h> + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +/* + * It's unlikely that we need to create more than 50-60 subdirs/symlinks + * as front files so we size the file system hash for 2x that number. + * The back devfs typically has ~80 nodes so this is also a comfortable size + * for the back hash table. + */ +#define LXD_HASH_SZ 128 + +#define LXD_BACK_HASH(v) ((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1)) + +#define LXD_NM_HASH(ldn, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(ldn) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + hash &= (LXD_HASH_SZ - 1); \ + } + + +enum lxd_node_type { LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT }; + +/* + * lxd per-mount data structure. + * + * All fields are protected by lxd_contents. + * File renames on a specific file system are protected lxdm_renamelck. + */ +typedef struct lxd_mnt { + struct vfs *lxdm_vfsp; /* filesystem's vfs struct */ + struct lxd_node *lxdm_rootnode; /* root lxd_node */ + char *lxdm_mntpath; /* name of lxd mount point */ + dev_t lxdm_dev; /* unique dev # of mounted `device' */ + kmutex_t lxdm_contents; /* per-mount lock */ + kmutex_t lxdm_renamelck; /* rename lock for this mount */ + uint_t lxdm_gen; /* node ID source for files */ + + /* protects buckets in both "dir ent" and "back" hash tables */ + kmutex_t lxdm_hash_mutex[LXD_HASH_SZ]; + + /* per-mount data for "back" vnodes in the fs */ + uint_t lxdm_back_refcnt; /* # outstanding "back" vnodes */ + struct lxd_node *lxdm_back_htable[LXD_HASH_SZ]; + + /* + * Per-mount directory data for "front" nodes in the fs. + * Each front node has a directory entry but directory entries can live + * on either front or back nodes. + */ + uint_t lxdm_dent_refcnt; /* # outstanding dir ents */ + struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ]; +} lxd_mnt_t; + +/* + * lxd_node is the file system dependent node for lxd. + * + * The node is used to represent both front and back files. For front files + * the node can represent either a directory or symlink. + */ +typedef struct lxd_node { + enum lxd_node_type lxdn_type; + + /* Data for "front" nodes */ + struct lxd_node *lxdn_prev; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_next; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_parent; /* dir containing this node */ + krwlock_t lxdn_rwlock; /* serialize mods/dir updates */ + kmutex_t lxdn_tlock; /* time, flag, and nlink lock */ + + /* these could be in a union ala tmpfs but not really necessary */ + uint_t lxdn_dirents; /* number of dirents */ + struct lxd_dirent *lxdn_dir; /* dirent list */ + char *lxdn_symlink; /* pointer to symlink */ + struct vattr lxdn_attr; /* attributes */ + + /* Hash table link */ + struct lxd_node *lxdn_hnxt; /* link in per-mount entry */ + /* hash table */ + vnode_t *lxdn_vnode; /* vnode for this lxd_node */ + + vnode_t *lxdn_real_vp; /* back file - real vnode */ +} lxd_node_t; + +/* + * Attributes + */ +#define lxdn_mask lxdn_attr.va_mask +#define lxdn_mode lxdn_attr.va_mode +#define lxdn_uid lxdn_attr.va_uid +#define lxdn_gid lxdn_attr.va_gid +#define lxdn_fsid lxdn_attr.va_fsid +#define lxdn_nodeid lxdn_attr.va_nodeid +#define lxdn_nlink lxdn_attr.va_nlink +#define lxdn_size lxdn_attr.va_size +#define lxdn_atime lxdn_attr.va_atime +#define lxdn_mtime lxdn_attr.va_mtime +#define lxdn_ctime lxdn_attr.va_ctime +#define lxdn_rdev lxdn_attr.va_rdev +#define lxdn_blksize lxdn_attr.va_blksize +#define lxdn_nblocks lxdn_attr.va_nblocks +#define lxdn_seq lxdn_attr.va_seq + +/* + * lx devfs conversion macros + */ +#define VFSTOLXDM(vfsp) ((lxd_mnt_t *)(vfsp)->vfs_data) +#define VTOLXDM(vp) ((lxd_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOLDN(vp) ((lxd_node_t *)(vp)->v_data) +#define LDNTOV(ln) ((ln)->lxdn_vnode) +#define ldnode_hold(ln) VN_HOLD(LDNTOV(ln)) +#define ldnode_rele(ln) VN_RELE(LDNTOV(ln)) + +#define REALVP(vp) (VTOLDN(vp)->lxdn_real_vp) + +/* + * front directories are made up of a linked list of lxd_dirent structures + * hanging off directory lxdn_nodes. File names are not fixed length, but are + * null terminated. + */ +typedef struct lxd_dirent { + lxd_node_t *lddir_node; /* lxd node for this file */ + struct lxd_dirent *lddir_next; /* next directory entry */ + struct lxd_dirent *lddir_prev; /* prev directory entry */ + uint_t lddir_offset; /* "offset" of dir entry */ + uint_t lddir_hash; /* a hash of lddir_name */ + struct lxd_dirent *lddir_link; /* linked via hash table */ + lxd_node_t *lddir_parent; /* parent, dir we are in */ + char *lddir_name; /* null terminated */ +} lxd_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +#define LX_MAJORSHIFT 8 +#define LX_MINORMASK ((1 << LX_MAJORSHIFT) - 1) +#define LX_MAKEDEVICE(lx_maj, lx_min) \ + ((lx_dev_t)((lx_maj) << LX_MAJORSHIFT | ((lx_min) & LX_MINORMASK))) + +typedef struct lxd_minor_translator { + char *lxd_mt_path; /* illumos minor node path */ + minor_t lxd_mt_minor; /* illumos minor node number */ + int lxd_mt_lx_major; /* linux major node number */ + int lxd_mt_lx_minor; /* linux minor node number */ +} lxd_minor_translator_t; + +enum lxd_xl_tp { DTT_INVALID, DTT_LIST, DTT_CUSTOM }; + +#define xl_list lxd_xl_minor.lxd_xl_list +#define xl_custom lxd_xl_minor.lxd_xl_custom + +typedef struct lxd_devt_translator { + char *lxd_xl_driver; /* driver name */ + major_t lxd_xl_major; /* driver number */ + + enum lxd_xl_tp lxd_xl_type; /* dictates how we intrep. xl_minor */ + union { + uintptr_t lxd_xl_foo; /* required to compile */ + lxd_minor_translator_t *lxd_xl_list; + int (*lxd_xl_custom)(dev_t, lx_dev_t *); + } lxd_xl_minor; +} lxd_devt_translator_t; + +extern struct vnodeops *lxd_vnodeops; +extern lxd_devt_translator_t lxd_devt_translators[]; + +vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *); +void lxd_free_back_node(lxd_node_t *); +int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *); +int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *, + lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *, + caller_context_t *); +void lxd_dirinit(lxd_node_t *, lxd_node_t *, cred_t *); +int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *); +void lxd_dirtrunc(lxd_node_t *); +void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *); +int lxd_naccess(void *, int, cred_t *); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXD_H */ diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c new file mode 100644 index 0000000000..9e67f988bc --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c @@ -0,0 +1,1004 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "lxd.h" + +#define LXD_HASH_SIZE 8192 /* must be power of 2 */ +#define LXD_MUTEX_SIZE 64 + + +#define MODESHIFT 3 + +typedef enum lxd_nodehold { + NOHOLD, + HOLD +} lxd_nodehold_t; + +/* + * The following functions maintain the per-mount "front" files. + */ +static void +lxd_save_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash); + de->lddir_hash = hash; + + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + ASSERT(de->lddir_link == NULL); + de->lddir_link = lxdm->lxdm_dent_htable[hash]; + lxdm->lxdm_dent_htable[hash] = de; + mutex_exit(hmtx); + + atomic_inc_32(&lxdm->lxdm_dent_refcnt); +} + +static void +lxd_rm_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + lxd_dirent_t **prevpp; + kmutex_t *hmtx; + + hash = de->lddir_hash; + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + prevpp = &lxdm->lxdm_dent_htable[hash]; + while (*prevpp != de) + prevpp = &(*prevpp)->lddir_link; + *prevpp = de->lddir_link; + de->lddir_link = NULL; + mutex_exit(hmtx); + + ASSERT(lxdm->lxdm_dent_refcnt > 0); + atomic_dec_32(&lxdm->lxdm_dent_refcnt); +} + +static lxd_dirent_t * +lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold, + lxd_node_t **found) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(parent)); + lxd_dirent_t *de; + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(parent, name, hash); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + de = lxdm->lxdm_dent_htable[hash]; + while (de) { + if (de->lddir_hash == hash && de->lddir_parent == parent && + strcmp(de->lddir_name, name) == 0) { + lxd_node_t *ldn = de->lddir_node; + + if (do_hold == HOLD) { + ASSERT(ldn != NULL); + ldnode_hold(ldn); + } + if (found != NULL) + *found = ldn; + mutex_exit(hmtx); + return (de); + } + + de = de->lddir_link; + } + mutex_exit(hmtx); + return (NULL); +} + +int +lxd_naccess(void *vcp, int mode, cred_t *cr) +{ + lxd_node_t *ldn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in lxd_node. + */ + if (crgetuid(cr) != ldn->lxdn_uid) { + shift += MODESHIFT; + if (groupmember(ldn->lxdn_gid, cr) == 0) + shift += MODESHIFT; + } + + if (ldn->lxdn_type == LXDNT_FRONT) + return (secpolicy_vnode_access2(cr, LDNTOV(ldn), + ldn->lxdn_uid, ldn->lxdn_mode << shift, mode)); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL)); +} + +static lxd_node_t * +lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm) +{ + lxd_node_t *l; + + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) { + if (l->lxdn_real_vp == vp) { + ASSERT(l->lxdn_type == LXDNT_BACK); + + VN_HOLD(LDNTOV(l)); + return (l); + } + } + return (NULL); +} + +static void +lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm) +{ + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(l->lxdn_real_vp != NULL); + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + atomic_inc_32(&lxdm->lxdm_back_refcnt); + + l->lxdn_hnxt = lxdm->lxdm_back_htable[hash]; + lxdm->lxdm_back_htable[hash] = l; +} + + +struct vnode * +lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + + hash = LXD_BACK_HASH(vp); /* Note: hashing with realvp */ + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + l = lxd_find_back(vp, hash, lxdm); + if (l == NULL) { + vnode_t *nvp; + + l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + nvp = vn_alloc(KM_SLEEP); + + rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + + l->lxdn_vnode = nvp; + l->lxdn_type = LXDNT_BACK; + l->lxdn_real_vp = vp; + + VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type, + vp->v_rdev); + nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN)); + vn_setops(nvp, lxd_vnodeops); + nvp->v_data = (caddr_t)l; + + lxd_save_back(l, hash, lxdm); + vn_exists(vp); + } else { + VN_RELE(vp); + } + + mutex_exit(hmtx); + return (LDNTOV(l)); +} + +void +lxd_free_back_node(lxd_node_t *lp) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + lxd_node_t *lprev = NULL; + vnode_t *vp = LDNTOV(lp); + vnode_t *realvp = REALVP(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + /* in lxd_make_back_node we call lxd_find_back with the realvp */ + hash = LXD_BACK_HASH(realvp); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + mutex_enter(&vp->v_lock); + if (vp->v_count > 1) { + vp->v_count--; /* release our hold from vn_rele */ + mutex_exit(&vp->v_lock); + mutex_exit(hmtx); + return; + } + mutex_exit(&vp->v_lock); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; + lprev = l, l = l->lxdn_hnxt) { + + if (l != lp) + continue; + + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(lxdm->lxdm_back_refcnt > 0); + + atomic_dec_32(&lxdm->lxdm_back_refcnt); + vn_invalid(vp); + + if (lprev == NULL) { + lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt; + } else { + lprev->lxdn_hnxt = l->lxdn_hnxt; + } + + mutex_exit(hmtx); + rw_destroy(&l->lxdn_rwlock); + mutex_destroy(&l->lxdn_tlock); + kmem_free(l, sizeof (lxd_node_t)); + vn_free(vp); + VN_RELE(realvp); + return; + } + + panic("lxd_free_back_node"); + /*NOTREACHED*/ +} +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found lxd_node with its vnode held. + */ +int +lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr) +{ + int error; + + *foundnp = NULL; + if (parent->lxdn_vnode->v_type != VDIR) + return (ENOTDIR); + + if ((error = lxd_naccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + ldnode_hold(parent); + *foundnp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the lxdn_dir list + * so that it doesn't change out from underneath us. + * lxd_find_dirent() will pass back the lxd_node + * with a hold on it. + */ + + if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) { + ASSERT(*foundnp); + return (0); + } + + return (ENOENT); +} + +/* + * Check if the source directory is in the path of the target directory. + * The target directory is locked by the caller. + */ +static int +lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent, cred_t *cr) +{ + int error = 0; + lxd_node_t *dir, *dotdot; + + ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock)); + ASSERT(toparent->lxdn_vnode->v_type == VDIR); + + dotdot = toparent->lxdn_parent; + if (dotdot == NULL) + return (ENOENT); + ldnode_hold(dotdot); + + if (dotdot == toparent) { + /* root of fs. search trivially satisfied. */ + ldnode_rele(dotdot); + return (0); + } + + for (;;) { + /* + * Return error for cases like "mv c c/d", + * "mv c c/d/e" and so on. + */ + if (dotdot == fromnode) { + ldnode_rele(dotdot); + error = EINVAL; + break; + } + + dir = dotdot; + dotdot = dir->lxdn_parent; + if (dotdot == NULL) { + ldnode_rele(dir); + error = ENOENT; + break; + } + ldnode_hold(dotdot); + + /* + * We're okay if we traverse the directory tree up to + * the root directory and don't run into the + * parent directory. + */ + if (dir == dotdot) { + ldnode_rele(dir); + ldnode_rele(dotdot); + break; + } + ldnode_rele(dir); + } + + return (error); +} + +static int +lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va, + enum de_op op, lxd_node_t **newnode, struct cred *cred) +{ + lxd_node_t *ldn; + + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + + ldn->lxdn_type = LXDNT_FRONT; + lxd_node_init(lxdm, ldn, NULL, va, cred); + + ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV; + ldn->lxdn_vnode->v_type = va->va_type; + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + if (va->va_mask & AT_ATIME) + ldn->lxdn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + ldn->lxdn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + lxd_dirinit(dir, ldn, cred); + } + + *newnode = ldn; + return (0); +} + +static int +lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name, enum de_op op) +{ + lxd_dirent_t *dp, *pdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->lxdn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (lxd_dirent_t); + dp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); + if (dp == NULL) + return (ENOSPC); + + ldn->lxdn_parent = dir; + + dir->lxdn_size += alloc_size; + dir->lxdn_dirents++; + dp->lddir_node = ldn; + dp->lddir_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t); + (void) strcpy(dp->lddir_name, name); + + lxd_save_dirent(dp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in lxd_readdir. + */ + pdp = dir->lxdn_dir->lddir_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (pdp->lddir_next != NULL && + (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) { + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset); + pdp = pdp->lddir_next; + } + dp->lddir_offset = pdp->lddir_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's lddir_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (pdp->lddir_next == NULL && + pdp->lddir_offset > 2 * dir->lxdn_dirents) + dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next; + else + dir->lxdn_dir->lddir_prev = dp; + + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + dp->lddir_next = pdp->lddir_next; + if (dp->lddir_next) { + dp->lddir_next->lddir_prev = dp; + } + dp->lddir_prev = pdp; + pdp->lddir_next = dp; + + ASSERT(dp->lddir_next != dp); + ASSERT(dp->lddir_prev != dp); + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + return (0); +} + +/* + * Enter a directory entry for 'name' into directory 'dir' + * + * Returns 0 on success. + */ +int +lxd_direnter( + lxd_mnt_t *lxdm, + lxd_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + lxd_node_t *fromparent, /* original directory if rename */ + lxd_node_t *ldn, /* existing lxd_node, if rename */ + struct vattr *va, + lxd_node_t **rnp, /* return lxd_node, if create/mkdir */ + cred_t *cr, + caller_context_t *ctp) +{ + lxd_dirent_t *dirp; + lxd_node_t *found = NULL; + int error = 0; + char *s; + + /* lxdn_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("lxd_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + */ + if (op == DE_RENAME) { + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink == 0) { + mutex_exit(&ldn->lxdn_tlock); + return (ENOENT); + } + + if (ldn->lxdn_nlink == MAXLINK) { + mutex_exit(&ldn->lxdn_tlock); + return (EMLINK); + } + ldn->lxdn_nlink++; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + + /* + * This might be a "dangling detached directory" (it could have been + * removed, but a reference to it kept in u_cwd). Don't bother + * searching it, and with any luck the user will get tired of dealing + * with us and cd to some absolute pathway (thus in ufs, too). + */ + if (dir->lxdn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * If this is a rename of a directory and the parent is different + * (".." must be changed), then the source directory must not be in the + * directory hierarchy above the target, as this would orphan + * everything below the source directory. + */ + if (op == DE_RENAME) { + if (ldn == dir) { + error = EINVAL; + goto out; + } + if ((ldn->lxdn_vnode->v_type) == VDIR) { + if ((fromparent != dir) && + (error = lxd_dircheckpath(ldn, dir, cr)) != 0) { + goto out; + } + } + } + + /* Search for an existing entry. */ + dirp = lxd_find_dirent(name, dir, HOLD, &found); + if (dirp != NULL) { + ASSERT(found != NULL); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (rnp != NULL) { + *rnp = found; + error = EEXIST; + } else { + ldnode_rele(found); + } + break; + + case DE_RENAME: + /* + * Note that we only hit this path when we're renaming + * a symlink from one directory to another and there is + * a pre-existing symlink as the target. lxd_rename + * will unlink the src from the original directory but + * here we need to unlink the dest that we collided + * with, then create the new directory entry as we do + * below when there is no pre-existing symlink. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + + ASSERT(found->lxdn_vnode->v_type == VLNK); + /* dir rw lock is already held and asserted above */ + rw_enter(&found->lxdn_rwlock, RW_WRITER); + error = lxd_dirdelete(dir, found, name, DR_RENAME, cr); + rw_exit(&found->lxdn_rwlock); + ldnode_rele(found); + if (error != 0) + goto out; + + error = lxd_diraddentry(dir, ldn, name, op); + if (error == 0 && rnp != NULL) + *rnp = ldn; + break; + } + } else { + + /* + * The directory entry does not exist, but the node might if + * this is a rename. Check write permission in directory to + * see if entry can be created. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new lxd_node and directory entry as required. + */ + error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr); + if (error) + goto out; + } + + error = lxd_diraddentry(dir, ldn, name, op); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if ((ldn->lxdn_vnode->v_type) == VDIR) { + ASSERT(dirp == NULL); + /* + * cleanup allocs made by lxd_dirinit + */ + lxd_dirtrunc(ldn); + } + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink = 0; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + ldnode_rele(ldn); + ldn = NULL; + } + } else if (rnp != NULL) { + *rnp = ldn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + ldnode_rele(ldn); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + return (error); +} + +/* + * Delete entry ldn of name "nm" from parent dir. This is used to both remove + * a directory and to remove file nodes within the directory (by recursively + * calling itself). It frees the dir entry space and decrements link count on + * lxd_node(s). + * + * Return 0 on success. + */ +int +lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op, + cred_t *cred) +{ + lxd_dirent_t *dirp; + int error; + size_t namelen; + lxd_node_t *fndnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + if (nm[0] == '\0') + panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->lxdn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a directory. Start by + * removing any file entries from the dir. We do this by + * recursively calling back into this function with a different + * op code. The caller of this function has already verified + * that it is safe to remove this directory. + */ + lxd_dirent_t *dirp; + + ASSERT(ldn->lxdn_vnode->v_type == VDIR); + + dirp = ldn->lxdn_dir; + while (dirp) { + lxd_node_t *dn; + lxd_dirent_t *nextp; + + if (strcmp(dirp->lddir_name, ".") == 0 || + strcmp(dirp->lddir_name, "..") == 0) { + dirp = dirp->lddir_next; + continue; + } + + dn = dirp->lddir_node; + nextp = dirp->lddir_next; + + ldnode_hold(dn); + error = lxd_dirdelete(ldn, dn, dirp->lddir_name, + DR_REMOVE, cred); + ldnode_rele(dn); + + dirp = nextp; + } + } + + dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp); + VERIFY(dirp != NULL); + VERIFY(ldn == fndnp); + + lxd_rm_dirent(dirp); + + /* Take dirp out of the directory list. */ + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + if (dirp->lddir_prev) { + dirp->lddir_prev->lddir_next = dirp->lddir_next; + } + if (dirp->lddir_next) { + dirp->lddir_next->lddir_prev = dirp->lddir_prev; + } + + /* + * If the roving slot pointer happens to match dirp, + * point it at the previous dirent. + */ + if (dir->lxdn_dir->lddir_prev == dirp) { + dir->lxdn_dir->lddir_prev = dirp->lddir_prev; + } + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + + /* dirp points to the correct directory entry */ + namelen = strlen(dirp->lddir_name) + 1; + + kmem_free(dirp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + ldn->lxdn_ctime = now; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) { + lxd_dirtrunc(ldn); + ASSERT(ldn->lxdn_nlink == 0); + } + return (0); +} + +/* + * Initialize a lxd_node and add it to file list under mount point. + */ +void +lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap, + cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + ldn->lxdn_mask = 0; + ldn->lxdn_attr.va_type = vap->va_type; + ldn->lxdn_nlink = 1; + ldn->lxdn_size = 0; + + if (cred == NULL) { + ldn->lxdn_uid = vap->va_uid; + ldn->lxdn_gid = vap->va_gid; + } else { + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + } + + ldn->lxdn_fsid = lxdm->lxdm_dev; + ldn->lxdn_rdev = vap->va_rdev; + ldn->lxdn_blksize = PAGESIZE; + ldn->lxdn_nblocks = 0; + gethrestime(&now); + ldn->lxdn_atime = now; + ldn->lxdn_mtime = now; + ldn->lxdn_ctime = now; + ldn->lxdn_seq = 0; + ldn->lxdn_dir = NULL; + + ldn->lxdn_real_vp = realvp; + + ldn->lxdn_vnode = vn_alloc(KM_SLEEP); + vp = LDNTOV(ldn); + vn_setops(vp, lxd_vnodeops); + vp->v_vfsp = lxdm->lxdm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)ldn; + + mutex_enter(&lxdm->lxdm_contents); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Add new lxd_node to end of linked list of lxd_nodes for this + * lxdevfs. Root directory is handled specially in lxd_mount. + */ + if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) { + ldn->lxdn_next = NULL; + ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev; + ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev = + ldn; + } + mutex_exit(&lxdm->lxdm_contents); + vn_exists(vp); +} + +/* + * lxd_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir, cred_t *cr) +{ + lxd_dirent_t *dot, *dotdot; + timestruc_t now; + lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode); + struct vattr nattr; + + ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + dir->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP); + dot->lddir_node = dir; + dot->lddir_offset = 0; + dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t); + dot->lddir_name[0] = '.'; + dot->lddir_parent = dir; + lxd_save_dirent(dot); + + dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP); + dotdot->lddir_node = parent; + dotdot->lddir_offset = 1; + dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t); + dotdot->lddir_name[0] = '.'; + dotdot->lddir_name[1] = '.'; + dotdot->lddir_parent = dir; + lxd_save_dirent(dotdot); + + /* + * Initialize directory entry list. + */ + dot->lddir_next = dotdot; + dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */ + dotdot->lddir_next = NULL; + dotdot->lddir_prev = dot; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + parent->lxdn_nlink++; + parent->lxdn_ctime = now; + + dir->lxdn_dir = dot; + dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5; /* dot and dotdot */ + dir->lxdn_dirents = 2; + dir->lxdn_nlink = 2; + dir->lxdn_parent = parent; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; +} + +/* + * lxd_dirtrunc is called to remove all directory entries under this directory. + */ +void +lxd_dirtrunc(lxd_node_t *dir) +{ + lxd_dirent_t *ldp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) { + size_t namelen; + lxd_node_t *ldn; + + ASSERT(ldp->lddir_next != ldp); + ASSERT(ldp->lddir_prev != ldp); + ASSERT(ldp->lddir_node); + + dir->lxdn_dir = ldp->lddir_next; + namelen = strlen(ldp->lddir_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + ldn = ldp->lddir_node; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + + lxd_rm_dirent(ldp); + kmem_free(ldp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + } + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + ASSERT(dir->lxdn_dir == NULL); + ASSERT(dir->lxdn_size == 0); + ASSERT(dir->lxdn_dirents == 0); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c new file mode 100644 index 0000000000..bf5913f025 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c @@ -0,0 +1,781 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * The lx devfs (lxd) file system is used within lx branded zones to provide + * the Linux view of /dev. + * + * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev. + * lxd now provides the Linux /dev. + * + * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file + * system which is the special device and corresponds to the special device in + * a lofs mount. As with lofs, all files in the special device are accessible + * through the lxd mount. Because the zone's devfs is not directly modifiable + * within the zone (also mknod(2) is not generally allowed within a zone) it is + * impossible to create files in devfs. For lx, in some cases it's useful to be + * able to make new symlinks or new directories under /dev. lxd implements + * these operations by creating "files" in memory in the same way as tmpfs + * does. Within lxd these are referred to as "front" files. For operations such + * as lookup or readdir, lxd provides a merged view of both the front and back + * files. lxd does not support regular front files or simple I/O (read/write) + * to front files, since there is no need for that. For back files, all + * operations are simply passed through to the real vnode, as is done with + * lofs. Front files are not allowed to mask back files. + * + * The Linux /dev is now a lxd mount with the special file (i.e. the back + * file system) as /native/dev. + * + * In addition, lx has a need for some illumos/Linux translation for the + * various *stat(2) system calls when used on a device. This translation can + * be centralized within lxd's getattr vnode entry point. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_ptm.h> + +#include "lxd.h" + +/* Module level parameters */ +static int lxd_fstype; +static dev_t lxd_dev; + +/* + * lxd_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t lxd_mountcount; + +/* + * lxd_minfree is the minimum amount of swap space that lx devfs leaves for + * the rest of the zone. + */ +size_t lxd_minfree = 0; + +/* + * LXDMINFREE -- the value from which lxd_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for lxd in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow lxd to consume + * no more than ~10% of this, yielding a LXDMINFREE of 12MB. + */ +#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +extern int stat64(char *, struct stat64 *); + +/* + * lxd vfs operations. + */ +static int lxd_init(int, char *); +static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int lxd_unmount(vfs_t *, int, cred_t *); +static int lxd_root(vfs_t *, vnode_t **); +static int lxd_statvfs(vfs_t *, statvfs64_t *); +static void lxd_freevfs(vfs_t *vfsp); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_devfs", + lxd_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand devfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +/* + * Definitions and translators for devt's. + */ +static int lxd_pts_devt_translator(dev_t, lx_dev_t *); +static int lxd_ptm_devt_translator(dev_t, lx_dev_t *); + +#define LX_PTS_MAJOR_MIN 136 +#define LX_PTS_MAJOR_MAX 143 +#define LX_PTS_MAX \ + ((LX_PTS_MAJOR_MAX - LX_PTS_MAJOR_MIN + 1) * LX_MINORMASK) + +#define LX_PTM_MAJOR 5 +#define LX_PTM_MINOR 2 + +static lxd_minor_translator_t lxd_mtranslator_mm[] = { + { "/dev/null", 0, 1, 3 }, + { "/dev/zero", 0, 1, 5 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_random[] = { + { "/dev/random", 0, 1, 8 }, + { "/dev/urandom", 0, 1, 9 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_sy[] = { + { "/dev/tty", 0, 5, 0 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_zcons[] = { + { "/dev/console", 0, 5, 1 }, + { NULL, 0, 0, 0 } +}; +lxd_devt_translator_t lxd_devt_translators[] = { + { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm }, + { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random }, + { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy }, + { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons }, + { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator }, + { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator }, + { NULL, 0, DTT_INVALID, NULL } +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (lxd_mountcount > 0) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(lxd_fstype); + vn_freevnodeops(lxd_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading lxd module. + */ +static int +lxd_init(int fstype, char *name) +{ + static const fs_operation_def_t lxd_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxd_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount }, + VFSNAME_ROOT, { .vfs_root = lxd_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def lxd_vnodeops_template[]; + int error; + major_t dev; + int i; + + lxd_fstype = fstype; + ASSERT(lxd_fstype != 0); + + error = vfs_setfsops(fstype, lxd_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxd_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxd_init: bad vnode ops template"); + return (error); + } + + /* + * lxd_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (lxd_minfree == 0) { + /* Set if not patched */ + lxd_minfree = btopr(LXDMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxd_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxd_dev = makedevice(dev, 0); + + /* + * Initialize device translator mapping table. + */ + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + lxd_minor_translator_t *mt; + int j; + + lxd_devt_translators[i].lxd_xl_major = + mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver); + + /* if this translator doesn't use a list mapping we're done. */ + if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST) + continue; + + /* for each device listed, lookup the minor node number */ + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + vnode_t *vp; + struct vattr va; + char *tpath; + char tnm[MAXPATHLEN]; + + /* + * The attach might be triggered in either the global + * zone or in a non-global zone, so we may need to + * adjust the path if we're in a NGZ. + */ + if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) { + tpath = mt[j].lxd_mt_path; + } else { + (void) snprintf(tnm, sizeof (tnm), "/native%s", + mt[j].lxd_mt_path); + tpath = tnm; + } + + if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL, + &vp, NULL) != 0) { + mt[j].lxd_mt_minor = -1; + continue; + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) { + va.va_rdev = NODEV; + } else { + ASSERT(getmajor(va.va_rdev) == + lxd_devt_translators[i].lxd_xl_major); + ASSERT(mt[j].lxd_mt_lx_minor < LX_MINORMASK); + } + + mt[j].lxd_mt_minor = getminor(va.va_rdev); + + VN_RELE(vp); + } + } + + return (0); +} + +static int +lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lxd_mnt_t *lxdm = NULL; + struct lxd_node *ldn; + struct pathname dpn; + int error; + int i; + int nodev; + struct vattr rattr; + vnode_t *realrootvp; + vnode_t *tvp; + + nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * This is the same behavior as with lofs. + * Loopback devices which get "nodevices" added can be done without + * "nodevices" set because we cannot import devices into a zone + * with loopback. Note that we have all zone privileges when + * this happens; if not, we'd have gotten "nosuid". + */ + if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); + + /* + * Only allow mounting within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* lxd doesn't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * Find real root + */ + if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) { + pn_free(&dpn); + return (error); + } + + if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (error); + } + + /* If realroot is not a devfs, error out */ + if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (EINVAL); + } + + lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP); + + /* init but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL); + + /* Initialize the hash table mutexes */ + for (i = 0; i < LXD_HASH_SZ; i++) { + mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT, + NULL); + } + + lxdm->lxdm_vfsp = vfsp; + lxdm->lxdm_gen = 1; /* start inode counter at 1 */ + + vfsp->vfs_data = (caddr_t)lxdm; + vfsp->vfs_fstype = lxd_fstype; + vfsp->vfs_dev = lxd_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype); + lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path); + + /* allocate and initialize root lxd_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + + tvp = lxd_make_back_node(realrootvp, lxdm); + ldn = VTOLDN(tvp); + + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + LDNTOV(ldn)->v_flag |= VROOT; + + /* + * initialize linked list of lxd_nodes so that the back pointer of + * the root lxd_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + ldn->lxdn_prev = ldn; + ldn->lxdn_next = NULL; + ldn->lxdn_nlink = 0; + lxdm->lxdm_rootnode = ldn; + + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + lxd_dirinit(ldn, ldn, cr); + + rw_exit(&ldn->lxdn_rwlock); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&lxd_mountcount); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + + mutex_enter(&lxdm->lxdm_contents); + + /* + * In the normal unmount case only the root node would have a reference + * count. + * + * With lxdm_contents held, nothing can be added or removed. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + ldn = lxdm->lxdm_rootnode; + + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EINVAL); + } + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) { + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = lxdm->lxdm_rootnode->lxdn_next; + while (cancel != ldn) { + vp = LDNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->lxdn_next; + } + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } else { + /* + * It may seem incorrect for us to have a vnode with + * a count of 0, but this is modeled on tmpfs and works + * the same way. See lxd_front_inactive. There we allow + * the v_count to go to 0 but rely on the link count to + * keep the vnode alive. Since we now want to cleanup + * these vnodes we manually add a VN_HOLD so that the + * VN_RELEs that occur in the lxd_freevfs() cleanup + * will take us down the lxd_inactive code path. We + * can directly add a VN_HOLD since we have the lock. + */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&lxdm->lxdm_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * lxd_unmount. + */ +void +lxd_freevfs(vfs_t *vfsp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn; + struct vnode *vp; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the lxd_mnt_t that + * says we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries (this doesn't remove top-level dirs). + */ + for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if (ldn->lxdn_vnode->v_type == VDIR) + lxd_dirtrunc(ldn); + rw_exit(&ldn->lxdn_rwlock); + } + + ASSERT(lxdm->lxdm_rootnode != NULL); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a + * lxd_node from blowing it away (in lxd_inactive) while we're trying + * to get to it here. Once we have a HOLD on it we know it'll stick + * around. + */ + mutex_enter(&lxdm->lxdm_contents); + + /* + * Remove all the files (except the rootnode) backwards. + */ + while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) { + mutex_exit(&lxdm->lxdm_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = LDNTOV(ldn); + ASSERT(vp->v_type == VLNK || vp->v_type == VDIR); + VN_RELE(vp); + mutex_enter(&lxdm->lxdm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (ldn == lxdm->lxdm_rootnode->lxdn_prev) { + VN_HOLD(vp); + mutex_exit(&lxdm->lxdm_contents); + delay(hz / 4); + mutex_enter(&lxdm->lxdm_contents); + } + } + mutex_exit(&lxdm->lxdm_contents); + + ASSERT(lxdm->lxdm_back_refcnt == 1); + ASSERT(lxdm->lxdm_dent_refcnt == 0); + + VN_RELE(LDNTOV(lxdm->lxdm_rootnode)); + + ASSERT(lxdm->lxdm_mntpath != NULL); + kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1); + + mutex_destroy(&lxdm->lxdm_contents); + mutex_destroy(&lxdm->lxdm_renamelck); + kmem_free(lxdm, sizeof (lxd_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&lxd_mountcount); +} + +/* + * return root lxdnode for given vnode + */ +static int +lxd_root(struct vfs *vfsp, struct vnode **vpp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn = lxdm->lxdm_rootnode; + struct vnode *vp; + + ASSERT(ldn != NULL); + + vp = LDNTOV(ldn); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = lxdm->lxdm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > lxd_minfree) + sbp->f_bfree = blocks - lxd_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of lxd_nodes we can allocate from the remaining kernel memory + * available to lxdevfs in this zone. This is fairly inaccurate since + * it doesn't take into account the names stored in the directory + * entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (lxd_node_t) + sizeof (lxd_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +lxd_pts_devt_translator(dev_t dev, lx_dev_t *jdev) +{ + minor_t min = getminor(dev); + int lx_maj; + int lx_min; + + /* + * linux has a really small minor number name space (8 bits). + * so if pts devices are limited to one major number you could + * only have 256 of them. linux addresses this issue by using + * multiple major numbers for pts devices. + */ + if (min >= LX_PTS_MAX) + return (EOVERFLOW); + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MINORMASK); + lx_min = min % LX_MINORMASK; + + *jdev = LX_MAKEDEVICE(lx_maj, lx_min); + return (0); +} + +static int +lxd_ptm_devt_translator(dev_t dev, lx_dev_t *jdev) +{ + *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR); + return (0); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c new file mode 100644 index 0000000000..05ca0400ad --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c @@ -0,0 +1,1453 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <sys/lx_brand.h> + +#include "lxd.h" + +static int +lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(*vpp); + vnode_t *vp = *vpp; + vnode_t *rvp; + vnode_t *oldvp; + int error; + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + oldvp = vp; + vp = rvp = REALVP(vp); + /* + * Need to hold new reference to vp since VOP_OPEN() may + * decide to release it. + */ + VN_HOLD(vp); + error = VOP_OPEN(&rvp, flag, cr, ct); + + if (!error && rvp != vp) { + /* + * the FS which we called should have released the + * new reference on vp + */ + *vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp)); + + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + VN_RELE(oldvp); + } else { + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + } + + return (error); +} + +static int +lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_CLOSE(vp, flag, count, offset, cr, ct)); +} + +static int +lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READ(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_WRITE(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr, + int *rvalp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct)); +} + +static int +lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETFL(vp, oflags, nflags, cr, ct)); +} + +/* + * Translate SunOS devt to Linux devt. + */ +static int +lxd_s2l_devt(dev_t dev, lx_dev_t *rdev) +{ + lxd_minor_translator_t *mt; + int i, j; + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + /* look for a devt translator for this major number */ + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + if (lxd_devt_translators[i].lxd_xl_major == maj) + break; + } + + if (lxd_devt_translators[i].lxd_xl_driver != NULL) { + /* try to translate the illumos devt to a linux devt */ + switch (lxd_devt_translators[i].lxd_xl_type) { + case DTT_INVALID: + ASSERT(0); + break; + + case DTT_LIST: + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + if (mt[j].lxd_mt_minor == min) { + ASSERT(mt[j].lxd_mt_minor < + LX_MINORMASK); + + /* found a translation */ + *rdev = LX_MAKEDEVICE( + mt[j].lxd_mt_lx_major, + mt[j].lxd_mt_lx_minor); + return (0); + } + } + break; + + case DTT_CUSTOM: + return (lxd_devt_translators[i].xl_custom(dev, rdev)); + } + } + + /* we don't have a translator for this device */ + *rdev = LX_MAKEDEVICE(maj, min); + return (0); +} + +static int +lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + int error; + vnode_t *rvp; + + if (ldn->lxdn_type == LXDNT_FRONT) { + mutex_enter(&ldn->lxdn_tlock); + + vap->va_type = vp->v_type; + vap->va_mode = ldn->lxdn_mode & MODEMASK; + vap->va_uid = ldn->lxdn_uid; + vap->va_gid = ldn->lxdn_gid; + vap->va_fsid = ldn->lxdn_fsid; + vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid; + vap->va_nlink = ldn->lxdn_nlink; + vap->va_size = (u_offset_t)ldn->lxdn_size; + vap->va_atime = ldn->lxdn_atime; + vap->va_mtime = ldn->lxdn_mtime; + vap->va_ctime = ldn->lxdn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = 0; /* no devs in front */ + vap->va_seq = ldn->lxdn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr( + vap->va_size))); + mutex_exit(&ldn->lxdn_tlock); + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + rvp = REALVP(vp); + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct))) + return (error); + + /* Skip devt translation for native programs */ + if (curproc->p_brand != &lx_brand) + return (0); + + if (rvp->v_type == VCHR) { + major_t major; + int i; + + major = getmajor(vap->va_rdev); + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; + i++) { + if (lxd_devt_translators[i].lxd_xl_major == major) { + lx_dev_t ldev; + + (void) lxd_s2l_devt(vap->va_rdev, &ldev); + DTRACE_PROBE3(lxd__devxl, void *, rvp, + void *, vap, int, ldev); + /* + * TBD: enable device translation for back + * nodes. + */ + /* vap->va_rdev = ldev; */ + break; + } + } + } + + return (0); +} + +static int +lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error = 0; + struct vattr *set; + long mask = vap->va_mask; + + /* Cannot set these attributes */ + if ((mask & AT_NOSET) || (mask & AT_XVATTR) || + (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) || + (mask & AT_SIZE)) + return (EINVAL); + + mutex_enter(&ldn->lxdn_tlock); + + set = &ldn->lxdn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, set, flags, + lxd_naccess, ldn); + if (error) { + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + if (mask & AT_MODE) { + set->va_mode &= S_IFMT; + set->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + set->va_uid = vap->va_uid; + if (mask & AT_GID) + set->va_gid = vap->va_gid; + if (mask & AT_ATIME) + set->va_atime = vap->va_atime; + if (mask & AT_MTIME) + set->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&ldn->lxdn_ctime); + + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETATTR(vp, vap, flags, cr, ct)); +} + +static int +lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + mutex_enter(&ldn->lxdn_tlock); + error = lxd_naccess(ldn, mode, cr); + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + vp = REALVP(vp); + return (VOP_ACCESS(vp, mode, flags, cr, ct)); +} + +static int +lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FSYNC(vp, syncflag, cr, ct)); +} + +static void +lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + mutex_enter(&ldn->lxdn_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || ldn->lxdn_nlink != 0) { + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + return; + } + + /* + * We have the last hold *and* the link count is zero, so this node is + * dead from the filesystem's viewpoint. + */ + if (ldn->lxdn_size != 0) { + if (ldn->lxdn_vnode->v_type == VLNK) + kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1); + } + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + + vn_invalid(LDNTOV(ldn)); + + mutex_enter(&lxdm->lxdm_contents); + if (ldn->lxdn_next == NULL) + lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev; + else + ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev; + ldn->lxdn_prev->lxdn_next = ldn->lxdn_next; + + mutex_exit(&lxdm->lxdm_contents); + rw_exit(&ldn->lxdn_rwlock); + rw_destroy(&ldn->lxdn_rwlock); + mutex_destroy(&ldn->lxdn_tlock); + + vn_free(LDNTOV(ldn)); + kmem_free(ldn, sizeof (lxd_node_t)); +} + +/*ARGSUSED*/ +static void +lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + lxd_front_inactive(vp, cr, ct); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + lxd_free_back_node(ldn); +} + +/* ARGSUSED */ +static int +lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FID(vp, fidp, ct)); +} + +/* + * For a front node lookup in the dirent hash table and return a shadow vnode + * (lxd_node_t type) of type LXDNT_FRONT. + * + * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type) + * of the real vnode found. + */ +static int +lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *vp = NULL; + int error; + vnode_t *realdvp; + lxd_mnt_t *lxdm = VTOLXDM(dvp); + int doingdotdot = 0; + lxd_node_t *ldn = VTOLDN(dvp); + lxd_node_t *nldn = NULL; + + /* + * First check for front file which could be instantiated on either a + * front or back node (e.g. the top-level moint point directory node is + * a back node which can have front files created in it). + */ + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for dir being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(ldn, nm, &nldn, cr); + rw_exit(&ldn->lxdn_rwlock); + + if (error == 0) { + /* found */ + ASSERT(nldn != NULL); + *vpp = LDNTOV(nldn); + return (0); + } + + /* At this point, if dir node is a front node, error */ + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOENT); + } + + realdvp = REALVP(dvp); + + if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { + doingdotdot++; + /* + * Handle ".." out of mounted filesystem + */ + while ((realdvp->v_flag & VROOT) && realdvp != rootdir) { + realdvp = realdvp->v_vfsp->vfs_vnodecovered; + ASSERT(realdvp != NULL); + } + } + + *vpp = NULL; /* default(error) case */ + + /* + * Do the normal lookup + */ + if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr, + ct, direntflags, realpnp)) != 0) { + vp = NULL; + goto out; + } + + /* + * We do this check here to avoid returning a stale file handle to the + * caller. + */ + if (nm[0] == '.' && nm[1] == '\0') { + ASSERT(vp == realdvp); + VN_HOLD(dvp); + VN_RELE(vp); + *vpp = dvp; + return (0); + } + + if (doingdotdot) { + *vpp = lxd_make_back_node(vp, lxdm); + return (0); + } + + /* + * If this vnode is mounted on, then we + * traverse to the vnode which is the root of + * the mounted file system. + */ + if ((error = traverse(&vp)) != 0) + goto out; + + /* + * Make a lxd node for the real vnode. + */ + *vpp = lxd_make_back_node(vp, lxdm); + if (vp->v_type != VDIR) { + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + VN_RELE(vp); + error = ENOSYS; + } else { + *vpp = svp; + } + } + return (error); + } + +out: + if (error != 0 && vp != NULL) + VN_RELE(vp); + + return (error); +} + +/*ARGSUSED*/ +static int +lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, + int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + int error; + vnode_t *vp = NULL; + lxd_node_t *parent = VTOLDN(dvp); + + /* + * We currently don't support creating simple files under lx devfs + * (i.e. Create front nodes. We only allow directories and symlinks). + */ + if (parent->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + /* + * We cannot create files in the back devfs but we want to allow for + * o_creat on existing files, so pass this through and let the back + * file system allow or deny it. + */ + + ASSERT(parent->lxdn_type == LXDNT_BACK); + if (*nm == '\0') { + ASSERT(vpp && dvp == *vpp); + vp = REALVP(*vpp); + } + + error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode, &vp, cr, flag, + ct, vsecp); + if (!error) { + *vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp)); + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + } + + return (error); +} + +static int +lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct, + int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *ldn = NULL; + int error; + + /* can only remove existing front nodes */ + error = lxd_dirlookup(parent, nm, &ldn, cr); + if (error) { + return (error); + } + + ASSERT(ldn != NULL); + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr); + + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + + ldnode_rele(ldn); + + return (error); +} + +static int +lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + return (ENOTSUP); +} + +static int +lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + lxd_node_t *oldparent = VTOLDN(odvp); + lxd_node_t *newparent; + lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode); + lxd_node_t *fromnode = NULL; + int error; + int samedir = 0; + + if (!vn_matchops(ndvp, lxd_vnodeops)) { + /* cannot rename out of this file system */ + return (EACCES); + } + + mutex_enter(&lxdm->lxdm_renamelck); + + newparent = VTOLDN(ndvp); + + /* + * We can only rename front nodes. + */ + error = lxd_dirlookup(oldparent, onm, &fromnode, cr); + if (error != 0) { + /* not found in front */ + mutex_exit(&lxdm->lxdm_renamelck); + return (error); + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. If + * that directory is "sticky" it requires further checks. + */ + if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromnode == oldparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (oldparent == fromnode)) { + error = EINVAL; + goto done; + } + + samedir = (oldparent == newparent); + + /* + * Make sure we can search and rename into the destination directory. + */ + if (!samedir) { + if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0) + goto done; + } + + /* + * Link source to new target + */ + rw_enter(&newparent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME, + oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL, + cr, ct); + rw_exit(&newparent->lxdn_rwlock); + + if (error) + goto done; + + /* + * Unlink from source. + */ + rw_enter(&oldparent->lxdn_rwlock, RW_WRITER); + rw_enter(&fromnode->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr); + + /* + * The following handles the case where our source node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + rw_exit(&fromnode->lxdn_rwlock); + rw_exit(&oldparent->lxdn_rwlock); + +done: + ldnode_rele(fromnode); + mutex_exit(&lxdm->lxdm_renamelck); + return (error); +} + +static int +lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, + struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + int error; + vnode_t *tvp; + lxd_node_t *ndir = NULL; + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + /* check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make front directory */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL, + va, &ndir, cr, ct); + rw_exit(&parent->lxdn_rwlock); + + if (error != 0) { + if (ndir != NULL) + ldnode_rele(ndir); + } else { + *vpp = LDNTOV(ndir); + } + + return (error); +} + +static int +lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + *vpp = vp; + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + while (vn_matchops(vp, lxd_vnodeops)) + vp = REALVP(vp); + + if (VOP_REALVP(vp, vpp, ct) != 0) + *vpp = vp; + return (0); +} + +static int +lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr, + caller_context_t *ct, int flags) +{ + int error; + lxd_node_t *ldn; + struct vnode *vp; + lxd_node_t *parent = VTOLDN(dvp); + + /* + * Return error if trying to remove . or .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); + + error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr); + if (error != 0) { + /* not found in front */ + return (error); + } + + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + vp = LDNTOV(ldn); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto err; + } + + if (ldn->lxdn_vnode->v_type != VDIR) { + error = ENOTDIR; + goto err; + } + + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink > 2) { + mutex_exit(&ldn->lxdn_tlock); + error = EEXIST; + goto err; + } + mutex_exit(&ldn->lxdn_tlock); + + /* Check for an empty directory */ + if (ldn->lxdn_dirents > 2) { + error = EEXIST; + gethrestime(&ldn->lxdn_atime); + goto err; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto err; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + vn_vfsunlock(vp); + goto err; + } + + error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr); + vn_vfsunlock(vp); + +err: + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + ldnode_rele(ldn); + + return (error); +} + +static int +lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm, + struct cred *cr, caller_context_t *ct, int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + lxd_node_t *self = NULL; + vnode_t *tvp; + char *cp = NULL; + int error; + size_t len; + + /* this will check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make symlink in the front */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + tva, &self, cr, ct); + rw_exit(&parent->lxdn_rwlock); + + if (error) { + if (self != NULL) + ldnode_rele(self); + return (error); + } + + len = strlen(tnm) + 1; + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); + if (cp == NULL) { + ldnode_rele(self); + return (ENOSPC); + } + (void) strcpy(cp, tnm); + + self->lxdn_symlink = cp; + self->lxdn_size = len - 1; + ldnode_rele(self); + + return (error); +} + +static int +lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + if (vp->v_type != VLNK) + return (EINVAL); + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ, + uiop); + gethrestime(&ldn->lxdn_atime); + rw_exit(&ldn->lxdn_rwlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READLINK(vp, uiop, cr, ct)); +} + +static int +lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp) +{ + lxd_node_t *ldn = VTOLDN(vp); + struct dirent *sd; + lxd_dirent_t *ldp; + enum lxd_node_type type = ldn->lxdn_type; + ssize_t uresid; + off_t front_off; + int error = 0; + int sdlen; + + /* skip the front entries if the back read was incomplete */ + if (*eofp == 0) + return (0); + + /* + * If this was a back node then reading that node has completed and we + * may have a partially full uio struct. eof should be set to true. + * Leave it set since we're likely to hit eof for the front nodes (if + * any). + */ + + front_off = uiop->uio_offset + 1; + sdlen = sizeof (struct dirent) + MAXPATHLEN; + /* zalloc to ensure we don't have anything in the d_name buffer */ + sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP); + ldp = ldn->lxdn_dir; + while (ldp != NULL && (uresid = uiop->uio_resid) > 0) { + int namelen; + int reclen; + + /* + * Skip dot and dotdot for back nodes since we have them + * already. + */ + if (type == LXDNT_BACK && + (strcmp(ldp->lddir_name, ".") == 0 || + strcmp(ldp->lddir_name, "..") == 0)) { + ldp = ldp->lddir_next; + continue; + } + + /* + * Might have previously had a partial readdir of the front + * nodes, and now we're back for more, or we may just be + * be doing a follow-up readdir after we've previously + * returned all front and back nodes. + */ + if (front_off > req_off) { + namelen = strlen(ldp->lddir_name); /* no +1 needed */ + reclen = (int)DIRENT64_RECLEN(namelen); + + /* + * If the size of the data to transfer is greater + * than that requested, then we can't do it this + * transfer. + */ + if (reclen > uresid) { + *eofp = 0; + /* Buffer too small for any entries. */ + if (front_off == 0) + error = EINVAL; + break; + } + + (void) strncpy(sd->d_name, ldp->lddir_name, + DIRENT64_NAMELEN(reclen)); + sd->d_reclen = (ushort_t)reclen; + sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid; + sd->d_off = front_off; + + /* uiomove will adjust iov_base properly */ + if ((error = uiomove((caddr_t)sd, reclen, UIO_READ, + uiop)) != 0) { + *eofp = 0; + break; + } + } + + /* + * uiomove() above updates both uio_resid and uio_offset by the + * same amount but we want uio_offset to change in increments + * of 1, which is different from the number of bytes being + * returned to the caller, so we set uio_offset explicitly, + * ignoring what uiomove() did. + */ + uiop->uio_offset = front_off; + front_off++; + + ldp = ldp->lddir_next; + } + + kmem_free(sd, sdlen); + return (error); +} + +static int +lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxd_node_t *ldn = VTOLDN(vp); + vnode_t *rvp; + int res; + caddr_t base; + off_t req_off; + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + base = uiop->uio_iov->iov_base; + req_off = uiop->uio_offset; + + /* First read the back node (if it is one) */ + if (ldn->lxdn_type == LXDNT_BACK) { + rvp = REALVP(vp); + res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags); + if (res != 0) + return (res); + } else { + /* setup for merge_front */ + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + /* caller should have already called lxd_rwlock */ + ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock)); + + *eofp = 1; + /* + * The merge code starts the offset calculation from uio_offset, + * which is normally already set to the high value by the back + * code, but in this case we need to count up from 0. + */ + uiop->uio_offset = 0; + } + + /* + * Our back nodes can also have front entries hanging on them so we + * need to merge those in. Or, we may simply have a front node (i.e. a + * front subdir). + */ + res = lx_merge_front(vp, uiop, req_off, eofp); + return (res); +} + +static int +lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + if (write_lock) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + } else { + rw_enter(&ldn->lxdn_rwlock, RW_READER); + } + return (write_lock); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_RWLOCK(vp, write_lock, ct)); +} + +static void +lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + rw_exit(&ldn->lxdn_rwlock); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + VOP_RWUNLOCK(vp, write_lock, ct); +} + +static int +lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SEEK(vp, ooff, noffp, ct)); +} + +static int +lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + while (vn_matchops(vp1, lxd_vnodeops) && + VTOLDN(vp1)->lxdn_type == LXDNT_BACK) { + vp1 = REALVP(vp1); + } + while (vn_matchops(vp2, lxd_vnodeops) && + VTOLDN(vp2)->lxdn_type == LXDNT_BACK) { + vp2 = REALVP(vp2); + } + + if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +static int +lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +static int +lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct)); +} + +static int +lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot, + struct page *parr[], size_t psz, struct seg *seg, caddr_t addr, + enum seg_rw rw, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr, + ct)); +} + +static int +lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PUTPAGE(vp, off, len, flags, cr, ct)); +} + +static int +lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct)); +} + +static int +lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp, + struct pollhead **phpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct)); +} + +static int +lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DUMP(vp, addr, bn, count, ct)); +} + +static int +lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PATHCONF(vp, cmd, valp, cr, ct)); +} + +static int +lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len, + int flags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct)); +} + +static void +lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + if (vp != NULL && !VN_ISKAS(vp)) + VOP_DISPOSE(vp, pp, fl, dn, cr, ct); +} + +static int +lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (vn_is_readonly(vp)) + return (EROFS); + + vp = REALVP(vp); + return (VOP_SETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct)); +} + +/* + * Loopback vnode operations vector. + */ + +struct vnodeops *lxd_vnodeops; + +const fs_operation_def_t lxd_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxd_open }, + VOPNAME_CLOSE, { .vop_close = lxd_close }, + VOPNAME_READ, { .vop_read = lxd_read }, + VOPNAME_WRITE, { .vop_write = lxd_write }, + VOPNAME_IOCTL, { .vop_ioctl = lxd_ioctl }, + VOPNAME_SETFL, { .vop_setfl = lxd_setfl }, + VOPNAME_GETATTR, { .vop_getattr = lxd_getattr }, + VOPNAME_SETATTR, { .vop_setattr = lxd_setattr }, + VOPNAME_ACCESS, { .vop_access = lxd_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxd_lookup }, + VOPNAME_CREATE, { .vop_create = lxd_create }, + VOPNAME_REMOVE, { .vop_remove = lxd_remove }, + VOPNAME_LINK, { .vop_link = lxd_link }, + VOPNAME_RENAME, { .vop_rename = lxd_rename }, + VOPNAME_MKDIR, { .vop_mkdir = lxd_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = lxd_rmdir }, + VOPNAME_READDIR, { .vop_readdir = lxd_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = lxd_symlink }, + VOPNAME_READLINK, { .vop_readlink = lxd_readlink }, + VOPNAME_FSYNC, { .vop_fsync = lxd_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = lxd_inactive }, + VOPNAME_FID, { .vop_fid = lxd_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = lxd_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = lxd_rwunlock }, + VOPNAME_SEEK, { .vop_seek = lxd_seek }, + VOPNAME_CMP, { .vop_cmp = lxd_cmp }, + VOPNAME_FRLOCK, { .vop_frlock = lxd_frlock }, + VOPNAME_SPACE, { .vop_space = lxd_space }, + VOPNAME_REALVP, { .vop_realvp = lxd_realvp }, + VOPNAME_GETPAGE, { .vop_getpage = lxd_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = lxd_putpage }, + VOPNAME_MAP, { .vop_map = lxd_map }, + VOPNAME_ADDMAP, { .vop_addmap = lxd_addmap }, + VOPNAME_DELMAP, { .vop_delmap = lxd_delmap }, + VOPNAME_POLL, { .vop_poll = lxd_poll }, + VOPNAME_DUMP, { .vop_dump = lxd_dump }, + VOPNAME_DUMPCTL, { .error = fs_error }, + VOPNAME_PATHCONF, { .vop_pathconf = lxd_pathconf }, + VOPNAME_PAGEIO, { .vop_pageio = lxd_pageio }, + VOPNAME_DISPOSE, { .vop_dispose = lxd_dispose }, + VOPNAME_SETSECATTR, { .vop_setsecattr = lxd_setsecattr }, + VOPNAME_GETSECATTR, { .vop_getsecattr = lxd_getsecattr }, + VOPNAME_SHRLOCK, { .vop_shrlock = lxd_shrlock }, + NULL, NULL +}; diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 6ec848acf9..337ad94679 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -105,6 +105,11 @@ LX_CGROUP_OBJS += \ cgrps_vfsops.o \ cgrps_vnops.o +LX_DEVFS_OBJS += \ + lxd_node.o \ + lxd_vfsops.o \ + lxd_vnops.o + LX_PROC_OBJS += \ lx_prsubr.o \ lx_prvfsops.o \ diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index b1d41d1e88..7972c61df2 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -545,7 +545,7 @@ SCHED_KMODS += IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL SDC FS_KMODS += autofs ctfs dcfs dev devfs fdfs fifofs hsfs hyprlofs FS_KMODS += lofs lxautofs lx_proc lxprocfs mntfs namefs nfs objfs zfs zut FS_KMODS += pcfs procfs sockfs specfs tmpfs udfs ufs sharefs lx_sysfs -FS_KMODS += smbfs bootfs lx_cgroup +FS_KMODS += smbfs bootfs lx_cgroup lx_devfs # # Streams Modules (/kernel/strmod): diff --git a/usr/src/uts/intel/lx_devfs/Makefile b/usr/src/uts/intel/lx_devfs/Makefile new file mode 100644 index 0000000000..1254f596eb --- /dev/null +++ b/usr/src/uts/intel/lx_devfs/Makefile @@ -0,0 +1,57 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +UTSBASE = ../.. + +LX_CMN = $(SRC)/common/brand/lx + +MODULE = lx_devfs +OBJECTS = $(LX_DEVFS_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(LX_DEVFS_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_FS_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nbrand/lx_brand + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ + +include $(UTSBASE)/intel/lx_devfs/Makefile.rules diff --git a/usr/src/uts/intel/lx_devfs/Makefile.rules b/usr/src/uts/intel/lx_devfs/Makefile.rules new file mode 100644 index 0000000000..4b9748314c --- /dev/null +++ b/usr/src/uts/intel/lx_devfs/Makefile.rules @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. All rights reserved. +# + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/devfs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/brand/lx/devfs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) |