diff options
Diffstat (limited to 'usr/src/uts/common/fs/tmpfs')
| -rw-r--r-- | usr/src/uts/common/fs/tmpfs/tmp_dir.c | 61 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/tmpfs/tmp_subr.c | 162 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/tmpfs/tmp_tnode.c | 70 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/tmpfs/tmp_vfsops.c | 278 | ||||
| -rw-r--r-- | usr/src/uts/common/fs/tmpfs/tmp_vnops.c | 99 |
5 files changed, 521 insertions, 149 deletions
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index f6621c8097..1a620642cc 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -383,20 +382,7 @@ tdirenter( /* * Unmake the inode we just made. */ - rw_enter(&tp->tn_rwlock, RW_WRITER); - if ((tp->tn_type) == VDIR) { - ASSERT(tdp == NULL); - /* - * cleanup allocs made by tdirinit() - */ - tdirtrunc(tp); - } - mutex_enter(&tp->tn_tlock); - tp->tn_nlink = 0; - mutex_exit(&tp->tn_tlock); - gethrestime(&tp->tn_ctime); - rw_exit(&tp->tn_rwlock); - tmpnode_rele(tp); + tmpnode_cleanup(tp); tp = NULL; } } else if (tpp) { @@ -431,6 +417,7 @@ tdirdelete( enum dr_op op, struct cred *cred) { + struct tmount *tm; struct tdirent *tpdp; int error; size_t namelen; @@ -516,7 +503,8 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + tm = TNTOTM(dir); + tmp_kmem_free(tm, tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -538,19 +526,27 @@ tdirdelete( * tdirinit is used internally to initialize a directory (dir) * with '.' and '..' entries without checking permissions and locking */ -void +int tdirinit( struct tmpnode *parent, /* parent of directory to initialize */ struct tmpnode *dir) /* the new directory */ { + struct tmount *tm; struct tdirent *dot, *dotdot; timestruc_t now; ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + tm = TNTOTM(parent); + dot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 2, KM_SLEEP); + if (dot == NULL) + return (ENOSPC); + dotdot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 3, KM_SLEEP); + if (dotdot == NULL) { + tmp_kmem_free(tm, dot, sizeof (struct tdirent) + 2); + return (ENOSPC); + } /* * Initialize the entries @@ -601,6 +597,8 @@ tdirinit( dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */ dir->tn_dirents = 2; dir->tn_nlink = 2; + + return (0); } @@ -612,6 +610,7 @@ tdirtrunc(struct tmpnode *dir) { struct tdirent *tdp; struct tmpnode *tp; + struct tmount *tm; size_t namelen; timestruc_t now; int isvattrdir, isdotdot, skip_decr; @@ -619,6 +618,8 @@ tdirtrunc(struct tmpnode *dir) ASSERT(RW_WRITE_HELD(&dir->tn_rwlock)); ASSERT(dir->tn_type == VDIR); + tm = TNTOTM(dir); + isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0; for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) { ASSERT(tdp->td_next != tdp); @@ -650,7 +651,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + tmp_kmem_free(tm, tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -903,6 +904,7 @@ tdiraddentry( enum de_op op, struct tmpnode *fromtp) { + struct tmount *tm; struct tdirent *tdp, *tpdp; size_t namelen, alloc_size; timestruc_t now; @@ -923,9 +925,10 @@ tdiraddentry( /* * Allocate and initialize directory entry */ + tm = TNTOTM(dir); namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP | KM_NORMALPRI); if (tdp == NULL) return (ENOSPC); @@ -1025,7 +1028,10 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + return (ENOSPC); + } tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ @@ -1087,8 +1093,13 @@ tdirmaketnode( if (va->va_mask & AT_MTIME) tp->tn_mtime = va->va_mtime; - if (op == DE_MKDIR) - tdirinit(dir, tp); + if (op == DE_MKDIR) { + int ret; + if ((ret = tdirinit(dir, tp)) != 0) { + tmpnode_cleanup(tp); + return (ret); + } + } *newnode = tp; return (0); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 8723631555..150ce2a220 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -43,6 +43,7 @@ #include <sys/fs/tmpnode.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <vm/anon.h> #define KILOBYTE 1024 #define MEGABYTE (1024 * KILOBYTE) @@ -54,6 +55,80 @@ extern pgcnt_t swapfs_minfree; +void * +tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag) +{ + void *buf; + zone_t *zone; + size_t pages; + + mutex_enter(&tm->tm_contents); + zone = tm->tm_vfsp->vfs_zone; + if (tm->tm_anonmem + size > tm->tm_anonmax || + tm->tm_anonmem + size < tm->tm_anonmem || + size + ptob(tmpfs_minfree) <= size || + !anon_checkspace(size + ptob(tmpfs_minfree), zone)) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + /* + * Only make anonymous memory reservations when a page boundary is + * crossed. This is necessary since the anon_resv functions rounds up + * to PAGESIZE internally. + */ + pages = btopr(tm->tm_allocmem + size); + pages -= btopr(tm->tm_allocmem); + if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + tm->tm_allocmem += size; + tm->tm_anonmem += size; + mutex_exit(&tm->tm_contents); + + buf = kmem_zalloc(size, flag); + if (buf == NULL) { + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + if (pages > 0) { + /* + * Re-chasing the zone pointer is necessary since a + * forced umount could have been performed while the + * tm_contents lock was dropped during allocation. + */ + anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); + } + + return (buf); +} + +void +tmp_kmem_free(struct tmount *tm, void *buf, size_t size) +{ + size_t pages; + + kmem_free(buf, size); + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + pages = btopr(tm->tm_allocmem); + tm->tm_allocmem -= size; + pages -= btopr(tm->tm_allocmem); + /* + * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when + * a page boundary has been crossed. + */ + if (pages > 0) { + anon_unresv_zone(size, tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); +} + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -99,42 +174,8 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* - * Convert a string containing a number (number of bytes) to a pgcnt_t, - * containing the corresponding number of pages. On 32-bit kernels, the + * Convert a string containing a number (number of bytes) to a size_t, + * containing the corresponding number of bytes. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * @@ -152,7 +193,7 @@ tmp_memfree(void *cp, size_t size) * error. */ int -tmp_convnum(char *str, pgcnt_t *maxpg) +tmp_convnum(char *str, size_t *maxbytes) { u_longlong_t num = 0; #ifdef _LP64 @@ -160,6 +201,7 @@ tmp_convnum(char *str, pgcnt_t *maxpg) #else u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif + size_t pages; char *c; const struct convchar { char *cc_char; @@ -250,13 +292,47 @@ valid_char: done: /* - * Since btopr() rounds up to page granularity, this round-up can - * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) - * and (max_bytes). In this case the resulting number is zero, which - * is what we check for below. + * We've been given a size in bytes; however, we want to make sure that + * we have at least one page worth no matter what. Therefore we use + * btopr to round up. However, this may cause an overflow only if 'num' + * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the + * resulting number is zero, which is what we check for below. Note, we + * require at least one page, so if pages is zero, well, it wasn't going + * to work anyways. */ - if ((*maxpg = (pgcnt_t)btopr(num)) == 0 && num != 0) + pages = btopr(num); + if (pages == 0) { return (EINVAL); + } + + *maxbytes = ptob(pages); + + return (0); +} + +/* + * Parse an octal mode string for use as the permissions set for the root + * of the tmpfs mount. + */ +int +tmp_convmode(char *str, mode_t *mode) +{ + ulong_t num; + char *c; + + if (str == NULL) { + return (EINVAL); + } + + if (ddi_strtoul(str, &c, 8, &num) != 0) { + return (EINVAL); + } + + if ((num & ~VALIDMODEBITS) != 0) { + return (EINVAL); + } + + *mode = VALIDMODEBITS & num; return (0); } diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 51e57b2611..13ea356924 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -64,21 +65,35 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + size_t pbytes = ptob(pages); zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* - * pagecreate is set only if we actually need to call anon_resv - * to reserve an additional page of anonymous memory. - * Since anon_resv always reserves a page at a time, - * it should only get called when we know we're growing the - * file into a new page or filling a hole. + * pagecreate is set only if we actually need to call anon_resv to + * reserve an additional page of anonymous memory. Since anon_resv + * always reserves a page at a time, it should only get called when we + * know we're growing the file into a new page or filling a hole. This + * is why we transform delta into a number of pages. However, because we + * track bytes and not pages, we convert that back to a number of bytes + * that we allocate against. * - * Deny if trying to reserve more than tmpfs can allocate + * Deny if trying to reserve more than tmpfs can allocate, the + * allocation causes an overflow, or the delta round up overflowed. + * Note, that btopr rounds up, so we need to catch the unsigned + * overflow. Note, rounding up when we are within a page of SIZE_MAX is + * done by adding a page, overflowing, which will then be rounded back + * to zero. Hence the following check. */ + if (pages == 0 && delta != 0) + return (1); + zone = tm->tm_vfsp->vfs_zone; - if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || + if (pagecreate && ((tm->tm_anonmem + pbytes > tm->tm_anonmax) || + (tm->tm_anonmem + pbytes < tm->tm_anonmem) || + (ptob(pages + tmpfs_minfree) <= pbytes) || (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || (anon_try_resv_zone(delta, zone) == 0))) { return (1); @@ -89,7 +104,7 @@ tmp_resv( */ if (pagecreate) { mutex_enter(&tm->tm_contents); - tm->tm_anonmem += pages; + tm->tm_anonmem += pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", @@ -110,13 +125,27 @@ tmp_unresv( struct tmpnode *tp, size_t delta) { + size_t pages, pbytes; + ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* + * If this is true, we have a grevious overflow bug and some size + * accounting has been messed with as having an amount to truncate at + * this size would imply that all of memory was used for this file. No + * matter how small the kernel, it will always need at least one page. + */ + pages = btopr(delta); + if (pages == 0 && delta != 0) + panic("tmpfs unsigned overflow detected"); + pbytes = ptob(pages); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); - tm->tm_anonmem -= btopr(delta); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - pbytes); + tm->tm_anonmem -= pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", tp, delta); @@ -154,6 +183,26 @@ tmpnode_growmap(struct tmpnode *tp, ulong_t newsize) } /* + * This is used to clean up a tmpnode that hasn't made it out the door. In other + * words, we allocated it and did a tmpnode_init; however, before it could get + * fully inserted into a directory, bad things happened and it failed. + */ +void +tmpnode_cleanup(struct tmpnode *tp) +{ + rw_enter(&tp->tn_rwlock, RW_WRITER); + if ((tp->tn_type) == VDIR) { + tdirtrunc(tp); + } + mutex_enter(&tp->tn_tlock); + tp->tn_nlink = 0; + mutex_exit(&tp->tn_tlock); + gethrestime(&tp->tn_ctime); + rw_exit(&tp->tn_rwlock); + tmpnode_rele(tp); +} + +/* * Initialize a tmpnode and add it to file list under mount point. */ void @@ -232,7 +281,6 @@ tmpnode_trunc( { size_t oldsize = tp->tn_size; size_t delta; - struct vnode *vp = TNTOV(tp); timestruc_t now; int error = 0; @@ -316,7 +364,7 @@ tmpnode_trunc( /* Delete anon array for tmpnode */ ASSERT(tp->tn_nblocks == 0); ASSERT(anon_get_ptr(tp->tn_anon, 0) == NULL); - ASSERT(!vn_has_cached_data(vp)); + ASSERT(!vn_has_cached_data(TNTOV(tp))); anon_release(tp->tn_anon, tp->tn_asize); tp->tn_anon = NULL; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index f0e0c54d3e..cef582ab86 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -56,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -65,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -123,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -141,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -157,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -178,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -212,18 +221,12 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } @@ -234,7 +237,7 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) struct tmpnode *tp; struct pathname dpn; int error; - pgcnt_t anonmax; + size_t anonmax; struct vattr rattr; int got_attrs; boolean_t mode_arg = B_FALSE; @@ -278,7 +281,18 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { - anonmax = ULONG_MAX; + anonmax = SIZE_MAX; + } + + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; } /* @@ -311,7 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) goto out; } - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if ((tm = kmem_zalloc(sizeof (struct tmount), + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -343,17 +358,37 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* + * Preemptively set vfs_zone before any of the tmp_kmem_* functions are + * called. That field is not populated until after a successful + * VFS_MOUNT when domount() sets vfsp metadata via vfs_add(). An + * accurate value is required for proper swap usage accounting. + */ + ASSERT0(uap->flags & MS_REMOUNT); + ASSERT(vfsp->vfs_zone == NULL); + vfsp->vfs_zone = curproc->p_zone; + + /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + + pn_free(&dpn); + error = ENOMEM; + goto out; + } tmpnode_init(tm, tp, &rattr, cr); /* @@ -392,12 +427,34 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) tp->tn_nlink = 0; tm->tm_rootnode = tp; - tdirinit(tp, tp); + if (tdirinit(tp, tp) != 0) { + /* + * While we would normally let our VOP_INACTIVE function take + * care of cleaning up here, we're in a bit of a delicate + * situation, so we do so manually. While it's tempting to try + * and rely upon tmpfs_freevfs() and others, it's probably safer + * for the time to do this manually at the cost of duplication. + */ + vn_invalid(TNTOV(tp)); + rw_destroy(&tp->tn_rwlock); + mutex_destroy(&tp->tn_tlock); + vn_free(TNTOV(tp)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + pn_free(&dpn); + error = ENOMEM; + goto out; + } rw_exit(&tp->tn_rwlock); pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -413,36 +470,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -452,14 +580,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -469,6 +633,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -535,15 +709,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -605,18 +780,19 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * If tm_anonmax for this mount is less than the available swap space * (minus the amount tmpfs can't use), use that instead */ - if (blocks > tmpfs_minfree) + if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) { sbp->f_bfree = MIN(blocks - tmpfs_minfree, - tm->tm_anonmax - tm->tm_anonmem); - else + btop(tm->tm_anonmax) - btopr(tm->tm_anonmem)); + } else { sbp->f_bfree = 0; + } sbp->f_bavail = sbp->f_bfree; /* * Total number of blocks is what's available plus what's been used */ - sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem)); if (eff_zid != GLOBAL_ZONEUNIQID && zp->zone_max_swap_ctl != UINT64_MAX) { @@ -646,13 +822,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index a8eadfa6db..747d280915 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 RackTop Systems. */ @@ -585,6 +585,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -614,6 +618,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -787,8 +795,13 @@ tmp_setattr( rw_exit(&tp->tn_contents); rw_exit(&tp->tn_rwlock); - if (error == 0 && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } goto out1; } @@ -834,6 +847,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -852,6 +868,8 @@ tmp_lookup( rw_enter(&tp->tn_rwlock, RW_WRITER); if (tp->tn_xattrdp == NULL) { + int err; + if (!(flags & CREATE_XATTR_DIR)) { rw_exit(&tp->tn_rwlock); return (ENOENT); @@ -872,9 +890,13 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); tm = VTOTM(dvp); + xdp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), + KM_SLEEP); + if (xdp == NULL) { + rw_exit(&tp->tn_rwlock); + return (ENOSPC); + } tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* * Fix-up fields unique to attribute directories. @@ -892,7 +914,16 @@ tmp_lookup( } xdp->tn_vnode->v_type = VDIR; xdp->tn_vnode->v_flag |= V_XATTRDIR; - tdirinit(tp, xdp); + if ((err = tdirinit(tp, xdp)) != 0) { + rw_exit(&tp->tn_rwlock); + /* + * This never got properly initialized so we can + * just clean it up. + */ + xdp->tn_vnode->v_flag &= V_XATTRDIR; + tmpnode_cleanup(tp); + return (err); + } tp->tn_xattrdp = xdp; } else { VN_HOLD(tp->tn_xattrdp->tn_vnode); @@ -1301,10 +1332,8 @@ tmp_rename( vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); /* * vnevent_rename_dest is called in tdirenter(). - * Notify the target dir if not same as source dir. */ - if (ndvp != odvp) - vnevent_rename_dest_dir(ndvp, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); } done: @@ -1473,6 +1502,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1606,12 +1639,12 @@ tmp_symlink( rw_exit(&parent->tn_rwlock); if (error) { - if (self) + if (self != NULL) tmpnode_rele(self); return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP | KM_NORMALPRI); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1676,10 +1709,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - vp->v_count--; + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + vp->v_count--; + } else { + vp->v_count--; + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1704,7 +1754,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + tmp_kmem_free(tm, tp->tn_symlink, tp->tn_size + 1); } /* @@ -1738,7 +1788,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1860,6 +1914,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2081,6 +2139,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ @@ -2341,8 +2403,13 @@ tmp_space( return (EFBIG); error = tmp_freesp(vp, bfp, flag); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); } |
