diff options
Diffstat (limited to 'usr/src/uts/common/fs/vnode.c')
-rw-r--r-- | usr/src/uts/common/fs/vnode.c | 460 |
1 files changed, 345 insertions, 115 deletions
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index e6b6adf56b..5a54ef8eee 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -66,6 +66,8 @@ #include <fs/fs_subr.h> #include <sys/taskq.h> #include <fs/fs_reparse.h> +#include <sys/time.h> +#include <sys/sdt.h> /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ @@ -102,6 +104,9 @@ kmutex_t vskstat_tree_lock; /* Global variable which enables/disables the vopstats collection */ int vopstats_enabled = 1; +/* Global used for empty/invalid v_path */ +char *vn_vpath_empty = ""; + /* * forward declarations for internal vnode specific data (vsd) */ @@ -200,6 +205,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -1643,7 +1653,7 @@ vn_rename(char *from, char *to, enum uio_seg seg) int vn_renameat(vnode_t *fdvp, char *fname, vnode_t *tdvp, - char *tname, enum uio_seg seg) + char *tname, enum uio_seg seg) { int error; struct vattr vattr; @@ -2284,7 +2294,8 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; + vp->v_path_stamp = 0; vp->v_mpssdata = NULL; vp->v_vsd = NULL; vp->v_fopdata = NULL; @@ -2331,6 +2342,7 @@ void vn_recycle(vnode_t *vp) { ASSERT(vp->v_pages == NULL); + VERIFY(vp->v_path != NULL); /* * XXX - This really belongs in vn_reinit(), but we have some issues @@ -2353,10 +2365,11 @@ vn_recycle(vnode_t *vp) kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } - if (vp->v_path) { + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } + vp->v_path_stamp = 0; if (vp->v_fopdata != NULL) { free_fopdata(vp); @@ -2427,9 +2440,10 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); - if (vp->v_path != NULL) { + VERIFY(vp->v_path != NULL); + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } /* If FEM was in use, make sure everything gets cleaned up */ @@ -2516,6 +2530,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2530,12 +2545,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2622,6 +2638,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct) (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); } +void +vnevent_resize(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct); +} + /* * Vnode accessors. */ @@ -2952,107 +2977,229 @@ fs_new_caller_id() } /* - * Given a starting vnode and a path, updates the path in the target vnode in - * a safe manner. If the vnode already has path information embedded, then the - * cached path is left untouched. + * The value stored in v_path is relative to rootdir, located in the global + * zone. Zones or chroot environments which reside deeper inside the VFS + * hierarchy will have a relative view of MAXPATHLEN since they are unaware of + * what lies below their perceived root. In order to keep v_path usable for + * these child environments, its allocations are allowed to exceed MAXPATHLEN. + * + * An upper bound of max_vnode_path is placed upon v_path allocations to + * prevent the system from going too wild at the behest of pathological + * behavior from the operator. */ - size_t max_vnode_path = 4 * MAXPATHLEN; + void -vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, - const char *path, size_t plen) -{ - char *rpath; - vnode_t *base; - size_t rpathlen, rpathalloc; - int doslash = 1; - - if (*path == '/') { - base = rootvp; - path++; - plen--; - } else { - base = startvp; - } +vn_clearpath(vnode_t *vp, hrtime_t compare_stamp) +{ + char *buf; + mutex_enter(&vp->v_lock); /* - * We cannot grab base->v_lock while we hold vp->v_lock because of - * the potential for deadlock. + * If the snapshot of v_path_stamp passed in via compare_stamp does not + * match the present value on the vnode, it indicates that subsequent + * changes have occurred. The v_path value is not cleared in this case + * since the new value may be valid. */ - mutex_enter(&base->v_lock); - if (base->v_path == NULL) { - mutex_exit(&base->v_lock); + if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) { + mutex_exit(&vp->v_lock); return; } + buf = vp->v_path; + vp->v_path = vn_vpath_empty; + vp->v_path_stamp = 0; + mutex_exit(&vp->v_lock); + if (buf != vn_vpath_empty) { + kmem_free(buf, strlen(buf) + 1); + } +} - rpathlen = strlen(base->v_path); - rpathalloc = rpathlen + plen + 1; - /* Avoid adding a slash if there's already one there */ - if (base->v_path[rpathlen-1] == '/') - doslash = 0; - else - rpathalloc++; - - /* - * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held, - * so we must do this dance. If, by chance, something changes the path, - * just give up since there is no real harm. - */ - mutex_exit(&base->v_lock); +static void +vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len, + boolean_t is_rename) +{ + char *buf, *oldbuf; + hrtime_t pstamp; + size_t baselen, buflen = 0; + + /* Handle the vn_setpath_str case. */ + if (pvp == NULL) { + if (len + 1 > max_vnode_path) { + DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp, + vnode_t *, vp, char *, name, size_t, len + 1); + return; + } + buf = kmem_alloc(len + 1, KM_SLEEP); + bcopy(name, buf, len); + buf[len] = '\0'; - /* Paths should stay within reason */ - if (rpathalloc > max_vnode_path) + mutex_enter(&vp->v_lock); + oldbuf = vp->v_path; + vp->v_path = buf; + vp->v_path_stamp = gethrtime(); + mutex_exit(&vp->v_lock); + if (oldbuf != vn_vpath_empty) { + kmem_free(oldbuf, strlen(oldbuf) + 1); + } return; + } + + /* Take snapshot of parent dir */ + mutex_enter(&pvp->v_lock); +retrybuf: + if (pvp->v_path == vn_vpath_empty) { + /* + * Without v_path from the parent directory, generating a child + * path from the name is impossible. + */ + if (len > 0) { + pstamp = pvp->v_path_stamp; + mutex_exit(&pvp->v_lock); + vn_clearpath(vp, pstamp); + return; + } - rpath = kmem_alloc(rpathalloc, KM_SLEEP); + /* + * The only feasible case here is where a NUL lookup is being + * performed on rootdir prior to its v_path being populated. + */ + ASSERT(pvp->v_path_stamp = 0); + baselen = 0; + pstamp = 0; + } else { + pstamp = pvp->v_path_stamp; + baselen = strlen(pvp->v_path); + /* ignore a trailing slash if present */ + if (pvp->v_path[baselen - 1] == '/') { + /* This should only the be case for rootdir */ + ASSERT(baselen == 1 && pvp == rootdir); + baselen--; + } + } + mutex_exit(&pvp->v_lock); - mutex_enter(&base->v_lock); - if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { - mutex_exit(&base->v_lock); - kmem_free(rpath, rpathalloc); + if (buflen != 0) { + /* Free the existing (mis-sized) buffer in case of retry */ + kmem_free(buf, buflen); + } + /* base, '/', name and trailing NUL */ + buflen = baselen + len + 2; + if (buflen > max_vnode_path) { + DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp, + vnode_t *, vp, char *, name, size_t, buflen); return; } - bcopy(base->v_path, rpath, rpathlen); - mutex_exit(&base->v_lock); + buf = kmem_alloc(buflen, KM_SLEEP); - if (doslash) - rpath[rpathlen++] = '/'; - bcopy(path, rpath + rpathlen, plen); - rpath[rpathlen + plen] = '\0'; + mutex_enter(&pvp->v_lock); + if (pvp->v_path_stamp != pstamp) { + size_t vlen; + + /* + * Since v_path_stamp changed on the parent, it is likely that + * v_path has been altered as well. If the length does not + * exactly match what was previously measured, the buffer + * allocation must be repeated for proper sizing. + */ + if (pvp->v_path == vn_vpath_empty) { + /* Give up if parent lack v_path */ + mutex_exit(&pvp->v_lock); + kmem_free(buf, buflen); + return; + } + vlen = strlen(pvp->v_path); + if (pvp->v_path[vlen - 1] == '/') { + vlen--; + } + if (vlen != baselen) { + goto retrybuf; + } + } + bcopy(pvp->v_path, buf, baselen); + mutex_exit(&pvp->v_lock); + + buf[baselen] = '/'; + baselen++; + bcopy(name, &buf[baselen], len + 1); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path_stamp == 0) { + /* never-visited vnode can inherit stamp from parent */ + ASSERT(vp->v_path == vn_vpath_empty); + vp->v_path_stamp = pstamp; + vp->v_path = buf; mutex_exit(&vp->v_lock); - kmem_free(rpath, rpathalloc); + } else if (vp->v_path_stamp < pstamp || is_rename) { + /* + * Install the updated path and stamp, ensuring that the v_path + * pointer is valid at all times for dtrace. + */ + oldbuf = vp->v_path; + vp->v_path = buf; + vp->v_path_stamp = gethrtime(); + mutex_exit(&vp->v_lock); + kmem_free(oldbuf, strlen(oldbuf) + 1); } else { - vp->v_path = rpath; + /* + * If the timestamp matches or is greater, it means another + * thread performed the update first while locks were dropped + * here to make the allocation. We defer to the newer value. + */ mutex_exit(&vp->v_lock); + kmem_free(buf, buflen); } + ASSERT(MUTEX_NOT_HELD(&vp->v_lock)); } -/* - * Sets the path to the vnode to be the given string, regardless of current - * context. The string must be a complete path from rootdir. This is only used - * by fsop_root() for setting the path based on the mountpoint. - */ void -vn_setpath_str(struct vnode *vp, const char *str, size_t len) +vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name) { - char *buf = kmem_alloc(len + 1, KM_SLEEP); + size_t len; - mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { - mutex_exit(&vp->v_lock); - kmem_free(buf, len + 1); + /* + * If the parent is older or empty, there's nothing further to do. + */ + if (pvp->v_path == vn_vpath_empty || + pvp->v_path_stamp <= vp->v_path_stamp) { + return; + } + + /* + * Given the lack of appropriate context, meaningful updates to v_path + * cannot be made for during lookups for the '.' or '..' entries. + */ + len = strlen(name); + if (len == 0 || (len == 1 && name[0] == '.') || + (len == 2 && name[0] == '.' && name[1] == '.')) { return; } - vp->v_path = buf; - bcopy(str, vp->v_path, len); - vp->v_path[len] = '\0'; + vn_setpath_common(pvp, vp, name, len, B_FALSE); +} - mutex_exit(&vp->v_lock); +/* + * Given a starting vnode and a path, updates the path in the target vnode in + * a safe manner. If the vnode already has path information embedded, then the + * cached path is left untouched. + */ +/* ARGSUSED */ +void +vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name, + size_t len) +{ + vn_setpath_common(pvp, vp, name, len, B_FALSE); +} + +/* + * Sets the path to the vnode to be the given string, regardless of current + * context. The string must be a complete path from rootdir. This is only used + * by fsop_root() for setting the path based on the mountpoint. + */ +void +vn_setpath_str(vnode_t *vp, const char *str, size_t len) +{ + vn_setpath_common(NULL, vp, str, len, B_FALSE); } /* @@ -3060,17 +3207,9 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len) * target vnode is available. */ void -vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) +vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len) { - char *tmp; - - mutex_enter(&vp->v_lock); - tmp = vp->v_path; - vp->v_path = NULL; - mutex_exit(&vp->v_lock); - vn_setpath(rootdir, dvp, vp, nm, len); - if (tmp != NULL) - kmem_free(tmp, strlen(tmp) + 1); + vn_setpath_common(pvp, vp, name, len, B_TRUE); } /* @@ -3081,37 +3220,42 @@ void vn_copypath(struct vnode *src, struct vnode *dst) { char *buf; - int alloc; + hrtime_t stamp; + size_t buflen; mutex_enter(&src->v_lock); - if (src->v_path == NULL) { + if (src->v_path == vn_vpath_empty) { mutex_exit(&src->v_lock); return; } - alloc = strlen(src->v_path) + 1; - - /* avoid kmem_alloc() with lock held */ + buflen = strlen(src->v_path) + 1; mutex_exit(&src->v_lock); - buf = kmem_alloc(alloc, KM_SLEEP); + + buf = kmem_alloc(buflen, KM_SLEEP); + mutex_enter(&src->v_lock); - if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { + if (src->v_path == vn_vpath_empty || + strlen(src->v_path) + 1 != buflen) { mutex_exit(&src->v_lock); - kmem_free(buf, alloc); + kmem_free(buf, buflen); return; } - bcopy(src->v_path, buf, alloc); + bcopy(src->v_path, buf, buflen); + stamp = src->v_path_stamp; mutex_exit(&src->v_lock); mutex_enter(&dst->v_lock); - if (dst->v_path != NULL) { + if (dst->v_path != vn_vpath_empty) { mutex_exit(&dst->v_lock); - kmem_free(buf, alloc); + kmem_free(buf, buflen); return; } dst->v_path = buf; + dst->v_path_stamp = stamp; mutex_exit(&dst->v_lock); } + /* * XXX Private interface for segvn routines that handle vnode * large page segments. @@ -3261,14 +3405,58 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3280,14 +3468,63 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3451,9 +3688,7 @@ fop_lookup( } if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, lookup); - if ((*vpp)->v_path == NULL) { - vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); - } + vn_updatepath(dvp, *vpp, nm); } return (ret); @@ -3493,9 +3728,7 @@ fop_create( (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, create); - if ((*vpp)->v_path == NULL) { - vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); - } + vn_updatepath(dvp, *vpp, name); } return (ret); @@ -3615,10 +3848,7 @@ fop_mkdir( (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, mkdir); - if ((*vpp)->v_path == NULL) { - vn_setpath(rootdir, dvp, *vpp, dirname, - strlen(dirname)); - } + vn_updatepath(dvp, *vpp, dirname); } return (ret); |