diff options
author | Jason King <jasonbking@users.noreply.github.com> | 2020-06-03 15:03:49 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-06-03 15:03:49 -0500 |
commit | 71b43f2a12f58ef8bc5a1965a3b742749bb49231 (patch) | |
tree | 4369300a9f43681a8ea759142d919982a0db2a31 | |
parent | 1522aad90cd52cad5f5eb1fae5521a9630ae60f0 (diff) | |
download | illumos-joyent-release-20200604.tar.gz |
OS-8054 inotify watches lead to EBUSY during zfs mount (#305)release-20200604
Portions contributed by: Mike Gerdts <mike.gerdts@joyent.com>
Reviewed by: John Levon <john.levon@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r-- | usr/src/pkg/manifests/system-test-zfstest.mf | 7 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/cmd/watch_dir/Makefile | 19 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/cmd/watch_dir/watch_dir.c | 151 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/include/commands.cfg | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/runfiles/smartos.run | 3 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_inotify.ksh | 74 | ||||
-rw-r--r-- | usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_portfs.ksh | 74 | ||||
-rw-r--r-- | usr/src/uts/common/fs/portfs/port_fop.c | 101 | ||||
-rw-r--r-- | usr/src/uts/common/fs/vnode.c | 35 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vfsops.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/io/inotify.c | 18 | ||||
-rw-r--r-- | usr/src/uts/common/sys/vnode.h | 46 |
12 files changed, 471 insertions, 64 deletions
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 0f05230a9b..9f78c29bb5 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -203,6 +203,7 @@ file path=opt/zfs-tests/bin/randwritecomp mode=0555 file path=opt/zfs-tests/bin/readmmap mode=0555 file path=opt/zfs-tests/bin/rename_dir mode=0555 file path=opt/zfs-tests/bin/rm_lnkcnt_zero_file mode=0555 +file path=opt/zfs-tests/bin/watch_dir mode=0555 file path=opt/zfs-tests/bin/zfstest mode=0555 file path=opt/zfs-tests/callbacks/zfs_dbgmsg mode=0555 file path=opt/zfs-tests/callbacks/zfs_mmp mode=0555 @@ -1027,6 +1028,12 @@ file \ file \ path=opt/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted \ mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_inotify \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_portfs \ + mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zfs_program/cleanup \ mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zfs_program/setup mode=0555 diff --git a/usr/src/test/zfs-tests/cmd/watch_dir/Makefile b/usr/src/test/zfs-tests/cmd/watch_dir/Makefile new file mode 100644 index 0000000000..517dff64af --- /dev/null +++ b/usr/src/test/zfs-tests/cmd/watch_dir/Makefile @@ -0,0 +1,19 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Joyent, Inc. +# + +PROG = watch_dir + +include $(SRC)/cmd/Makefile.cmd +include ../Makefile.subdirs diff --git a/usr/src/test/zfs-tests/cmd/watch_dir/watch_dir.c b/usr/src/test/zfs-tests/cmd/watch_dir/watch_dir.c new file mode 100644 index 0000000000..f6d8445162 --- /dev/null +++ b/usr/src/test/zfs-tests/cmd/watch_dir/watch_dir.c @@ -0,0 +1,151 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Joyent, Inc. + */ + +/* + * This program watches a directory with portfs or inotify, exiting when the + * directory is removed. It is useful in tests that ensure that watching a + * directory does not prevent it from being used as a mount point. + */ +#include <limits.h> +#include <port.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/inotify.h> +#include <unistd.h> + +void +fail_usage(void) +{ + (void) fprintf(stderr, "Usage: watch <portfs|inotify> directory\n"); + exit(1); +} + +#define MAX_PES 8 + +void +watch_port(const char *path) +{ + int port; + struct file_obj fobj = {0}; + + if ((port = port_create()) < 0) { + perror("port_create"); + exit(1); + } + + fobj.fo_name = (char *)path; + for (;;) { + timespec_t ts = {300, 0}; + port_event_t pe; + + if (port_associate(port, PORT_SOURCE_FILE, (uintptr_t)&fobj, + 0, (char *)path) != 0) { + perror("port_associate"); + exit(1); + } + + if (port_get(port, &pe, &ts) != 0) { + perror("port_get"); + exit(1); + } + + if (pe.portev_events & FILE_DELETE) { + (void) printf("DELETE\t%s\n", path); + exit(0); + } + if (pe.portev_events & MOUNTEDOVER) { + (void) printf("MOUNTEDOVER\t%s\n", path); + } + } +} + +void +watch_inotify(const char *path) +{ + int in, wd; + struct inotify_event ev; + + if ((in = inotify_init()) < 0) { + perror("inotify_init"); + exit(1); + } + if ((wd = inotify_add_watch(in, path, IN_DELETE_SELF)) == -1) { + perror("inotify_add_watch"); + exit(1); + } + + for (;;) { + ssize_t cnt; + char evpath[PATH_MAX]; + + cnt = read(in, &ev, sizeof (ev)); + if (cnt != sizeof (ev)) { + (void) fprintf(stderr, + "read: expected %ld bytes got %ld\n", + sizeof (ev), cnt); + exit(1); + } + if (ev.len != 0) { + if (ev.len > sizeof (evpath)) { + (void) fprintf(stderr, "read: oversize " + "path (%u bytes)\n", ev.len); + exit(1); + } + cnt = read(in, evpath, ev.len); + if (cnt != ev.len) { + (void) fprintf(stderr, "read: expected %ld " + "bytes for path, got %ld\n", ev.len, cnt); + exit(1); + } + evpath[ev.len - 1] = '\0'; + } else { + evpath[0] = '\0'; + } + if (ev.mask & IN_DELETE_SELF) { + /* + * IN_DELETE_SELF events don't appear to include + * the path in the event. + */ + (void) printf("DELETE_SELF\n"); + exit(0); + } else { + (void) printf("EVENT_%08x\t%s\n", ev.mask, evpath); + } + + } +} + +int +main(int argc, char **argv) +{ + const char *watcher, *path; + + if (argc != 3) { + fail_usage(); + } + watcher = argv[1]; + path = argv[2]; + + if (strcmp(watcher, "portfs") == 0) { + watch_port(path); + } else if (strcmp(watcher, "inotify") == 0) { + watch_inotify(path); + } else { + fail_usage(); + } + + return (0); +} diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg index f9b0bdf7ac..0ad892e2b2 100644 --- a/usr/src/test/zfs-tests/include/commands.cfg +++ b/usr/src/test/zfs-tests/include/commands.cfg @@ -196,4 +196,5 @@ export ZFSTEST_FILES='chg_usr_exec randwritecomp readmmap rename_dir - rm_lnkcnt_zero_file' + rm_lnkcnt_zero_file + watch_dir' diff --git a/usr/src/test/zfs-tests/runfiles/smartos.run b/usr/src/test/zfs-tests/runfiles/smartos.run index 47059b0b52..18e819a301 100644 --- a/usr/src/test/zfs-tests/runfiles/smartos.run +++ b/usr/src/test/zfs-tests/runfiles/smartos.run @@ -130,7 +130,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', 'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg', 'zfs_mount_all_001_pos', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', - 'zfs_mount_encrypted'] + 'zfs_mount_encrypted', 'zfs_mount_watched_inotify', + 'zfs_mount_watched_portfs' ] [/opt/zfs-tests/tests/functional/cli_root/zfs_program] tests = ['zfs_program_json'] diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_inotify.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_inotify.ksh new file mode 100644 index 0000000000..0f181621e8 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_inotify.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2020 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zfs mount' should not get EBUSY due to inotify(5) watching a directory +# +# STRATEGY: +# 1. Create a directory +# 2. Start watching the directory with inotify(5). +# 3. Create a filesystem +# 4. Mount the filesystem at the directory created in step 1 +# 5. Destroy the filesystem +# 6. Remove the directory +# 7. Verify the watcher saw the directory removal +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + log_must zfs destroy -f $TESTPOOL/$TESTFS1 + log_must rm -rf "$TESTDIR/mntpt" "$TESTDIR/watch_dir.log" +} + +log_onexit cleanup + +log_assert "'zfs mount' should not get EBUSY due to inotify(5) watching a directory" + +# 1. Create a directory. +log_must mkdir -p "$TESTDIR/mntpt" + +# 2. Start watching the directory with inotify(5). +watch_dir inotify $TESTDIR/mntpt > $TESTDIR/watch_dir.log & + +# 3. Create a filesystem +log_must zfs create $TESTPOOL/$TESTFS1 + +# 4. Mount the file system at the directory created in step 1 +log_must zfs set mountpoint=$TESTDIR/mntpt $TESTPOOL/$TESTFS1 + +# 5. Destroy the filesystem +log_must zfs destroy $TESTPOOL/$TESTFS1 + +# 6. Remove the directory. The corresponding inotify event will cause the +# watcher to exit. +log_must rmdir $TESTDIR/mntpt + +# 7. Verify the watcher saw the directory removal. This ensures that the watcher +# was watching the directory we are interested in. +wait +log_must grep -q DELETE_SELF $TESTDIR/watch_dir.log + +log_pass "'zfs mount' should not get EBUSY due to inotify(5) watching a directory" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_portfs.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_portfs.ksh new file mode 100644 index 0000000000..3135203973 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_watched_portfs.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2020 Joyent, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zfs mount' should not get EBUSY due to portfs watching a directory +# +# STRATEGY: +# 1. Create a directory +# 2. Start watching the directory with port_associate +# 3. Create a filesystem +# 4. Mount the filesystem at the directory created in step 1 +# 5. Destroy the filesystem +# 6. Remove the directory +# 7. Verify the watcher saw the directory removal +# + +verify_runnable "both" + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + log_must zfs destroy -f $TESTPOOL/$TESTFS1 + log_must rm -rf "$TESTDIR/mntpt" "$TESTDIR/watch_dir.log" +} + +log_onexit cleanup + +log_assert "'zfs mount' should not get EBUSY due to portfs watching a directory" + +# 1. Create a directory. +log_must mkdir -p "$TESTDIR/mntpt" + +# 2. Start watching the directory with port_associate +watch_dir portfs $TESTDIR/mntpt > $TESTDIR/watch_dir.log & + +# 3. Create a filesystem +log_must zfs create $TESTPOOL/$TESTFS1 + +# 4. Mount the file system at the directory created in step 1 +log_must zfs set mountpoint=$TESTDIR/mntpt $TESTPOOL/$TESTFS1 + +# 5. Destroy the filesystem +log_must zfs destroy $TESTPOOL/$TESTFS1 + +# 6. Remove the directory. The corresponding portfs event will cause the +# watcher to exit. +log_must rmdir $TESTDIR/mntpt + +# 7. Verify the watcher saw the directory removal. This ensures that the watcher +# was watching the directory we are interested in. +wait +log_must grep -q DELETE.$TESTDIR/mntpt $TESTDIR/watch_dir.log + +log_pass "'zfs mount' should not get EBUSY due to portfs watching a directory" diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c index 019de0540a..252ff2d9ac 100644 --- a/usr/src/uts/common/fs/portfs/port_fop.c +++ b/usr/src/uts/common/fs/portfs/port_fop.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -257,7 +257,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { VOPNAME_READ, { .femop_read = port_fop_read }, VOPNAME_WRITE, { .femop_write = port_fop_write }, VOPNAME_MAP, { .femop_map = port_fop_map }, - VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr }, + VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr }, VOPNAME_CREATE, { .femop_create = port_fop_create }, VOPNAME_REMOVE, { .femop_remove = port_fop_remove }, VOPNAME_LINK, { .femop_link = port_fop_link }, @@ -266,7 +266,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { VOPNAME_RMDIR, { .femop_rmdir = port_fop_rmdir }, VOPNAME_READDIR, { .femop_readdir = port_fop_readdir }, VOPNAME_SYMLINK, { .femop_symlink = port_fop_symlink }, - VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr }, + VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr }, VOPNAME_VNEVENT, { .femop_vnevent = port_fop_vnevent }, NULL, NULL }; @@ -275,7 +275,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { * Fsem - vfs ops hooks */ const fs_operation_def_t port_vfssrc_template[] = { - VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount }, + VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount }, NULL, NULL }; @@ -539,14 +539,14 @@ port_fop_trimpfplist(vnode_t *vp) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } } /* * This routine returns 1, if the vnode can be rele'ed by the caller. - * The caller has to VN_RELE the vnode with out holding any + * The caller has to VN_PHANTOM_RELE the vnode with out holding any * locks. */ int @@ -616,7 +616,7 @@ port_fop_femuninstall(vnode_t *vp) * able to remove it from the port's queue). * * vpp and dvpp will point to the vnode and directory vnode which the caller - * is required to VN_RELE without holding any locks. + * is required to VN_PHANTOM_RELE without holding any locks. */ int port_remove_fop(portfop_t *pfp, portfop_cache_t *pfcp, int cleanup, @@ -726,12 +726,12 @@ port_cache_lookup_fop(portfop_cache_t *pfcp, pid_t pid, uintptr_t obj) /* * Given the file name, get the vnode and also the directory vnode - * On return, the vnodes are held (VN_HOLD). The caller has to VN_RELE - * the vnode(s). + * On return, the vnodes are held with phantom holds (VN_PHANTOM_HOLD). The + * caller has to VN_PHANTOM_RELE the vnode(s). */ int port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, - char **cname, int *len, int follow) + char **cname, int *len, int follow) { int error = 0; struct pathname pn; @@ -777,6 +777,17 @@ port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, } } + /* Trade VN_HOLD()s from lookuppn with VN_PHANTOM_HOLD()s */ + if (dvp != NULL) { + VN_PHANTOM_HOLD(*dvp); + VN_RELE(*dvp); + } + + if (vp != NULL) { + VN_PHANTOM_HOLD(*vp); + VN_RELE(*vp); + } + pn_free(&pn); return (error); } @@ -815,7 +826,7 @@ port_getsrc(port_t *pp, int source) */ static void port_check_timestamp(portfop_cache_t *pfcp, vnode_t *vp, vnode_t *dvp, - portfop_t *pfp, void *objptr, uintptr_t object) + portfop_t *pfp, void *objptr, uintptr_t object) { vattr_t vatt; portfop_vp_t *pvp = vp->v_fopdata; @@ -1102,8 +1113,8 @@ port_install_fopdata(vnode_t *vp) */ int port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, - uintptr_t object, int events, void *user, char *cname, int clen, - vnode_t *dvp) + uintptr_t object, int events, void *user, char *cname, int clen, + vnode_t *dvp) { portfop_t *pfp = NULL; port_kevent_t *pkevp; @@ -1176,7 +1187,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold a reference to the vnode since * we successfully installed the hooks. */ - VN_HOLD(vp); + VN_PHANTOM_HOLD(vp); } else { (void) fem_uninstall(vp, femp, vp); pvp->pvp_femp = NULL; @@ -1209,7 +1220,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold the directory vnode since we have a reference now. */ if (dvp != NULL) - VN_HOLD(dvp); + VN_PHANTOM_HOLD(dvp); *pfpp = pfp; return (0); } @@ -1224,9 +1235,9 @@ port_resolve_vp(vnode_t *vp) */ if (vfs_mntdummyvp && mntfstype != 0 && vp->v_vfsp->vfs_fstype == mntfstype) { - VN_RELE(vp); + VN_PHANTOM_RELE(vp); vp = vfs_mntdummyvp; - VN_HOLD(vfs_mntdummyvp); + VN_PHANTOM_HOLD(vfs_mntdummyvp); } /* @@ -1234,8 +1245,8 @@ port_resolve_vp(vnode_t *vp) * hardlinks. */ if ((VOP_REALVP(vp, &rvp, NULL) == 0) && vp != rvp) { - VN_HOLD(rvp); - VN_RELE(vp); + VN_PHANTOM_HOLD(rvp); + VN_PHANTOM_RELE(vp); vp = rvp; } return (vp); @@ -1247,10 +1258,10 @@ port_resolve_vp(vnode_t *vp) * The association is identified by the object pointer and the pid. * The events argument contains the events to be monitored for. * - * The vnode will have a VN_HOLD once the fem hooks are installed. + * The vnode will have a VN_PHANTOM_HOLD once the fem hooks are installed. * - * Every reference(pfp) to the directory vnode will have a VN_HOLD to ensure - * that the directory vnode pointer does not change. + * Every reference(pfp) to the directory vnode will have a VN_PHANTOM_HOLD to + * ensure that the directory vnode pointer does not change. */ int port_associate_fop(port_t *pp, int source, uintptr_t object, int events, @@ -1330,7 +1341,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, */ if (dvp != NULL && dvp->v_vfsp != vp->v_vfsp && !(orig->v_type == VPROC && vp != NULL && vp->v_type != VPROC)) { - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); dvp = NULL; } @@ -1350,8 +1361,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, pfp = port_cache_lookup_fop(pfcp, curproc->p_pid, object); /* - * If it is not the same vnode, just discard it. VN_RELE needs to be - * called with no locks held, therefore save vnode pointers and + * If it is not the same vnode, just discard it. VN_PHANTOM_RELE needs + * to be called with no locks held, therefore save vnode pointers and * vn_rele them later. */ if (pfp != NULL && (pfp->pfop_vp != vp || pfp->pfop_dvp != dvp)) { @@ -1404,7 +1415,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * This vnode pointer is just used * for comparison, so rele it */ - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); } } @@ -1437,8 +1448,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * active and it is not being removed from * the vnode list. This is checked in * port_remove_fop with the vnode lock held. - * The vnode returned is VN_RELE'ed after dropping - * the locks. + * The vnode returned is VN_PHANTOM_RELE'ed after + * dropping the locks. */ tdvp = tvp = NULL; if (port_remove_fop(pfp, pfcp, 0, NULL, &tvp, &tdvp)) { @@ -1451,9 +1462,9 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, } mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); goto errout; } } else { @@ -1519,14 +1530,14 @@ errout: * Release the hold acquired due to the lookup operation. */ if (vp != NULL) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); if (dvp != NULL) - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); if (oldvp != NULL) - VN_RELE(oldvp); + VN_PHANTOM_RELE(oldvp); if (olddvp != NULL) - VN_RELE(olddvp); + VN_PHANTOM_RELE(olddvp); /* * copied file name not used, free it. @@ -1587,9 +1598,9 @@ port_dissociate_fop(port_t *pp, uintptr_t object) (void) port_remove_fop(pfp, pfcp, 1, &active, &tvp, &tdvp); mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); return (active ? 0 : ENOENT); } @@ -1610,7 +1621,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) portfop_t *pfpnext; int index, i; port_source_t *pse; - vnode_t *tdvp = NULL; + vnode_t *tdvp = NULL; vnode_t *vpl[PORTFOP_NVP]; pse = port_getsrc(pp, PORT_SOURCE_FILE); @@ -1627,7 +1638,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) * be possible as the port is being closed. * * The common case is that the port is not shared and all the entries - * are of this pid and have to be freed. Since VN_RELE has to be + * are of this pid and have to be freed. Since VN_PHANTOM_RELE has to be * called outside the lock, we do it in batches. */ hashtbl = (portfop_t **)pfcp->pfc_hash; @@ -1654,14 +1665,14 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) if (pfp == NULL) index++; /* - * Now call VN_RELE if we have collected enough vnodes or - * we have reached the end of the hash table. + * Now call VN_PHANTOM_RELE if we have collected enough vnodes + * or we have reached the end of the hash table. */ if (i >= (PORTFOP_NVP - 1) || (i > 0 && index == PORTFOP_HASHSIZE)) { mutex_exit(&pfcp->pfc_lock); while (i > 0) { - VN_RELE(vpl[--i]); + VN_PHANTOM_RELE(vpl[--i]); vpl[i] = NULL; } mutex_enter(&pfcp->pfc_lock); @@ -1769,7 +1780,7 @@ port_fop_excep(list_t *tlist, int op) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } @@ -1933,7 +1944,7 @@ port_fop_sendevent(vnode_t *vp, int events, vnode_t *dvp, char *cname) * that may be attempting to remove an object from the vnode's. */ if (port_fop_femuninstall(vp)) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); /* * Send exception events and discard the watch entries. @@ -1980,7 +1991,7 @@ port_fop(vnode_t *vp, int op, int retval) event |= FILE_TRUNC; } if (event) { - port_fop_sendevent(vp, event, NULL, NULL); + port_fop_sendevent(vp, event, NULL, NULL); } } @@ -2068,7 +2079,7 @@ port_fop_unmount(fsemarg_t *vf, int flag, cred_t *cr) * unmount is in process. */ port_fop_sendevent(pvp->pvp_vp, UNMOUNTED, NULL, NULL); - VN_RELE(pvp->pvp_vp); + VN_PHANTOM_RELE(pvp->pvp_vp); } error = vfsnext_unmount(vf, flag, cr); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 9e0b071999..c57884a940 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. */ @@ -852,6 +852,37 @@ vn_rele(vnode_t *vp) mutex_exit(&vp->v_lock); } +void +vn_phantom_rele(vnode_t *vp) +{ + VERIFY(vp->v_count > 0); + + mutex_enter(&vp->v_lock); + VERIFY3U(vp->v_count, >=, vp->v_phantom_count); + vp->v_phantom_count--; + DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp); + if (vp->v_count == 1) { + ASSERT0(vp->v_phantom_count); + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } + VN_RELE_LOCKED(vp); + mutex_exit(&vp->v_lock); +} + +/* + * Return the number of non-phantom holds. Things such as portfs will use + * phantom holds to prevent it from blocking filesystems from mounting over + * watched directories. + */ +uint_t +vn_count(vnode_t *vp) +{ + ASSERT(MUTEX_HELD(&vp->v_lock)); + return (vp->v_count - vp->v_phantom_count); +} + /* * Release a vnode referenced by the DNLC. Multiple DNLC references are treated * as a single reference, so v_count is not decremented until the last DNLC hold @@ -2428,6 +2459,7 @@ vn_reinit(vnode_t *vp) { vp->v_count = 1; vp->v_count_dnlc = 0; + vp->v_phantom_count = 0; vp->v_vfsp = NULL; vp->v_stream = NULL; vp->v_vfsmountedhere = NULL; @@ -2484,6 +2516,7 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); + ASSERT0(vp->v_phantom_count); VERIFY(vp->v_path != NULL); if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index d06cb875cf..e78907b5a6 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -24,7 +24,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ @@ -1893,7 +1893,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + (vn_count(mvp) != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c index b20c63481f..67bf55f213 100644 --- a/usr/src/uts/common/io/inotify.c +++ b/usr/src/uts/common/io/inotify.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright (c) 2015 The MathWorks, Inc. All rights reserved. */ @@ -728,7 +728,7 @@ inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) crfree(child->inw_cred); } - VN_RELE(child->inw_vp); + VN_PHANTOM_RELE(child->inw_vp); /* * We're down (or should be down) to a single reference to @@ -738,7 +738,7 @@ inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) } inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); - VN_RELE(watch->inw_vp); + VN_PHANTOM_RELE(watch->inw_vp); /* * It's now safe to zombify the watch -- we know that the only reference @@ -830,7 +830,7 @@ inotify_watch_delete(inotify_watch_t *watch, uint32_t event) err = inotify_fem_uninstall(watch->inw_vp, watch); VERIFY(err == 0); - VN_RELE(watch->inw_vp); + VN_PHANTOM_RELE(watch->inw_vp); /* * It's now safe to zombify the watch -- which won't actually delete @@ -865,7 +865,7 @@ inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) return; } - VN_HOLD(vp); + VN_PHANTOM_HOLD(vp); watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); VERIFY(watch != NULL); @@ -897,7 +897,7 @@ inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, return (ENOSPC); } - VN_HOLD(vp); + VN_PHANTOM_HOLD(vp); watch = inotify_watch_add(state, NULL, NULL, vp, set); *wdp = watch->inw_wd; mutex_exit(&state->ins_lock); @@ -978,6 +978,10 @@ inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) return (0); } + /* Trade the plain hold from lookupnameat() for a phantom hold */ + VN_PHANTOM_HOLD(cvp); + VN_RELE(cvp); + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); VERIFY(watch != NULL); mutex_exit(&state->ins_lock); @@ -1065,7 +1069,7 @@ inotify_clean(void *arg) */ savecred = curthread->t_cred; curthread->t_cred = watch->inw_cred; - VN_RELE(watch->inw_vp); + VN_PHANTOM_RELE(watch->inw_vp); crfree(watch->inw_cred); curthread->t_cred = savecred; diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index 494264731b..df5da6c2e7 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. */ @@ -197,6 +197,7 @@ struct vsd_node { * v_count * v_shrlocks * v_path + * v_phantom_count * v_vsd * v_xattrdir * @@ -214,6 +215,7 @@ struct vsd_node { * v_lock * v_flag * v_count + * v_phantom_count * v_data * v_vfsp * v_stream @@ -285,6 +287,8 @@ typedef struct vnode { kmutex_t v_lock; /* protects vnode fields */ uint_t v_flag; /* vnode flags (see below) */ uint_t v_count; /* reference count */ + /* non vn_count() ref count (see below) */ + uint_t v_phantom_count; void *v_data; /* private data for fs */ struct vfs *v_vfsp; /* ptr to containing VFS */ struct stdata *v_stream; /* associated stream */ @@ -811,9 +815,9 @@ typedef enum vnevent { VE_REMOVE = 3, /* Remove of vnode's name */ VE_RMDIR = 4, /* Remove of directory vnode's name */ VE_CREATE = 5, /* Create with vnode's name which exists */ - VE_LINK = 6, /* Link with vnode's name as source */ + VE_LINK = 6, /* Link with vnode's name as source */ VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ - VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ + VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ VE_TRUNCATE = 9, /* Truncate */ VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */ VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ @@ -1294,9 +1298,9 @@ void vn_recycle(vnode_t *); void vn_free(vnode_t *); int vn_is_readonly(vnode_t *); -int vn_is_opened(vnode_t *, v_mode_t); -int vn_is_mapped(vnode_t *, v_mode_t); -int vn_has_other_opens(vnode_t *, v_mode_t); +int vn_is_opened(vnode_t *, v_mode_t); +int vn_is_mapped(vnode_t *, v_mode_t); +int vn_has_other_opens(vnode_t *, v_mode_t); void vn_open_upgrade(vnode_t *, int); void vn_open_downgrade(vnode_t *, int); @@ -1335,10 +1339,12 @@ int vn_createat(char *pnamep, enum uio_seg seg, struct vattr *vap, int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, cred_t *cr, ssize_t *residp); +uint_t vn_count(struct vnode *vp); void vn_rele(struct vnode *vp); void vn_rele_async(struct vnode *vp, struct taskq *taskq); void vn_rele_dnlc(struct vnode *vp); void vn_rele_stream(struct vnode *vp); +void vn_phantom_rele(struct vnode *vp); int vn_link(char *from, char *to, enum uio_seg seg); int vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, vnode_t *tstartvp, char *to, enum uio_seg seg); @@ -1443,6 +1449,16 @@ extern uint_t pvn_vmodsort_supported; * this->vp->v_path == NULL ? "NULL" : stringof(this->vp->v_path), * this->vp->v_count) * }' + * + * There are some situations where we don't want a hold to make the vnode + * 'busy'. For example, watching a directory via port events or inotify + * should not prevent a filesystem from mounting on a watched directory. + * For those instances, a phantom hold is used via VN_PHANTOM_HOLD(). + * + * A phantom hold works identically to regular hold, except that those holds + * are excluded from the return value of vn_count(). + * + * A phantom hold must be released by VN_PHANTOM_RELE(). */ #define VN_HOLD_LOCKED(vp) { \ ASSERT(mutex_owned(&(vp)->v_lock)); \ @@ -1471,6 +1487,22 @@ extern uint_t pvn_vmodsort_supported; DTRACE_PROBE1(vn__rele, vnode_t *, vp); \ } +#define VN_PHANTOM_HOLD_LOCKED(vp) { \ + VN_HOLD_LOCKED(vp); \ + (vp)->v_phantom_count++; \ + DTRACE_PROBE1(vn__phantom_hold, vnode_t *, vp); \ +} + +#define VN_PHANTOM_HOLD(vp) { \ + mutex_enter(&(vp)->v_lock); \ + VN_PHANTOM_HOLD_LOCKED(vp); \ + mutex_exit(&(vp)->v_lock); \ +} + +#define VN_PHANTOM_RELE(vp) { \ + vn_phantom_rele(vp); \ +} + #define VN_SET_VFS_TYPE_DEV(vp, vfsp, type, dev) { \ (vp)->v_vfsp = (vfsp); \ (vp)->v_type = (type); \ @@ -1481,7 +1513,7 @@ extern uint_t pvn_vmodsort_supported; * Compare two vnodes for equality. In general this macro should be used * in preference to calling VOP_CMP directly. */ -#define VN_CMP(VP1, VP2) ((VP1) == (VP2) ? 1 : \ +#define VN_CMP(VP1, VP2) ((VP1) == (VP2) ? 1 : \ ((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \ VOP_CMP(VP1, VP2, NULL) : 0)) |