summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
authorDan McDonald <danmcd@mnx.io>2022-07-06 16:58:29 -0400
committerDan McDonald <danmcd@mnx.io>2022-08-31 13:44:03 -0400
commitda29c6a3afe4aa1e02c76bdf0f78a1ee76ee9ac8 (patch)
treef1c17e94e7085184904a359fa16c88b1be5c2605 /usr/src/uts/common/fs
parent2570281cf351044b6936651ce26dbe1f801dcbd8 (diff)
downloadillumos-gate-da29c6a3afe4aa1e02c76bdf0f78a1ee76ee9ac8.tar.gz
14788 FDINFO misbehaves in multiple ways
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com> Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Igor Kozhukhov <igor@dilos.org> Approved by: Patrick Mooney <pmooney@pfmooney.com>
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/proc/prdata.h1
-rw-r--r--usr/src/uts/common/fs/proc/prsubr.c105
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c52
3 files changed, 114 insertions, 44 deletions
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h
index 59315368d7..45219eccd5 100644
--- a/usr/src/uts/common/fs/proc/prdata.h
+++ b/usr/src/uts/common/fs/proc/prdata.h
@@ -352,7 +352,6 @@ extern int pr_set(proc_t *, long);
extern int pr_unset(proc_t *, long);
extern void pr_sethold(prnode_t *, sigset_t *);
extern file_t *pr_getf(proc_t *, uint_t, short *);
-extern void pr_releasef(proc_t *, uint_t);
extern void pr_setfault(proc_t *, fltset_t *);
extern int prusrio(proc_t *, enum uio_rw, struct uio *, int);
extern int prwritectl(vnode_t *, struct uio *, cred_t *);
diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c
index 76a645f931..021d2b4b49 100644
--- a/usr/src/uts/common/fs/proc/prsubr.c
+++ b/usr/src/uts/common/fs/proc/prsubr.c
@@ -23,6 +23,7 @@
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2017, Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2022 MNX Cloud, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -1492,15 +1493,56 @@ pr_u64tos(uint64_t n, char *s)
return (len);
}
+/*
+ * Similar to getf() / getf_gen(), but for the specified process. On success,
+ * returns the fp with fp->f_count incremented. The caller MUST call
+ * closef(fp) on the returned fp after completing any actions using that fp.
+ * We return a reference-held (fp->f_count bumped) file_t so no other closef()
+ * can invoke destructive VOP_CLOSE actions while we're inspecting the
+ * process's FD.
+ *
+ * Returns NULL for errors: either an empty process-table slot post-fi_lock
+ * and UF_ENTER, or too many mutex_tryenter() failures on the file_t's f_tlock.
+ * Both failure modes have DTrace probes.
+ *
+ * The current design of the procfs "close" code path uses the following lock
+ * order of:
+ *
+ * 1: (file_t) f_tlock
+ * 2: (proc_t) p_lock AND setting p->p_proc_flag's P_PR_LOCK
+ *
+ * That happens because closef() holds f_tlock while calling fop_close(),
+ * which can be prclose(), which currently waits on and sets P_PR_LOCK at its
+ * beginning.
+ *
+ * That lock order creates a challenge for pr_getf, which needs to take those
+ * locks in the opposite order when the fd points to a procfs file descriptor.
+ * The solution chosen here is to use mutex_tryenter on f_tlock and retry some
+ * (limited) number of times, failing if we don't get both locks.
+ *
+ * The cases where this can fail are rare, and all involve a procfs caller
+ * asking for info (eg. FDINFO) on another procfs FD. In these cases,
+ * returning EBADF (which results from a NULL return from pr_getf()) is
+ * acceptable.
+ *
+ * One can increase the number of tries in pr_getf_maxtries if one is worried
+ * about the contentuous case.
+ */
+
+uint64_t pr_getf_tryfails; /* Bumped for statistic purposes. */
+int pr_getf_maxtries = 3; /* So you can tune it from /etc/system */
+
file_t *
pr_getf(proc_t *p, uint_t fd, short *flag)
{
uf_entry_t *ufp;
uf_info_t *fip;
file_t *fp;
+ int tries = 0;
ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK));
+retry:
fip = P_FINFO(p);
if (fd >= fip->fi_nfiles)
@@ -1510,37 +1552,54 @@ pr_getf(proc_t *p, uint_t fd, short *flag)
mutex_enter(&fip->fi_lock);
UF_ENTER(ufp, fip, fd);
if ((fp = ufp->uf_file) != NULL && fp->f_count > 0) {
- if (flag != NULL)
- *flag = ufp->uf_flag;
- ufp->uf_refcnt++;
+ if (mutex_tryenter(&fp->f_tlock)) {
+ ASSERT(fp->f_count > 0);
+ fp->f_count++;
+ mutex_exit(&fp->f_tlock);
+ if (flag != NULL)
+ *flag = ufp->uf_flag;
+ } else {
+ /*
+ * Note the number of mutex_trylock attempts.
+ *
+ * The exit path will catch this and try again if we
+ * are below the retry threshhold (pr_getf_maxtries).
+ */
+ tries++;
+ pr_getf_tryfails++;
+ /*
+ * If we hit pr_getf_maxtries, we'll return NULL.
+ * DTrace scripts looking for this sort of failure
+ * should check when arg1 is pr_getf_maxtries.
+ */
+ DTRACE_PROBE2(pr_getf_tryfail, file_t *, fp, int,
+ tries);
+ fp = NULL;
+ }
} else {
fp = NULL;
+ /* If we fail here, someone else closed this FD. */
+ DTRACE_PROBE1(pr_getf_emptyslot, int, tries);
+ tries = pr_getf_maxtries; /* Don't bother retrying. */
}
UF_EXIT(ufp);
mutex_exit(&fip->fi_lock);
mutex_enter(&p->p_lock);
- return (fp);
-}
-
-void
-pr_releasef(proc_t *p, uint_t fd)
-{
- uf_entry_t *ufp;
- uf_info_t *fip;
-
- ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK));
-
- fip = P_FINFO(p);
+ /* Use goto instead of tail-recursion so we can keep "tries" around. */
+ if (fp == NULL) {
+ /* "tries" starts at 1. */
+ if (tries < pr_getf_maxtries)
+ goto retry;
+ } else {
+ /*
+ * Probes here will detect successes after arg1's number of
+ * mutex_tryenter() calls.
+ */
+ DTRACE_PROBE2(pr_getf_trysuccess, file_t *, fp, int, tries + 1);
+ }
- mutex_exit(&p->p_lock);
- mutex_enter(&fip->fi_lock);
- UF_ENTER(ufp, fip, fd);
- ASSERT3U(ufp->uf_refcnt, >, 0);
- ufp->uf_refcnt--;
- UF_EXIT(ufp);
- mutex_exit(&fip->fi_lock);
- mutex_enter(&p->p_lock);
+ return (fp);
}
void
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index 621116fd01..f2a924095e 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -24,6 +24,7 @@
* Copyright (c) 2018, Joyent, Inc.
* Copyright (c) 2017 by Delphix. All rights reserved.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2022 MNX Cloud, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -824,10 +825,8 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr)
prfdinfo_t *fdinfo;
list_t data;
proc_t *p;
- vnode_t *vp;
uint_t fd;
file_t *fp;
- cred_t *file_cred;
short ufp_flag;
int error = 0;
@@ -865,9 +864,6 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr)
goto out;
}
- vp = fp->f_vnode;
- VN_HOLD(vp);
-
/*
* For fdinfo, we don't want to include the placeholder pr_misc at the
* end of the struct. We'll terminate the data with an empty pr_misc
@@ -881,21 +877,16 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr)
if ((fdinfo->pr_fileflags & (FSEARCH | FEXEC)) == 0)
fdinfo->pr_fileflags += FOPEN;
fdinfo->pr_offset = fp->f_offset;
- file_cred = fp->f_cred;
- crhold(file_cred);
/*
* Information from the vnode (rather than the file_t) is retrieved
* later, in prgetfdinfo() - for example sock_getfasync()
*/
- pr_releasef(p, fd);
prunlock(pnp);
- error = prgetfdinfo(p, vp, fdinfo, cr, file_cred, &data);
+ error = prgetfdinfo(p, fp->f_vnode, fdinfo, cr, fp->f_cred, &data);
- crfree(file_cred);
-
- VN_RELE(vp);
+ (void) closef(fp);
out:
if (error == 0)
@@ -3105,12 +3096,36 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
return (0);
}
+ /* A subset of prlock(pnp...) */
p = pr_p_lock(pnp);
mutex_exit(&pr_pidlock);
if (p == NULL)
return (ENOENT);
pcp = pnp->pr_common;
+ /*
+ * Because we're performing a subset of prlock() inline here, we must
+ * follow prlock's semantics when encountering a zombie process
+ * (PRC_DESTROY flag is set) or an exiting process (SEXITING flag is
+ * set). Those semantics indicate acting as if the process is no
+ * longer there (return ENOENT).
+ *
+ * If we chose to proceed here regardless, we may encounter issues
+ * when we drop the p_lock (see PR_OBJECTDIR, PR_PATHDIR, PR_*MAP,
+ * PR_LDT, and PR_*PAGEDATA below). A process-cleanup which was
+ * blocked on p_lock may ignore the P_PR_LOCK flag we set above, since
+ * it set one of PRC_DESTROY or SEXITING. If the process then gets
+ * destroyed our "p" will be useless, as will its p_lock.
+ *
+ * It may be desirable to move this check to only places further down
+ * prior to actual droppages of p->p_lock, but for now, we're playing
+ * it safe and checking here immediately, like prlock() does..
+ */
+ if (((pcp->prc_flags & PRC_DESTROY) || (p->p_flag & SEXITING))) {
+ prunlock(pnp);
+ return (ENOENT);
+ }
+
mutex_enter(&p->p_crlock);
vap->va_uid = crgetruid(p->p_cred);
vap->va_gid = crgetrgid(p->p_cred);
@@ -3182,7 +3197,6 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
break;
case PR_FDINFO: {
file_t *fp;
- vnode_t *vp;
int fd = pnp->pr_index;
fp = pr_getf(p, fd, NULL);
@@ -3190,13 +3204,10 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
prunlock(pnp);
return (ENOENT);
}
- vp = fp->f_vnode;
- VN_HOLD(vp);
- pr_releasef(p, fd);
prunlock(pnp);
- vap->va_size = prgetfdinfosize(p, vp, cr);
- VN_RELE(vp);
+ vap->va_size = prgetfdinfosize(p, fp->f_vnode, cr);
vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size);
+ (void) closef(fp);
return (0);
}
case PR_LWPDIR:
@@ -4245,7 +4256,7 @@ pr_lookup_lwpiddir(vnode_t *dp, char *comp)
}
/*
- * Lookup one of the process's open files.
+ * Lookup one of the process's file vnodes.
*/
static vnode_t *
pr_lookup_fddir(vnode_t *dp, char *comp)
@@ -4292,10 +4303,11 @@ pr_lookup_fddir(vnode_t *dp, char *comp)
pnp->pr_mode |= 0222;
vp = fp->f_vnode;
VN_HOLD(vp);
- pr_releasef(p, fd);
}
prunlock(dpnp);
+ if (fp != NULL)
+ (void) closef(fp);
if (vp == NULL) {
prfreenode(pnp);