diff options
author | Dan McDonald <danmcd@mnx.io> | 2022-07-06 16:58:29 -0400 |
---|---|---|
committer | Dan McDonald <danmcd@mnx.io> | 2022-08-31 13:44:03 -0400 |
commit | da29c6a3afe4aa1e02c76bdf0f78a1ee76ee9ac8 (patch) | |
tree | f1c17e94e7085184904a359fa16c88b1be5c2605 /usr/src/uts/common/fs | |
parent | 2570281cf351044b6936651ce26dbe1f801dcbd8 (diff) | |
download | illumos-gate-da29c6a3afe4aa1e02c76bdf0f78a1ee76ee9ac8.tar.gz |
14788 FDINFO misbehaves in multiple ways
Portions contributed by: Gordon Ross <gordon.w.ross@gmail.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
Approved by: Patrick Mooney <pmooney@pfmooney.com>
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r-- | usr/src/uts/common/fs/proc/prdata.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/fs/proc/prsubr.c | 105 | ||||
-rw-r--r-- | usr/src/uts/common/fs/proc/prvnops.c | 52 |
3 files changed, 114 insertions, 44 deletions
diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index 59315368d7..45219eccd5 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -352,7 +352,6 @@ extern int pr_set(proc_t *, long); extern int pr_unset(proc_t *, long); extern void pr_sethold(prnode_t *, sigset_t *); extern file_t *pr_getf(proc_t *, uint_t, short *); -extern void pr_releasef(proc_t *, uint_t); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); extern int prwritectl(vnode_t *, struct uio *, cred_t *); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 76a645f931..021d2b4b49 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -23,6 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2017, Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 MNX Cloud, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1492,15 +1493,56 @@ pr_u64tos(uint64_t n, char *s) return (len); } +/* + * Similar to getf() / getf_gen(), but for the specified process. On success, + * returns the fp with fp->f_count incremented. The caller MUST call + * closef(fp) on the returned fp after completing any actions using that fp. + * We return a reference-held (fp->f_count bumped) file_t so no other closef() + * can invoke destructive VOP_CLOSE actions while we're inspecting the + * process's FD. + * + * Returns NULL for errors: either an empty process-table slot post-fi_lock + * and UF_ENTER, or too many mutex_tryenter() failures on the file_t's f_tlock. + * Both failure modes have DTrace probes. + * + * The current design of the procfs "close" code path uses the following lock + * order of: + * + * 1: (file_t) f_tlock + * 2: (proc_t) p_lock AND setting p->p_proc_flag's P_PR_LOCK + * + * That happens because closef() holds f_tlock while calling fop_close(), + * which can be prclose(), which currently waits on and sets P_PR_LOCK at its + * beginning. + * + * That lock order creates a challenge for pr_getf, which needs to take those + * locks in the opposite order when the fd points to a procfs file descriptor. + * The solution chosen here is to use mutex_tryenter on f_tlock and retry some + * (limited) number of times, failing if we don't get both locks. + * + * The cases where this can fail are rare, and all involve a procfs caller + * asking for info (eg. FDINFO) on another procfs FD. In these cases, + * returning EBADF (which results from a NULL return from pr_getf()) is + * acceptable. + * + * One can increase the number of tries in pr_getf_maxtries if one is worried + * about the contentuous case. + */ + +uint64_t pr_getf_tryfails; /* Bumped for statistic purposes. */ +int pr_getf_maxtries = 3; /* So you can tune it from /etc/system */ + file_t * pr_getf(proc_t *p, uint_t fd, short *flag) { uf_entry_t *ufp; uf_info_t *fip; file_t *fp; + int tries = 0; ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK)); +retry: fip = P_FINFO(p); if (fd >= fip->fi_nfiles) @@ -1510,37 +1552,54 @@ pr_getf(proc_t *p, uint_t fd, short *flag) mutex_enter(&fip->fi_lock); UF_ENTER(ufp, fip, fd); if ((fp = ufp->uf_file) != NULL && fp->f_count > 0) { - if (flag != NULL) - *flag = ufp->uf_flag; - ufp->uf_refcnt++; + if (mutex_tryenter(&fp->f_tlock)) { + ASSERT(fp->f_count > 0); + fp->f_count++; + mutex_exit(&fp->f_tlock); + if (flag != NULL) + *flag = ufp->uf_flag; + } else { + /* + * Note the number of mutex_trylock attempts. + * + * The exit path will catch this and try again if we + * are below the retry threshhold (pr_getf_maxtries). + */ + tries++; + pr_getf_tryfails++; + /* + * If we hit pr_getf_maxtries, we'll return NULL. + * DTrace scripts looking for this sort of failure + * should check when arg1 is pr_getf_maxtries. + */ + DTRACE_PROBE2(pr_getf_tryfail, file_t *, fp, int, + tries); + fp = NULL; + } } else { fp = NULL; + /* If we fail here, someone else closed this FD. */ + DTRACE_PROBE1(pr_getf_emptyslot, int, tries); + tries = pr_getf_maxtries; /* Don't bother retrying. */ } UF_EXIT(ufp); mutex_exit(&fip->fi_lock); mutex_enter(&p->p_lock); - return (fp); -} - -void -pr_releasef(proc_t *p, uint_t fd) -{ - uf_entry_t *ufp; - uf_info_t *fip; - - ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK)); - - fip = P_FINFO(p); + /* Use goto instead of tail-recursion so we can keep "tries" around. */ + if (fp == NULL) { + /* "tries" starts at 1. */ + if (tries < pr_getf_maxtries) + goto retry; + } else { + /* + * Probes here will detect successes after arg1's number of + * mutex_tryenter() calls. + */ + DTRACE_PROBE2(pr_getf_trysuccess, file_t *, fp, int, tries + 1); + } - mutex_exit(&p->p_lock); - mutex_enter(&fip->fi_lock); - UF_ENTER(ufp, fip, fd); - ASSERT3U(ufp->uf_refcnt, >, 0); - ufp->uf_refcnt--; - UF_EXIT(ufp); - mutex_exit(&fip->fi_lock); - mutex_enter(&p->p_lock); + return (fp); } void diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 621116fd01..f2a924095e 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -24,6 +24,7 @@ * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 MNX Cloud, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -824,10 +825,8 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr) prfdinfo_t *fdinfo; list_t data; proc_t *p; - vnode_t *vp; uint_t fd; file_t *fp; - cred_t *file_cred; short ufp_flag; int error = 0; @@ -865,9 +864,6 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr) goto out; } - vp = fp->f_vnode; - VN_HOLD(vp); - /* * For fdinfo, we don't want to include the placeholder pr_misc at the * end of the struct. We'll terminate the data with an empty pr_misc @@ -881,21 +877,16 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr) if ((fdinfo->pr_fileflags & (FSEARCH | FEXEC)) == 0) fdinfo->pr_fileflags += FOPEN; fdinfo->pr_offset = fp->f_offset; - file_cred = fp->f_cred; - crhold(file_cred); /* * Information from the vnode (rather than the file_t) is retrieved * later, in prgetfdinfo() - for example sock_getfasync() */ - pr_releasef(p, fd); prunlock(pnp); - error = prgetfdinfo(p, vp, fdinfo, cr, file_cred, &data); + error = prgetfdinfo(p, fp->f_vnode, fdinfo, cr, fp->f_cred, &data); - crfree(file_cred); - - VN_RELE(vp); + (void) closef(fp); out: if (error == 0) @@ -3105,12 +3096,36 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, return (0); } + /* A subset of prlock(pnp...) */ p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) return (ENOENT); pcp = pnp->pr_common; + /* + * Because we're performing a subset of prlock() inline here, we must + * follow prlock's semantics when encountering a zombie process + * (PRC_DESTROY flag is set) or an exiting process (SEXITING flag is + * set). Those semantics indicate acting as if the process is no + * longer there (return ENOENT). + * + * If we chose to proceed here regardless, we may encounter issues + * when we drop the p_lock (see PR_OBJECTDIR, PR_PATHDIR, PR_*MAP, + * PR_LDT, and PR_*PAGEDATA below). A process-cleanup which was + * blocked on p_lock may ignore the P_PR_LOCK flag we set above, since + * it set one of PRC_DESTROY or SEXITING. If the process then gets + * destroyed our "p" will be useless, as will its p_lock. + * + * It may be desirable to move this check to only places further down + * prior to actual droppages of p->p_lock, but for now, we're playing + * it safe and checking here immediately, like prlock() does.. + */ + if (((pcp->prc_flags & PRC_DESTROY) || (p->p_flag & SEXITING))) { + prunlock(pnp); + return (ENOENT); + } + mutex_enter(&p->p_crlock); vap->va_uid = crgetruid(p->p_cred); vap->va_gid = crgetrgid(p->p_cred); @@ -3182,7 +3197,6 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, break; case PR_FDINFO: { file_t *fp; - vnode_t *vp; int fd = pnp->pr_index; fp = pr_getf(p, fd, NULL); @@ -3190,13 +3204,10 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, prunlock(pnp); return (ENOENT); } - vp = fp->f_vnode; - VN_HOLD(vp); - pr_releasef(p, fd); prunlock(pnp); - vap->va_size = prgetfdinfosize(p, vp, cr); - VN_RELE(vp); + vap->va_size = prgetfdinfosize(p, fp->f_vnode, cr); vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + (void) closef(fp); return (0); } case PR_LWPDIR: @@ -4245,7 +4256,7 @@ pr_lookup_lwpiddir(vnode_t *dp, char *comp) } /* - * Lookup one of the process's open files. + * Lookup one of the process's file vnodes. */ static vnode_t * pr_lookup_fddir(vnode_t *dp, char *comp) @@ -4292,10 +4303,11 @@ pr_lookup_fddir(vnode_t *dp, char *comp) pnp->pr_mode |= 0222; vp = fp->f_vnode; VN_HOLD(vp); - pr_releasef(p, fd); } prunlock(dpnp); + if (fp != NULL) + (void) closef(fp); if (vp == NULL) { prfreenode(pnp); |