summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/fs/fifofs/fifovnops.c7
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c6
-rw-r--r--usr/src/uts/common/io/devpoll.c517
-rw-r--r--usr/src/uts/common/io/tty_pty.c6
-rw-r--r--usr/src/uts/common/os/streamio.c30
-rw-r--r--usr/src/uts/common/sys/devpoll.h5
-rw-r--r--usr/src/uts/common/sys/poll.h6
-rw-r--r--usr/src/uts/common/sys/poll_impl.h106
-rw-r--r--usr/src/uts/common/syscall/poll.c298
9 files changed, 811 insertions, 170 deletions
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index fee2924093..61edfab76c 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -28,7 +28,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/*
@@ -1775,7 +1775,10 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
fn_dest = fnp->fn_dest;
fn_lock = fnp->fn_lock;
- polllock(&stp->sd_pollist, &fn_lock->flk_lock);
+ if (polllock(&stp->sd_pollist, &fn_lock->flk_lock) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
/*
* see if FIFO/pipe open
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index e392ce4b14..b60f938c9b 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -5939,7 +5939,11 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp,
return (0);
}
- lockstate = pollunlock(); /* avoid deadlock with prnotify() */
+ /* avoid deadlock with prnotify() */
+ if (pollunlock(&lockstate) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
if ((error = prlock(pnp, ZNO)) != 0) {
pollrelock(lockstate);
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index 7b3454f89c..a63e1f1a08 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -123,6 +123,12 @@ static struct modlinkage modlinkage = {
NULL
};
+static void pcachelink_assoc(pollcache_t *, pollcache_t *);
+static void pcachelink_mark_stale(pollcache_t *);
+static void pcachelink_purge_stale(pollcache_t *);
+static void pcachelink_purge_all(pollcache_t *);
+
+
/*
* Locking Design
*
@@ -157,7 +163,6 @@ _init()
mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
devpoll_init = 1;
if ((error = mod_install(&modlinkage)) != 0) {
- mutex_destroy(&devpoll_lock);
kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
devpoll_init = 0;
}
@@ -255,6 +260,7 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
epoll_event_t *epoll;
int error = 0;
short mask = POLLRDHUP | POLLWRBAND;
+ boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
ASSERT(MUTEX_HELD(&pcp->pc_lock));
if (pcp->pc_bitmap == NULL) {
@@ -265,7 +271,7 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
return (error);
}
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
pfdp = NULL;
epoll = (epoll_event_t *)dpbuf;
} else {
@@ -331,7 +337,7 @@ repoll:
* polling a closed fd. Hope this will remind
* user to do a POLLREMOVE.
*/
- if (pfdp != NULL) {
+ if (!is_epoll && pfdp != NULL) {
pfdp[fdcnt].fd = fd;
pfdp[fdcnt].revents = POLLNVAL;
fdcnt++;
@@ -343,18 +349,18 @@ repoll:
* perform the implicit removal to remain
* closer to the epoll semantics.
*/
- ASSERT(epoll != NULL);
+ if (is_epoll) {
+ pdp->pd_fp = NULL;
+ pdp->pd_events = 0;
- pdp->pd_fp = NULL;
- pdp->pd_events = 0;
+ if (php != NULL) {
+ pollhead_delete(php, pdp);
+ pdp->pd_php = NULL;
+ }
- if (php != NULL) {
- pollhead_delete(php, pdp);
- pdp->pd_php = NULL;
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ continue;
}
-
- BT_CLEAR(pcp->pc_bitmap, fd);
- continue;
}
if (fp != pdp->pd_fp) {
@@ -394,6 +400,7 @@ repoll:
if (error != 0) {
break;
}
+
/*
* layered devices (e.g. console driver)
* may change the vnode and thus the pollhead
@@ -416,7 +423,7 @@ repoll:
pfdp[fdcnt].fd = fd;
pfdp[fdcnt].events = pdp->pd_events;
pfdp[fdcnt].revents = revent;
- } else {
+ } else if (epoll != NULL) {
epoll_event_t *ep = &epoll[fdcnt];
ASSERT(epoll != NULL);
@@ -449,6 +456,35 @@ repoll:
(pdp->pd_events & EPOLLWRNORM)) {
ep->events |= EPOLLWRNORM;
}
+ } else {
+ pollstate_t *ps =
+ curthread->t_pollstate;
+ /*
+ * The devpoll handle itself is being
+ * polled. Notify the caller of any
+ * readable event(s), leaving as much
+ * state as possible untouched.
+ */
+ VERIFY(fdcnt == 0);
+ VERIFY(ps != NULL);
+
+ /*
+ * If a call to pollunlock() fails
+ * during VOP_POLL, skip over the fd
+ * and continue polling.
+ *
+ * Otherwise, report that there is an
+ * event pending.
+ */
+ if ((ps->ps_flags & POLLSTATE_ULFAIL)
+ != 0) {
+ ps->ps_flags &=
+ ~POLLSTATE_ULFAIL;
+ continue;
+ } else {
+ fdcnt++;
+ break;
+ }
}
/*
@@ -608,6 +644,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
polldat_t *pdp;
int fd;
file_t *fp;
+ boolean_t is_epoll, fds_added = B_FALSE;
minor = getminor(dev);
@@ -616,22 +653,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
dpep = devpolltbl[minor];
ASSERT(dpep != NULL);
mutex_exit(&devpoll_lock);
+
+ mutex_enter(&dpep->dpe_lock);
pcp = dpep->dpe_pcache;
+ is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
+ size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t);
+ mutex_exit(&dpep->dpe_lock);
- if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) &&
- curproc->p_pid != pcp->pc_pid) {
- if (pcp->pc_pid != -1)
+ if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
+ if (pcp->pc_pid != -1) {
return (EACCES);
+ }
pcp->pc_pid = curproc->p_pid;
}
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
- size = sizeof (dvpoll_epollfd_t);
- } else {
- size = sizeof (pollfd_t);
- }
-
uiosize = uiop->uio_resid;
pollfdnum = uiosize / size;
mutex_enter(&curproc->p_lock);
@@ -640,7 +676,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
curproc->p_rctls, curproc, RCA_SAFE);
mutex_exit(&curproc->p_lock);
- return (set_errno(EINVAL));
+ return (EINVAL);
}
mutex_exit(&curproc->p_lock);
/*
@@ -665,44 +701,44 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
/*
* We are about to enter the core portion of dpwrite(). Make sure this
* write has exclusive access in this portion of the code, i.e., no
- * other writers in this code and no other readers in dpioctl.
+ * other writers in this code.
+ *
+ * Waiting for all readers to drop their references to the dpe is
+ * unecessary since the pollcache itself is protected by pc_lock.
*/
mutex_enter(&dpep->dpe_lock);
dpep->dpe_writerwait++;
- while (dpep->dpe_refcnt != 0) {
- /*
- * We need to do a bit of a dance here: we need to drop
- * our dpe_lock and grab the pc_lock to broadcast the pc_cv to
- * kick any DP_POLL/DP_PPOLL sleepers.
- */
- mutex_exit(&dpep->dpe_lock);
- mutex_enter(&pcp->pc_lock);
- pcp->pc_flag |= PC_WRITEWANTED;
- cv_broadcast(&pcp->pc_cv);
- mutex_exit(&pcp->pc_lock);
- mutex_enter(&dpep->dpe_lock);
-
- if (dpep->dpe_refcnt == 0)
- break;
+ while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
+ ASSERT(dpep->dpe_refcnt != 0);
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
dpep->dpe_writerwait--;
mutex_exit(&dpep->dpe_lock);
- mutex_enter(&pcp->pc_lock);
- pcp->pc_flag &= ~PC_WRITEWANTED;
- mutex_exit(&pcp->pc_lock);
kmem_free(pollfdp, uiosize);
- return (set_errno(EINTR));
+ return (EINTR);
}
}
dpep->dpe_writerwait--;
dpep->dpe_flag |= DP_WRITER_PRESENT;
dpep->dpe_refcnt++;
+ if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) {
+ /*
+ * The epoll compat mode was enabled while we were waiting to
+ * establish write access. It is not safe to continue since
+ * state was prepared for non-epoll operation.
+ */
+ error = EBUSY;
+ goto bypass;
+ }
mutex_exit(&dpep->dpe_lock);
- mutex_enter(&pcp->pc_lock);
- pcp->pc_flag &= ~PC_WRITEWANTED;
+ /*
+ * Since the dpwrite() may recursively walk an added /dev/poll handle,
+ * pollstate_enter() deadlock and loop detection must be used.
+ */
+ (void) pollstate_create();
+ VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
if (pcp->pc_bitmap == NULL) {
pcache_create(pcp, pollfdnum);
@@ -715,7 +751,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
* epoll semantics demand that we return EBADF if our
* specified fd is invalid.
*/
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
error = EBADF;
break;
}
@@ -736,7 +772,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
* we return EBADF if our specified fd is
* invalid.
*/
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
if ((fp = getf(fd)) == NULL) {
error = EBADF;
break;
@@ -771,7 +807,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
* then, the file descriptor must be closed and
* reused in a relatively tight time span.)
*/
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
if (pdp->pd_fp != NULL &&
(fp = getf(fd)) != NULL &&
fp == pdp->pd_fp &&
@@ -794,7 +830,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
}
}
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
epfdp = (dvpoll_epollfd_t *)pfdp;
pdp->pd_epolldata = epfdp->dpep_data;
}
@@ -886,12 +922,12 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pdp->pd_php = php;
}
}
-
}
+ fds_added = B_TRUE;
releasef(fd);
} else {
if (pdp == NULL || pdp->pd_fp == NULL) {
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
/*
* As with the add case (above), epoll
* semantics demand that we error out
@@ -914,10 +950,19 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
BT_CLEAR(pcp->pc_bitmap, fd);
}
}
- mutex_exit(&pcp->pc_lock);
+ /*
+ * Any fds added to an recursive-capable pollcache could themselves be
+ * /dev/poll handles. To ensure that proper event propagation occurs,
+ * parent pollcaches are woken so that they can create any needed
+ * pollcache links.
+ */
+ if (fds_added) {
+ pcache_wake_parents(pcp);
+ }
+ pollstate_exit(pcp);
mutex_enter(&dpep->dpe_lock);
+bypass:
dpep->dpe_flag &= ~DP_WRITER_PRESENT;
- ASSERT(dpep->dpe_refcnt == 1);
dpep->dpe_refcnt--;
cv_broadcast(&dpep->dpe_cv);
mutex_exit(&dpep->dpe_lock);
@@ -945,6 +990,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
pollcache_t *pcp;
hrtime_t now;
int error = 0;
+ boolean_t is_epoll;
STRUCT_DECL(dvpoll, dvpoll);
if (cmd == DP_POLL || cmd == DP_PPOLL) {
@@ -961,6 +1007,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
pcp = dpep->dpe_pcache;
mutex_enter(&dpep->dpe_lock);
+ is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
if (cmd == DP_EPOLLCOMPAT) {
if (dpep->dpe_refcnt != 0) {
@@ -982,8 +1029,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
return (0);
}
- if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) &&
- curproc->p_pid != pcp->pc_pid) {
+ if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
if (pcp->pc_pid != -1) {
mutex_exit(&dpep->dpe_lock);
return (EACCES);
@@ -992,7 +1038,8 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
pcp->pc_pid = curproc->p_pid;
}
- while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
+ /* Wait until all writers have cleared the handle before continuing */
+ while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 ||
(dpep->dpe_writerwait != 0)) {
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
mutex_exit(&dpep->dpe_lock);
@@ -1128,7 +1175,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
return (error == 0 ? EINTR : 0);
}
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ if (is_epoll) {
size = nfds * (fdsize = sizeof (epoll_event_t));
} else {
size = nfds * (fdsize = sizeof (pollfd_t));
@@ -1139,10 +1186,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
* requires another per thread structure hook. This can be
* implemented later if data suggests that it's necessary.
*/
- if ((ps = curthread->t_pollstate) == NULL) {
- curthread->t_pollstate = pollstate_create();
- ps = curthread->t_pollstate;
- }
+ ps = pollstate_create();
if (ps->ps_dpbufsize < size) {
/*
@@ -1169,15 +1213,25 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
}
}
- mutex_enter(&pcp->pc_lock);
+ VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
for (;;) {
pcp->pc_flag &= ~PC_POLLWAKE;
+ /*
+ * Mark all child pcachelinks as stale.
+ * Those which are still part of the tree will be
+ * marked as valid during the poll.
+ */
+ pcachelink_mark_stale(pcp);
+
error = dp_pcache_poll(dpep, ps->ps_dpbuf,
pcp, nfds, &fdcnt);
if (fdcnt > 0 || error != 0)
break;
+ /* Purge still-stale child pcachelinks */
+ pcachelink_purge_stale(pcp);
+
/*
* A pollwake has happened since we polled cache.
*/
@@ -1192,42 +1246,12 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
break;
}
- if (!(pcp->pc_flag & PC_WRITEWANTED)) {
- error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
- &pcp->pc_lock, deadline);
- } else {
- error = 1;
- }
-
- if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) {
- /*
- * We've been kicked off of our cv because a
- * writer wants in. We're going to drop our
- * reference count and then wait until the
- * writer is gone -- at which point we'll
- * reacquire the pc_lock and call into
- * dp_pcache_poll() to get the updated state.
- */
- mutex_exit(&pcp->pc_lock);
-
- mutex_enter(&dpep->dpe_lock);
- dpep->dpe_refcnt--;
- cv_broadcast(&dpep->dpe_cv);
-
- while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
- (dpep->dpe_writerwait != 0)) {
- error = cv_wait_sig_swap(&dpep->dpe_cv,
- &dpep->dpe_lock);
- }
-
- dpep->dpe_refcnt++;
- mutex_exit(&dpep->dpe_lock);
- mutex_enter(&pcp->pc_lock);
- }
+ error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
+ &pcp->pc_lock, deadline);
/*
- * If we were awakened by a signal or timeout
- * then break the loop, else poll again.
+ * If we were awakened by a signal or timeout then
+ * break the loop, else poll again.
*/
if (error <= 0) {
error = (error == 0) ? EINTR : 0;
@@ -1236,7 +1260,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
error = 0;
}
}
- mutex_exit(&pcp->pc_lock);
+ pollstate_exit(pcp);
DP_SIGMASK_RESTORE(ksetp);
@@ -1299,6 +1323,66 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
return (error);
}
+/*
+ * Overview of Recursive Polling
+ *
+ * It is possible for /dev/poll to poll for events on file descriptors which
+ * themselves are /dev/poll handles. Pending events in the child handle are
+ * represented as readable data via the POLLIN flag. To limit surface area,
+ * this recursion is presently allowed on only /dev/poll handles which have
+ * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is
+ * limited to 5 in order to be consistent with Linux epoll.
+ *
+ * Extending dppoll() for VOP_POLL:
+ *
+ * The recursive /dev/poll implementation begins by extending dppoll() to
+ * report when resources contained in the pollcache have relevant event state.
+ * At the highest level, it means calling dp_pcache_poll() so it indicates if
+ * fd events are present without consuming them or altering the pollcache
+ * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will
+ * yield the initiating event. Additionally, the VOP_POLL should return in
+ * such a way that dp_pcache_poll() does not clear the parent bitmap entry
+ * which corresponds to the child /dev/poll fd. This means that child
+ * pollcaches will be checked during every poll which facilitates wake-up
+ * behavior detailed below.
+ *
+ * Pollcache Links and Wake Events:
+ *
+ * Recursive /dev/poll avoids complicated pollcache locking constraints during
+ * pollwakeup events by eschewing the traditional pollhead mechanism in favor
+ * of a different approach. For each pollcache at the root of a recursive
+ * /dev/poll "tree", pcachelink_t structures are established to all child
+ * /dev/poll pollcaches. During pollnotify() in a child pollcache, the
+ * linked list of pcachelink_t entries is walked, where those marked as valid
+ * incur a cv_broadcast to their parent pollcache. Most notably, these
+ * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
+ * parent pollcache (which would require careful deadlock avoidance). This
+ * still allows the woken poll on the parent to discover the pertinent events
+ * due to the fact that bitmap entires for the child pollcache are always
+ * maintained by the dppoll() logic above.
+ *
+ * Depth Limiting and Loop Prevention:
+ *
+ * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
+ * loop constraints are enforced via pollstate_enter(). The pollcache_t
+ * pointer is compared against any existing entries in ps_pc_stack and is added
+ * to the end if no match (and therefore loop) is found. Once poll operations
+ * for a given pollcache_t are complete, pollstate_exit() clears the pointer
+ * from the list. The pollstate_enter() and pollstate_exit() functions are
+ * responsible for acquiring and releasing pc_lock, respectively.
+ *
+ * Deadlock Safety:
+ *
+ * Descending through a tree of recursive /dev/poll handles involves the tricky
+ * business of sequentially entering multiple pollcache locks. This tree
+ * topology cannot define a lock acquisition order in such a way that it is
+ * immune to deadlocks between threads. The pollstate_enter() and
+ * pollstate_exit() functions provide an interface for recursive /dev/poll
+ * operations to safely lock pollcaches while failing gracefully in the face of
+ * deadlocking topologies. (See pollstate_contend() for more detail about how
+ * deadlocks are detected and resolved.)
+ */
+
/*ARGSUSED*/
static int
dppoll(dev_t dev, short events, int anyyet, short *reventsp,
@@ -1306,24 +1390,63 @@ dppoll(dev_t dev, short events, int anyyet, short *reventsp,
{
minor_t minor;
dp_entry_t *dpep;
+ pollcache_t *pcp;
+ int res, rc = 0;
minor = getminor(dev);
-
mutex_enter(&devpoll_lock);
+ ASSERT(minor < dptblsize);
dpep = devpolltbl[minor];
ASSERT(dpep != NULL);
mutex_exit(&devpoll_lock);
- /*
- * Polling on a /dev/poll fd is not fully supported yet.
- */
- if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
- /* no error in epoll compat. mode */
- *reventsp = 0;
- } else {
+ mutex_enter(&dpep->dpe_lock);
+ if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) {
+ /* Poll recursion is not yet supported for non-epoll handles */
*reventsp = POLLERR;
+ mutex_exit(&dpep->dpe_lock);
+ return (0);
+ } else {
+ dpep->dpe_refcnt++;
+ pcp = dpep->dpe_pcache;
+ mutex_exit(&dpep->dpe_lock);
}
- return (0);
+
+ res = pollstate_enter(pcp);
+ if (res == PSE_SUCCESS) {
+ nfds_t nfds = 1;
+ int fdcnt = 0;
+ pollstate_t *ps = curthread->t_pollstate;
+
+ rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
+ if (rc == 0) {
+ *reventsp = (fdcnt > 0) ? POLLIN : 0;
+ }
+ pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
+ pollstate_exit(pcp);
+ } else {
+ switch (res) {
+ case PSE_FAIL_DEPTH:
+ rc = EINVAL;
+ break;
+ case PSE_FAIL_LOOP:
+ case PSE_FAIL_DEADLOCK:
+ rc = ELOOP;
+ break;
+ default:
+ /*
+ * If anything else has gone awry, such as being polled
+ * from an unexpected context, fall back to the
+ * recursion-intolerant response.
+ */
+ *reventsp = POLLERR;
+ rc = 0;
+ break;
+ }
+ }
+
+ DP_REFRELE(dpep);
+ return (rc);
}
/*
@@ -1376,8 +1499,190 @@ dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
while (pcp->pc_busy > 0)
cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
mutex_exit(&pcp->pc_no_exit);
+
+ /* Clean up any pollcache links created via recursive /dev/poll */
+ if (pcp->pc_parents != NULL || pcp->pc_children != NULL) {
+ /*
+ * Because of the locking rules for pcachelink manipulation,
+ * acquring pc_lock is required for this step.
+ */
+ mutex_enter(&pcp->pc_lock);
+ pcachelink_purge_all(pcp);
+ mutex_exit(&pcp->pc_lock);
+ }
+
pcache_destroy(pcp);
ASSERT(dpep->dpe_refcnt == 0);
kmem_free(dpep, sizeof (dp_entry_t));
return (0);
}
+
+static void
+pcachelink_locked_rele(pcachelink_t *pl)
+{
+ ASSERT(MUTEX_HELD(&pl->pcl_lock));
+ VERIFY(pl->pcl_refcnt >= 1);
+
+ pl->pcl_refcnt--;
+ if (pl->pcl_refcnt == 0) {
+ VERIFY(pl->pcl_state == PCL_INVALID);
+ ASSERT(pl->pcl_parent_pc == NULL);
+ ASSERT(pl->pcl_child_pc == NULL);
+ ASSERT(pl->pcl_parent_next == NULL);
+ ASSERT(pl->pcl_child_next == NULL);
+
+ pl->pcl_state = PCL_FREE;
+ mutex_destroy(&pl->pcl_lock);
+ kmem_free(pl, sizeof (pcachelink_t));
+ } else {
+ mutex_exit(&pl->pcl_lock);
+ }
+}
+
+/*
+ * Associate parent and child pollcaches via a pcachelink_t. If an existing
+ * link (stale or valid) between the two is found, it will be reused. If a
+ * suitable link is not found for reuse, a new one will be allocated.
+ */
+static void
+pcachelink_assoc(pollcache_t *child, pollcache_t *parent)
+{
+ pcachelink_t *pl, **plpn;
+
+ ASSERT(MUTEX_HELD(&child->pc_lock));
+ ASSERT(MUTEX_HELD(&parent->pc_lock));
+
+ /* Search for an existing link we can reuse. */
+ plpn = &child->pc_parents;
+ for (pl = child->pc_parents; pl != NULL; pl = *plpn) {
+ mutex_enter(&pl->pcl_lock);
+ if (pl->pcl_state == PCL_INVALID) {
+ /* Clean any invalid links while walking the list */
+ *plpn = pl->pcl_parent_next;
+ pl->pcl_child_pc = NULL;
+ pl->pcl_parent_next = NULL;
+ pcachelink_locked_rele(pl);
+ } else if (pl->pcl_parent_pc == parent) {
+ /* Successfully found parent link */
+ ASSERT(pl->pcl_state == PCL_VALID ||
+ pl->pcl_state == PCL_STALE);
+ pl->pcl_state = PCL_VALID;
+ mutex_exit(&pl->pcl_lock);
+ return;
+ } else {
+ plpn = &pl->pcl_parent_next;
+ mutex_exit(&pl->pcl_lock);
+ }
+ }
+
+ /* No existing link to the parent was found. Create a fresh one. */
+ pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP);
+ mutex_init(&pl->pcl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ pl->pcl_parent_pc = parent;
+ pl->pcl_child_next = parent->pc_children;
+ parent->pc_children = pl;
+ pl->pcl_refcnt++;
+
+ pl->pcl_child_pc = child;
+ pl->pcl_parent_next = child->pc_parents;
+ child->pc_parents = pl;
+ pl->pcl_refcnt++;
+
+ pl->pcl_state = PCL_VALID;
+}
+
+/*
+ * Mark all child links in a pollcache as stale. Any invalid child links found
+ * during iteration are purged.
+ */
+static void
+pcachelink_mark_stale(pollcache_t *pcp)
+{
+ pcachelink_t *pl, **plpn;
+
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+
+ plpn = &pcp->pc_children;
+ for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
+ mutex_enter(&pl->pcl_lock);
+ if (pl->pcl_state == PCL_INVALID) {
+ /*
+ * Remove any invalid links while we are going to the
+ * trouble of walking the list.
+ */
+ *plpn = pl->pcl_child_next;
+ pl->pcl_parent_pc = NULL;
+ pl->pcl_child_next = NULL;
+ pcachelink_locked_rele(pl);
+ } else {
+ pl->pcl_state = PCL_STALE;
+ plpn = &pl->pcl_child_next;
+ mutex_exit(&pl->pcl_lock);
+ }
+ }
+}
+
+/*
+ * Purge all stale (or invalid) child links from a pollcache.
+ */
+static void
+pcachelink_purge_stale(pollcache_t *pcp)
+{
+ pcachelink_t *pl, **plpn;
+
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+
+ plpn = &pcp->pc_children;
+ for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
+ mutex_enter(&pl->pcl_lock);
+ switch (pl->pcl_state) {
+ case PCL_STALE:
+ pl->pcl_state = PCL_INVALID;
+ /* FALLTHROUGH */
+ case PCL_INVALID:
+ *plpn = pl->pcl_child_next;
+ pl->pcl_parent_pc = NULL;
+ pl->pcl_child_next = NULL;
+ pcachelink_locked_rele(pl);
+ break;
+ default:
+ plpn = &pl->pcl_child_next;
+ mutex_exit(&pl->pcl_lock);
+ }
+ }
+}
+
+/*
+ * Purge all child and parent links from a pollcache, regardless of status.
+ */
+static void
+pcachelink_purge_all(pollcache_t *pcp)
+{
+ pcachelink_t *pl, **plpn;
+
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+
+ plpn = &pcp->pc_parents;
+ for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) {
+ mutex_enter(&pl->pcl_lock);
+ pl->pcl_state = PCL_INVALID;
+ *plpn = pl->pcl_parent_next;
+ pl->pcl_child_pc = NULL;
+ pl->pcl_parent_next = NULL;
+ pcachelink_locked_rele(pl);
+ }
+
+ plpn = &pcp->pc_children;
+ for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
+ mutex_enter(&pl->pcl_lock);
+ pl->pcl_state = PCL_INVALID;
+ *plpn = pl->pcl_child_next;
+ pl->pcl_parent_pc = NULL;
+ pl->pcl_child_next = NULL;
+ pcachelink_locked_rele(pl);
+ }
+
+ ASSERT(pcp->pc_parents == NULL);
+ ASSERT(pcp->pc_children == NULL);
+}
diff --git a/usr/src/uts/common/io/tty_pty.c b/usr/src/uts/common/io/tty_pty.c
index 6c829dcd21..a8eea823be 100644
--- a/usr/src/uts/common/io/tty_pty.c
+++ b/usr/src/uts/common/io/tty_pty.c
@@ -1,6 +1,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
/*
@@ -988,7 +989,10 @@ ptcpoll(dev_t dev,
#ifdef lint
anyyet = anyyet;
#endif
- polllock(php, &pty->ptc_lock);
+ if (polllock(php, &pty->ptc_lock) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
ASSERT(MUTEX_HELD(&pty->ptc_lock));
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 18a5ded1c6..62f94729cf 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -8218,7 +8218,11 @@ strpoll(
tq = qp->q_next->q_nfsrv;
ASSERT(tq != NULL);
- polllock(&stp->sd_pollist, QLOCK(tq));
+ if (polllock(&stp->sd_pollist, QLOCK(tq)) != 0) {
+ releasestr(qp);
+ *reventsp = POLLNVAL;
+ return (0);
+ }
if (events & POLLWRNORM) {
queue_t *sqp;
@@ -8228,7 +8232,12 @@ strpoll(
else if ((sqp = stp->sd_struiowrq) != NULL) {
/* Check sync stream barrier write q */
mutex_exit(QLOCK(tq));
- polllock(&stp->sd_pollist, QLOCK(sqp));
+ if (polllock(&stp->sd_pollist,
+ QLOCK(sqp)) != 0) {
+ releasestr(qp);
+ *reventsp = POLLNVAL;
+ return (0);
+ }
if (sqp->q_flag & QFULL)
/* ensure pollwakeup() is done */
sqp->q_flag |= QWANTWSYNC;
@@ -8241,7 +8250,12 @@ strpoll(
goto chkrd;
}
mutex_exit(QLOCK(sqp));
- polllock(&stp->sd_pollist, QLOCK(tq));
+ if (polllock(&stp->sd_pollist,
+ QLOCK(tq)) != 0) {
+ releasestr(qp);
+ *reventsp = POLLNVAL;
+ return (0);
+ }
} else
retevents |= POLLOUT;
}
@@ -8273,7 +8287,10 @@ chkrd:
* Note: Need to do polllock() here since ps_lock may be
* held. See bug 4191544.
*/
- polllock(&stp->sd_pollist, &stp->sd_lock);
+ if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
headlocked = 1;
mp = qp->q_first;
while (mp) {
@@ -8326,7 +8343,10 @@ chkrd:
if (!anyyet) {
*phpp = &stp->sd_pollist;
if (headlocked == 0) {
- polllock(&stp->sd_pollist, &stp->sd_lock);
+ if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) {
+ *reventsp = POLLNVAL;
+ return (0);
+ }
headlocked = 1;
}
stp->sd_rput_opt |= SR_POLLIN;
diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h
index 4e4c76d9b0..3b6bd159c3 100644
--- a/usr/src/uts/common/sys/devpoll.h
+++ b/usr/src/uts/common/sys/devpoll.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _SYS_DEVPOLL_H
@@ -88,9 +88,6 @@ typedef struct dp_entry {
mutex_enter(&(dpep)->dpe_lock); \
ASSERT((dpep)->dpe_refcnt > 0); \
(dpep)->dpe_refcnt--; \
- if ((dpep)->dpe_refcnt == 0) { \
- cv_broadcast(&(dpep)->dpe_cv); \
- } \
mutex_exit(&(dpep)->dpe_lock); \
}
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h
index efc8457a6a..75a588533f 100644
--- a/usr/src/uts/common/sys/poll.h
+++ b/usr/src/uts/common/sys/poll.h
@@ -31,7 +31,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _SYS_POLL_H
@@ -130,8 +130,8 @@ extern void pollwakeup(pollhead_t *, short);
/*
* Internal routines.
*/
-extern void polllock(pollhead_t *, kmutex_t *);
-extern int pollunlock(void);
+extern int polllock(pollhead_t *, kmutex_t *);
+extern int pollunlock(int *);
extern void pollrelock(int);
extern void pollcleanup(void);
extern void pollblockexit(struct fpollinfo *);
diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h
index 2e866ec4d4..67b47f9a1e 100644
--- a/usr/src/uts/common/sys/poll_impl.h
+++ b/usr/src/uts/common/sys/poll_impl.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _SYS_POLL_IMPL_H
@@ -36,7 +36,7 @@
*
* Each kernel thread (1), if engaged in poll system call, has a reference to
* a pollstate_t (2), which contains relevant flags and locks. The pollstate_t
- * contains a pointer to a pcache_t (3), which caches the state of previous
+ * contains a pointer to a pollcache_t (3), which caches the state of previous
* calls to poll. A bitmap (4) is stored inside the poll cache, where each
* bit represents a file descriptor. The bits are set if the corresponding
* device has a polled event pending. Only fds with their bit set will be
@@ -45,7 +45,7 @@
* structures keep track of the pollfd_t arrays (6) passed in from userland.
* Each polled file descriptor has a corresponding polldat_t which can be
* chained onto a device's pollhead, and these are kept in a hash table (7)
- * inside the pcache_t. The hash table allows efficient conversion of a
+ * inside the pollcache_t. The hash table allows efficient conversion of a
* given fd to its corresponding polldat_t.
*
* (1) (2)
@@ -76,7 +76,7 @@
* Both poll system call and /dev/poll use the pollcache_t structure
* definition and the routines managing the structure. But poll(2) and
* /dev/poll have their own copy of the structures. The /dev/poll driver
- * table (1a) contains an array of pointers, each pointing at a pcache_t
+ * table (1a) contains an array of pointers, each pointing at a pollcache_t
* struct (3). A device minor number is used as an device table index.
*
*/
@@ -86,12 +86,26 @@
#include <sys/thread.h>
#include <sys/file.h>
+#include <sys/port_kernel.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
+ * Typedefs
+ */
+struct pollcache;
+struct pollstate;
+struct pcachelink;
+struct polldat;
+
+typedef struct pollcache pollcache_t;
+typedef struct pollstate pollstate_t;
+typedef struct pcachelink pcachelink_t;
+typedef struct polldat polldat_t;
+
+/*
* description of pollcacheset structure
*/
typedef struct pollcacheset {
@@ -104,18 +118,40 @@ typedef struct pollcacheset {
#define POLLFDSETS 2
/*
+ * Maximum depth for recusive poll operations.
+ */
+#define POLLMAXDEPTH 5
+
+/*
* State information kept by each polling thread
*/
-typedef struct pollstate {
+struct pollstate {
pollfd_t *ps_pollfd; /* hold the current poll list */
size_t ps_nfds; /* size of ps_pollfd */
kmutex_t ps_lock; /* mutex for sleep/wakeup */
- struct pollcache *ps_pcache; /* cached poll fd set */
+ pollcache_t *ps_pcache; /* cached poll fd set */
pollcacheset_t *ps_pcacheset; /* cached poll lists */
int ps_nsets; /* no. of cached poll sets */
pollfd_t *ps_dpbuf; /* return pollfd buf used by devpoll */
size_t ps_dpbufsize; /* size of ps_dpbuf */
-} pollstate_t;
+ int ps_depth; /* epoll recursion depth */
+ pollcache_t *ps_pc_stack[POLLMAXDEPTH]; /* epoll recursion state */
+ pollcache_t *ps_contend_pc; /* pollcache waited on */
+ pollstate_t *ps_contend_nextp; /* next in contender list */
+ pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */
+ int ps_flags; /* state flags */
+};
+
+/* pollstate flags */
+#define POLLSTATE_STALEMATE 0x1
+#define POLLSTATE_ULFAIL 0x2
+
+/* pollstate_enter results */
+#define PSE_SUCCESS 0
+#define PSE_FAIL_DEPTH 1
+#define PSE_FAIL_LOOP 2
+#define PSE_FAIL_DEADLOCK 3
+#define PSE_FAIL_POLLSTATE 4
/*
* poll cache size defines
@@ -143,27 +179,54 @@ typedef struct xref {
#define POLLPOSINVAL (-1L) /* xf_position is invalid */
#define POLLPOSTRANS (-2L) /* xf_position is transient state */
+
+typedef enum pclstate {
+ PCL_INIT = 0, /* just allocated/zeroed, prior */
+ PCL_VALID, /* linked with both parent and child pollcaches */
+ PCL_STALE, /* still linked but marked stale, pending refresh */
+ PCL_INVALID, /* dissociated from one pollcache, awaiting cleanup */
+ PCL_FREE /* only meant to indicate use-after-free */
+} pclstate_t;
+
+/*
+ * The pcachelink struct creates an association between parent and child
+ * pollcaches in a recursive /dev/poll operation. Fields are protected by
+ * pcl_lock although manipulation of pcl_child_next or pcl_parent_next also
+ * requires holding pc_lock in the respective pcl_parent_pc or pcl_child_pc
+ * pollcache.
+ */
+struct pcachelink {
+ kmutex_t pcl_lock; /* protects contents */
+ pclstate_t pcl_state; /* status of link entry */
+ int pcl_refcnt; /* ref cnt of linked pcaches */
+ pollcache_t *pcl_child_pc; /* child pollcache */
+ pollcache_t *pcl_parent_pc; /* parent pollcache */
+ pcachelink_t *pcl_child_next; /* next in child list */
+ pcachelink_t *pcl_parent_next; /* next in parents list */
+};
+
+
/*
* polldat is an entry for a cached poll fd. A polldat struct can be in
* poll cache table as well as on pollhead ph_list, which is used by
* pollwakeup to wake up a sleeping poller. There should be one polldat
* per polled fd hanging off pollstate struct.
*/
-typedef struct polldat {
+struct polldat {
int pd_fd; /* cached poll fd */
int pd_events; /* union of all polled events */
file_t *pd_fp; /* used to detect fd reuse */
pollhead_t *pd_php; /* used to undo poll registration */
kthread_t *pd_thread; /* used for waking up a sleep thrd */
- struct pollcache *pd_pcache; /* a ptr to the pollcache of this fd */
- struct polldat *pd_next; /* next on pollhead's ph_list */
- struct polldat *pd_hashnext; /* next on pollhead's ph_list */
+ pollcache_t *pd_pcache; /* a ptr to the pollcache of this fd */
+ polldat_t *pd_next; /* next on pollhead's ph_list */
+ polldat_t *pd_hashnext; /* next on pollhead's ph_list */
int pd_count; /* total count from all ref'ed sets */
int pd_nsets; /* num of xref sets, used by poll(2) */
xref_t *pd_ref; /* ptr to xref info, 1 for each set */
- struct port_kevent *pd_portev; /* associated port event struct */
+ port_kevent_t *pd_portev; /* associated port event struct */
uint64_t pd_epolldata; /* epoll data, if any */
-} polldat_t;
+};
/*
* One cache for each thread that polls. Points to a bitmap (used by pollwakeup)
@@ -172,7 +235,7 @@ typedef struct polldat {
* of port_fdcache_t, both structs implement pc_lock with offset 0 (see also
* pollrelock()).
*/
-typedef struct pollcache {
+struct pollcache {
kmutex_t pc_lock; /* lock to protect pollcache */
ulong_t *pc_bitmap; /* point to poll fd bitmap */
polldat_t **pc_hash; /* points to a hash table of ptrs */
@@ -187,11 +250,12 @@ typedef struct pollcache {
kcondvar_t pc_cv; /* cv to wait on if needed */
pid_t pc_pid; /* for check acc rights, devpoll only */
int pc_mapstart; /* where search start, devpoll only */
-} pollcache_t;
+ pcachelink_t *pc_parents; /* linked list of epoll parents */
+ pcachelink_t *pc_children; /* linked list of epoll children */
+};
/* pc_flag */
#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */
-#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */
#if defined(_KERNEL)
/*
@@ -218,11 +282,15 @@ extern void pollhead_delete(pollhead_t *, polldat_t *);
/*
* poll state interfaces:
*
- * pollstate_create creates per-thread pollstate
- * pollstate_destroy cleans up per-thread pollstate
+ * pollstate_create initializes per-thread pollstate
+ * pollstate_destroy cleans up per-thread pollstate
+ * pollstate_enter safely lock pollcache for pollstate
+ * pollstate_exit unlock pollcache from pollstate
*/
extern pollstate_t *pollstate_create(void);
extern void pollstate_destroy(pollstate_t *);
+extern int pollstate_enter(pollcache_t *);
+extern void pollstate_exit(pollcache_t *);
/*
* public pcache interfaces:
@@ -254,6 +322,7 @@ extern void pcache_destroy(pollcache_t *);
* pcache_grow_map grows the pollcache bitmap
* pcache_update_xref update cross ref (from polldat back to cacheset) info
* pcache_clean_entry cleanup an entry in pcache and more...
+ * pcache_wake_parents wake linked parent pollcaches
*/
extern polldat_t *pcache_lookup_fd(pollcache_t *, int);
extern polldat_t *pcache_alloc_fd(int);
@@ -263,6 +332,7 @@ extern void pcache_grow_hashtbl(pollcache_t *, nfds_t);
extern void pcache_grow_map(pollcache_t *, int);
extern void pcache_update_xref(pollcache_t *, int, ssize_t, int);
extern void pcache_clean_entry(pollstate_t *, int);
+extern void pcache_wake_parents(pollcache_t *);
/*
* pcacheset interfaces:
diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c
index c33156a4fc..cc125f127a 100644
--- a/usr/src/uts/common/syscall/poll.c
+++ b/usr/src/uts/common/syscall/poll.c
@@ -29,7 +29,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/*
@@ -77,11 +77,13 @@ static struct {
kstat_named_t pollcachehit; /* list matched 100% w/ cached one */
kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */
kstat_named_t pollcachemiss; /* every list entry is dif from cache */
+ kstat_named_t pollunlockfail; /* failed to perform pollunlock */
} pollstats = {
{ "polllistmiss", KSTAT_DATA_UINT64 },
{ "pollcachehit", KSTAT_DATA_UINT64 },
{ "pollcachephit", KSTAT_DATA_UINT64 },
- { "pollcachemiss", KSTAT_DATA_UINT64 }
+ { "pollcachemiss", KSTAT_DATA_UINT64 },
+ { "pollunlockfail", KSTAT_DATA_UINT64 }
};
kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats;
@@ -96,6 +98,10 @@ struct pplock {
static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */
+/* Contention lock & list for preventing deadlocks in recursive /dev/poll. */
+static kmutex_t pollstate_contenders_lock;
+static pollstate_t *pollstate_contenders = NULL;
+
#ifdef DEBUG
static int pollchecksanity(pollstate_t *, nfds_t);
static int pollcheckxref(pollstate_t *, int);
@@ -223,19 +229,35 @@ static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int,
* (which hold poll locks on entry to xx_poll(), then acquire foo)
* and pollwakeup() threads (which hold foo, then acquire poll locks).
*
- * pollunlock(void) releases whatever poll locks the current thread holds,
- * returning a cookie for use by pollrelock();
+ * pollunlock(*cookie) releases whatever poll locks the current thread holds,
+ * setting a cookie for use by pollrelock();
*
* pollrelock(cookie) reacquires previously dropped poll locks;
*
* polllock(php, mutex) does the common case: pollunlock(),
* acquire the problematic mutex, pollrelock().
+ *
+ * If polllock() or pollunlock() return non-zero, it indicates that a recursive
+ * /dev/poll is in progress and pollcache locks cannot be dropped. Callers
+ * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL.
*/
int
-pollunlock(void)
+pollunlock(int *lockstate)
{
+ pollstate_t *ps = curthread->t_pollstate;
pollcache_t *pcp;
- int lockstate = 0;
+
+ ASSERT(lockstate != NULL);
+
+ /*
+ * There is no way to safely perform a pollunlock() while in the depths
+ * of a recursive /dev/poll operation.
+ */
+ if (ps != NULL && ps->ps_depth > 1) {
+ ps->ps_flags |= POLLSTATE_ULFAIL;
+ pollstats.pollunlockfail.value.ui64++;
+ return (-1);
+ }
/*
* t_pollcache is set by /dev/poll and event ports (port_fd.c).
@@ -243,45 +265,56 @@ pollunlock(void)
* the t_pollcache should be NULL.
*/
if (curthread->t_pollcache == NULL)
- pcp = curthread->t_pollstate->ps_pcache;
+ pcp = ps->ps_pcache;
else
pcp = curthread->t_pollcache;
- if (mutex_owned(&pcp->pc_lock)) {
- lockstate = 1;
+ if (!mutex_owned(&pcp->pc_lock)) {
+ *lockstate = 0;
+ } else {
+ *lockstate = 1;
mutex_exit(&pcp->pc_lock);
}
- return (lockstate);
+ return (0);
}
void
pollrelock(int lockstate)
{
+ pollstate_t *ps = curthread->t_pollstate;
pollcache_t *pcp;
+ /* Skip this whole ordeal if the pollcache was not locked to begin */
+ if (lockstate == 0)
+ return;
+
/*
* t_pollcache is set by /dev/poll and event ports (port_fd.c).
* If the pollrelock/pollunlock is called as a result of poll(2),
* the t_pollcache should be NULL.
*/
if (curthread->t_pollcache == NULL)
- pcp = curthread->t_pollstate->ps_pcache;
+ pcp = ps->ps_pcache;
else
pcp = curthread->t_pollcache;
- if (lockstate > 0)
- mutex_enter(&pcp->pc_lock);
+ mutex_enter(&pcp->pc_lock);
}
/* ARGSUSED */
-void
+int
polllock(pollhead_t *php, kmutex_t *lp)
{
- if (!mutex_tryenter(lp)) {
- int lockstate = pollunlock();
+ if (mutex_tryenter(lp) == 0) {
+ int state;
+
+ if (pollunlock(&state) != 0) {
+ return (-1);
+ }
mutex_enter(lp);
- pollrelock(lockstate);
+ pollrelock(state);
}
+ return (0);
}
static int
@@ -370,10 +403,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
* Need to allocate memory for pollstate before anything because
* the mutex and cv are created in this space
*/
- if ((ps = t->t_pollstate) == NULL) {
- t->t_pollstate = pollstate_create();
- ps = t->t_pollstate;
- }
+ ps = pollstate_create();
if (ps->ps_pcache == NULL)
ps->ps_pcache = pcache_alloc();
@@ -899,6 +929,7 @@ pollnotify(pollcache_t *pcp, int fd)
BT_SET(pcp->pc_bitmap, fd);
pcp->pc_flag |= PC_POLLWAKE;
cv_broadcast(&pcp->pc_cv);
+ pcache_wake_parents(pcp);
}
/*
@@ -2221,20 +2252,47 @@ pcache_clean_entry(pollstate_t *ps, int fd)
}
}
+void
+pcache_wake_parents(pollcache_t *pcp)
+{
+ pcachelink_t *pl, *pln;
+
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+
+ for (pl = pcp->pc_parents; pl != NULL; pl = pln) {
+ mutex_enter(&pl->pcl_lock);
+ if (pl->pcl_state == PCL_VALID) {
+ ASSERT(pl->pcl_parent_pc != NULL);
+ cv_broadcast(&pl->pcl_parent_pc->pc_cv);
+ }
+ pln = pl->pcl_parent_next;
+ mutex_exit(&pl->pcl_lock);
+ }
+}
+
/*
- * This is the first time this thread has ever polled,
- * so we have to create its pollstate structure.
- * This will persist for the life of the thread,
- * until it calls pollcleanup().
+ * Initialize thread pollstate structure.
+ * It will persist for the life of the thread, until it calls pollcleanup().
*/
pollstate_t *
-pollstate_create(void)
+pollstate_create()
{
- pollstate_t *ps;
+ pollstate_t *ps = curthread->t_pollstate;
- ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
- ps->ps_nsets = POLLFDSETS;
- ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
+ if (ps == NULL) {
+ /*
+ * This is the first time this thread has ever polled, so we
+ * have to create its pollstate structure.
+ */
+ ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
+ ps->ps_nsets = POLLFDSETS;
+ ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
+ curthread->t_pollstate = ps;
+ } else {
+ ASSERT(ps->ps_depth == 0);
+ ASSERT(ps->ps_flags == 0);
+ ASSERT(ps->ps_pc_stack[0] == 0);
+ }
return (ps);
}
@@ -2259,6 +2317,186 @@ pollstate_destroy(pollstate_t *ps)
kmem_free(ps, sizeof (pollstate_t));
}
+static int
+pollstate_contend(pollstate_t *ps, pollcache_t *pcp)
+{
+ pollstate_t *rem, *next;
+ pollcache_t *desired_pc;
+ int result = 0, depth_total;
+
+ mutex_enter(&pollstate_contenders_lock);
+ /*
+ * There is a small chance that the pollcache of interest became
+ * available while we were waiting on the contenders lock.
+ */
+ if (mutex_tryenter(&pcp->pc_lock) != 0) {
+ goto out;
+ }
+
+ /*
+ * Walk the list of contended pollstates, searching for evidence of a
+ * deadlock condition.
+ */
+ depth_total = ps->ps_depth;
+ desired_pc = pcp;
+ for (rem = pollstate_contenders; rem != NULL; rem = next) {
+ int i, j;
+ next = rem->ps_contend_nextp;
+
+ /* Is this pollstate holding the pollcache of interest? */
+ for (i = 0; i < rem->ps_depth; i++) {
+ if (rem->ps_pc_stack[i] != desired_pc) {
+ continue;
+ }
+
+ /*
+ * The remote pollstate holds the pollcache lock we
+ * desire. If it is waiting on a pollcache we hold,
+ * then we can report the obvious deadlock.
+ */
+ ASSERT(rem->ps_contend_pc != NULL);
+ for (j = 0; j < ps->ps_depth; j++) {
+ if (rem->ps_contend_pc == ps->ps_pc_stack[j]) {
+ rem->ps_flags |= POLLSTATE_STALEMATE;
+ result = -1;
+ goto out;
+ }
+ }
+
+ /*
+ * The remote pollstate is not blocking on a pollcache
+ * which would deadlock against us. That pollcache
+ * may, however, be held by a pollstate which would
+ * result in a deadlock.
+ *
+ * To detect such a condition, we continue walking
+ * through the list using the pollcache blocking the
+ * remote thread as our new search target.
+ *
+ * Return to the front of pollstate_contenders since it
+ * is not ordered to guarantee complete dependency
+ * traversal. The below depth tracking places an upper
+ * bound on iterations.
+ */
+ desired_pc = rem->ps_contend_pc;
+ next = pollstate_contenders;
+
+ /*
+ * The recursion depth of the remote pollstate is used
+ * to calculate a final depth for the local /dev/poll
+ * recursion, since those locks will be acquired
+ * eventually. If that value exceeds the defined
+ * limit, we can report the failure now instead of
+ * recursing to that failure depth.
+ */
+ depth_total += (rem->ps_depth - i);
+ if (depth_total >= POLLMAXDEPTH) {
+ result = -1;
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * No deadlock partner was found. The only course of action is to
+ * record ourself as a contended pollstate and wait for the pollcache
+ * mutex to become available.
+ */
+ ps->ps_contend_pc = pcp;
+ ps->ps_contend_nextp = pollstate_contenders;
+ ps->ps_contend_pnextp = &pollstate_contenders;
+ if (pollstate_contenders != NULL) {
+ pollstate_contenders->ps_contend_pnextp =
+ &ps->ps_contend_nextp;
+ }
+ pollstate_contenders = ps;
+
+ mutex_exit(&pollstate_contenders_lock);
+ mutex_enter(&pcp->pc_lock);
+ mutex_enter(&pollstate_contenders_lock);
+
+ /*
+ * Our acquisition of the pollcache mutex may be due to another thread
+ * giving up in the face of deadlock with us. If that is the case,
+ * we too should report the failure.
+ */
+ if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) {
+ result = -1;
+ ps->ps_flags &= ~POLLSTATE_STALEMATE;
+ mutex_exit(&pcp->pc_lock);
+ }
+
+ /* Remove ourself from the contenders list. */
+ if (ps->ps_contend_nextp != NULL) {
+ ps->ps_contend_nextp->ps_contend_pnextp =
+ ps->ps_contend_pnextp;
+ }
+ *ps->ps_contend_pnextp = ps->ps_contend_nextp;
+ ps->ps_contend_pc = NULL;
+ ps->ps_contend_nextp = NULL;
+ ps->ps_contend_pnextp = NULL;
+
+out:
+ mutex_exit(&pollstate_contenders_lock);
+ return (result);
+}
+
+int
+pollstate_enter(pollcache_t *pcp)
+{
+ pollstate_t *ps = curthread->t_pollstate;
+ int i;
+
+ if (ps == NULL) {
+ /*
+ * The thread pollstate may not be initialized if VOP_POLL is
+ * called on a recursion-enabled /dev/poll handle from outside
+ * the poll() or /dev/poll codepaths.
+ */
+ return (PSE_FAIL_POLLSTATE);
+ }
+ if (ps->ps_depth >= POLLMAXDEPTH) {
+ return (PSE_FAIL_DEPTH);
+ }
+ /*
+ * Check the desired pollcache against pollcaches we already have
+ * locked. Such a loop is the most simple deadlock scenario.
+ */
+ for (i = 0; i < ps->ps_depth; i++) {
+ if (ps->ps_pc_stack[i] == pcp) {
+ return (PSE_FAIL_LOOP);
+ }
+ }
+ ASSERT(ps->ps_pc_stack[i] == NULL);
+
+ if (ps->ps_depth == 0) {
+ /* Locking initial the pollcache requires no caution */
+ mutex_enter(&pcp->pc_lock);
+ } else if (mutex_tryenter(&pcp->pc_lock) == 0) {
+ if (pollstate_contend(ps, pcp) != 0) {
+ /* This pollcache cannot safely be locked. */
+ return (PSE_FAIL_DEADLOCK);
+ }
+ }
+
+ ps->ps_pc_stack[ps->ps_depth++] = pcp;
+ return (PSE_SUCCESS);
+}
+
+void
+pollstate_exit(pollcache_t *pcp)
+{
+ pollstate_t *ps = curthread->t_pollstate;
+
+ VERIFY(ps != NULL);
+ VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp);
+
+ mutex_exit(&pcp->pc_lock);
+ ps->ps_pc_stack[--ps->ps_depth] = NULL;
+ VERIFY(ps->ps_depth >= 0);
+}
+
+
/*
* We are holding the appropriate uf_lock entering this routine.
* Bump up the ps_busy count to prevent the thread from exiting.