diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/fs/fifofs/fifovnops.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/fs/proc/prvnops.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/io/devpoll.c | 517 | ||||
-rw-r--r-- | usr/src/uts/common/io/tty_pty.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/os/streamio.c | 30 | ||||
-rw-r--r-- | usr/src/uts/common/sys/devpoll.h | 5 | ||||
-rw-r--r-- | usr/src/uts/common/sys/poll.h | 6 | ||||
-rw-r--r-- | usr/src/uts/common/sys/poll_impl.h | 106 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/poll.c | 298 |
9 files changed, 811 insertions, 170 deletions
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index fee2924093..61edfab76c 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -28,7 +28,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -1775,7 +1775,10 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, fn_dest = fnp->fn_dest; fn_lock = fnp->fn_lock; - polllock(&stp->sd_pollist, &fn_lock->flk_lock); + if (polllock(&stp->sd_pollist, &fn_lock->flk_lock) != 0) { + *reventsp = POLLNVAL; + return (0); + } /* * see if FIFO/pipe open diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index e392ce4b14..b60f938c9b 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -5939,7 +5939,11 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp, return (0); } - lockstate = pollunlock(); /* avoid deadlock with prnotify() */ + /* avoid deadlock with prnotify() */ + if (pollunlock(&lockstate) != 0) { + *reventsp = POLLNVAL; + return (0); + } if ((error = prlock(pnp, ZNO)) != 0) { pollrelock(lockstate); diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 7b3454f89c..a63e1f1a08 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -123,6 +123,12 @@ static struct modlinkage modlinkage = { NULL }; +static void pcachelink_assoc(pollcache_t *, pollcache_t *); +static void pcachelink_mark_stale(pollcache_t *); +static void pcachelink_purge_stale(pollcache_t *); +static void pcachelink_purge_all(pollcache_t *); + + /* * Locking Design * @@ -157,7 +163,6 @@ _init() mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL); devpoll_init = 1; if ((error = mod_install(&modlinkage)) != 0) { - mutex_destroy(&devpoll_lock); kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); devpoll_init = 0; } @@ -255,6 +260,7 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, epoll_event_t *epoll; int error = 0; short mask = POLLRDHUP | POLLWRBAND; + boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { @@ -265,7 +271,7 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, return (error); } - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { pfdp = NULL; epoll = (epoll_event_t *)dpbuf; } else { @@ -331,7 +337,7 @@ repoll: * polling a closed fd. Hope this will remind * user to do a POLLREMOVE. */ - if (pfdp != NULL) { + if (!is_epoll && pfdp != NULL) { pfdp[fdcnt].fd = fd; pfdp[fdcnt].revents = POLLNVAL; fdcnt++; @@ -343,18 +349,18 @@ repoll: * perform the implicit removal to remain * closer to the epoll semantics. */ - ASSERT(epoll != NULL); + if (is_epoll) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; - pdp->pd_fp = NULL; - pdp->pd_events = 0; + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } - if (php != NULL) { - pollhead_delete(php, pdp); - pdp->pd_php = NULL; + BT_CLEAR(pcp->pc_bitmap, fd); + continue; } - - BT_CLEAR(pcp->pc_bitmap, fd); - continue; } if (fp != pdp->pd_fp) { @@ -394,6 +400,7 @@ repoll: if (error != 0) { break; } + /* * layered devices (e.g. console driver) * may change the vnode and thus the pollhead @@ -416,7 +423,7 @@ repoll: pfdp[fdcnt].fd = fd; pfdp[fdcnt].events = pdp->pd_events; pfdp[fdcnt].revents = revent; - } else { + } else if (epoll != NULL) { epoll_event_t *ep = &epoll[fdcnt]; ASSERT(epoll != NULL); @@ -449,6 +456,35 @@ repoll: (pdp->pd_events & EPOLLWRNORM)) { ep->events |= EPOLLWRNORM; } + } else { + pollstate_t *ps = + curthread->t_pollstate; + /* + * The devpoll handle itself is being + * polled. Notify the caller of any + * readable event(s), leaving as much + * state as possible untouched. + */ + VERIFY(fdcnt == 0); + VERIFY(ps != NULL); + + /* + * If a call to pollunlock() fails + * during VOP_POLL, skip over the fd + * and continue polling. + * + * Otherwise, report that there is an + * event pending. + */ + if ((ps->ps_flags & POLLSTATE_ULFAIL) + != 0) { + ps->ps_flags &= + ~POLLSTATE_ULFAIL; + continue; + } else { + fdcnt++; + break; + } } /* @@ -608,6 +644,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) polldat_t *pdp; int fd; file_t *fp; + boolean_t is_epoll, fds_added = B_FALSE; minor = getminor(dev); @@ -616,22 +653,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dpep = devpolltbl[minor]; ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); + + mutex_enter(&dpep->dpe_lock); pcp = dpep->dpe_pcache; + is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; + size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t); + mutex_exit(&dpep->dpe_lock); - if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && - curproc->p_pid != pcp->pc_pid) { - if (pcp->pc_pid != -1) + if (!is_epoll && curproc->p_pid != pcp->pc_pid) { + if (pcp->pc_pid != -1) { return (EACCES); + } pcp->pc_pid = curproc->p_pid; } - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { - size = sizeof (dvpoll_epollfd_t); - } else { - size = sizeof (pollfd_t); - } - uiosize = uiop->uio_resid; pollfdnum = uiosize / size; mutex_enter(&curproc->p_lock); @@ -640,7 +676,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc, RCA_SAFE); mutex_exit(&curproc->p_lock); - return (set_errno(EINVAL)); + return (EINVAL); } mutex_exit(&curproc->p_lock); /* @@ -665,44 +701,44 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) /* * We are about to enter the core portion of dpwrite(). Make sure this * write has exclusive access in this portion of the code, i.e., no - * other writers in this code and no other readers in dpioctl. + * other writers in this code. + * + * Waiting for all readers to drop their references to the dpe is + * unecessary since the pollcache itself is protected by pc_lock. */ mutex_enter(&dpep->dpe_lock); dpep->dpe_writerwait++; - while (dpep->dpe_refcnt != 0) { - /* - * We need to do a bit of a dance here: we need to drop - * our dpe_lock and grab the pc_lock to broadcast the pc_cv to - * kick any DP_POLL/DP_PPOLL sleepers. - */ - mutex_exit(&dpep->dpe_lock); - mutex_enter(&pcp->pc_lock); - pcp->pc_flag |= PC_WRITEWANTED; - cv_broadcast(&pcp->pc_cv); - mutex_exit(&pcp->pc_lock); - mutex_enter(&dpep->dpe_lock); - - if (dpep->dpe_refcnt == 0) - break; + while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { + ASSERT(dpep->dpe_refcnt != 0); if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); - mutex_enter(&pcp->pc_lock); - pcp->pc_flag &= ~PC_WRITEWANTED; - mutex_exit(&pcp->pc_lock); kmem_free(pollfdp, uiosize); - return (set_errno(EINTR)); + return (EINTR); } } dpep->dpe_writerwait--; dpep->dpe_flag |= DP_WRITER_PRESENT; dpep->dpe_refcnt++; + if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) { + /* + * The epoll compat mode was enabled while we were waiting to + * establish write access. It is not safe to continue since + * state was prepared for non-epoll operation. + */ + error = EBUSY; + goto bypass; + } mutex_exit(&dpep->dpe_lock); - mutex_enter(&pcp->pc_lock); - pcp->pc_flag &= ~PC_WRITEWANTED; + /* + * Since the dpwrite() may recursively walk an added /dev/poll handle, + * pollstate_enter() deadlock and loop detection must be used. + */ + (void) pollstate_create(); + VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); if (pcp->pc_bitmap == NULL) { pcache_create(pcp, pollfdnum); @@ -715,7 +751,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * epoll semantics demand that we return EBADF if our * specified fd is invalid. */ - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { error = EBADF; break; } @@ -736,7 +772,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * we return EBADF if our specified fd is * invalid. */ - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { if ((fp = getf(fd)) == NULL) { error = EBADF; break; @@ -771,7 +807,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * then, the file descriptor must be closed and * reused in a relatively tight time span.) */ - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { if (pdp->pd_fp != NULL && (fp = getf(fd)) != NULL && fp == pdp->pd_fp && @@ -794,7 +830,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } } - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { epfdp = (dvpoll_epollfd_t *)pfdp; pdp->pd_epolldata = epfdp->dpep_data; } @@ -886,12 +922,12 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp->pd_php = php; } } - } + fds_added = B_TRUE; releasef(fd); } else { if (pdp == NULL || pdp->pd_fp == NULL) { - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { /* * As with the add case (above), epoll * semantics demand that we error out @@ -914,10 +950,19 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) BT_CLEAR(pcp->pc_bitmap, fd); } } - mutex_exit(&pcp->pc_lock); + /* + * Any fds added to an recursive-capable pollcache could themselves be + * /dev/poll handles. To ensure that proper event propagation occurs, + * parent pollcaches are woken so that they can create any needed + * pollcache links. + */ + if (fds_added) { + pcache_wake_parents(pcp); + } + pollstate_exit(pcp); mutex_enter(&dpep->dpe_lock); +bypass: dpep->dpe_flag &= ~DP_WRITER_PRESENT; - ASSERT(dpep->dpe_refcnt == 1); dpep->dpe_refcnt--; cv_broadcast(&dpep->dpe_cv); mutex_exit(&dpep->dpe_lock); @@ -945,6 +990,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) pollcache_t *pcp; hrtime_t now; int error = 0; + boolean_t is_epoll; STRUCT_DECL(dvpoll, dvpoll); if (cmd == DP_POLL || cmd == DP_PPOLL) { @@ -961,6 +1007,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) pcp = dpep->dpe_pcache; mutex_enter(&dpep->dpe_lock); + is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; if (cmd == DP_EPOLLCOMPAT) { if (dpep->dpe_refcnt != 0) { @@ -982,8 +1029,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) return (0); } - if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && - curproc->p_pid != pcp->pc_pid) { + if (!is_epoll && curproc->p_pid != pcp->pc_pid) { if (pcp->pc_pid != -1) { mutex_exit(&dpep->dpe_lock); return (EACCES); @@ -992,7 +1038,8 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) pcp->pc_pid = curproc->p_pid; } - while ((dpep->dpe_flag & DP_WRITER_PRESENT) || + /* Wait until all writers have cleared the handle before continuing */ + while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 || (dpep->dpe_writerwait != 0)) { if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { mutex_exit(&dpep->dpe_lock); @@ -1128,7 +1175,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) return (error == 0 ? EINTR : 0); } - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (is_epoll) { size = nfds * (fdsize = sizeof (epoll_event_t)); } else { size = nfds * (fdsize = sizeof (pollfd_t)); @@ -1139,10 +1186,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * requires another per thread structure hook. This can be * implemented later if data suggests that it's necessary. */ - if ((ps = curthread->t_pollstate) == NULL) { - curthread->t_pollstate = pollstate_create(); - ps = curthread->t_pollstate; - } + ps = pollstate_create(); if (ps->ps_dpbufsize < size) { /* @@ -1169,15 +1213,25 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) } } - mutex_enter(&pcp->pc_lock); + VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); for (;;) { pcp->pc_flag &= ~PC_POLLWAKE; + /* + * Mark all child pcachelinks as stale. + * Those which are still part of the tree will be + * marked as valid during the poll. + */ + pcachelink_mark_stale(pcp); + error = dp_pcache_poll(dpep, ps->ps_dpbuf, pcp, nfds, &fdcnt); if (fdcnt > 0 || error != 0) break; + /* Purge still-stale child pcachelinks */ + pcachelink_purge_stale(pcp); + /* * A pollwake has happened since we polled cache. */ @@ -1192,42 +1246,12 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) break; } - if (!(pcp->pc_flag & PC_WRITEWANTED)) { - error = cv_timedwait_sig_hrtime(&pcp->pc_cv, - &pcp->pc_lock, deadline); - } else { - error = 1; - } - - if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) { - /* - * We've been kicked off of our cv because a - * writer wants in. We're going to drop our - * reference count and then wait until the - * writer is gone -- at which point we'll - * reacquire the pc_lock and call into - * dp_pcache_poll() to get the updated state. - */ - mutex_exit(&pcp->pc_lock); - - mutex_enter(&dpep->dpe_lock); - dpep->dpe_refcnt--; - cv_broadcast(&dpep->dpe_cv); - - while ((dpep->dpe_flag & DP_WRITER_PRESENT) || - (dpep->dpe_writerwait != 0)) { - error = cv_wait_sig_swap(&dpep->dpe_cv, - &dpep->dpe_lock); - } - - dpep->dpe_refcnt++; - mutex_exit(&dpep->dpe_lock); - mutex_enter(&pcp->pc_lock); - } + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); /* - * If we were awakened by a signal or timeout - * then break the loop, else poll again. + * If we were awakened by a signal or timeout then + * break the loop, else poll again. */ if (error <= 0) { error = (error == 0) ? EINTR : 0; @@ -1236,7 +1260,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) error = 0; } } - mutex_exit(&pcp->pc_lock); + pollstate_exit(pcp); DP_SIGMASK_RESTORE(ksetp); @@ -1299,6 +1323,66 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) return (error); } +/* + * Overview of Recursive Polling + * + * It is possible for /dev/poll to poll for events on file descriptors which + * themselves are /dev/poll handles. Pending events in the child handle are + * represented as readable data via the POLLIN flag. To limit surface area, + * this recursion is presently allowed on only /dev/poll handles which have + * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is + * limited to 5 in order to be consistent with Linux epoll. + * + * Extending dppoll() for VOP_POLL: + * + * The recursive /dev/poll implementation begins by extending dppoll() to + * report when resources contained in the pollcache have relevant event state. + * At the highest level, it means calling dp_pcache_poll() so it indicates if + * fd events are present without consuming them or altering the pollcache + * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will + * yield the initiating event. Additionally, the VOP_POLL should return in + * such a way that dp_pcache_poll() does not clear the parent bitmap entry + * which corresponds to the child /dev/poll fd. This means that child + * pollcaches will be checked during every poll which facilitates wake-up + * behavior detailed below. + * + * Pollcache Links and Wake Events: + * + * Recursive /dev/poll avoids complicated pollcache locking constraints during + * pollwakeup events by eschewing the traditional pollhead mechanism in favor + * of a different approach. For each pollcache at the root of a recursive + * /dev/poll "tree", pcachelink_t structures are established to all child + * /dev/poll pollcaches. During pollnotify() in a child pollcache, the + * linked list of pcachelink_t entries is walked, where those marked as valid + * incur a cv_broadcast to their parent pollcache. Most notably, these + * pcachelink_t cv wakeups are performed without acquiring pc_lock on the + * parent pollcache (which would require careful deadlock avoidance). This + * still allows the woken poll on the parent to discover the pertinent events + * due to the fact that bitmap entires for the child pollcache are always + * maintained by the dppoll() logic above. + * + * Depth Limiting and Loop Prevention: + * + * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and + * loop constraints are enforced via pollstate_enter(). The pollcache_t + * pointer is compared against any existing entries in ps_pc_stack and is added + * to the end if no match (and therefore loop) is found. Once poll operations + * for a given pollcache_t are complete, pollstate_exit() clears the pointer + * from the list. The pollstate_enter() and pollstate_exit() functions are + * responsible for acquiring and releasing pc_lock, respectively. + * + * Deadlock Safety: + * + * Descending through a tree of recursive /dev/poll handles involves the tricky + * business of sequentially entering multiple pollcache locks. This tree + * topology cannot define a lock acquisition order in such a way that it is + * immune to deadlocks between threads. The pollstate_enter() and + * pollstate_exit() functions provide an interface for recursive /dev/poll + * operations to safely lock pollcaches while failing gracefully in the face of + * deadlocking topologies. (See pollstate_contend() for more detail about how + * deadlocks are detected and resolved.) + */ + /*ARGSUSED*/ static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, @@ -1306,24 +1390,63 @@ dppoll(dev_t dev, short events, int anyyet, short *reventsp, { minor_t minor; dp_entry_t *dpep; + pollcache_t *pcp; + int res, rc = 0; minor = getminor(dev); - mutex_enter(&devpoll_lock); + ASSERT(minor < dptblsize); dpep = devpolltbl[minor]; ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); - /* - * Polling on a /dev/poll fd is not fully supported yet. - */ - if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { - /* no error in epoll compat. mode */ - *reventsp = 0; - } else { + mutex_enter(&dpep->dpe_lock); + if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) { + /* Poll recursion is not yet supported for non-epoll handles */ *reventsp = POLLERR; + mutex_exit(&dpep->dpe_lock); + return (0); + } else { + dpep->dpe_refcnt++; + pcp = dpep->dpe_pcache; + mutex_exit(&dpep->dpe_lock); } - return (0); + + res = pollstate_enter(pcp); + if (res == PSE_SUCCESS) { + nfds_t nfds = 1; + int fdcnt = 0; + pollstate_t *ps = curthread->t_pollstate; + + rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); + if (rc == 0) { + *reventsp = (fdcnt > 0) ? POLLIN : 0; + } + pcachelink_assoc(pcp, ps->ps_pc_stack[0]); + pollstate_exit(pcp); + } else { + switch (res) { + case PSE_FAIL_DEPTH: + rc = EINVAL; + break; + case PSE_FAIL_LOOP: + case PSE_FAIL_DEADLOCK: + rc = ELOOP; + break; + default: + /* + * If anything else has gone awry, such as being polled + * from an unexpected context, fall back to the + * recursion-intolerant response. + */ + *reventsp = POLLERR; + rc = 0; + break; + } + } + + DP_REFRELE(dpep); + return (rc); } /* @@ -1376,8 +1499,190 @@ dpclose(dev_t dev, int flag, int otyp, cred_t *credp) while (pcp->pc_busy > 0) cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); mutex_exit(&pcp->pc_no_exit); + + /* Clean up any pollcache links created via recursive /dev/poll */ + if (pcp->pc_parents != NULL || pcp->pc_children != NULL) { + /* + * Because of the locking rules for pcachelink manipulation, + * acquring pc_lock is required for this step. + */ + mutex_enter(&pcp->pc_lock); + pcachelink_purge_all(pcp); + mutex_exit(&pcp->pc_lock); + } + pcache_destroy(pcp); ASSERT(dpep->dpe_refcnt == 0); kmem_free(dpep, sizeof (dp_entry_t)); return (0); } + +static void +pcachelink_locked_rele(pcachelink_t *pl) +{ + ASSERT(MUTEX_HELD(&pl->pcl_lock)); + VERIFY(pl->pcl_refcnt >= 1); + + pl->pcl_refcnt--; + if (pl->pcl_refcnt == 0) { + VERIFY(pl->pcl_state == PCL_INVALID); + ASSERT(pl->pcl_parent_pc == NULL); + ASSERT(pl->pcl_child_pc == NULL); + ASSERT(pl->pcl_parent_next == NULL); + ASSERT(pl->pcl_child_next == NULL); + + pl->pcl_state = PCL_FREE; + mutex_destroy(&pl->pcl_lock); + kmem_free(pl, sizeof (pcachelink_t)); + } else { + mutex_exit(&pl->pcl_lock); + } +} + +/* + * Associate parent and child pollcaches via a pcachelink_t. If an existing + * link (stale or valid) between the two is found, it will be reused. If a + * suitable link is not found for reuse, a new one will be allocated. + */ +static void +pcachelink_assoc(pollcache_t *child, pollcache_t *parent) +{ + pcachelink_t *pl, **plpn; + + ASSERT(MUTEX_HELD(&child->pc_lock)); + ASSERT(MUTEX_HELD(&parent->pc_lock)); + + /* Search for an existing link we can reuse. */ + plpn = &child->pc_parents; + for (pl = child->pc_parents; pl != NULL; pl = *plpn) { + mutex_enter(&pl->pcl_lock); + if (pl->pcl_state == PCL_INVALID) { + /* Clean any invalid links while walking the list */ + *plpn = pl->pcl_parent_next; + pl->pcl_child_pc = NULL; + pl->pcl_parent_next = NULL; + pcachelink_locked_rele(pl); + } else if (pl->pcl_parent_pc == parent) { + /* Successfully found parent link */ + ASSERT(pl->pcl_state == PCL_VALID || + pl->pcl_state == PCL_STALE); + pl->pcl_state = PCL_VALID; + mutex_exit(&pl->pcl_lock); + return; + } else { + plpn = &pl->pcl_parent_next; + mutex_exit(&pl->pcl_lock); + } + } + + /* No existing link to the parent was found. Create a fresh one. */ + pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP); + mutex_init(&pl->pcl_lock, NULL, MUTEX_DEFAULT, NULL); + + pl->pcl_parent_pc = parent; + pl->pcl_child_next = parent->pc_children; + parent->pc_children = pl; + pl->pcl_refcnt++; + + pl->pcl_child_pc = child; + pl->pcl_parent_next = child->pc_parents; + child->pc_parents = pl; + pl->pcl_refcnt++; + + pl->pcl_state = PCL_VALID; +} + +/* + * Mark all child links in a pollcache as stale. Any invalid child links found + * during iteration are purged. + */ +static void +pcachelink_mark_stale(pollcache_t *pcp) +{ + pcachelink_t *pl, **plpn; + + ASSERT(MUTEX_HELD(&pcp->pc_lock)); + + plpn = &pcp->pc_children; + for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { + mutex_enter(&pl->pcl_lock); + if (pl->pcl_state == PCL_INVALID) { + /* + * Remove any invalid links while we are going to the + * trouble of walking the list. + */ + *plpn = pl->pcl_child_next; + pl->pcl_parent_pc = NULL; + pl->pcl_child_next = NULL; + pcachelink_locked_rele(pl); + } else { + pl->pcl_state = PCL_STALE; + plpn = &pl->pcl_child_next; + mutex_exit(&pl->pcl_lock); + } + } +} + +/* + * Purge all stale (or invalid) child links from a pollcache. + */ +static void +pcachelink_purge_stale(pollcache_t *pcp) +{ + pcachelink_t *pl, **plpn; + + ASSERT(MUTEX_HELD(&pcp->pc_lock)); + + plpn = &pcp->pc_children; + for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { + mutex_enter(&pl->pcl_lock); + switch (pl->pcl_state) { + case PCL_STALE: + pl->pcl_state = PCL_INVALID; + /* FALLTHROUGH */ + case PCL_INVALID: + *plpn = pl->pcl_child_next; + pl->pcl_parent_pc = NULL; + pl->pcl_child_next = NULL; + pcachelink_locked_rele(pl); + break; + default: + plpn = &pl->pcl_child_next; + mutex_exit(&pl->pcl_lock); + } + } +} + +/* + * Purge all child and parent links from a pollcache, regardless of status. + */ +static void +pcachelink_purge_all(pollcache_t *pcp) +{ + pcachelink_t *pl, **plpn; + + ASSERT(MUTEX_HELD(&pcp->pc_lock)); + + plpn = &pcp->pc_parents; + for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) { + mutex_enter(&pl->pcl_lock); + pl->pcl_state = PCL_INVALID; + *plpn = pl->pcl_parent_next; + pl->pcl_child_pc = NULL; + pl->pcl_parent_next = NULL; + pcachelink_locked_rele(pl); + } + + plpn = &pcp->pc_children; + for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { + mutex_enter(&pl->pcl_lock); + pl->pcl_state = PCL_INVALID; + *plpn = pl->pcl_child_next; + pl->pcl_parent_pc = NULL; + pl->pcl_child_next = NULL; + pcachelink_locked_rele(pl); + } + + ASSERT(pcp->pc_parents == NULL); + ASSERT(pcp->pc_children == NULL); +} diff --git a/usr/src/uts/common/io/tty_pty.c b/usr/src/uts/common/io/tty_pty.c index 6c829dcd21..a8eea823be 100644 --- a/usr/src/uts/common/io/tty_pty.c +++ b/usr/src/uts/common/io/tty_pty.c @@ -1,6 +1,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* @@ -988,7 +989,10 @@ ptcpoll(dev_t dev, #ifdef lint anyyet = anyyet; #endif - polllock(php, &pty->ptc_lock); + if (polllock(php, &pty->ptc_lock) != 0) { + *reventsp = POLLNVAL; + return (0); + } ASSERT(MUTEX_HELD(&pty->ptc_lock)); diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 18a5ded1c6..62f94729cf 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -8218,7 +8218,11 @@ strpoll( tq = qp->q_next->q_nfsrv; ASSERT(tq != NULL); - polllock(&stp->sd_pollist, QLOCK(tq)); + if (polllock(&stp->sd_pollist, QLOCK(tq)) != 0) { + releasestr(qp); + *reventsp = POLLNVAL; + return (0); + } if (events & POLLWRNORM) { queue_t *sqp; @@ -8228,7 +8232,12 @@ strpoll( else if ((sqp = stp->sd_struiowrq) != NULL) { /* Check sync stream barrier write q */ mutex_exit(QLOCK(tq)); - polllock(&stp->sd_pollist, QLOCK(sqp)); + if (polllock(&stp->sd_pollist, + QLOCK(sqp)) != 0) { + releasestr(qp); + *reventsp = POLLNVAL; + return (0); + } if (sqp->q_flag & QFULL) /* ensure pollwakeup() is done */ sqp->q_flag |= QWANTWSYNC; @@ -8241,7 +8250,12 @@ strpoll( goto chkrd; } mutex_exit(QLOCK(sqp)); - polllock(&stp->sd_pollist, QLOCK(tq)); + if (polllock(&stp->sd_pollist, + QLOCK(tq)) != 0) { + releasestr(qp); + *reventsp = POLLNVAL; + return (0); + } } else retevents |= POLLOUT; } @@ -8273,7 +8287,10 @@ chkrd: * Note: Need to do polllock() here since ps_lock may be * held. See bug 4191544. */ - polllock(&stp->sd_pollist, &stp->sd_lock); + if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) { + *reventsp = POLLNVAL; + return (0); + } headlocked = 1; mp = qp->q_first; while (mp) { @@ -8326,7 +8343,10 @@ chkrd: if (!anyyet) { *phpp = &stp->sd_pollist; if (headlocked == 0) { - polllock(&stp->sd_pollist, &stp->sd_lock); + if (polllock(&stp->sd_pollist, &stp->sd_lock) != 0) { + *reventsp = POLLNVAL; + return (0); + } headlocked = 1; } stp->sd_rput_opt |= SR_POLLIN; diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h index 4e4c76d9b0..3b6bd159c3 100644 --- a/usr/src/uts/common/sys/devpoll.h +++ b/usr/src/uts/common/sys/devpoll.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_DEVPOLL_H @@ -88,9 +88,6 @@ typedef struct dp_entry { mutex_enter(&(dpep)->dpe_lock); \ ASSERT((dpep)->dpe_refcnt > 0); \ (dpep)->dpe_refcnt--; \ - if ((dpep)->dpe_refcnt == 0) { \ - cv_broadcast(&(dpep)->dpe_cv); \ - } \ mutex_exit(&(dpep)->dpe_lock); \ } #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h index efc8457a6a..75a588533f 100644 --- a/usr/src/uts/common/sys/poll.h +++ b/usr/src/uts/common/sys/poll.h @@ -31,7 +31,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_POLL_H @@ -130,8 +130,8 @@ extern void pollwakeup(pollhead_t *, short); /* * Internal routines. */ -extern void polllock(pollhead_t *, kmutex_t *); -extern int pollunlock(void); +extern int polllock(pollhead_t *, kmutex_t *); +extern int pollunlock(int *); extern void pollrelock(int); extern void pollcleanup(void); extern void pollblockexit(struct fpollinfo *); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index 2e866ec4d4..67b47f9a1e 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_POLL_IMPL_H @@ -36,7 +36,7 @@ * * Each kernel thread (1), if engaged in poll system call, has a reference to * a pollstate_t (2), which contains relevant flags and locks. The pollstate_t - * contains a pointer to a pcache_t (3), which caches the state of previous + * contains a pointer to a pollcache_t (3), which caches the state of previous * calls to poll. A bitmap (4) is stored inside the poll cache, where each * bit represents a file descriptor. The bits are set if the corresponding * device has a polled event pending. Only fds with their bit set will be @@ -45,7 +45,7 @@ * structures keep track of the pollfd_t arrays (6) passed in from userland. * Each polled file descriptor has a corresponding polldat_t which can be * chained onto a device's pollhead, and these are kept in a hash table (7) - * inside the pcache_t. The hash table allows efficient conversion of a + * inside the pollcache_t. The hash table allows efficient conversion of a * given fd to its corresponding polldat_t. * * (1) (2) @@ -76,7 +76,7 @@ * Both poll system call and /dev/poll use the pollcache_t structure * definition and the routines managing the structure. But poll(2) and * /dev/poll have their own copy of the structures. The /dev/poll driver - * table (1a) contains an array of pointers, each pointing at a pcache_t + * table (1a) contains an array of pointers, each pointing at a pollcache_t * struct (3). A device minor number is used as an device table index. * */ @@ -86,12 +86,26 @@ #include <sys/thread.h> #include <sys/file.h> +#include <sys/port_kernel.h> #ifdef __cplusplus extern "C" { #endif /* + * Typedefs + */ +struct pollcache; +struct pollstate; +struct pcachelink; +struct polldat; + +typedef struct pollcache pollcache_t; +typedef struct pollstate pollstate_t; +typedef struct pcachelink pcachelink_t; +typedef struct polldat polldat_t; + +/* * description of pollcacheset structure */ typedef struct pollcacheset { @@ -104,18 +118,40 @@ typedef struct pollcacheset { #define POLLFDSETS 2 /* + * Maximum depth for recusive poll operations. + */ +#define POLLMAXDEPTH 5 + +/* * State information kept by each polling thread */ -typedef struct pollstate { +struct pollstate { pollfd_t *ps_pollfd; /* hold the current poll list */ size_t ps_nfds; /* size of ps_pollfd */ kmutex_t ps_lock; /* mutex for sleep/wakeup */ - struct pollcache *ps_pcache; /* cached poll fd set */ + pollcache_t *ps_pcache; /* cached poll fd set */ pollcacheset_t *ps_pcacheset; /* cached poll lists */ int ps_nsets; /* no. of cached poll sets */ pollfd_t *ps_dpbuf; /* return pollfd buf used by devpoll */ size_t ps_dpbufsize; /* size of ps_dpbuf */ -} pollstate_t; + int ps_depth; /* epoll recursion depth */ + pollcache_t *ps_pc_stack[POLLMAXDEPTH]; /* epoll recursion state */ + pollcache_t *ps_contend_pc; /* pollcache waited on */ + pollstate_t *ps_contend_nextp; /* next in contender list */ + pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ + int ps_flags; /* state flags */ +}; + +/* pollstate flags */ +#define POLLSTATE_STALEMATE 0x1 +#define POLLSTATE_ULFAIL 0x2 + +/* pollstate_enter results */ +#define PSE_SUCCESS 0 +#define PSE_FAIL_DEPTH 1 +#define PSE_FAIL_LOOP 2 +#define PSE_FAIL_DEADLOCK 3 +#define PSE_FAIL_POLLSTATE 4 /* * poll cache size defines @@ -143,27 +179,54 @@ typedef struct xref { #define POLLPOSINVAL (-1L) /* xf_position is invalid */ #define POLLPOSTRANS (-2L) /* xf_position is transient state */ + +typedef enum pclstate { + PCL_INIT = 0, /* just allocated/zeroed, prior */ + PCL_VALID, /* linked with both parent and child pollcaches */ + PCL_STALE, /* still linked but marked stale, pending refresh */ + PCL_INVALID, /* dissociated from one pollcache, awaiting cleanup */ + PCL_FREE /* only meant to indicate use-after-free */ +} pclstate_t; + +/* + * The pcachelink struct creates an association between parent and child + * pollcaches in a recursive /dev/poll operation. Fields are protected by + * pcl_lock although manipulation of pcl_child_next or pcl_parent_next also + * requires holding pc_lock in the respective pcl_parent_pc or pcl_child_pc + * pollcache. + */ +struct pcachelink { + kmutex_t pcl_lock; /* protects contents */ + pclstate_t pcl_state; /* status of link entry */ + int pcl_refcnt; /* ref cnt of linked pcaches */ + pollcache_t *pcl_child_pc; /* child pollcache */ + pollcache_t *pcl_parent_pc; /* parent pollcache */ + pcachelink_t *pcl_child_next; /* next in child list */ + pcachelink_t *pcl_parent_next; /* next in parents list */ +}; + + /* * polldat is an entry for a cached poll fd. A polldat struct can be in * poll cache table as well as on pollhead ph_list, which is used by * pollwakeup to wake up a sleeping poller. There should be one polldat * per polled fd hanging off pollstate struct. */ -typedef struct polldat { +struct polldat { int pd_fd; /* cached poll fd */ int pd_events; /* union of all polled events */ file_t *pd_fp; /* used to detect fd reuse */ pollhead_t *pd_php; /* used to undo poll registration */ kthread_t *pd_thread; /* used for waking up a sleep thrd */ - struct pollcache *pd_pcache; /* a ptr to the pollcache of this fd */ - struct polldat *pd_next; /* next on pollhead's ph_list */ - struct polldat *pd_hashnext; /* next on pollhead's ph_list */ + pollcache_t *pd_pcache; /* a ptr to the pollcache of this fd */ + polldat_t *pd_next; /* next on pollhead's ph_list */ + polldat_t *pd_hashnext; /* next on pollhead's ph_list */ int pd_count; /* total count from all ref'ed sets */ int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ - struct port_kevent *pd_portev; /* associated port event struct */ + port_kevent_t *pd_portev; /* associated port event struct */ uint64_t pd_epolldata; /* epoll data, if any */ -} polldat_t; +}; /* * One cache for each thread that polls. Points to a bitmap (used by pollwakeup) @@ -172,7 +235,7 @@ typedef struct polldat { * of port_fdcache_t, both structs implement pc_lock with offset 0 (see also * pollrelock()). */ -typedef struct pollcache { +struct pollcache { kmutex_t pc_lock; /* lock to protect pollcache */ ulong_t *pc_bitmap; /* point to poll fd bitmap */ polldat_t **pc_hash; /* points to a hash table of ptrs */ @@ -187,11 +250,12 @@ typedef struct pollcache { kcondvar_t pc_cv; /* cv to wait on if needed */ pid_t pc_pid; /* for check acc rights, devpoll only */ int pc_mapstart; /* where search start, devpoll only */ -} pollcache_t; + pcachelink_t *pc_parents; /* linked list of epoll parents */ + pcachelink_t *pc_children; /* linked list of epoll children */ +}; /* pc_flag */ #define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ -#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */ #if defined(_KERNEL) /* @@ -218,11 +282,15 @@ extern void pollhead_delete(pollhead_t *, polldat_t *); /* * poll state interfaces: * - * pollstate_create creates per-thread pollstate - * pollstate_destroy cleans up per-thread pollstate + * pollstate_create initializes per-thread pollstate + * pollstate_destroy cleans up per-thread pollstate + * pollstate_enter safely lock pollcache for pollstate + * pollstate_exit unlock pollcache from pollstate */ extern pollstate_t *pollstate_create(void); extern void pollstate_destroy(pollstate_t *); +extern int pollstate_enter(pollcache_t *); +extern void pollstate_exit(pollcache_t *); /* * public pcache interfaces: @@ -254,6 +322,7 @@ extern void pcache_destroy(pollcache_t *); * pcache_grow_map grows the pollcache bitmap * pcache_update_xref update cross ref (from polldat back to cacheset) info * pcache_clean_entry cleanup an entry in pcache and more... + * pcache_wake_parents wake linked parent pollcaches */ extern polldat_t *pcache_lookup_fd(pollcache_t *, int); extern polldat_t *pcache_alloc_fd(int); @@ -263,6 +332,7 @@ extern void pcache_grow_hashtbl(pollcache_t *, nfds_t); extern void pcache_grow_map(pollcache_t *, int); extern void pcache_update_xref(pollcache_t *, int, ssize_t, int); extern void pcache_clean_entry(pollstate_t *, int); +extern void pcache_wake_parents(pollcache_t *); /* * pcacheset interfaces: diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index c33156a4fc..cc125f127a 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -77,11 +77,13 @@ static struct { kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ kstat_named_t pollcachemiss; /* every list entry is dif from cache */ + kstat_named_t pollunlockfail; /* failed to perform pollunlock */ } pollstats = { { "polllistmiss", KSTAT_DATA_UINT64 }, { "pollcachehit", KSTAT_DATA_UINT64 }, { "pollcachephit", KSTAT_DATA_UINT64 }, - { "pollcachemiss", KSTAT_DATA_UINT64 } + { "pollcachemiss", KSTAT_DATA_UINT64 }, + { "pollunlockfail", KSTAT_DATA_UINT64 } }; kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; @@ -96,6 +98,10 @@ struct pplock { static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ +/* Contention lock & list for preventing deadlocks in recursive /dev/poll. */ +static kmutex_t pollstate_contenders_lock; +static pollstate_t *pollstate_contenders = NULL; + #ifdef DEBUG static int pollchecksanity(pollstate_t *, nfds_t); static int pollcheckxref(pollstate_t *, int); @@ -223,19 +229,35 @@ static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, * (which hold poll locks on entry to xx_poll(), then acquire foo) * and pollwakeup() threads (which hold foo, then acquire poll locks). * - * pollunlock(void) releases whatever poll locks the current thread holds, - * returning a cookie for use by pollrelock(); + * pollunlock(*cookie) releases whatever poll locks the current thread holds, + * setting a cookie for use by pollrelock(); * * pollrelock(cookie) reacquires previously dropped poll locks; * * polllock(php, mutex) does the common case: pollunlock(), * acquire the problematic mutex, pollrelock(). + * + * If polllock() or pollunlock() return non-zero, it indicates that a recursive + * /dev/poll is in progress and pollcache locks cannot be dropped. Callers + * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL. */ int -pollunlock(void) +pollunlock(int *lockstate) { + pollstate_t *ps = curthread->t_pollstate; pollcache_t *pcp; - int lockstate = 0; + + ASSERT(lockstate != NULL); + + /* + * There is no way to safely perform a pollunlock() while in the depths + * of a recursive /dev/poll operation. + */ + if (ps != NULL && ps->ps_depth > 1) { + ps->ps_flags |= POLLSTATE_ULFAIL; + pollstats.pollunlockfail.value.ui64++; + return (-1); + } /* * t_pollcache is set by /dev/poll and event ports (port_fd.c). @@ -243,45 +265,56 @@ pollunlock(void) * the t_pollcache should be NULL. */ if (curthread->t_pollcache == NULL) - pcp = curthread->t_pollstate->ps_pcache; + pcp = ps->ps_pcache; else pcp = curthread->t_pollcache; - if (mutex_owned(&pcp->pc_lock)) { - lockstate = 1; + if (!mutex_owned(&pcp->pc_lock)) { + *lockstate = 0; + } else { + *lockstate = 1; mutex_exit(&pcp->pc_lock); } - return (lockstate); + return (0); } void pollrelock(int lockstate) { + pollstate_t *ps = curthread->t_pollstate; pollcache_t *pcp; + /* Skip this whole ordeal if the pollcache was not locked to begin */ + if (lockstate == 0) + return; + /* * t_pollcache is set by /dev/poll and event ports (port_fd.c). * If the pollrelock/pollunlock is called as a result of poll(2), * the t_pollcache should be NULL. */ if (curthread->t_pollcache == NULL) - pcp = curthread->t_pollstate->ps_pcache; + pcp = ps->ps_pcache; else pcp = curthread->t_pollcache; - if (lockstate > 0) - mutex_enter(&pcp->pc_lock); + mutex_enter(&pcp->pc_lock); } /* ARGSUSED */ -void +int polllock(pollhead_t *php, kmutex_t *lp) { - if (!mutex_tryenter(lp)) { - int lockstate = pollunlock(); + if (mutex_tryenter(lp) == 0) { + int state; + + if (pollunlock(&state) != 0) { + return (-1); + } mutex_enter(lp); - pollrelock(lockstate); + pollrelock(state); } + return (0); } static int @@ -370,10 +403,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * Need to allocate memory for pollstate before anything because * the mutex and cv are created in this space */ - if ((ps = t->t_pollstate) == NULL) { - t->t_pollstate = pollstate_create(); - ps = t->t_pollstate; - } + ps = pollstate_create(); if (ps->ps_pcache == NULL) ps->ps_pcache = pcache_alloc(); @@ -899,6 +929,7 @@ pollnotify(pollcache_t *pcp, int fd) BT_SET(pcp->pc_bitmap, fd); pcp->pc_flag |= PC_POLLWAKE; cv_broadcast(&pcp->pc_cv); + pcache_wake_parents(pcp); } /* @@ -2221,20 +2252,47 @@ pcache_clean_entry(pollstate_t *ps, int fd) } } +void +pcache_wake_parents(pollcache_t *pcp) +{ + pcachelink_t *pl, *pln; + + ASSERT(MUTEX_HELD(&pcp->pc_lock)); + + for (pl = pcp->pc_parents; pl != NULL; pl = pln) { + mutex_enter(&pl->pcl_lock); + if (pl->pcl_state == PCL_VALID) { + ASSERT(pl->pcl_parent_pc != NULL); + cv_broadcast(&pl->pcl_parent_pc->pc_cv); + } + pln = pl->pcl_parent_next; + mutex_exit(&pl->pcl_lock); + } +} + /* - * This is the first time this thread has ever polled, - * so we have to create its pollstate structure. - * This will persist for the life of the thread, - * until it calls pollcleanup(). + * Initialize thread pollstate structure. + * It will persist for the life of the thread, until it calls pollcleanup(). */ pollstate_t * -pollstate_create(void) +pollstate_create() { - pollstate_t *ps; + pollstate_t *ps = curthread->t_pollstate; - ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); - ps->ps_nsets = POLLFDSETS; - ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); + if (ps == NULL) { + /* + * This is the first time this thread has ever polled, so we + * have to create its pollstate structure. + */ + ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); + ps->ps_nsets = POLLFDSETS; + ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); + curthread->t_pollstate = ps; + } else { + ASSERT(ps->ps_depth == 0); + ASSERT(ps->ps_flags == 0); + ASSERT(ps->ps_pc_stack[0] == 0); + } return (ps); } @@ -2259,6 +2317,186 @@ pollstate_destroy(pollstate_t *ps) kmem_free(ps, sizeof (pollstate_t)); } +static int +pollstate_contend(pollstate_t *ps, pollcache_t *pcp) +{ + pollstate_t *rem, *next; + pollcache_t *desired_pc; + int result = 0, depth_total; + + mutex_enter(&pollstate_contenders_lock); + /* + * There is a small chance that the pollcache of interest became + * available while we were waiting on the contenders lock. + */ + if (mutex_tryenter(&pcp->pc_lock) != 0) { + goto out; + } + + /* + * Walk the list of contended pollstates, searching for evidence of a + * deadlock condition. + */ + depth_total = ps->ps_depth; + desired_pc = pcp; + for (rem = pollstate_contenders; rem != NULL; rem = next) { + int i, j; + next = rem->ps_contend_nextp; + + /* Is this pollstate holding the pollcache of interest? */ + for (i = 0; i < rem->ps_depth; i++) { + if (rem->ps_pc_stack[i] != desired_pc) { + continue; + } + + /* + * The remote pollstate holds the pollcache lock we + * desire. If it is waiting on a pollcache we hold, + * then we can report the obvious deadlock. + */ + ASSERT(rem->ps_contend_pc != NULL); + for (j = 0; j < ps->ps_depth; j++) { + if (rem->ps_contend_pc == ps->ps_pc_stack[j]) { + rem->ps_flags |= POLLSTATE_STALEMATE; + result = -1; + goto out; + } + } + + /* + * The remote pollstate is not blocking on a pollcache + * which would deadlock against us. That pollcache + * may, however, be held by a pollstate which would + * result in a deadlock. + * + * To detect such a condition, we continue walking + * through the list using the pollcache blocking the + * remote thread as our new search target. + * + * Return to the front of pollstate_contenders since it + * is not ordered to guarantee complete dependency + * traversal. The below depth tracking places an upper + * bound on iterations. + */ + desired_pc = rem->ps_contend_pc; + next = pollstate_contenders; + + /* + * The recursion depth of the remote pollstate is used + * to calculate a final depth for the local /dev/poll + * recursion, since those locks will be acquired + * eventually. If that value exceeds the defined + * limit, we can report the failure now instead of + * recursing to that failure depth. + */ + depth_total += (rem->ps_depth - i); + if (depth_total >= POLLMAXDEPTH) { + result = -1; + goto out; + } + } + } + + /* + * No deadlock partner was found. The only course of action is to + * record ourself as a contended pollstate and wait for the pollcache + * mutex to become available. + */ + ps->ps_contend_pc = pcp; + ps->ps_contend_nextp = pollstate_contenders; + ps->ps_contend_pnextp = &pollstate_contenders; + if (pollstate_contenders != NULL) { + pollstate_contenders->ps_contend_pnextp = + &ps->ps_contend_nextp; + } + pollstate_contenders = ps; + + mutex_exit(&pollstate_contenders_lock); + mutex_enter(&pcp->pc_lock); + mutex_enter(&pollstate_contenders_lock); + + /* + * Our acquisition of the pollcache mutex may be due to another thread + * giving up in the face of deadlock with us. If that is the case, + * we too should report the failure. + */ + if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) { + result = -1; + ps->ps_flags &= ~POLLSTATE_STALEMATE; + mutex_exit(&pcp->pc_lock); + } + + /* Remove ourself from the contenders list. */ + if (ps->ps_contend_nextp != NULL) { + ps->ps_contend_nextp->ps_contend_pnextp = + ps->ps_contend_pnextp; + } + *ps->ps_contend_pnextp = ps->ps_contend_nextp; + ps->ps_contend_pc = NULL; + ps->ps_contend_nextp = NULL; + ps->ps_contend_pnextp = NULL; + +out: + mutex_exit(&pollstate_contenders_lock); + return (result); +} + +int +pollstate_enter(pollcache_t *pcp) +{ + pollstate_t *ps = curthread->t_pollstate; + int i; + + if (ps == NULL) { + /* + * The thread pollstate may not be initialized if VOP_POLL is + * called on a recursion-enabled /dev/poll handle from outside + * the poll() or /dev/poll codepaths. + */ + return (PSE_FAIL_POLLSTATE); + } + if (ps->ps_depth >= POLLMAXDEPTH) { + return (PSE_FAIL_DEPTH); + } + /* + * Check the desired pollcache against pollcaches we already have + * locked. Such a loop is the most simple deadlock scenario. + */ + for (i = 0; i < ps->ps_depth; i++) { + if (ps->ps_pc_stack[i] == pcp) { + return (PSE_FAIL_LOOP); + } + } + ASSERT(ps->ps_pc_stack[i] == NULL); + + if (ps->ps_depth == 0) { + /* Locking initial the pollcache requires no caution */ + mutex_enter(&pcp->pc_lock); + } else if (mutex_tryenter(&pcp->pc_lock) == 0) { + if (pollstate_contend(ps, pcp) != 0) { + /* This pollcache cannot safely be locked. */ + return (PSE_FAIL_DEADLOCK); + } + } + + ps->ps_pc_stack[ps->ps_depth++] = pcp; + return (PSE_SUCCESS); +} + +void +pollstate_exit(pollcache_t *pcp) +{ + pollstate_t *ps = curthread->t_pollstate; + + VERIFY(ps != NULL); + VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp); + + mutex_exit(&pcp->pc_lock); + ps->ps_pc_stack[--ps->ps_depth] = NULL; + VERIFY(ps->ps_depth >= 0); +} + + /* * We are holding the appropriate uf_lock entering this routine. * Bump up the ps_busy count to prevent the thread from exiting. |