summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/io/devpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/io/devpoll.c')
-rw-r--r--usr/src/uts/common/io/devpoll.c465
1 files changed, 295 insertions, 170 deletions
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index 4fce431e00..040a1f9190 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -25,7 +25,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2017 Joyent, Inc.
*/
#include <sys/types.h>
@@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
* stale entries!
*/
static int
-dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
- pollcache_t *pcp, nfds_t nfds, int *fdcntp)
+dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
+ int *fdcntp)
{
- int start, ostart, end;
- int fdcnt, fd;
- boolean_t done;
- file_t *fp;
- short revent;
- boolean_t no_wrap;
- pollhead_t *php;
- polldat_t *pdp;
+ int start, ostart, end, fdcnt, error = 0;
+ boolean_t done, no_wrap;
pollfd_t *pfdp;
epoll_event_t *epoll;
- int error = 0;
- short mask = POLLRDHUP | POLLWRBAND;
- boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
+ const short mask = POLLRDHUP | POLLWRBAND;
+ const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
ASSERT(MUTEX_HELD(&pcp->pc_lock));
if (pcp->pc_bitmap == NULL) {
- /*
- * No Need to search because no poll fd
- * has been cached.
- */
- return (error);
+ /* No Need to search because no poll fd has been cached. */
+ return (0);
}
if (is_epoll) {
@@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
retry:
start = ostart = pcp->pc_mapstart;
end = pcp->pc_mapend;
- php = NULL;
if (start == 0) {
/*
@@ -294,8 +283,11 @@ retry:
done = B_FALSE;
fdcnt = 0;
while ((fdcnt < nfds) && !done) {
- php = NULL;
- revent = 0;
+ pollhead_t *php = NULL;
+ short revent = 0;
+ uf_entry_gen_t gen;
+ int fd;
+
/*
* Examine the bit map in a circular fashion
* to avoid starvation. Always resume from
@@ -305,6 +297,9 @@ retry:
fd = bt_getlowbit(pcp->pc_bitmap, start, end);
ASSERT(fd <= end);
if (fd >= 0) {
+ file_t *fp;
+ polldat_t *pdp;
+
if (fd == end) {
if (no_wrap) {
done = B_TRUE;
@@ -328,28 +323,14 @@ repoll:
*/
continue;
}
- if ((fp = getf(fd)) == NULL) {
- /*
- * The fd has been closed, but user has not
- * done a POLLREMOVE on this fd yet. Instead
- * of cleaning it here implicitly, we return
- * POLLNVAL. This is consistent with poll(2)
- * polling a closed fd. Hope this will remind
- * user to do a POLLREMOVE.
- */
- if (!is_epoll && pfdp != NULL) {
- pfdp[fdcnt].fd = fd;
- pfdp[fdcnt].revents = POLLNVAL;
- fdcnt++;
- continue;
- }
-
- /*
- * In the epoll compatibility case, we actually
- * perform the implicit removal to remain
- * closer to the epoll semantics.
- */
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
if (is_epoll) {
+ /*
+ * In the epoll compatibility case, we
+ * actually perform the implicit
+ * removal to remain closer to the
+ * epoll semantics.
+ */
pdp->pd_fp = NULL;
pdp->pd_events = 0;
@@ -360,27 +341,71 @@ repoll:
}
BT_CLEAR(pcp->pc_bitmap, fd);
- continue;
+ } else if (pfdp != NULL) {
+ /*
+ * The fd has been closed, but user has
+ * not done a POLLREMOVE on this fd
+ * yet. Instead of cleaning it here
+ * implicitly, we return POLLNVAL. This
+ * is consistent with poll(2) polling a
+ * closed fd. Hope this will remind
+ * user to do a POLLREMOVE.
+ */
+ pfdp[fdcnt].fd = fd;
+ pfdp[fdcnt].revents = POLLNVAL;
+ fdcnt++;
}
+ continue;
}
- if (fp != pdp->pd_fp) {
+ /*
+ * Detect a change to the resource underlying a cached
+ * file descriptor. While the fd generation comparison
+ * will catch nearly all cases, the file_t comparison
+ * is maintained as a failsafe as well.
+ */
+ if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
/*
- * user is polling on a cached fd which was
- * closed and then reused. Unfortunately
- * there is no good way to inform user.
- * If the file struct is also reused, we
- * may not be able to detect the fd reuse
- * at all. As long as this does not
- * cause system failure and/or memory leak,
- * we will play along. Man page states if
- * user does not clean up closed fds, polling
- * results will be indeterministic.
+ * The user is polling on a cached fd which was
+ * closed and then reused. Unfortunately there
+ * is no good way to communicate this fact to
+ * the consumer.
*
- * XXX - perhaps log the detection of fd
- * reuse?
+ * When this situation has been detected, it's
+ * likely that any existing pollhead is
+ * ill-suited to perform proper wake-ups.
+ *
+ * Clean up the old entry under the expectation
+ * that a valid one will be provided as part of
+ * the later VOP_POLL.
+ */
+ if (pdp->pd_php != NULL) {
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = NULL;
+ }
+
+ /*
+ * Since epoll is expected to act on the
+ * underlying 'struct file' (in Linux terms,
+ * our vnode_t would be a closer analog) rather
+ * than the fd itself, an implicit remove
+ * is necessary under these circumstances to
+ * suppress any results (or errors) from the
+ * new resource occupying the fd.
*/
- pdp->pd_fp = fp;
+ if (is_epoll) {
+ pdp->pd_fp = NULL;
+ pdp->pd_events = 0;
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ releasef(fd);
+ continue;
+ } else {
+ /*
+ * Regular /dev/poll is unbothered
+ * about the fd reassignment.
+ */
+ pdp->pd_fp = fp;
+ }
}
/*
* XXX - pollrelock() logic needs to know which
@@ -396,6 +421,27 @@ repoll:
curthread->t_pollcache = pcp;
error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
&revent, &php, NULL);
+
+ /*
+ * Recheck edge-triggered descriptors which lack a
+ * pollhead. While this check is performed when an fd
+ * is added to the pollcache in dpwrite(), subsequent
+ * descriptor manipulation could cause a different
+ * resource to be present now.
+ */
+ if ((pdp->pd_events & POLLET) && error == 0 &&
+ pdp->pd_php == NULL && php == NULL && revent != 0) {
+ short levent = 0;
+
+ /*
+ * The same POLLET-only VOP_POLL is used in an
+ * attempt to coax a pollhead from older
+ * driver logic.
+ */
+ error = VOP_POLL(fp->f_vnode, POLLET,
+ 0, &levent, &php, NULL);
+ }
+
curthread->t_pollcache = NULL;
releasef(fd);
if (error != 0) {
@@ -431,6 +477,16 @@ repoll:
ep->data.u64 = pdp->pd_epolldata;
/*
+ * Since POLLNVAL is a legal event for
+ * VOP_POLL handlers to emit, it must
+ * be translated epoll-legal.
+ */
+ if (revent & POLLNVAL) {
+ revent &= ~POLLNVAL;
+ revent |= POLLERR;
+ }
+
+ /*
* If any of the event bits are set for
* which poll and epoll representations
* differ, swizzle in the native epoll
@@ -488,19 +544,12 @@ repoll:
}
}
- /*
- * If POLLET is set, clear the bit in the
- * bitmap -- which effectively latches the
- * edge on a pollwakeup() from the driver.
- */
- if (pdp->pd_events & POLLET)
- BT_CLEAR(pcp->pc_bitmap, fd);
-
- /*
- * If POLLONESHOT is set, perform the implicit
- * POLLREMOVE.
- */
+ /* Handle special polling modes. */
if (pdp->pd_events & POLLONESHOT) {
+ /*
+ * If POLLONESHOT is set, perform the
+ * implicit POLLREMOVE.
+ */
pdp->pd_fp = NULL;
pdp->pd_events = 0;
@@ -511,6 +560,28 @@ repoll:
}
BT_CLEAR(pcp->pc_bitmap, fd);
+ } else if (pdp->pd_events & POLLET) {
+ /*
+ * Wire up the pollhead which should
+ * have been provided. Edge-triggered
+ * polling cannot function properly
+ * with drivers which do not emit one.
+ */
+ if (php != NULL &&
+ pdp->pd_php == NULL) {
+ pollhead_insert(php, pdp);
+ pdp->pd_php = php;
+ }
+
+ /*
+ * If the driver has emitted a pollhead,
+ * clear the bit in the bitmap which
+ * effectively latches the edge on a
+ * pollwakeup() from the driver.
+ */
+ if (pdp->pd_php != NULL) {
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ }
}
fdcnt++;
@@ -639,14 +710,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pollfd_t *pollfdp, *pfdp;
dvpoll_epollfd_t *epfdp;
uintptr_t limit;
- int error, size;
- ssize_t uiosize;
- size_t copysize;
+ int error;
+ uint_t size;
+ size_t copysize, uiosize;
nfds_t pollfdnum;
- struct pollhead *php = NULL;
- polldat_t *pdp;
- int fd;
- file_t *fp;
boolean_t is_epoll, fds_added = B_FALSE;
minor = getminor(dev);
@@ -671,10 +738,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pcp->pc_pid = curproc->p_pid;
}
- uiosize = uiop->uio_resid;
+ if (uiop->uio_resid < 0) {
+ /* No one else is this careful, but maybe they should be. */
+ return (EINVAL);
+ }
+
+ uiosize = (size_t)uiop->uio_resid;
pollfdnum = uiosize / size;
/*
+ * For epoll-enabled handles, restrict the allowed write size to 2.
+ * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
+ * operation which is expanded into two operations (DEL and ADD).
+ *
+ * All other operations performed through epoll_ctl(3C) will consist of
+ * a single entry.
+ */
+ if (is_epoll && pollfdnum > 2) {
+ return (EINVAL);
+ }
+
+ /*
* We want to make sure that pollfdnum isn't large enough to DoS us,
* but we also don't want to grab p_lock unnecessarily -- so we
* perform the full check against our resource limits if and only if
@@ -733,6 +817,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
ASSERT(dpep->dpe_refcnt != 0);
+ /*
+ * The epoll API does not allow EINTR as a result when making
+ * modifications to the set of polled fds. Given that write
+ * activity is relatively quick and the size of accepted writes
+ * is limited above to two entries, a signal-ignorant wait is
+ * used here to avoid the EINTR.
+ */
+ if (is_epoll) {
+ cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
+ continue;
+ }
+
+ /*
+ * Non-epoll writers to /dev/poll handles can tolerate EINTR.
+ */
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
dpep->dpe_writerwait--;
mutex_exit(&dpep->dpe_lock);
@@ -767,7 +866,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
}
for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
- fd = pfdp->fd;
+ int fd = pfdp->fd;
+ polldat_t *pdp;
+
if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
/*
* epoll semantics demand that we return EBADF if our
@@ -783,78 +884,61 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
pdp = pcache_lookup_fd(pcp, fd);
if (pfdp->events != POLLREMOVE) {
+ uf_entry_gen_t gen;
+ file_t *fp = NULL;
+ struct pollhead *php = NULL;
- fp = NULL;
-
- if (pdp == NULL) {
- /*
- * If we're in epoll compatibility mode, check
- * that the fd is valid before allocating
- * anything for it; epoll semantics demand that
- * we return EBADF if our specified fd is
- * invalid.
- */
- if (is_epoll) {
- if ((fp = getf(fd)) == NULL) {
- error = EBADF;
- break;
- }
+ /*
+ * If we're in epoll compatibility mode, check that the
+ * fd is valid before allocating anything for it; epoll
+ * semantics demand that we return EBADF if our
+ * specified fd is invalid.
+ */
+ if (is_epoll) {
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
+ error = EBADF;
+ break;
}
-
+ }
+ if (pdp == NULL) {
pdp = pcache_alloc_fd(0);
pdp->pd_fd = fd;
pdp->pd_pcache = pcp;
pcache_insert_fd(pcp, pdp, pollfdnum);
- } else {
+ }
+
+ if (is_epoll) {
/*
- * epoll semantics demand that we error out if
- * a file descriptor is added twice, which we
- * check (imperfectly) by checking if we both
- * have the file descriptor cached and the
- * file pointer that correponds to the file
- * descriptor matches our cached value. If
- * there is a pointer mismatch, the file
- * descriptor was closed without being removed.
- * The converse is clearly not true, however,
- * so to narrow the window by which a spurious
- * EEXIST may be returned, we also check if
- * this fp has been added to an epoll control
- * descriptor in the past; if it hasn't, we
- * know that this is due to fp reuse -- it's
- * not a true EEXIST case. (By performing this
- * additional check, we limit the window of
- * spurious EEXIST to situations where a single
- * file descriptor is being used across two or
- * more epoll control descriptors -- and even
- * then, the file descriptor must be closed and
- * reused in a relatively tight time span.)
+ * If the fd is already a member of the epoll
+ * set, error emission is needed only when the
+ * fd assignment generation matches the one
+ * recorded in the polldat_t. Absence of such
+ * a generation match indicates that a new
+ * resource has been assigned at that fd.
+ *
+ * Caveat: It is possible to force a generation
+ * update while keeping the same backing
+ * resource. This is possible via dup2, but
+ * does not represent real-world use cases,
+ * making the lack of error acceptable.
*/
- if (is_epoll) {
- if (pdp->pd_fp != NULL &&
- (fp = getf(fd)) != NULL &&
- fp == pdp->pd_fp &&
- (fp->f_flag2 & FEPOLLED)) {
- error = EEXIST;
- releasef(fd);
- break;
- }
-
- /*
- * We have decided that the cached
- * information was stale: it either
- * didn't match, or the fp had never
- * actually been epoll()'d on before.
- * We need to now clear our pd_events
- * to assure that we don't mistakenly
- * operate on cached event disposition.
- */
- pdp->pd_events = 0;
+ if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
+ error = EEXIST;
+ releasef(fd);
+ break;
}
- }
- if (is_epoll) {
+ /*
+ * We have decided that the cached information
+ * was stale. Clear pd_events to assure that
+ * we don't mistakenly operate on cached event
+ * disposition.
+ */
+ pdp->pd_events = 0;
+
epfdp = (dvpoll_epollfd_t *)pfdp;
pdp->pd_epolldata = epfdp->dpep_data;
+
}
ASSERT(pdp->pd_fd == fd);
@@ -867,39 +951,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
if (fd > pcp->pc_mapend) {
pcp->pc_mapend = fd;
}
- if (fp == NULL && (fp = getf(fd)) == NULL) {
- /*
- * The fd is not valid. Since we can't pass
- * this error back in the write() call, set
- * the bit in bitmap to force DP_POLL ioctl
- * to examine it.
- */
- BT_SET(pcp->pc_bitmap, fd);
- pdp->pd_events |= pfdp->events;
- continue;
- }
- /*
- * To (greatly) reduce EEXIST false positives, we
- * denote that this fp has been epoll()'d. We do this
- * regardless of epoll compatibility mode, as the flag
- * is harmless if not in epoll compatibility mode.
- */
- fp->f_flag2 |= FEPOLLED;
+ if (!is_epoll) {
+ ASSERT(fp == NULL);
- /*
- * Don't do VOP_POLL for an already cached fd with
- * same poll events.
- */
- if ((pdp->pd_events == pfdp->events) &&
- (pdp->pd_fp == fp)) {
+ if ((fp = getf_gen(fd, &gen)) == NULL) {
+ /*
+ * The fd is not valid. Since we can't
+ * pass this error back in the write()
+ * call, set the bit in bitmap to force
+ * DP_POLL ioctl to examine it.
+ */
+ BT_SET(pcp->pc_bitmap, fd);
+ pdp->pd_events |= pfdp->events;
+ continue;
+ }
/*
- * the events are already cached
+ * Don't do VOP_POLL for an already cached fd
+ * with same poll events.
*/
- releasef(fd);
- continue;
+ if ((pdp->pd_events == pfdp->events) &&
+ (pdp->pd_fp == fp)) {
+ /*
+ * the events are already cached
+ */
+ releasef(fd);
+ continue;
+ }
}
+
/*
* do VOP_POLL and cache this poll fd.
*/
@@ -917,6 +998,32 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
curthread->t_pollcache = pcp;
error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
&pfdp->revents, &php, NULL);
+
+ /*
+ * Edge-triggered polling requires a pollhead in order
+ * to initiate wake-ups properly. Drivers which are
+ * savvy to POLLET presence, which should include
+ * everything in-gate, will always emit one, regardless
+ * of revent status. Older drivers which only emit a
+ * pollhead if 'revents == 0' are given a second chance
+ * here via a second VOP_POLL, with only POLLET set in
+ * the events of interest. These circumstances should
+ * induce any cacheable drivers to emit a pollhead for
+ * wake-ups.
+ *
+ * Drivers which never emit a pollhead will simply
+ * disobey the exectation of edge-triggered behavior.
+ * This includes recursive epoll which, even on Linux,
+ * yields its events in a level-triggered fashion only.
+ */
+ if ((pdp->pd_events & POLLET) && error == 0 &&
+ php == NULL) {
+ short levent = 0;
+
+ error = VOP_POLL(fp->f_vnode, POLLET, 0,
+ &levent, &php, NULL);
+ }
+
curthread->t_pollcache = NULL;
/*
* We always set the bit when this fd is cached;
@@ -931,6 +1038,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
break;
}
pdp->pd_fp = fp;
+ pdp->pd_gen = gen;
pdp->pd_events |= pfdp->events;
if (php != NULL) {
if (pdp->pd_php == NULL) {
@@ -1056,8 +1164,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
* to turn it off for a particular open.
*/
dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
- mutex_exit(&dpep->dpe_lock);
+ /* Record the epoll-enabled nature in the pollcache too */
+ mutex_enter(&pcp->pc_lock);
+ pcp->pc_flag |= PC_EPOLL;
+ mutex_exit(&pcp->pc_lock);
+
+ mutex_exit(&dpep->dpe_lock);
return (0);
}
@@ -1458,9 +1571,21 @@ dppoll(dev_t dev, short events, int anyyet, short *reventsp,
int fdcnt = 0;
pollstate_t *ps = curthread->t_pollstate;
- rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
- if (rc == 0) {
- *reventsp = (fdcnt > 0) ? POLLIN : 0;
+ /*
+ * Recursive polling will only emit certain events. Skip a
+ * scan of the pollcache if those events are not of interest.
+ */
+ if (events & (POLLIN|POLLRDNORM)) {
+ rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
+ } else {
+ rc = 0;
+ fdcnt = 0;
+ }
+
+ if (rc == 0 && fdcnt > 0) {
+ *reventsp = POLLIN|POLLRDNORM;
+ } else {
+ *reventsp = 0;
}
pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
pollstate_exit(pcp);