diff options
Diffstat (limited to 'usr/src/uts/common/io/devpoll.c')
| -rw-r--r-- | usr/src/uts/common/io/devpoll.c | 465 |
1 files changed, 295 insertions, 170 deletions
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 4fce431e00..040a1f9190 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, - pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds, + int *fdcntp) { - int start, ostart, end; - int fdcnt, fd; - boolean_t done; - file_t *fp; - short revent; - boolean_t no_wrap; - pollhead_t *php; - polldat_t *pdp; + int start, ostart, end, fdcnt, error = 0; + boolean_t done, no_wrap; pollfd_t *pfdp; epoll_event_t *epoll; - int error = 0; - short mask = POLLRDHUP | POLLWRBAND; - boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; + const short mask = POLLRDHUP | POLLWRBAND; + const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { - /* - * No Need to search because no poll fd - * has been cached. - */ - return (error); + /* No Need to search because no poll fd has been cached. */ + return (0); } if (is_epoll) { @@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; - php = NULL; if (start == 0) { /* @@ -294,8 +283,11 @@ retry: done = B_FALSE; fdcnt = 0; while ((fdcnt < nfds) && !done) { - php = NULL; - revent = 0; + pollhead_t *php = NULL; + short revent = 0; + uf_entry_gen_t gen; + int fd; + /* * Examine the bit map in a circular fashion * to avoid starvation. Always resume from @@ -305,6 +297,9 @@ retry: fd = bt_getlowbit(pcp->pc_bitmap, start, end); ASSERT(fd <= end); if (fd >= 0) { + file_t *fp; + polldat_t *pdp; + if (fd == end) { if (no_wrap) { done = B_TRUE; @@ -328,28 +323,14 @@ repoll: */ continue; } - if ((fp = getf(fd)) == NULL) { - /* - * The fd has been closed, but user has not - * done a POLLREMOVE on this fd yet. Instead - * of cleaning it here implicitly, we return - * POLLNVAL. This is consistent with poll(2) - * polling a closed fd. Hope this will remind - * user to do a POLLREMOVE. - */ - if (!is_epoll && pfdp != NULL) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; - continue; - } - - /* - * In the epoll compatibility case, we actually - * perform the implicit removal to remain - * closer to the epoll semantics. - */ + if ((fp = getf_gen(fd, &gen)) == NULL) { if (is_epoll) { + /* + * In the epoll compatibility case, we + * actually perform the implicit + * removal to remain closer to the + * epoll semantics. + */ pdp->pd_fp = NULL; pdp->pd_events = 0; @@ -360,27 +341,71 @@ repoll: } BT_CLEAR(pcp->pc_bitmap, fd); - continue; + } else if (pfdp != NULL) { + /* + * The fd has been closed, but user has + * not done a POLLREMOVE on this fd + * yet. Instead of cleaning it here + * implicitly, we return POLLNVAL. This + * is consistent with poll(2) polling a + * closed fd. Hope this will remind + * user to do a POLLREMOVE. + */ + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; } + continue; } - if (fp != pdp->pd_fp) { + /* + * Detect a change to the resource underlying a cached + * file descriptor. While the fd generation comparison + * will catch nearly all cases, the file_t comparison + * is maintained as a failsafe as well. + */ + if (gen != pdp->pd_gen || fp != pdp->pd_fp) { /* - * user is polling on a cached fd which was - * closed and then reused. Unfortunately - * there is no good way to inform user. - * If the file struct is also reused, we - * may not be able to detect the fd reuse - * at all. As long as this does not - * cause system failure and/or memory leak, - * we will play along. Man page states if - * user does not clean up closed fds, polling - * results will be indeterministic. + * The user is polling on a cached fd which was + * closed and then reused. Unfortunately there + * is no good way to communicate this fact to + * the consumer. * - * XXX - perhaps log the detection of fd - * reuse? + * When this situation has been detected, it's + * likely that any existing pollhead is + * ill-suited to perform proper wake-ups. + * + * Clean up the old entry under the expectation + * that a valid one will be provided as part of + * the later VOP_POLL. + */ + if (pdp->pd_php != NULL) { + pollhead_delete(pdp->pd_php, pdp); + pdp->pd_php = NULL; + } + + /* + * Since epoll is expected to act on the + * underlying 'struct file' (in Linux terms, + * our vnode_t would be a closer analog) rather + * than the fd itself, an implicit remove + * is necessary under these circumstances to + * suppress any results (or errors) from the + * new resource occupying the fd. */ - pdp->pd_fp = fp; + if (is_epoll) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + BT_CLEAR(pcp->pc_bitmap, fd); + releasef(fd); + continue; + } else { + /* + * Regular /dev/poll is unbothered + * about the fd reassignment. + */ + pdp->pd_fp = fp; + } } /* * XXX - pollrelock() logic needs to know which @@ -396,6 +421,27 @@ repoll: curthread->t_pollcache = pcp; error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, &revent, &php, NULL); + + /* + * Recheck edge-triggered descriptors which lack a + * pollhead. While this check is performed when an fd + * is added to the pollcache in dpwrite(), subsequent + * descriptor manipulation could cause a different + * resource to be present now. + */ + if ((pdp->pd_events & POLLET) && error == 0 && + pdp->pd_php == NULL && php == NULL && revent != 0) { + short levent = 0; + + /* + * The same POLLET-only VOP_POLL is used in an + * attempt to coax a pollhead from older + * driver logic. + */ + error = VOP_POLL(fp->f_vnode, POLLET, + 0, &levent, &php, NULL); + } + curthread->t_pollcache = NULL; releasef(fd); if (error != 0) { @@ -431,6 +477,16 @@ repoll: ep->data.u64 = pdp->pd_epolldata; /* + * Since POLLNVAL is a legal event for + * VOP_POLL handlers to emit, it must + * be translated epoll-legal. + */ + if (revent & POLLNVAL) { + revent &= ~POLLNVAL; + revent |= POLLERR; + } + + /* * If any of the event bits are set for * which poll and epoll representations * differ, swizzle in the native epoll @@ -488,19 +544,12 @@ repoll: } } - /* - * If POLLET is set, clear the bit in the - * bitmap -- which effectively latches the - * edge on a pollwakeup() from the driver. - */ - if (pdp->pd_events & POLLET) - BT_CLEAR(pcp->pc_bitmap, fd); - - /* - * If POLLONESHOT is set, perform the implicit - * POLLREMOVE. - */ + /* Handle special polling modes. */ if (pdp->pd_events & POLLONESHOT) { + /* + * If POLLONESHOT is set, perform the + * implicit POLLREMOVE. + */ pdp->pd_fp = NULL; pdp->pd_events = 0; @@ -511,6 +560,28 @@ repoll: } BT_CLEAR(pcp->pc_bitmap, fd); + } else if (pdp->pd_events & POLLET) { + /* + * Wire up the pollhead which should + * have been provided. Edge-triggered + * polling cannot function properly + * with drivers which do not emit one. + */ + if (php != NULL && + pdp->pd_php == NULL) { + pollhead_insert(php, pdp); + pdp->pd_php = php; + } + + /* + * If the driver has emitted a pollhead, + * clear the bit in the bitmap which + * effectively latches the edge on a + * pollwakeup() from the driver. + */ + if (pdp->pd_php != NULL) { + BT_CLEAR(pcp->pc_bitmap, fd); + } } fdcnt++; @@ -639,14 +710,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pollfd_t *pollfdp, *pfdp; dvpoll_epollfd_t *epfdp; uintptr_t limit; - int error, size; - ssize_t uiosize; - size_t copysize; + int error; + uint_t size; + size_t copysize, uiosize; nfds_t pollfdnum; - struct pollhead *php = NULL; - polldat_t *pdp; - int fd; - file_t *fp; boolean_t is_epoll, fds_added = B_FALSE; minor = getminor(dev); @@ -671,10 +738,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pcp->pc_pid = curproc->p_pid; } - uiosize = uiop->uio_resid; + if (uiop->uio_resid < 0) { + /* No one else is this careful, but maybe they should be. */ + return (EINVAL); + } + + uiosize = (size_t)uiop->uio_resid; pollfdnum = uiosize / size; /* + * For epoll-enabled handles, restrict the allowed write size to 2. + * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD + * operation which is expanded into two operations (DEL and ADD). + * + * All other operations performed through epoll_ctl(3C) will consist of + * a single entry. + */ + if (is_epoll && pollfdnum > 2) { + return (EINVAL); + } + + /* * We want to make sure that pollfdnum isn't large enough to DoS us, * but we also don't want to grab p_lock unnecessarily -- so we * perform the full check against our resource limits if and only if @@ -733,6 +817,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { ASSERT(dpep->dpe_refcnt != 0); + /* + * The epoll API does not allow EINTR as a result when making + * modifications to the set of polled fds. Given that write + * activity is relatively quick and the size of accepted writes + * is limited above to two entries, a signal-ignorant wait is + * used here to avoid the EINTR. + */ + if (is_epoll) { + cv_wait(&dpep->dpe_cv, &dpep->dpe_lock); + continue; + } + + /* + * Non-epoll writers to /dev/poll handles can tolerate EINTR. + */ if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); @@ -767,7 +866,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } for (pfdp = pollfdp; (uintptr_t)pfdp < limit; pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { - fd = pfdp->fd; + int fd = pfdp->fd; + polldat_t *pdp; + if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { /* * epoll semantics demand that we return EBADF if our @@ -783,78 +884,61 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { + uf_entry_gen_t gen; + file_t *fp = NULL; + struct pollhead *php = NULL; - fp = NULL; - - if (pdp == NULL) { - /* - * If we're in epoll compatibility mode, check - * that the fd is valid before allocating - * anything for it; epoll semantics demand that - * we return EBADF if our specified fd is - * invalid. - */ - if (is_epoll) { - if ((fp = getf(fd)) == NULL) { - error = EBADF; - break; - } + /* + * If we're in epoll compatibility mode, check that the + * fd is valid before allocating anything for it; epoll + * semantics demand that we return EBADF if our + * specified fd is invalid. + */ + if (is_epoll) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + error = EBADF; + break; } - + } + if (pdp == NULL) { pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); - } else { + } + + if (is_epoll) { /* - * epoll semantics demand that we error out if - * a file descriptor is added twice, which we - * check (imperfectly) by checking if we both - * have the file descriptor cached and the - * file pointer that correponds to the file - * descriptor matches our cached value. If - * there is a pointer mismatch, the file - * descriptor was closed without being removed. - * The converse is clearly not true, however, - * so to narrow the window by which a spurious - * EEXIST may be returned, we also check if - * this fp has been added to an epoll control - * descriptor in the past; if it hasn't, we - * know that this is due to fp reuse -- it's - * not a true EEXIST case. (By performing this - * additional check, we limit the window of - * spurious EEXIST to situations where a single - * file descriptor is being used across two or - * more epoll control descriptors -- and even - * then, the file descriptor must be closed and - * reused in a relatively tight time span.) + * If the fd is already a member of the epoll + * set, error emission is needed only when the + * fd assignment generation matches the one + * recorded in the polldat_t. Absence of such + * a generation match indicates that a new + * resource has been assigned at that fd. + * + * Caveat: It is possible to force a generation + * update while keeping the same backing + * resource. This is possible via dup2, but + * does not represent real-world use cases, + * making the lack of error acceptable. */ - if (is_epoll) { - if (pdp->pd_fp != NULL && - (fp = getf(fd)) != NULL && - fp == pdp->pd_fp && - (fp->f_flag2 & FEPOLLED)) { - error = EEXIST; - releasef(fd); - break; - } - - /* - * We have decided that the cached - * information was stale: it either - * didn't match, or the fp had never - * actually been epoll()'d on before. - * We need to now clear our pd_events - * to assure that we don't mistakenly - * operate on cached event disposition. - */ - pdp->pd_events = 0; + if (pdp->pd_fp != NULL && pdp->pd_gen == gen) { + error = EEXIST; + releasef(fd); + break; } - } - if (is_epoll) { + /* + * We have decided that the cached information + * was stale. Clear pd_events to assure that + * we don't mistakenly operate on cached event + * disposition. + */ + pdp->pd_events = 0; + epfdp = (dvpoll_epollfd_t *)pfdp; pdp->pd_epolldata = epfdp->dpep_data; + } ASSERT(pdp->pd_fd == fd); @@ -867,39 +951,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } - if (fp == NULL && (fp = getf(fd)) == NULL) { - /* - * The fd is not valid. Since we can't pass - * this error back in the write() call, set - * the bit in bitmap to force DP_POLL ioctl - * to examine it. - */ - BT_SET(pcp->pc_bitmap, fd); - pdp->pd_events |= pfdp->events; - continue; - } - /* - * To (greatly) reduce EEXIST false positives, we - * denote that this fp has been epoll()'d. We do this - * regardless of epoll compatibility mode, as the flag - * is harmless if not in epoll compatibility mode. - */ - fp->f_flag2 |= FEPOLLED; + if (!is_epoll) { + ASSERT(fp == NULL); - /* - * Don't do VOP_POLL for an already cached fd with - * same poll events. - */ - if ((pdp->pd_events == pfdp->events) && - (pdp->pd_fp == fp)) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + /* + * The fd is not valid. Since we can't + * pass this error back in the write() + * call, set the bit in bitmap to force + * DP_POLL ioctl to examine it. + */ + BT_SET(pcp->pc_bitmap, fd); + pdp->pd_events |= pfdp->events; + continue; + } /* - * the events are already cached + * Don't do VOP_POLL for an already cached fd + * with same poll events. */ - releasef(fd); - continue; + if ((pdp->pd_events == pfdp->events) && + (pdp->pd_fp == fp)) { + /* + * the events are already cached + */ + releasef(fd); + continue; + } } + /* * do VOP_POLL and cache this poll fd. */ @@ -917,6 +998,32 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) curthread->t_pollcache = pcp; error = VOP_POLL(fp->f_vnode, pfdp->events, 0, &pfdp->revents, &php, NULL); + + /* + * Edge-triggered polling requires a pollhead in order + * to initiate wake-ups properly. Drivers which are + * savvy to POLLET presence, which should include + * everything in-gate, will always emit one, regardless + * of revent status. Older drivers which only emit a + * pollhead if 'revents == 0' are given a second chance + * here via a second VOP_POLL, with only POLLET set in + * the events of interest. These circumstances should + * induce any cacheable drivers to emit a pollhead for + * wake-ups. + * + * Drivers which never emit a pollhead will simply + * disobey the exectation of edge-triggered behavior. + * This includes recursive epoll which, even on Linux, + * yields its events in a level-triggered fashion only. + */ + if ((pdp->pd_events & POLLET) && error == 0 && + php == NULL) { + short levent = 0; + + error = VOP_POLL(fp->f_vnode, POLLET, 0, + &levent, &php, NULL); + } + curthread->t_pollcache = NULL; /* * We always set the bit when this fd is cached; @@ -931,6 +1038,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) break; } pdp->pd_fp = fp; + pdp->pd_gen = gen; pdp->pd_events |= pfdp->events; if (php != NULL) { if (pdp->pd_php == NULL) { @@ -1056,8 +1164,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * to turn it off for a particular open. */ dpep->dpe_flag |= DP_ISEPOLLCOMPAT; - mutex_exit(&dpep->dpe_lock); + /* Record the epoll-enabled nature in the pollcache too */ + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_EPOLL; + mutex_exit(&pcp->pc_lock); + + mutex_exit(&dpep->dpe_lock); return (0); } @@ -1458,9 +1571,21 @@ dppoll(dev_t dev, short events, int anyyet, short *reventsp, int fdcnt = 0; pollstate_t *ps = curthread->t_pollstate; - rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); - if (rc == 0) { - *reventsp = (fdcnt > 0) ? POLLIN : 0; + /* + * Recursive polling will only emit certain events. Skip a + * scan of the pollcache if those events are not of interest. + */ + if (events & (POLLIN|POLLRDNORM)) { + rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); + } else { + rc = 0; + fdcnt = 0; + } + + if (rc == 0 && fdcnt > 0) { + *reventsp = POLLIN|POLLRDNORM; + } else { + *reventsp = 0; } pcachelink_assoc(pcp, ps->ps_pc_stack[0]); pollstate_exit(pcp); |
