diff options
author | Bryan Cantrill <bryan@joyent.com> | 2014-04-23 07:41:10 +0000 |
---|---|---|
committer | Bryan Cantrill <bryan@joyent.com> | 2014-04-23 07:44:59 +0000 |
commit | 7f98ab2316da18bdec16f363a24fffa33594b4f2 (patch) | |
tree | 03b60e0981e86e005101b073f9b0a537bada1e31 /usr/src | |
parent | e1bd8a8a019df3353d0acd7f92dd710f887ef90b (diff) | |
download | illumos-joyent-7f98ab2316da18bdec16f363a24fffa33594b4f2.tar.gz |
OS-2893 add support for epoll
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Diffstat (limited to 'usr/src')
32 files changed, 1421 insertions, 99 deletions
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 069268dc05..4e453da0c1 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -23,7 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -703,6 +703,8 @@ const struct ioc { /* /dev/poll ioctl() control codes */ { (uint_t)DP_POLL, "DP_POLL", NULL }, { (uint_t)DP_ISPOLLED, "DP_ISPOLLED", NULL }, + { (uint_t)DP_PPOLL, "DP_PPOLL", NULL }, + { (uint_t)DP_EPOLLCOMPAT, "DP_EPOLLCOMPAT", NULL }, /* the old /proc ioctl() control codes */ #define PIOC ('q'<<8) { (uint_t)(PIOC|1), "PIOCSTATUS", NULL }, diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile index 873c2ded87..cec30a4fbd 100644 --- a/usr/src/lib/libc/amd64/Makefile +++ b/usr/src/lib/libc/amd64/Makefile @@ -20,7 +20,7 @@ # # # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2011 Nexenta Systems, Inc. All rights reserved. @@ -851,6 +851,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com index d91540d0ae..ddbed44735 100644 --- a/usr/src/lib/libc/i386/Makefile.com +++ b/usr/src/lib/libc/i386/Makefile.com @@ -20,9 +20,8 @@ # # # Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2013, Joyent, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. # # Copyright 2011 Nexenta Systems, Inc. All rights reserved. # Use is subject to license terms. @@ -890,6 +889,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index 86adb85d94..c08f234962 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -24,9 +24,8 @@ # Copyright 2010 Nexenta Systems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # Copyright (c) 2012 by Delphix. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright (c) 2013 Gary Mills # @@ -2721,6 +2720,11 @@ $endif _dgettext; _doprnt; _doscan; + epoll_create; + epoll_create1; + epoll_ctl; + epoll_wait; + epoll_pwait; _errfp; _errxfp; exportfs; diff --git a/usr/src/lib/libc/port/sys/epoll.c b/usr/src/lib/libc/port/sys/epoll.c new file mode 100644 index 0000000000..d90b625293 --- /dev/null +++ b/usr/src/lib/libc/port/sys/epoll.c @@ -0,0 +1,207 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/devpoll.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> + +/* + * Events that match their epoll(7) equivalents. + */ +#if EPOLLIN != POLLIN +#error value of EPOLLIN does not match value of POLLIN +#endif + +#if EPOLLPRI != POLLPRI +#error value of EPOLLPRI does not match value of POLLPRI +#endif + +#if EPOLLOUT != POLLOUT +#error value of EPOLLOUT does not match value of POLLOUT +#endif + +#if EPOLLRDNORM != POLLRDNORM +#error value of EPOLLRDNORM does not match value of POLLRDNORM +#endif + +#if EPOLLRDBAND != POLLRDBAND +#error value of EPOLLRDBAND does not match value of POLLRDBAND +#endif + +#if EPOLLERR != POLLERR +#error value of EPOLLERR does not match value of POLLERR +#endif + +#if EPOLLHUP != POLLHUP +#error value of EPOLLHUP does not match value of POLLHUP +#endif + +/* + * Events that we ignore entirely. They can be set in events, but they will + * never be returned. + */ +#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP) + +/* + * Events that we swizzle into other bit positions. + */ +#define EPOLLSWIZZLED \ + (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM) + +int +epoll_create(int size) +{ + int fd; + + /* + * From the epoll_create() man page: "Since Linux 2.6.8, the size + * argument is ignored, but must be greater than zero." You keep using + * that word "ignored"... + */ + if (size <= 0) { + errno = EINVAL; + return (-1); + } + + if ((fd = open("/dev/poll", O_RDWR)) == -1) + return (-1); + + if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) { + (void) close(fd); + return (-1); + } + + return (fd); +} + +int +epoll_create1(int flags) +{ + int fd, oflags = O_RDWR; + + if (flags & EPOLL_CLOEXEC) + oflags |= O_CLOEXEC; + + if ((fd = open("/dev/poll", oflags)) == -1) + return (-1); + + if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) { + (void) close(fd); + return (-1); + } + + return (fd); +} + +int +epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) +{ + dvpoll_epollfd_t epoll[2]; + uint32_t events = event->events; + uint32_t ev; + int i = 0; + + epoll[i].dpep_pollfd.fd = fd; + + switch (op) { + case EPOLL_CTL_DEL: + ev = POLLREMOVE; + break; + + case EPOLL_CTL_MOD: + /* + * In the modify case, we pass down two events: one to + * remove the event and another to add it back. + */ + epoll[i++].dpep_pollfd.events = POLLREMOVE; + epoll[i].dpep_pollfd.fd = fd; + /* FALLTHROUGH */ + + case EPOLL_CTL_ADD: + /* + * Mask off the events that we ignore, and then swizzle the + * events for which our values differ from their epoll(7) + * equivalents. + */ + ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED); + + if (events & EPOLLRDHUP) + ev |= POLLRDHUP; + + if (events & EPOLLET) + ev |= POLLET; + + if (events & EPOLLONESHOT) + ev |= POLLONESHOT; + + if (events & EPOLLWRNORM) + ev |= POLLWRNORM; + + if (events & EPOLLWRBAND) + ev |= POLLWRBAND; + + epoll[i].dpep_data = event->data.u64; + break; + + default: + errno = EOPNOTSUPP; + return (-1); + } + + epoll[i].dpep_pollfd.events = ev; + + return (write(epfd, epoll, sizeof (epoll[0]) * (i + 1)) == -1 ? -1 : 0); +} + +int +epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + struct dvpoll arg; + + if (maxevents <= 0) { + errno = EINVAL; + return (-1); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + + return (ioctl(epfd, DP_POLL, &arg)); +} + +int +epoll_pwait(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask) +{ + struct dvpoll arg; + + if (maxevents <= 0) { + errno = EINVAL; + return (-1); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + arg.dp_setp = (sigset_t *)sigmask; + + return (ioctl(epfd, DP_PPOLL, &arg)); +} diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com index c8779479d3..34d35699d0 100644 --- a/usr/src/lib/libc/sparc/Makefile.com +++ b/usr/src/lib/libc/sparc/Makefile.com @@ -20,9 +20,8 @@ # # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # # Copyright 2011 Nexenta Systems, Inc. All rights reserved. # Use is subject to license terms. @@ -925,6 +924,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/lib/libc/sparcv9/Makefile.com b/usr/src/lib/libc/sparcv9/Makefile.com index 7f4a6e4e38..65ac2dedc7 100644 --- a/usr/src/lib/libc/sparcv9/Makefile.com +++ b/usr/src/lib/libc/sparcv9/Makefile.com @@ -20,9 +20,8 @@ # # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. # Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. -# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # # Copyright 2011 Nexenta Systems, Inc. All rights reserved. # Use is subject to license terms. @@ -871,6 +870,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/man/man3c/Makefile b/usr/src/man/man3c/Makefile index f3b6f18a7f..3056f2c923 100644 --- a/usr/src/man/man3c/Makefile +++ b/usr/src/man/man3c/Makefile @@ -13,6 +13,7 @@ # Copyright 2011, Richard Lowe # Copyright 2013 Nexenta Systems, Inc. All rights reserved. # Copyright 2013, OmniTI Computer Consulting, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # include $(SRC)/Makefile.master @@ -106,6 +107,9 @@ MANFILES= __fbufsize.3c \ enable_extended_FILE_stdio.3c \ encrypt.3c \ end.3c \ + epoll_create.3c \ + epoll_ctl.3c \ + epoll_wait.3c \ err.3c \ euclen.3c \ exit.3c \ @@ -706,6 +710,8 @@ MANLINKS= FD_CLR.3c \ endusershell.3c \ endutent.3c \ endutxent.3c \ + epoll_create1.3c \ + epoll_pwait.3c \ erand48.3c \ errno.3c \ errx.3c \ @@ -1435,6 +1441,9 @@ _etext.3c := LINKSRC = end.3c edata.3c := LINKSRC = end.3c etext.3c := LINKSRC = end.3c +epoll_create1.3c := LINKSRC = epoll_create.3c +epoll_pwait.3c := LINKSRC = epoll_wait.3c + errx.3c := LINKSRC = err.3c verr.3c := LINKSRC = err.3c verrx.3c := LINKSRC = err.3c diff --git a/usr/src/man/man3c/epoll_create.3c b/usr/src/man/man3c/epoll_create.3c new file mode 100644 index 0000000000..87d0a85c38 --- /dev/null +++ b/usr/src/man/man3c/epoll_create.3c @@ -0,0 +1,106 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_CREATE 3C "Apr 17, 2014" +.SH NAME +epoll_create, epoll_create1 \- create an epoll instance +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_create\fR(\fBint\fR \fIsize\fR); +.fi + +.LP +.nf +\fBint\fR \fBepoll_create1\fR(\fBint\fR \fIflags\fR); +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBepoll_create()\fR and \fBepoll_create1()\fR functions both create an +\fBepoll\fR(5) instance that can be operated upon via \fBepoll_ctl\fR(3C), +\fBepoll_wait\fR(3C) and \fBepoll_pwait\fR(3C). \fBepoll\fR instances are +represented as file descriptors, and should be closed via \fBclose\fR(2). + +The only difference between the two functions is their signature; +\fBepoll_create()\fR takes a size argument that +is vestigal and is only meaningful in as much as it must be greater than +zero, while \fBepoll_create1()\fR takes a flags argument that can have +any of the following values: + +.sp +.ne 2 +.na +\fBEPOLL_CLOEXEC\fR +.ad +.RS 12n +Instance should be closed upon an +\fBexec\fR(2); see \fBopen\fR(2)'s description of \fBO_CLOEXEC\fR. +.RE + +.SH RETURN VALUES +.sp +.LP +Upon succesful completion, 0 is returned. Otherwise, -1 is returned and errno +is set to indicate the error. +.SH ERRORS +.sp +.LP +The \fBepoll_create()\fR and \fBepoll_create1()\fR functions will fail if: +.sp +.ne 2 +.na +\fB\fBEINVAL\fR\fR +.ad +.RS 10n +Either the \fIsize\fR is zero (\fBepoll_create()\fR) or the \fIflags\fR +are invalid (\fBepoll_create1()\fR). +.RE + +.sp +.ne 2 +.na +\fB\fBEMFILE\fR\fR +.ad +.RS 10n +There are currently {\fBOPEN_MAX\fR} file descriptors open in the calling +process. +.RE + +.sp +.ne 2 +.na +\fB\fBENFILE\fR\fR +.ad +.RS 10n +The maximum allowable number of files is currently open in the system. +.RE + +.sp +.LP +.SH NOTES +.sp +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. +.RE + +.SH SEE ALSO +.sp +.LP +\fBepoll_ctl\fR(3C), \fBepoll_wait\fR(3C), \fBepoll\fR(5) diff --git a/usr/src/man/man3c/epoll_ctl.3c b/usr/src/man/man3c/epoll_ctl.3c new file mode 100644 index 0000000000..0cce6affa4 --- /dev/null +++ b/usr/src/man/man3c/epoll_ctl.3c @@ -0,0 +1,316 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_CTL 3C "Apr 17, 2014" +.SH NAME +epoll_ctl \- control an epoll instance +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_ctl\fR(\fBint\fR \fIepfd\fR, \fBint\fR \fIop\fR, \fBint\fR \fIfd\fR, \fBstruct epoll_event *\fR\fIevent\fR); +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBepoll_ctl()\fR function executes the operation specified by +\fIop\fR (as parameterized by \fIevent\fR) on the \fIepfd\fR epoll instance. +Valid values for \fIop\fR: + +.sp +.ne 2 +.na +\fBEPOLL_CTL_ADD\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, +associate the file descriptor specified by \fIfd\fR with the event specified +by \fIevent\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLL_CTL_DEL\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, +remove all event associations for the file descriptor specified by \fIfd\fR. +\fIevent\fR is ignored, and may be NULL. +.RE + +.sp +.ne 2 +.na +\fBEPOLL_CTL_MOD\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, modify the event +association for the file descriptor specified by \fIfd\fR to be that +specified by \fIevent\fR. + +.RE + +The \fIevent\fR parameter has the following structure: + +.in +4 +.nf +typedef union epoll_data { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; +} epoll_data_t; + +struct epoll_event { + uint32_t events; + epoll_data_t data; +}; +.fi +.in -4 + +The \fIdata\fR field specifies the datum to +be associated with the event and +will be returned via \fBepoll_wait\fR(3C). +The \fIevents\fR field denotes both the desired events (when specified via +\fBepoll_ctl()\fR) and the events that have occurred (when returned via +\fBepoll_wait\fR(3C)). +In either case, the +\fIevents\fR field is a bitmask constructed by a logical \fBOR\fR operation +of any combination of the following event flags: + +.sp +.ne 2 +.na +\fBEPOLLIN\fR +.as +.RS 14n +Data other than high priority data may be read without blocking. For streams, +this flag is set in the returned \fIevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLPRI\fR +.as +.RS 14n +Normal data (priority band equals 0) may be read without blocking. For streams, +this flag is set in the returned \fIevents\fR even if the message is of zero +length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLOUT\fR +.as +.RS 14n +Normal data (priority band equals 0) may be written without blocking. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDNORM\fR +.as +.RS 14n +Normal data (priority band equals 0) may be read without blocking. For streams, +this flag is set in the returned \fIrevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDBAND\fR +.as +.RS 14n +Data from a non-zero priority band may be read without blocking. For streams, +this flag is set in the returned \fIrevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWRNORM\fR +.as +.RS 14n +The same as \fBEPOLLOUT\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWRBAND\fR +.as +.RS 14n +Priority data (priority band > 0) may be written. This event only examines +bands that have been written to at least once. +.RE + +.sp +.ne 2 +.na +\fBEPOLLMSG\fR +.as +.RS 14n +This exists only for backwards binary and source compatibility with Linux; +it has no meaning and is ignored. +.RE + +.sp +.ne 2 +.na +\fBEPOLLERR\fR +.as +.RS 14n +An error has occurred on the device or stream. This flag is only valid in the +returned \fIevents\fR field. +.RE + +.sp +.ne 2 +.na +\fBEPOLLHUP\fR +.as +.RS 14n +A hangup has occurred on the stream. This event and \fBEPOLLOUT\fR are mutually +exclusive; a stream can never be writable if a hangup has occurred. However, +this event and \fBEPOLLIN\fR, \fBEPOLLRDNORM\fR, \fBEPOLLRDBAND\fR, +\fBEPOLLRDHUP\fR or +\fBEPOLLPRI\fR are not mutually exclusive. This flag is only valid in the +the \fIevents\fR field returned from \fBepoll_wait\fR(3C); it is not used +in the \fIevents\fR field specified via \fBepoll_ctl()\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDHUP\fR +.as +.RS 14n +The stream socket peer shutdown the writing half of the connection and no +further data will be readable via the socket. This event is not mutually +exclusive with \fBEPOLLIN\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWAKEUP\fR +.as +.RS 14n +This exists only for backwards binary and source compatibility with Linux; +it has no meaning and is ignored. +.RE + +.sp +.ne 2 +.na +\fBEPOLLONESHOT\fR +.as +.RS 14n +Sets the specified event to be in one-shot mode, whereby the event association +with the \fBepoll\fR(5) instance specified by \fIepfd\fR is removed atomically +as the event is returned via \fBepoll_wait\fR(3C). Use of this mode allows +for resolution of some of the +races inherent in multithreaded use of \fBepoll_wait\fR(3C). +.RE + +.sp +.ne 2 +.na +\fBEPOLLET\fR +.as +.RS 14n +Sets the specified event to be edge-triggered mode instead of the default +mode of level-triggered. In this mode, events will be induced by +transitions on an event source rather than the state of the event source. +While perhaps superficially appealing, this mode introduces several new +potential failure modes for user-level software and should be used +with caution. +.RE + +.SH RETURN VALUES +.sp +.LP +Upon succesful completion, \fBepoll_ctl()\fR returns 0. +If an error occurs, -1 is returned and errno is set to indicate +the error. + +.SH ERRORS +.sp +.LP +\fBepoll_ctl()\fR will fail if: +.sp +.ne 2 +.na +\fB\fBEBADF\fR\fR +.ad +.RS 10n +\fIepfd\fR is not a valid file descriptor. +.RE + +.sp +.ne 2 +.na +\fB\fBEFAULT\fR\fR +.ad +.RS 10n +The memory associated with \fIevent\fR was not mapped. +.RE + +.sp +.ne 2 +.na +\fB\fBEEXIST\fR\fR +.ad +.RS 10n +The operation specified was \fBEPOLL_CTL_ADD\fR and the specified file +descriptor is already associated with an event for the specified +\fBepoll\fR(5) instance. +.RE + +.sp +.ne 2 +.na +\fB\fBENOENT\fR\fR +.ad +.RS 10n +The operation specified was \fBEPOLL_CTL_MOD\fR or \fBEPOLL_CTL_DEL\fR and +the specified file descriptor is not associated with an event for the +specified \fBepoll\fR(5) instance. +.RE + +.sp +.LP +.SH NOTES +.sp +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. +.RE + +.SH SEE ALSO +.sp +.LP +\fBepoll_create\fR(3C), \fBepoll_wait\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C), +\fBepoll\fR(5) diff --git a/usr/src/man/man3c/epoll_wait.3c b/usr/src/man/man3c/epoll_wait.3c new file mode 100644 index 0000000000..6405c30b5d --- /dev/null +++ b/usr/src/man/man3c/epoll_wait.3c @@ -0,0 +1,115 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_WAIT 3C "Apr 17, 2014" +.SH NAME +epoll_wait, epoll_pwait \- wait for epoll events +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_wait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR, + \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR); +.fi + +.LP +.nf +\fBint\fR \fBepoll_pwait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR, + \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR, + \fBconst sigset_t *\fR\fIsigmask\fR); +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBepoll_wait()\fR function waits for events on the \fBepoll\fR(5) +instance specified by \fIepfd\fR. The \fIevents\fR parameter must point to +an array of \fImaxevents\fR \fIepoll_event\fR structures to be +filled in with pending events. The \fItimeout\fR argument specifies the +number of milliseconds to wait for an event if none is pending. A +\fItimeout\fR of -1 denotes an infinite timeout. + +The \fBepoll_pwait()\fR is similar to \fBepoll_wait()\fR, but takes an +additional \fIsigmask\fR argument that specifies the desired signal mask +when \fBepoll_pwait()\fR is blocked. It is equivalent to atomically +setting the signal mask, calling \fBepoll_wait()\fR, and restoring the +signal mask upon return, and is therefore similar to the relationship +between \fBselect\fR(3C) and \fBpselect\fR(3C). + +.SH RETURN VALUES +.sp +.LP +Upon successful completion, \fBepoll_wait()\fR and \fBepoll_pwait()\fR return +the number of events, or 0 if none was pending and \fItimeout\fR milliseconds +elapsed. If an error occurs, -1 is returned and errno is set to indicate +the error. + +.SH ERRORS +.sp +.LP +The \fBepoll_wait()\fR and \fBepoll_pwait()\fR functions will fail if: +.sp +.ne 2 +.na +\fB\fBEBADF\fR\fR +.ad +.RS 10n +\fIepfd\fR is not a valid file descriptor. +.RE + +.sp +.ne 2 +.na +\fB\fBEFAULT\fR\fR +.ad +.RS 10n +The memory associated with \fIevents\fR was not mapped or was not writable. +.RE + +.sp +.ne 2 +.na +\fB\fBEINTR\fR\fR +.ad +.RS 10n +A signal was received during the \fBepoll_wait()\fR or \fBepoll_pwait()\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBEINVAL\fR\fR +.ad +.RS 10n +Either \fIepfd\fR is not a valid \fBepoll\fR(5) instance or \fImaxevents\fR +is not greater than zero. +.RE + +.sp +.LP +.SH NOTES +.sp +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. +.RE + +.SH SEE ALSO +.sp +.LP +\fBepoll_create\fR(3C), \fBepoll_ctl\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C), +\fBpselect\fR(3C), \fBepoll\fR(5) diff --git a/usr/src/man/man5/Makefile b/usr/src/man/man5/Makefile index eff831c3d4..8e805da008 100644 --- a/usr/src/man/man5/Makefile +++ b/usr/src/man/man5/Makefile @@ -13,6 +13,7 @@ # Copyright 2011, Richard Lowe # Copyright (c) 2012 by Delphix. All rights reserved. # Copyright 2013 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # include $(SRC)/Makefile.master @@ -41,6 +42,7 @@ MANFILES= Intro.5 \ dhcp.5 \ dhcp_modules.5 \ environ.5 \ + epoll.5 \ eqnchar.5 \ extendedFILE.5 \ filesystem.5 \ diff --git a/usr/src/man/man5/epoll.5 b/usr/src/man/man5/epoll.5 new file mode 100644 index 0000000000..bba13e15d2 --- /dev/null +++ b/usr/src/man/man5/epoll.5 @@ -0,0 +1,115 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL 5 "Apr 17, 2014" +.SH NAME +epoll \- Linux-compatible I/O event notification facility +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> +.fi + +.SH DESCRIPTION +.sp +.LP + +\fBepoll\fR is a facility for efficient event-oriented I/O that has a +similar model to \fBpoll\fR(2), but does not necessitate rescanning a +set of file descriptors to wait for an event. \fBepoll\fR is of Linux +origins, and this facility is designed to be binary-compatible with +the Linux facility, including the following interfaces: + +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_create\fR(3C) creates an \fBepoll\fR instance, returning a file +descriptor. It contains a size arugment which is meaningful only in as +much as it cannot be 0. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_create1\fR(3C) also creates an \fBepoll\fR instance, but eliminates +the meaningless size argument -- replacing it instead with a flags +argument. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_ctl\fR(3C) allows file descriptors to be added +(via \fBEPOLL_CTL_ADD\fR), deleted (via \fBEPOLL_CTL_DEL\fR) or +modified (via \fBEPOLL_CTL_MOD\fR) with respect to the \fBepoll\fR'd set +of file descriptors. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_wait\fR(3C) fetches pending events for file descriptors added +via \fBepoll_ctl\fR(3C), blocking the caller if no such events are pending. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_pwait\fR(3C) opeates in a similar manner to \fBepoll_wait\fR(3C), but +allows the caller to specify a signal mask to be set atomically with respect +to waiting for events. +.RE + +.sp +.LP +.SH NOTES +.sp +.LP + +The \fBepoll\fR facility is implemented +for purposes of offering compatibility to and portability of Linux-borne +applications; native applications should continue to prefer using event ports +via the \fBport_create\fR(3C), +\fBport_associate\fR(3C) and \fBport_getn\fR(3C) interfaces. +In particular, use of \fBepoll\fR in a multithreaded environment is fraught +with peril; even when using \fBEPOLLONESHOT\fR for one-shot events, +there are race conditions with respect to \fBclose\fR(2) that are unresolvable. +(For more details, see the aborted effort in Linux to resolve this via the +proposed +\fBEPOLL_CTL_DISABLE\fR operation.) +The event port facility -- like the BSD kqueue facility that inspired it -- +is designed to deal with such issues via explicit event source dissociation. + +While a best effort has been made to mimic the Linux semantics, there +are some semantics that are too peculiar or ill-conceived to merit +accommodation. In particular, the Linux \fBepoll\fR facility will -- by +design -- continue to generate events for closed file descriptors where/when +the underlying file description remains open. For example, if one were +to \fBfork\fR(2) and subsequently close an actively \fBepoll\fR'd file +descriptor in the parent, +any events generated in the child on the implicitly duplicated file descriptor +will continue to be delivered to the parent -- despite the fact that the +parent itself no longer has any notion of the file description! +This \fBepoll\fR facility refuses to honor +these semantics; closing the \fBEPOLL_CTL_ADD\fR'd file descriptor +will always result in no further +events being generated for that event description. + +.RE +.SH SEE ALSO +.sp +.LP +\fBepoll_create\fR(3C), \fBepoll_create1\fR(3C), \fBepoll_ctl\fR(3C), +\fBepoll_wait\fR(3C), \fBepoll_pwait\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_dissociate\fR(3C), +\fBport_get\fR(3C), +\fBpselect\fR(3C) diff --git a/usr/src/man/man7d/poll.7d b/usr/src/man/man7d/poll.7d index cd3db77de9..7a3292eb97 100644 --- a/usr/src/man/man7d/poll.7d +++ b/usr/src/man/man7d/poll.7d @@ -73,15 +73,6 @@ Pointer to \fBpollfd\fR structure. .SH DESCRIPTION .LP -Note - -.sp -.RS 2 -The \fB/dev/poll\fR device, associated driver and corresponding manpages may be -removed in a future Solaris release. For similar functionality in the event -ports framework, see \fBport_create\fR(3C). -.RE -.sp -.LP The \fB/dev/poll\fR driver is a special driver that enables you to monitor multiple sets of polled file descriptors. By using the \fB/dev/poll\fR driver, you can efficiently poll large numbers of file descriptors. Access to diff --git a/usr/src/man/man9e/chpoll.9e b/usr/src/man/man9e/chpoll.9e index a2adaf7a9c..24e3379861 100644 --- a/usr/src/man/man9e/chpoll.9e +++ b/usr/src/man/man9e/chpoll.9e @@ -123,6 +123,17 @@ The same as \fBPOLLOUT\fR. Priority data (priority band > 0) may be written. .RE +.sp +.ne 2 +.na +\fB\fBPOLLET\fR\fR +.ad +.RS 14n +The desired event is to be edge-triggered; calls to \fBpollwakeup\fR(9F) +should not be suppressed, even if the event is pending at the time of +call to the \fBchpoll()\fR function. +.RE + .RE .sp @@ -200,7 +211,11 @@ be called with multiple events at one time. The \fBpollwakup()\fR can be called regardless of whether or not the \fBchpoll()\fR entry is called; it should be called every time the driver detects the pollable event. The driver must not hold any mutex across the call to \fBpollwakeup\fR(9F) that is acquired in its -\fBchpoll()\fR entry point, or a deadlock may result. +\fBchpoll()\fR entry point, or a deadlock may result. Note that if +\fBPOLLET\fR is set in the specified events, the driver must call +\fBpollwakeup\fR(9F) on subsequent events, even if events are pending at +the time of the call to \fBchpoll()\fR. + .RE .SH RETURN VALUES .sp diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index ac89e430c7..fee2924093 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -27,7 +27,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * FIFOFS file system vnode operations. This file system @@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, } /* - * if we happened to get something, return + * if we happened to get something and we're not edge-triggered, return */ - - if ((*reventsp = (short)retevents) != 0) { + if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) { mutex_exit(&fnp->fn_lock->flk_lock); return (0); } /* - * If poll() has not found any events yet, set up event cell - * to wake up the poll if a requested event occurs on this + * If poll() has not found any events yet or we're edge-triggered, set + * up event cell to wake up the poll if a requested event occurs on this * pipe/fifo. */ if (!anyyet) { diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c index b2f5088e06..ab95c0a1f8 100644 --- a/usr/src/uts/common/fs/portfs/port_vnops.c +++ b/usr/src/uts/common/fs/portfs/port_vnops.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> @@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp, levents |= POLLOUT; levents &= events; *reventsp = levents; - if (levents == 0) { - if (!anyyet) { - *phpp = &pp->port_pollhd; - portq->portq_flags |= - events & POLLIN ? PORTQ_POLLIN : 0; - portq->portq_flags |= - events & POLLOUT ? PORTQ_POLLOUT : 0; - } + if ((levents == 0 && !anyyet) || (events & POLLET)) { + *phpp = &pp->port_pollhd; + portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0; + portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0; } mutex_exit(&portq->portq_mutex); return (0); diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index c84b9d3726..84a8ae4d31 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -6010,7 +6010,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp, } *reventsp = revents; - if (!anyyet && revents == 0) { + if ((!anyyet && revents == 0) || (events & POLLET)) { /* * Arrange to wake up the polling lwp when * the target process/lwp stops or terminates diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 0be628f329..d3ff264eef 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -953,6 +953,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, if (!list_is_empty(&so->so_acceptq_list)) *reventsp |= (POLLIN|POLLRDNORM) & events; + /* + * If we're looking for POLLRDHUP, indicate it if we have sent the + * last rx signal for the socket. + */ + if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG)) + *reventsp |= POLLRDHUP; + /* Data */ /* so_downcalls is null for sctp */ if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) { @@ -988,14 +995,18 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, *reventsp |= POLLHUP; } - if (!*reventsp && !anyyet) { + if ((!*reventsp && !anyyet) || (events & POLLET)) { /* Check for read events again, but this time under lock */ if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&so->so_lock); if (SO_HAVE_DATA(so) || !list_is_empty(&so->so_acceptq_list)) { + if (events & POLLET) + so->so_pollev |= SO_POLLEV_IN; + mutex_exit(&so->so_lock); *reventsp |= (POLLIN|POLLRDNORM) & events; + return (0); } else { so->so_pollev |= SO_POLLEV_IN; diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c index 3d5ba2a7e8..3f858afecc 100644 --- a/usr/src/uts/common/fs/sockfs/socknotify.c +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev) so->so_state |= SS_SENTLASTREADSIG; so->so_pollev &= ~SO_POLLEV_IN; - *pollev |= POLLIN|POLLRDNORM; + *pollev |= POLLIN|POLLRDNORM|POLLRDHUP; *sigev |= SOCKETSIG_READ; return (1); diff --git a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c index 94323582d6..c2c16e848b 100644 --- a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c +++ b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * av1394 asynchronous module @@ -359,9 +361,10 @@ av1394_async_poll(av1394_inst_t *avp, short events, int anyyet, short *reventsp, AV1394_TNF_ENTER(av1394_async_poll); if (events & POLLIN) { - if (av1394_peekq(rq)) { + if (av1394_peekq(rq)) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { mutex_enter(&ap->a_mutex); ap->a_pollevents |= POLLIN; *phpp = &ap->a_pollhead; diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 7d311aaa10..14c2f47fac 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -45,6 +46,8 @@ #include <sys/devpoll.h> #include <sys/rctl.h> #include <sys/resource.h> +#include <sys/schedctl.h> +#include <sys/epoll.h> #define RESERVED 1 @@ -237,7 +240,8 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, + pollcache_t *pcp, nfds_t nfds, int *fdcntp) { int start, ostart, end; int fdcnt, fd; @@ -247,7 +251,10 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) boolean_t no_wrap; pollhead_t *php; polldat_t *pdp; + pollfd_t *pfdp; + epoll_event_t *epoll; int error = 0; + short mask = POLLRDHUP | POLLWRNORM | POLLWRBAND; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { @@ -257,6 +264,14 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) */ return (error); } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + pfdp = NULL; + epoll = (epoll_event_t *)dpbuf; + } else { + pfdp = (pollfd_t *)dpbuf; + epoll = NULL; + } retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; @@ -316,11 +331,32 @@ repoll: * polling a closed fd. Hope this will remind * user to do a POLLREMOVE. */ - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; + continue; + } + + /* + * In the epoll compatibility case, we actually + * perform the implicit removal to remain + * closer to the epoll semantics. + */ + ASSERT(epoll != NULL); + + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); continue; } + if (fp != pdp->pd_fp) { /* * user is polling on a cached fd which was @@ -376,9 +412,59 @@ repoll: } if (revent != 0) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].events = pdp->pd_events; - pfdp[fdcnt].revents = revent; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].events = pdp->pd_events; + pfdp[fdcnt].revents = revent; + } else { + epoll_event_t *ep = &epoll[fdcnt]; + + ASSERT(epoll != NULL); + ep->data.u64 = pdp->pd_epolldata; + + /* + * If any of the event bits are set for + * which poll and epoll representations + * differ, swizzle in the native epoll + * values. + */ + if (revent & mask) { + ep->events = (revent & ~mask) | + ((revent & POLLRDHUP) ? + EPOLLRDHUP : 0) | + ((revent & POLLWRNORM) ? + EPOLLWRNORM : 0) | + ((revent & POLLWRBAND) ? + EPOLLWRBAND : 0); + } else { + ep->events = revent; + } + } + + /* + * If POLLET is set, clear the bit in the + * bitmap -- which effectively latches the + * edge on a pollwakeup() from the driver. + */ + if (pdp->pd_events & POLLET) + BT_CLEAR(pcp->pc_bitmap, fd); + + /* + * If POLLONESHOT is set, perform the implicit + * POLLREMOVE. + */ + if (pdp->pd_events & POLLONESHOT) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); + } + fdcnt++; } else if (php != NULL) { /* @@ -392,7 +478,7 @@ repoll: * in bitmap. */ if ((pdp->pd_php != NULL) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } if (pdp->pd_php == NULL) { @@ -499,7 +585,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dp_entry_t *dpep; pollcache_t *pcp; pollfd_t *pollfdp, *pfdp; - int error; + dvpoll_epollfd_t *epfdp; + uintptr_t limit; + int error, size; ssize_t uiosize; nfds_t pollfdnum; struct pollhead *php = NULL; @@ -515,11 +603,18 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); pcp = dpep->dpe_pcache; - if (curproc->p_pid != pcp->pc_pid) { + + if (curproc->p_pid != pcp->pc_pid) return (EACCES); + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = sizeof (dvpoll_epollfd_t); + } else { + size = sizeof (pollfd_t); } + uiosize = uiop->uio_resid; - pollfdnum = uiosize / sizeof (pollfd_t); + pollfdnum = uiosize / size; mutex_enter(&curproc->p_lock); if (pollfdnum > (uint_t)rctl_enforced_value( rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { @@ -534,6 +629,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * each polled fd to the cached set. */ pollfdp = kmem_alloc(uiosize, KM_SLEEP); + limit = (uintptr_t)pollfdp + (pollfdnum * size); /* * Although /dev/poll uses the write(2) interface to cache fds, it's @@ -555,9 +651,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) mutex_enter(&dpep->dpe_lock); dpep->dpe_writerwait++; while (dpep->dpe_refcnt != 0) { + /* + * We need to do a bit of a dance here: we need to drop + * our dpe_lock and grab the pc_lock to broadcast the pc_cv to + * kick any DP_POLL/DP_PPOLL sleepers. + */ + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_WRITEWANTED; + cv_broadcast(&pcp->pc_cv); + mutex_exit(&pcp->pc_lock); + mutex_enter(&dpep->dpe_lock); + + if (dpep->dpe_refcnt == 0) + break; + if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + mutex_exit(&pcp->pc_lock); kmem_free(pollfdp, uiosize); return (set_errno(EINTR)); } @@ -565,13 +679,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dpep->dpe_writerwait--; dpep->dpe_flag |= DP_WRITER_PRESENT; dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + if (pcp->pc_bitmap == NULL) { pcache_create(pcp, pollfdnum); } - for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) { + for (pfdp = pollfdp; (uintptr_t)pfdp < limit; + pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { fd = pfdp->fd; if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) continue; @@ -580,9 +698,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (pdp == NULL) { pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* LINTED pointer alignment */ + epfdp = (dvpoll_epollfd_t *)pfdp; + pdp->pd_epolldata = epfdp->dpep_data; + } + pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); + } else { + if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + pdp->pd_fp != NULL) { + /* + * epoll semantics demand that we error + * out in this case. + */ + error = EEXIST; + break; + } } + ASSERT(pdp->pd_fd == fd); ASSERT(pdp->pd_pcache == pcp); if (fd >= pcp->pc_mapsize) { @@ -665,7 +801,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } releasef(fd); } else { - if (pdp == NULL) { + if (pdp == NULL || pdp->pd_fp == NULL) { + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* + * As with the add case (above), epoll + * semantics demand that we error out + * in this case. + */ + error = ENOENT; + break; + } + continue; } ASSERT(pdp->pd_fd == fd); @@ -690,6 +836,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) return (error); } +#define DP_SIGMASK_RESTORE(ksetp) { \ + if (ksetp != NULL) { \ + mutex_enter(&p->p_lock); \ + if (lwp->lwp_cursig == 0) { \ + t->t_hold = lwp->lwp_sigoldmask; \ + t->t_flag &= ~T_TOMASK; \ + } \ + mutex_exit(&p->p_lock); \ + } \ +} + /*ARGSUSED*/ static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) @@ -701,7 +858,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) int error = 0; STRUCT_DECL(dvpoll, dvpoll); - if (cmd == DP_POLL) { + if (cmd == DP_POLL || cmd == DP_PPOLL) { /* do this now, before we sleep on DP_WRITER_PRESENT */ now = gethrtime(); } @@ -717,6 +874,27 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) return (EACCES); mutex_enter(&dpep->dpe_lock); + + if (cmd == DP_EPOLLCOMPAT) { + if (dpep->dpe_refcnt != 0) { + /* + * We can't turn on epoll compatibility while there + * are outstanding operations. + */ + mutex_exit(&dpep->dpe_lock); + return (EBUSY); + } + + /* + * epoll compatibility is a one-way street: there's no way + * to turn it off for a particular open. + */ + dpep->dpe_flag |= DP_ISEPOLLCOMPAT; + mutex_exit(&dpep->dpe_lock); + + return (0); + } + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || (dpep->dpe_writerwait != 0)) { if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { @@ -729,15 +907,36 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) switch (cmd) { case DP_POLL: + case DP_PPOLL: { pollstate_t *ps; nfds_t nfds; int fdcnt = 0; + size_t size, fdsize, dpsize; hrtime_t deadline = 0; + k_sigset_t *ksetp = NULL; + k_sigset_t kset; + sigset_t set; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + struct proc *p = ttoproc(curthread); STRUCT_INIT(dvpoll, mode); - error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), - STRUCT_SIZE(dvpoll)); + + /* + * The dp_setp member is only required/consumed for DP_PPOLL, + * which otherwise uses the same structure as DP_POLL. + */ + if (cmd == DP_POLL) { + dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - + (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); + } else { + ASSERT(cmd == DP_PPOLL); + dpsize = STRUCT_SIZE(dvpoll); + } + + error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); + if (error) { DP_REFRELE(dpep); return (EFAULT); @@ -755,6 +954,52 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) deadline += now; } + if (cmd == DP_PPOLL) { + void *setp = STRUCT_FGETP(dvpoll, dp_setp); + + if (setp != NULL) { + if (copyin(setp, &set, sizeof (set))) { + DP_REFRELE(dpep); + return (EFAULT); + } + + sigutok(&set, &kset); + ksetp = &kset; + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + + /* + * Like ppoll() with a non-NULL sigset, we'll + * call cv_reltimedwait_sig() just to check for + * signals. This call will return immediately + * with either 0 (signalled) or -1 (no signal). + * There are some conditions whereby we can + * get 0 from cv_reltimedwait_sig() without + * a true signal (e.g., a directed stop), so + * we restore our signal mask in the unlikely + * event that lwp_cursig is 0. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, + &p->p_lock, 0, TR_CLOCK_TICK)) { + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + + mutex_exit(&p->p_lock); + + DP_REFRELE(dpep); + return (EINTR); + } + + mutex_exit(&p->p_lock); + } + } + if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { /* * We are just using DP_POLL to sleep, so @@ -762,17 +1007,29 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * Do not check for signals if we have a zero timeout. */ DP_REFRELE(dpep); - if (deadline == 0) + if (deadline == 0) { + DP_SIGMASK_RESTORE(ksetp); return (0); + } + mutex_enter(&curthread->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&curthread->t_delay_cv, &curthread->t_delay_lock, deadline)) > 0) continue; mutex_exit(&curthread->t_delay_lock); + + DP_SIGMASK_RESTORE(ksetp); + return (error == 0 ? EINTR : 0); } + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = nfds * (fdsize = sizeof (epoll_event_t)); + } else { + size = nfds * (fdsize = sizeof (pollfd_t)); + } + /* * XXX It would be nice not to have to alloc each time, but it * requires another per thread structure hook. This can be @@ -782,8 +1039,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) curthread->t_pollstate = pollstate_create(); ps = curthread->t_pollstate; } - if (ps->ps_dpbufsize < nfds) { - struct proc *p = ttoproc(curthread); + if (ps->ps_dpbufsize < size) { /* * The maximum size should be no large than * current maximum open file count. @@ -792,27 +1048,28 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) if (nfds > p->p_fno_ctl) { mutex_exit(&p->p_lock); DP_REFRELE(dpep); + DP_SIGMASK_RESTORE(ksetp); return (EINVAL); } mutex_exit(&p->p_lock); - kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) * - ps->ps_dpbufsize); - ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) * - nfds, KM_SLEEP); - ps->ps_dpbufsize = nfds; + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); + ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); + ps->ps_dpbufsize = size; } mutex_enter(&pcp->pc_lock); for (;;) { - pcp->pc_flag = 0; - error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt); + pcp->pc_flag &= ~PC_POLLWAKE; + + error = dp_pcache_poll(dpep, ps->ps_dpbuf, + pcp, nfds, &fdcnt); if (fdcnt > 0 || error != 0) break; /* * A pollwake has happened since we polled cache. */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -822,8 +1079,40 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) /* immediate timeout; do not check signals */ break; } - error = cv_timedwait_sig_hrtime(&pcp->pc_cv, - &pcp->pc_lock, deadline); + + if (!(pcp->pc_flag & PC_WRITEWANTED)) { + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); + } else { + error = 1; + } + + if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) { + /* + * We've been kicked off of our cv because a + * writer wants in. We're going to drop our + * reference count and then wait until the + * writer is gone -- at which point we'll + * reacquire the pc_lock and call into + * dp_pcache_poll() to get the updated state. + */ + mutex_exit(&pcp->pc_lock); + + mutex_enter(&dpep->dpe_lock); + dpep->dpe_refcnt--; + cv_broadcast(&dpep->dpe_cv); + + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || + (dpep->dpe_writerwait != 0)) { + error = cv_wait_sig_swap(&dpep->dpe_cv, + &dpep->dpe_lock); + } + + dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + } + /* * If we were awakened by a signal or timeout * then break the loop, else poll again. @@ -837,9 +1126,11 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) } mutex_exit(&pcp->pc_lock); + DP_SIGMASK_RESTORE(ksetp); + if (error == 0 && fdcnt > 0) { - if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll, - dp_fds), sizeof (pollfd_t) * fdcnt)) { + if (copyout(ps->ps_dpbuf, + STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { DP_REFRELE(dpep); return (EFAULT); } diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 49ca6f0475..8944fcbff3 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,6 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/file.h> @@ -820,7 +821,7 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents, if (error != 0 || *revents != 0) break; - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; if (timo == -1) { diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c index cb20c24270..5852e40799 100644 --- a/usr/src/uts/common/io/usb/usba/usba_ugen.c +++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c @@ -23,6 +23,10 @@ */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* * UGEN: USB Generic Driver support code * * This code provides entry points called by the ugen driver or other @@ -1082,7 +1086,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_INTR_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_INTR_IN_POLL_PENDING; @@ -1101,7 +1108,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_ISOC_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_ISOC_IN_POLL_PENDING; @@ -1115,9 +1125,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; case UGEN_MINOR_DEV_STAT_NODE: - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; @@ -1131,9 +1142,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; } } else { - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLHUP|POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index f9df89923f..4c99f92411 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,6 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -8312,7 +8313,7 @@ chkrd: } *reventsp = (short)retevents; - if (retevents) { + if (retevents && !(events & POLLET)) { if (headlocked) mutex_exit(&stp->sd_lock); return (0); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 0a72d3d882..08b2488b97 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -22,6 +22,7 @@ # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2013, Joyent, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> +# Copyright (c) 2014, Joyent, Inc. All rights reserved. # include $(SRC)/uts/Makefile.uts @@ -212,6 +213,7 @@ CHKHDRS= \ emul64cmd.h \ emul64var.h \ epm.h \ + epoll.h \ errno.h \ errorq.h \ errorq_impl.h \ diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h index 36c815c69f..4e4c76d9b0 100644 --- a/usr/src/uts/common/sys/devpoll.h +++ b/usr/src/uts/common/sys/devpoll.h @@ -24,11 +24,13 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_DEVPOLL_H #define _SYS_DEVPOLL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/poll_impl.h> #include <sys/types32.h> @@ -39,8 +41,10 @@ extern "C" { /* /dev/poll ioctl */ #define DPIOC (0xD0 << 8) -#define DP_POLL (DPIOC | 1) /* poll on fds in cached in /dev/poll */ +#define DP_POLL (DPIOC | 1) /* poll on fds cached via /dev/poll */ #define DP_ISPOLLED (DPIOC | 2) /* is this fd cached in /dev/poll */ +#define DP_PPOLL (DPIOC | 3) /* ppoll on fds cached via /dev/poll */ +#define DP_EPOLLCOMPAT (DPIOC | 4) /* turn on epoll compatibility */ #define DEVPOLLSIZE 1000 /* /dev/poll table size increment */ @@ -51,14 +55,21 @@ typedef struct dvpoll { pollfd_t *dp_fds; /* pollfd array */ nfds_t dp_nfds; /* num of pollfd's in dp_fds[] */ int dp_timeout; /* time out in milisec */ + sigset_t *dp_setp; /* sigset, if any */ } dvpoll_t; typedef struct dvpoll32 { caddr32_t dp_fds; /* pollfd array */ uint32_t dp_nfds; /* num of pollfd's in dp_fds[] */ int32_t dp_timeout; /* time out in milisec */ + caddr32_t dp_setp; /* sigset, if any */ } dvpoll32_t; +typedef struct dvpoll_epollfd { + pollfd_t dpep_pollfd; /* must be first member */ + uint64_t dpep_data; /* data payload */ +} dvpoll_epollfd_t; + #ifdef _KERNEL typedef struct dp_entry { @@ -71,6 +82,7 @@ typedef struct dp_entry { } dp_entry_t; #define DP_WRITER_PRESENT 0x1 /* a write is in progress */ +#define DP_ISEPOLLCOMPAT 0x2 /* epoll compatibility mode */ #define DP_REFRELE(dpep) { \ mutex_enter(&(dpep)->dpe_lock); \ diff --git a/usr/src/uts/common/sys/epoll.h b/usr/src/uts/common/sys/epoll.h new file mode 100644 index 0000000000..9b1c37f0dd --- /dev/null +++ b/usr/src/uts/common/sys/epoll.h @@ -0,0 +1,88 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_EPOLL_H +#define _SYS_EPOLL_H + +#include <sys/poll.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef union epoll_data { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; +} epoll_data_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct epoll_event { + uint32_t events; /* events */ + epoll_data_t data; /* user-specified data */ +} epoll_event_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +/* + * Define the EPOLL* constants in terms of their poll(2)/poll(7) equivalents. + * Note that the values match the equivalents in Linux to allow for any binary + * compatibility layers to not need to translate them. + */ +#define EPOLLIN 0x0001 +#define EPOLLPRI 0x0002 +#define EPOLLOUT 0x0004 +#define EPOLLRDNORM 0x0040 +#define EPOLLRDBAND 0x0080 +#define EPOLLWRNORM 0x0100 +#define EPOLLWRBAND 0x0200 +#define EPOLLMSG 0x0400 /* not used */ +#define EPOLLERR 0x0008 +#define EPOLLHUP 0x0010 +#define EPOLLRDHUP 0x2000 + +#define EPOLLWAKEUP (1UL << 29) /* no meaning; silently ignored */ +#define EPOLLONESHOT (1UL << 30) /* translated to POLLONESHOT */ +#define EPOLLET (1UL << 31) /* translated to POLLET */ + +#define EPOLL_CTL_ADD 1 +#define EPOLL_CTL_DEL 2 +#define EPOLL_CTL_MOD 3 + +#define EPOLL_CLOEXEC 02000000 + +#if !defined(_KERNEL) + +extern int epoll_create(int size); +extern int epoll_create1(int flags); +extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); +extern int epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout); +extern int epoll_pwait(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EPOLL_H */ diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h index b1e222a3a3..4e3673753e 100644 --- a/usr/src/uts/common/sys/poll.h +++ b/usr/src/uts/common/sys/poll.h @@ -28,11 +28,13 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_H #define _SYS_POLL_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 11.9 */ - #ifdef __cplusplus extern "C" { #endif @@ -59,6 +61,7 @@ typedef unsigned long nfds_t; #define POLLWRNORM POLLOUT #define POLLRDBAND 0x0080 /* out-of-band data is readable */ #define POLLWRBAND 0x0100 /* out-of-band data is writeable */ +#define POLLRDHUP 0x4000 /* read-side hangup */ #define POLLNORM POLLRDNORM @@ -70,7 +73,13 @@ typedef unsigned long nfds_t; #define POLLHUP 0x0010 /* fd has been hung up on */ #define POLLNVAL 0x0020 /* invalid pollfd entry */ -#define POLLREMOVE 0x0800 /* remove a cached poll fd from /dev/poll */ +/* + * These events will never be specified in revents, but may be specified in + * events to control /dev/poll behavior. + */ +#define POLLREMOVE 0x0800 /* remove cached /dev/poll fd */ +#define POLLONESHOT 0x1000 /* /dev/poll should one-shot this fd */ +#define POLLET 0x2000 /* edge-triggered /dev/poll fd */ #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index ede99d0df2..2e866ec4d4 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_IMPL_H #define _SYS_POLL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Caching Poll Subsystem: * @@ -160,6 +162,7 @@ typedef struct polldat { int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ struct port_kevent *pd_portev; /* associated port event struct */ + uint64_t pd_epolldata; /* epoll data, if any */ } polldat_t; /* @@ -187,7 +190,8 @@ typedef struct pollcache { } pollcache_t; /* pc_flag */ -#define T_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */ #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 7f37529941..c33156a4fc 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,6 +29,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -525,13 +526,13 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) } /* - * If T_POLLWAKE is set, a pollwakeup() was performed on + * If PC_POLLWAKE is set, a pollwakeup() was performed on * one of the file descriptors. This can happen only if * one of the VOP_POLL() functions dropped pcp->pc_lock. * The only current cases of this is in procfs (prpoll()) * and STREAMS (strpoll()). */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -886,9 +887,9 @@ retry: } /* - * This function is called to inform a thread that - * an event being polled for has occurred. - * The pollstate lock on the thread should be held on entry. + * This function is called to inform a thread (or threads) that an event being + * polled on has occurred. The pollstate lock on the thread should be held + * on entry. */ void pollnotify(pollcache_t *pcp, int fd) @@ -896,8 +897,8 @@ pollnotify(pollcache_t *pcp, int fd) ASSERT(fd < pcp->pc_mapsize); ASSERT(MUTEX_HELD(&pcp->pc_lock)); BT_SET(pcp->pc_bitmap, fd); - pcp->pc_flag |= T_POLLWAKE; - cv_signal(&pcp->pc_cv); + pcp->pc_flag |= PC_POLLWAKE; + cv_broadcast(&pcp->pc_cv); } /* @@ -2024,7 +2025,7 @@ retry: */ if ((pdp->pd_php != NULL) && (pollfdp[entry].events == pdp->pd_events) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } /* @@ -2251,7 +2252,7 @@ pollstate_destroy(pollstate_t *ps) pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); ps->ps_pcacheset = NULL; if (ps->ps_dpbuf != NULL) { - kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); ps->ps_dpbuf = NULL; } mutex_destroy(&ps->ps_lock); diff --git a/usr/src/uts/sun4v/io/vcc.c b/usr/src/uts/sun4v/io/vcc.c index feeaf03e8f..85f722e467 100644 --- a/usr/src/uts/sun4v/io/vcc.c +++ b/usr/src/uts/sun4v/io/vcc.c @@ -24,6 +24,9 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/file.h> @@ -2456,7 +2459,7 @@ vcc_chpoll(dev_t dev, short events, int anyyet, short *reventsp, *reventsp |= (events & POLLIN); } - if (((*reventsp) == 0) && (!anyyet)) { + if ((((*reventsp) == 0) && (!anyyet)) || (events & POLLET)) { *phpp = &vport->poll; if (events & POLLIN) { mutex_enter(&vport->lock); |