diff options
author | Bryan Cantrill <bryan@joyent.com> | 2015-02-14 16:55:35 -0800 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2015-10-16 11:59:14 -0700 |
commit | a5eb7107f06a6e23e8e77e8d3a84c1ff90a73ac6 (patch) | |
tree | 70fac1fa3fb719f5145ff6db721af2c343faa4f2 /usr/src | |
parent | 7509ca605713ac7f244b0e812b1712dd25f04da1 (diff) | |
download | illumos-joyent-a5eb7107f06a6e23e8e77e8d3a84c1ff90a73ac6.tar.gz |
5640 want epoll support
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src')
39 files changed, 1568 insertions, 113 deletions
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 069268dc05..4e453da0c1 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -23,7 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -703,6 +703,8 @@ const struct ioc { /* /dev/poll ioctl() control codes */ { (uint_t)DP_POLL, "DP_POLL", NULL }, { (uint_t)DP_ISPOLLED, "DP_ISPOLLED", NULL }, + { (uint_t)DP_PPOLL, "DP_PPOLL", NULL }, + { (uint_t)DP_EPOLLCOMPAT, "DP_EPOLLCOMPAT", NULL }, /* the old /proc ioctl() control codes */ #define PIOC ('q'<<8) { (uint_t)(PIOC|1), "PIOCSTATUS", NULL }, diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile index 0c1421bbf2..b5e54b19fa 100644 --- a/usr/src/lib/libc/amd64/Makefile +++ b/usr/src/lib/libc/amd64/Makefile @@ -863,6 +863,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com index 9a76280c0a..d7e77502f2 100644 --- a/usr/src/lib/libc/i386/Makefile.com +++ b/usr/src/lib/libc/i386/Makefile.com @@ -903,6 +903,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ eventfd.o \ exacctsys.o \ execl.o \ diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index 0b683fc923..017c7c31bc 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -93,6 +93,15 @@ $if _x86 && _ELF64 $add amd64 $endif +SYMBOL_VERSION ILLUMOS_0.15 { # epoll(3C) + protected: + epoll_create; + epoll_create1; + epoll_ctl; + epoll_wait; + epoll_pwait; +} ILLUMOS_0.14; + SYMBOL_VERSION ILLUMOS_0.14 { # strerror_l protected: strerror_l; diff --git a/usr/src/lib/libc/port/sys/epoll.c b/usr/src/lib/libc/port/sys/epoll.c new file mode 100644 index 0000000000..93379b583e --- /dev/null +++ b/usr/src/lib/libc/port/sys/epoll.c @@ -0,0 +1,207 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/epoll.h> +#include <sys/devpoll.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> + +/* + * Events that match their epoll(7) equivalents. + */ +#if EPOLLIN != POLLIN +#error value of EPOLLIN does not match value of POLLIN +#endif + +#if EPOLLPRI != POLLPRI +#error value of EPOLLPRI does not match value of POLLPRI +#endif + +#if EPOLLOUT != POLLOUT +#error value of EPOLLOUT does not match value of POLLOUT +#endif + +#if EPOLLRDNORM != POLLRDNORM +#error value of EPOLLRDNORM does not match value of POLLRDNORM +#endif + +#if EPOLLRDBAND != POLLRDBAND +#error value of EPOLLRDBAND does not match value of POLLRDBAND +#endif + +#if EPOLLERR != POLLERR +#error value of EPOLLERR does not match value of POLLERR +#endif + +#if EPOLLHUP != POLLHUP +#error value of EPOLLHUP does not match value of POLLHUP +#endif + +/* + * Events that we ignore entirely. They can be set in events, but they will + * never be returned. + */ +#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP) + +/* + * Events that we swizzle into other bit positions. + */ +#define EPOLLSWIZZLED \ + (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM) + +int +epoll_create(int size) +{ + int fd; + + /* + * From the epoll_create() man page: "Since Linux 2.6.8, the size + * argument is ignored, but must be greater than zero." You keep using + * that word "ignored"... + */ + if (size <= 0) { + errno = EINVAL; + return (-1); + } + + if ((fd = open("/dev/poll", O_RDWR)) == -1) + return (-1); + + if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) { + (void) close(fd); + return (-1); + } + + return (fd); +} + +int +epoll_create1(int flags) +{ + int fd, oflags = O_RDWR; + + if (flags & EPOLL_CLOEXEC) + oflags |= O_CLOEXEC; + + if ((fd = open("/dev/poll", oflags)) == -1) + return (-1); + + if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) { + (void) close(fd); + return (-1); + } + + return (fd); +} + +int +epoll_ctl(int epfd, int op, int fd, struct epoll_event *event) +{ + dvpoll_epollfd_t epoll[2]; + uint32_t events, ev = 0; + int i = 0; + + epoll[i].dpep_pollfd.fd = fd; + + switch (op) { + case EPOLL_CTL_DEL: + ev = POLLREMOVE; + break; + + case EPOLL_CTL_MOD: + /* + * In the modify case, we pass down two events: one to + * remove the event and another to add it back. + */ + epoll[i++].dpep_pollfd.events = POLLREMOVE; + epoll[i].dpep_pollfd.fd = fd; + /* FALLTHROUGH */ + + case EPOLL_CTL_ADD: + /* + * Mask off the events that we ignore, and then swizzle the + * events for which our values differ from their epoll(7) + * equivalents. + */ + events = event->events; + ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED); + + if (events & EPOLLRDHUP) + ev |= POLLRDHUP; + + if (events & EPOLLET) + ev |= POLLET; + + if (events & EPOLLONESHOT) + ev |= POLLONESHOT; + + if (events & EPOLLWRNORM) + ev |= POLLWRNORM; + + if (events & EPOLLWRBAND) + ev |= POLLWRBAND; + + epoll[i].dpep_data = event->data.u64; + break; + + default: + errno = EOPNOTSUPP; + return (-1); + } + + epoll[i].dpep_pollfd.events = ev; + + return (write(epfd, epoll, sizeof (epoll[0]) * (i + 1)) == -1 ? -1 : 0); +} + +int +epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout) +{ + struct dvpoll arg; + + if (maxevents <= 0) { + errno = EINVAL; + return (-1); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + + return (ioctl(epfd, DP_POLL, &arg)); +} + +int +epoll_pwait(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask) +{ + struct dvpoll arg; + + if (maxevents <= 0) { + errno = EINVAL; + return (-1); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = timeout; + arg.dp_fds = (pollfd_t *)events; + arg.dp_setp = (sigset_t *)sigmask; + + return (ioctl(epfd, DP_PPOLL, &arg)); +} diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com index 3856c5332c..dc965fe6ac 100644 --- a/usr/src/lib/libc/sparc/Makefile.com +++ b/usr/src/lib/libc/sparc/Makefile.com @@ -937,6 +937,7 @@ PORTSYS= \ chmod.o \ chown.o \ corectl.o \ + epoll.o \ eventfd.o \ exacctsys.o \ execl.o \ diff --git a/usr/src/lib/libc/sparcv9/Makefile.com b/usr/src/lib/libc/sparcv9/Makefile.com index 1a65ab7680..415aaf2be2 100644 --- a/usr/src/lib/libc/sparcv9/Makefile.com +++ b/usr/src/lib/libc/sparcv9/Makefile.com @@ -882,6 +882,7 @@ PORTSYS= \ chown.o \ corectl.o \ eventfd.o \ + epoll.o \ exacctsys.o \ execl.o \ execle.o \ diff --git a/usr/src/man/man3c/Makefile b/usr/src/man/man3c/Makefile index eeb6267192..f6cadebe95 100644 --- a/usr/src/man/man3c/Makefile +++ b/usr/src/man/man3c/Makefile @@ -110,6 +110,9 @@ MANFILES= __fbufsize.3c \ enable_extended_FILE_stdio.3c \ encrypt.3c \ end.3c \ + epoll_create.3c \ + epoll_ctl.3c \ + epoll_wait.3c \ err.3c \ euclen.3c \ eventfd.3c \ @@ -733,6 +736,8 @@ MANLINKS= FD_CLR.3c \ endusershell.3c \ endutent.3c \ endutxent.3c \ + epoll_create1.3c \ + epoll_pwait.3c \ erand48.3c \ errno.3c \ errx.3c \ @@ -1582,6 +1587,9 @@ _etext.3c := LINKSRC = end.3c edata.3c := LINKSRC = end.3c etext.3c := LINKSRC = end.3c +epoll_create1.3c := LINKSRC = epoll_create.3c +epoll_pwait.3c := LINKSRC = epoll_wait.3c + errx.3c := LINKSRC = err.3c verr.3c := LINKSRC = err.3c verrx.3c := LINKSRC = err.3c diff --git a/usr/src/man/man3c/epoll_create.3c b/usr/src/man/man3c/epoll_create.3c new file mode 100644 index 0000000000..6f54f638f7 --- /dev/null +++ b/usr/src/man/man3c/epoll_create.3c @@ -0,0 +1,99 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_CREATE 3C "Apr 17, 2014" +.SH NAME +epoll_create, epoll_create1 \- create an epoll instance +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_create\fR(\fBint\fR \fIsize\fR); +.fi + +.LP +.nf +\fBint\fR \fBepoll_create1\fR(\fBint\fR \fIflags\fR); +.fi + +.SH DESCRIPTION +.LP +The \fBepoll_create()\fR and \fBepoll_create1()\fR functions both create an +\fBepoll\fR(5) instance that can be operated upon via \fBepoll_ctl\fR(3C), +\fBepoll_wait\fR(3C) and \fBepoll_pwait\fR(3C). \fBepoll\fR instances are +represented as file descriptors, and should be closed via \fBclose\fR(2). + +The only difference between the two functions is their signature; +\fBepoll_create()\fR takes a size argument that +is vestigal and is only meaningful in as much as it must be greater than +zero, while \fBepoll_create1()\fR takes a flags argument that can have +any of the following values: + +.sp +.ne 2 +.na +\fBEPOLL_CLOEXEC\fR +.ad +.RS 12n +Instance should be closed upon an +\fBexec\fR(2); see \fBopen\fR(2)'s description of \fBO_CLOEXEC\fR. +.RE + +.SH RETURN VALUES +.LP +Upon succesful completion, 0 is returned. Otherwise, -1 is returned and errno +is set to indicate the error. +.SH ERRORS +.LP +The \fBepoll_create()\fR and \fBepoll_create1()\fR functions will fail if: +.sp +.ne 2 +.na +\fB\fBEINVAL\fR\fR +.ad +.RS 10n +Either the \fIsize\fR is zero (\fBepoll_create()\fR) or the \fIflags\fR +are invalid (\fBepoll_create1()\fR). +.RE + +.sp +.ne 2 +.na +\fB\fBEMFILE\fR\fR +.ad +.RS 10n +There are currently {\fBOPEN_MAX\fR} file descriptors open in the calling +process. +.RE + +.sp +.ne 2 +.na +\fB\fBENFILE\fR\fR +.ad +.RS 10n +The maximum allowable number of files is currently open in the system. +.RE + +.sp +.SH NOTES +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. + +.SH SEE ALSO +.LP +\fBepoll_ctl\fR(3C), \fBepoll_wait\fR(3C), \fBepoll\fR(5) diff --git a/usr/src/man/man3c/epoll_ctl.3c b/usr/src/man/man3c/epoll_ctl.3c new file mode 100644 index 0000000000..19c02f2abb --- /dev/null +++ b/usr/src/man/man3c/epoll_ctl.3c @@ -0,0 +1,309 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_CTL 3C "Apr 17, 2014" +.SH NAME +epoll_ctl \- control an epoll instance +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_ctl\fR(\fBint\fR \fIepfd\fR, \fBint\fR \fIop\fR, \fBint\fR \fIfd\fR, \fBstruct epoll_event *\fR\fIevent\fR); +.fi + +.SH DESCRIPTION +.LP +The \fBepoll_ctl()\fR function executes the operation specified by +\fIop\fR (as parameterized by \fIevent\fR) on the \fIepfd\fR epoll instance. +Valid values for \fIop\fR: + +.sp +.ne 2 +.na +\fBEPOLL_CTL_ADD\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, +associate the file descriptor specified by \fIfd\fR with the event specified +by \fIevent\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLL_CTL_DEL\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, +remove all event associations for the file descriptor specified by \fIfd\fR. +\fIevent\fR is ignored, and may be NULL. +.RE + +.sp +.ne 2 +.na +\fBEPOLL_CTL_MOD\fR +.ad +.RS 12n +For the \fBepoll\fR(5) instance specified by \fIepfd\fR, modify the event +association for the file descriptor specified by \fIfd\fR to be that +specified by \fIevent\fR. + +.RE + +The \fIevent\fR parameter has the following structure: + +.in +4 +.nf +typedef union epoll_data { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; +} epoll_data_t; + +struct epoll_event { + uint32_t events; + epoll_data_t data; +}; +.fi +.in -4 + +The \fIdata\fR field specifies the datum to +be associated with the event and +will be returned via \fBepoll_wait\fR(3C). +The \fIevents\fR field denotes both the desired events (when specified via +\fBepoll_ctl()\fR) and the events that have occurred (when returned via +\fBepoll_wait\fR(3C)). +In either case, the +\fIevents\fR field is a bitmask constructed by a logical \fBOR\fR operation +of any combination of the following event flags: + +.sp +.ne 2 +.na +\fBEPOLLIN\fR +.ad +.RS 14n +Data other than high priority data may be read without blocking. For streams, +this flag is set in the returned \fIevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLPRI\fR +.ad +.RS 14n +Normal data (priority band equals 0) may be read without blocking. For streams, +this flag is set in the returned \fIevents\fR even if the message is of zero +length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLOUT\fR +.ad +.RS 14n +Normal data (priority band equals 0) may be written without blocking. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDNORM\fR +.ad +.RS 14n +Normal data (priority band equals 0) may be read without blocking. For streams, +this flag is set in the returned \fIrevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDBAND\fR +.ad +.RS 14n +Data from a non-zero priority band may be read without blocking. For streams, +this flag is set in the returned \fIrevents\fR even if the message is of +zero length. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWRNORM\fR +.ad +.RS 14n +The same as \fBEPOLLOUT\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWRBAND\fR +.ad +.RS 14n +Priority data (priority band > 0) may be written. This event only examines +bands that have been written to at least once. +.RE + +.sp +.ne 2 +.na +\fBEPOLLMSG\fR +.ad +.RS 14n +This exists only for backwards binary and source compatibility with Linux; +it has no meaning and is ignored. +.RE + +.sp +.ne 2 +.na +\fBEPOLLERR\fR +.ad +.RS 14n +An error has occurred on the device or stream. This flag is only valid in the +returned \fIevents\fR field. +.RE + +.sp +.ne 2 +.na +\fBEPOLLHUP\fR +.ad +.RS 14n +A hangup has occurred on the stream. This event and \fBEPOLLOUT\fR are mutually +exclusive; a stream can never be writable if a hangup has occurred. However, +this event and \fBEPOLLIN\fR, \fBEPOLLRDNORM\fR, \fBEPOLLRDBAND\fR, +\fBEPOLLRDHUP\fR or +\fBEPOLLPRI\fR are not mutually exclusive. This flag is only valid in the +the \fIevents\fR field returned from \fBepoll_wait\fR(3C); it is not used +in the \fIevents\fR field specified via \fBepoll_ctl()\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLRDHUP\fR +.ad +.RS 14n +The stream socket peer shutdown the writing half of the connection and no +further data will be readable via the socket. This event is not mutually +exclusive with \fBEPOLLIN\fR. +.RE + +.sp +.ne 2 +.na +\fBEPOLLWAKEUP\fR +.ad +.RS 14n +This exists only for backwards binary and source compatibility with Linux; +it has no meaning and is ignored. +.RE + +.sp +.ne 2 +.na +\fBEPOLLONESHOT\fR +.ad +.RS 14n +Sets the specified event to be in one-shot mode, whereby the event association +with the \fBepoll\fR(5) instance specified by \fIepfd\fR is removed atomically +as the event is returned via \fBepoll_wait\fR(3C). Use of this mode allows +for resolution of some of the +races inherent in multithreaded use of \fBepoll_wait\fR(3C). +.RE + +.sp +.ne 2 +.na +\fBEPOLLET\fR +.ad +.RS 14n +Sets the specified event to be edge-triggered mode instead of the default +mode of level-triggered. In this mode, events will be induced by +transitions on an event source rather than the state of the event source. +While perhaps superficially appealing, this mode introduces several new +potential failure modes for user-level software and should be used +with caution. +.RE + +.SH RETURN VALUES +.LP +Upon succesful completion, \fBepoll_ctl()\fR returns 0. +If an error occurs, -1 is returned and errno is set to indicate +the error. + +.SH ERRORS +.LP +\fBepoll_ctl()\fR will fail if: +.sp +.ne 2 +.na +\fB\fBEBADF\fR\fR +.ad +.RS 10n +\fIepfd\fR is not a valid file descriptor. +.RE + +.sp +.ne 2 +.na +\fB\fBEFAULT\fR\fR +.ad +.RS 10n +The memory associated with \fIevent\fR was not mapped. +.RE + +.sp +.ne 2 +.na +\fB\fBEEXIST\fR\fR +.ad +.RS 10n +The operation specified was \fBEPOLL_CTL_ADD\fR and the specified file +descriptor is already associated with an event for the specified +\fBepoll\fR(5) instance. +.RE + +.sp +.ne 2 +.na +\fB\fBENOENT\fR\fR +.ad +.RS 10n +The operation specified was \fBEPOLL_CTL_MOD\fR or \fBEPOLL_CTL_DEL\fR and +the specified file descriptor is not associated with an event for the +specified \fBepoll\fR(5) instance. +.RE + +.sp +.SH NOTES +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. + +.SH SEE ALSO +.LP +\fBepoll_create\fR(3C), \fBepoll_wait\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C), +\fBepoll\fR(5) diff --git a/usr/src/man/man3c/epoll_wait.3c b/usr/src/man/man3c/epoll_wait.3c new file mode 100644 index 0000000000..6ae9e0f9c4 --- /dev/null +++ b/usr/src/man/man3c/epoll_wait.3c @@ -0,0 +1,108 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL_WAIT 3C "Apr 17, 2014" +.SH NAME +epoll_wait, epoll_pwait \- wait for epoll events +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> + +\fBint\fR \fBepoll_wait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR, + \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR); +.fi + +.LP +.nf +\fBint\fR \fBepoll_pwait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR, + \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR, + \fBconst sigset_t *\fR\fIsigmask\fR); +.fi + +.SH DESCRIPTION +.LP +The \fBepoll_wait()\fR function waits for events on the \fBepoll\fR(5) +instance specified by \fIepfd\fR. The \fIevents\fR parameter must point to +an array of \fImaxevents\fR \fIepoll_event\fR structures to be +filled in with pending events. The \fItimeout\fR argument specifies the +number of milliseconds to wait for an event if none is pending. A +\fItimeout\fR of -1 denotes an infinite timeout. + +The \fBepoll_pwait()\fR is similar to \fBepoll_wait()\fR, but takes an +additional \fIsigmask\fR argument that specifies the desired signal mask +when \fBepoll_pwait()\fR is blocked. It is equivalent to atomically +setting the signal mask, calling \fBepoll_wait()\fR, and restoring the +signal mask upon return, and is therefore similar to the relationship +between \fBselect\fR(3C) and \fBpselect\fR(3C). + +.SH RETURN VALUES +.LP +Upon successful completion, \fBepoll_wait()\fR and \fBepoll_pwait()\fR return +the number of events, or 0 if none was pending and \fItimeout\fR milliseconds +elapsed. If an error occurs, -1 is returned and errno is set to indicate +the error. + +.SH ERRORS +.LP +The \fBepoll_wait()\fR and \fBepoll_pwait()\fR functions will fail if: +.sp +.ne 2 +.na +\fB\fBEBADF\fR\fR +.ad +.RS 10n +\fIepfd\fR is not a valid file descriptor. +.RE + +.sp +.ne 2 +.na +\fB\fBEFAULT\fR\fR +.ad +.RS 10n +The memory associated with \fIevents\fR was not mapped or was not writable. +.RE + +.sp +.ne 2 +.na +\fB\fBEINTR\fR\fR +.ad +.RS 10n +A signal was received during the \fBepoll_wait()\fR or \fBepoll_pwait()\fR. +.RE + +.sp +.ne 2 +.na +\fB\fBEINVAL\fR\fR +.ad +.RS 10n +Either \fIepfd\fR is not a valid \fBepoll\fR(5) instance or \fImaxevents\fR +is not greater than zero. +.RE + +.sp +.SH NOTES +.LP + +The \fBepoll\fR(5) facility is implemented for purposes of offering +compatibility for Linux-borne applications; native +applications should continue to prefer using event ports via the +\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C) +interfaces. See \fBepoll\fR(5) for compatibility details and restrictions. + +.SH SEE ALSO +.LP +\fBepoll_create\fR(3C), \fBepoll_ctl\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C), +\fBpselect\fR(3C), \fBepoll\fR(5) diff --git a/usr/src/man/man5/Makefile b/usr/src/man/man5/Makefile index 7c928f3473..4784603013 100644 --- a/usr/src/man/man5/Makefile +++ b/usr/src/man/man5/Makefile @@ -41,6 +41,7 @@ MANFILES= Intro.5 \ device_clean.5 \ dhcp.5 \ environ.5 \ + epoll.5 \ eqn.5 \ eqnchar.5 \ eventfd.5 \ diff --git a/usr/src/man/man5/epoll.5 b/usr/src/man/man5/epoll.5 new file mode 100644 index 0000000000..860b2bb91f --- /dev/null +++ b/usr/src/man/man5/epoll.5 @@ -0,0 +1,110 @@ +'\" te +.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved. +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.TH EPOLL 5 "Apr 17, 2014" +.SH NAME +epoll \- Linux-compatible I/O event notification facility +.SH SYNOPSIS + +.LP +.nf +#include <sys/epoll.h> +.fi + +.SH DESCRIPTION +.LP + +\fBepoll\fR is a facility for efficient event-oriented I/O that has a +similar model to \fBpoll\fR(2), but does not necessitate rescanning a +set of file descriptors to wait for an event. \fBepoll\fR is of Linux +origins, and this facility is designed to be binary-compatible with +the Linux facility, including the following interfaces: + +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_create\fR(3C) creates an \fBepoll\fR instance, returning a file +descriptor. It contains a size arugment which is meaningful only in as +much as it cannot be 0. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_create1\fR(3C) also creates an \fBepoll\fR instance, but eliminates +the meaningless size argument -- replacing it instead with a flags +argument. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_ctl\fR(3C) allows file descriptors to be added +(via \fBEPOLL_CTL_ADD\fR), deleted (via \fBEPOLL_CTL_DEL\fR) or +modified (via \fBEPOLL_CTL_MOD\fR) with respect to the \fBepoll\fR'd set +of file descriptors. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_wait\fR(3C) fetches pending events for file descriptors added +via \fBepoll_ctl\fR(3C), blocking the caller if no such events are pending. +.RE +.RS +4 +.TP +.ie t \(bu +.el o +\fBepoll_pwait\fR(3C) opeates in a similar manner to \fBepoll_wait\fR(3C), but +allows the caller to specify a signal mask to be set atomically with respect +to waiting for events. +.RE + +.sp +.SH NOTES +.LP + +The \fBepoll\fR facility is implemented +for purposes of offering compatibility to and portability of Linux-borne +applications; native applications should continue to prefer using event ports +via the \fBport_create\fR(3C), +\fBport_associate\fR(3C) and \fBport_getn\fR(3C) interfaces. +In particular, use of \fBepoll\fR in a multithreaded environment is fraught +with peril; even when using \fBEPOLLONESHOT\fR for one-shot events, +there are race conditions with respect to \fBclose\fR(2) that are unresolvable. +(For more details, see the aborted effort in Linux to resolve this via the +proposed +\fBEPOLL_CTL_DISABLE\fR operation.) +The event port facility -- like the BSD kqueue facility that inspired it -- +is designed to deal with such issues via explicit event source dissociation. + +While a best effort has been made to mimic the Linux semantics, there +are some semantics that are too peculiar or ill-conceived to merit +accommodation. In particular, the Linux \fBepoll\fR facility will -- by +design -- continue to generate events for closed file descriptors where/when +the underlying file description remains open. For example, if one were +to \fBfork\fR(2) and subsequently close an actively \fBepoll\fR'd file +descriptor in the parent, +any events generated in the child on the implicitly duplicated file descriptor +will continue to be delivered to the parent -- despite the fact that the +parent itself no longer has any notion of the file description! +This \fBepoll\fR facility refuses to honor +these semantics; closing the \fBEPOLL_CTL_ADD\fR'd file descriptor +will always result in no further +events being generated for that event description. + +.SH SEE ALSO +.LP +\fBepoll_create\fR(3C), \fBepoll_create1\fR(3C), \fBepoll_ctl\fR(3C), +\fBepoll_wait\fR(3C), \fBepoll_pwait\fR(3C), +\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_dissociate\fR(3C), +\fBport_get\fR(3C), +\fBpselect\fR(3C) diff --git a/usr/src/man/man7d/poll.7d b/usr/src/man/man7d/poll.7d index cd3db77de9..e3d9e074aa 100644 --- a/usr/src/man/man7d/poll.7d +++ b/usr/src/man/man7d/poll.7d @@ -17,7 +17,6 @@ int n = ioctl(int fd, DP_ISPOLLED, struct pollfd* pfd);\fR .fi .SH PARAMETERS -.sp .ne 2 .na \fB\fIfd\fR \fR @@ -73,15 +72,6 @@ Pointer to \fBpollfd\fR structure. .SH DESCRIPTION .LP -Note - -.sp -.RS 2 -The \fB/dev/poll\fR device, associated driver and corresponding manpages may be -removed in a future Solaris release. For similar functionality in the event -ports framework, see \fBport_create\fR(3C). -.RE -.sp -.LP The \fB/dev/poll\fR driver is a special driver that enables you to monitor multiple sets of polled file descriptors. By using the \fB/dev/poll\fR driver, you can efficiently poll large numbers of file descriptors. Access to @@ -165,7 +155,6 @@ currently polled \fBevents\fR. The ioctl returns \fB0\fR if the file descriptor is not in the set. The \fBpollfd\fR structure pointed by \fIpfd\fR is not modified. The ioctl returns a \fB-1\fR if the call fails. .SH EXAMPLES -.sp .LP The following example shows how \fB/dev/poll\fR may be used. .sp @@ -293,7 +282,6 @@ The following example is part of a test program which shows how .in -2 .SH ERRORS -.sp .ne 2 .na \fB\fBEACCES\fR \fR @@ -347,7 +335,6 @@ special file does not exist. .RE .SH ATTRIBUTES -.sp .LP See \fBattributes\fR(5) for a description of the following attributes: .sp @@ -364,11 +351,9 @@ MT-Level Safe .TE .SH SEE ALSO -.sp .LP \fBopen\fR(2), \fBpoll\fR(2), \fBwrite\fR(2), \fBattributes\fR(5) .SH NOTES -.sp .LP The \fB/dev/poll\fR API is particularly beneficial to applications that poll a large number of file descriptors repeatedly. Applications will exhibit the diff --git a/usr/src/man/man9e/chpoll.9e b/usr/src/man/man9e/chpoll.9e index 27fe2a20e9..468ef7b53f 100644 --- a/usr/src/man/man9e/chpoll.9e +++ b/usr/src/man/man9e/chpoll.9e @@ -121,6 +121,17 @@ The same as \fBPOLLOUT\fR. Priority data (priority band > 0) may be written. .RE +.sp +.ne 2 +.na +\fB\fBPOLLET\fR\fR +.ad +.RS 14n +The desired event is to be edge-triggered; calls to \fBpollwakeup\fR(9F) +should not be suppressed, even if the event is pending at the time of +call to the \fBchpoll()\fR function. +.RE + .RE .sp @@ -197,7 +208,11 @@ be called with multiple events at one time. The \fBpollwakup()\fR can be called regardless of whether or not the \fBchpoll()\fR entry is called; it should be called every time the driver detects the pollable event. The driver must not hold any mutex across the call to \fBpollwakeup\fR(9F) that is acquired in its -\fBchpoll()\fR entry point, or a deadlock may result. +\fBchpoll()\fR entry point, or a deadlock may result. Note that if +\fBPOLLET\fR is set in the specified events, the driver must call +\fBpollwakeup\fR(9F) on subsequent events, even if events are pending at +the time of the call to \fBchpoll()\fR. + .RE .SH RETURN VALUES .LP diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf index 54ba88c061..b72d713cd8 100644 --- a/usr/src/pkg/manifests/system-header.mf +++ b/usr/src/pkg/manifests/system-header.mf @@ -954,6 +954,7 @@ file path=usr/include/sys/elf_amd64.h file path=usr/include/sys/elf_notes.h file path=usr/include/sys/elftypes.h file path=usr/include/sys/epm.h +file path=usr/include/sys/epoll.h file path=usr/include/sys/errno.h file path=usr/include/sys/errorq.h file path=usr/include/sys/errorq_impl.h diff --git a/usr/src/pkg/manifests/system-kernel.man5.inc b/usr/src/pkg/manifests/system-kernel.man5.inc index ab1d31cea6..090344a9c3 100644 --- a/usr/src/pkg/manifests/system-kernel.man5.inc +++ b/usr/src/pkg/manifests/system-kernel.man5.inc @@ -12,6 +12,7 @@ # Copyright 2011, Richard Lowe # Copyright 2014 Garrett D'Amore <garrett@damore.org> +file path=usr/share/man/man5/epoll.5 file path=usr/share/man/man5/fsattr.5 file path=usr/share/man/man5/ieee802.11.5 file path=usr/share/man/man5/ieee802.3.5 diff --git a/usr/src/pkg/manifests/system-library.man3c.inc b/usr/src/pkg/manifests/system-library.man3c.inc index 3b67d7408b..27268505b3 100644 --- a/usr/src/pkg/manifests/system-library.man3c.inc +++ b/usr/src/pkg/manifests/system-library.man3c.inc @@ -105,6 +105,9 @@ file path=usr/share/man/man3c/ecvt.3c file path=usr/share/man/man3c/enable_extended_FILE_stdio.3c file path=usr/share/man/man3c/encrypt.3c file path=usr/share/man/man3c/end.3c +file path=usr/share/man/man3c/epoll_create.3c +file path=usr/share/man/man3c/epoll_ctl.3c +file path=usr/share/man/man3c/epoll_wait.3c file path=usr/share/man/man3c/err.3c file path=usr/share/man/man3c/euclen.3c file path=usr/share/man/man3c/eventfd.3c @@ -732,6 +735,8 @@ link path=usr/share/man/man3c/endspent.3c target=getspnam.3c link path=usr/share/man/man3c/endusershell.3c target=getusershell.3c link path=usr/share/man/man3c/endutent.3c target=getutent.3c link path=usr/share/man/man3c/endutxent.3c target=getutxent.3c +link path=usr/share/man/man3c/epoll_create1.3c target=epoll_create.3c +link path=usr/share/man/man3c/epoll_pwait.3c target=epoll_wait.3c link path=usr/share/man/man3c/erand48.3c target=drand48.3c link path=usr/share/man/man3c/errno.3c target=perror.3c link path=usr/share/man/man3c/errx.3c target=err.3c diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index ac89e430c7..fee2924093 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -27,7 +27,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * FIFOFS file system vnode operations. This file system @@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, } /* - * if we happened to get something, return + * if we happened to get something and we're not edge-triggered, return */ - - if ((*reventsp = (short)retevents) != 0) { + if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) { mutex_exit(&fnp->fn_lock->flk_lock); return (0); } /* - * If poll() has not found any events yet, set up event cell - * to wake up the poll if a requested event occurs on this + * If poll() has not found any events yet or we're edge-triggered, set + * up event cell to wake up the poll if a requested event occurs on this * pipe/fifo. */ if (!anyyet) { diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c index b2f5088e06..ab95c0a1f8 100644 --- a/usr/src/uts/common/fs/portfs/port_vnops.c +++ b/usr/src/uts/common/fs/portfs/port_vnops.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> @@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp, levents |= POLLOUT; levents &= events; *reventsp = levents; - if (levents == 0) { - if (!anyyet) { - *phpp = &pp->port_pollhd; - portq->portq_flags |= - events & POLLIN ? PORTQ_POLLIN : 0; - portq->portq_flags |= - events & POLLOUT ? PORTQ_POLLOUT : 0; - } + if ((levents == 0 && !anyyet) || (events & POLLET)) { + *phpp = &pp->port_pollhd; + portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0; + portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0; } mutex_exit(&portq->portq_mutex); return (0); diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 411c9b8b0b..e392ce4b14 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -6010,7 +6010,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp, } *reventsp = revents; - if (!anyyet && revents == 0) { + if ((!anyyet && revents == 0) || (events & POLLET)) { /* * Arrange to wake up the polling lwp when * the target process/lwp stops or terminates diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 0be628f329..e5bc6dc845 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -953,6 +953,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, if (!list_is_empty(&so->so_acceptq_list)) *reventsp |= (POLLIN|POLLRDNORM) & events; + /* + * If we're looking for POLLRDHUP, indicate it if we have sent the + * last rx signal for the socket. + */ + if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG)) + *reventsp |= POLLRDHUP; + /* Data */ /* so_downcalls is null for sctp */ if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) { @@ -988,14 +995,20 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, *reventsp |= POLLHUP; } - if (!*reventsp && !anyyet) { + if ((!*reventsp && !anyyet) || (events & POLLET)) { /* Check for read events again, but this time under lock */ if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&so->so_lock); if (SO_HAVE_DATA(so) || !list_is_empty(&so->so_acceptq_list)) { + if (events & POLLET) { + so->so_pollev |= SO_POLLEV_IN; + *phpp = &so->so_poll_list; + } + mutex_exit(&so->so_lock); *reventsp |= (POLLIN|POLLRDNORM) & events; + return (0); } else { so->so_pollev |= SO_POLLEV_IN; diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c index 3d5ba2a7e8..3f858afecc 100644 --- a/usr/src/uts/common/fs/sockfs/socknotify.c +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev) so->so_state |= SS_SENTLASTREADSIG; so->so_pollev &= ~SO_POLLEV_IN; - *pollev |= POLLIN|POLLRDNORM; + *pollev |= POLLIN|POLLRDNORM|POLLRDHUP; *sigev |= SOCKETSIG_READ; return (1); diff --git a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c index 94323582d6..4a2556177e 100644 --- a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c +++ b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * av1394 asynchronous module @@ -359,9 +361,10 @@ av1394_async_poll(av1394_inst_t *avp, short events, int anyyet, short *reventsp, AV1394_TNF_ENTER(av1394_async_poll); if (events & POLLIN) { - if (av1394_peekq(rq)) { + if (av1394_peekq(rq)) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { mutex_enter(&ap->a_mutex); ap->a_pollevents |= POLLIN; *phpp = &ap->a_pollhead; @@ -438,8 +441,8 @@ av1394_async_create_minor_node(av1394_inst_t *avp) int ret; ret = ddi_create_minor_node(avp->av_dip, "async", - S_IFCHR, AV1394_ASYNC_INST2MINOR(avp->av_instance), - DDI_NT_AV_ASYNC, NULL); + S_IFCHR, AV1394_ASYNC_INST2MINOR(avp->av_instance), + DDI_NT_AV_ASYNC, NULL); if (ret != DDI_SUCCESS) { TNF_PROBE_0(av1394_async_create_minor_node_error, AV1394_TNF_ASYNC_ERROR, ""); diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index a3fcbbba03..7b3454f89c 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -45,6 +46,8 @@ #include <sys/devpoll.h> #include <sys/rctl.h> #include <sys/resource.h> +#include <sys/schedctl.h> +#include <sys/epoll.h> #define RESERVED 1 @@ -237,7 +240,8 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, + pollcache_t *pcp, nfds_t nfds, int *fdcntp) { int start, ostart, end; int fdcnt, fd; @@ -247,7 +251,10 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) boolean_t no_wrap; pollhead_t *php; polldat_t *pdp; + pollfd_t *pfdp; + epoll_event_t *epoll; int error = 0; + short mask = POLLRDHUP | POLLWRBAND; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { @@ -257,6 +264,14 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) */ return (error); } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + pfdp = NULL; + epoll = (epoll_event_t *)dpbuf; + } else { + pfdp = (pollfd_t *)dpbuf; + epoll = NULL; + } retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; @@ -316,11 +331,32 @@ repoll: * polling a closed fd. Hope this will remind * user to do a POLLREMOVE. */ - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; + continue; + } + + /* + * In the epoll compatibility case, we actually + * perform the implicit removal to remain + * closer to the epoll semantics. + */ + ASSERT(epoll != NULL); + + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); continue; } + if (fp != pdp->pd_fp) { /* * user is polling on a cached fd which was @@ -376,9 +412,69 @@ repoll: } if (revent != 0) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].events = pdp->pd_events; - pfdp[fdcnt].revents = revent; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].events = pdp->pd_events; + pfdp[fdcnt].revents = revent; + } else { + epoll_event_t *ep = &epoll[fdcnt]; + + ASSERT(epoll != NULL); + ep->data.u64 = pdp->pd_epolldata; + + /* + * If any of the event bits are set for + * which poll and epoll representations + * differ, swizzle in the native epoll + * values. + */ + if (revent & mask) { + ep->events = (revent & ~mask) | + ((revent & POLLRDHUP) ? + EPOLLRDHUP : 0) | + ((revent & POLLWRBAND) ? + EPOLLWRBAND : 0); + } else { + ep->events = revent; + } + + /* + * We define POLLWRNORM to be POLLOUT, + * but epoll has separate definitions + * for them; if POLLOUT is set and the + * user has asked for EPOLLWRNORM, set + * that as well. + */ + if ((revent & POLLOUT) && + (pdp->pd_events & EPOLLWRNORM)) { + ep->events |= EPOLLWRNORM; + } + } + + /* + * If POLLET is set, clear the bit in the + * bitmap -- which effectively latches the + * edge on a pollwakeup() from the driver. + */ + if (pdp->pd_events & POLLET) + BT_CLEAR(pcp->pc_bitmap, fd); + + /* + * If POLLONESHOT is set, perform the implicit + * POLLREMOVE. + */ + if (pdp->pd_events & POLLONESHOT) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); + } + fdcnt++; } else if (php != NULL) { /* @@ -392,7 +488,7 @@ repoll: * in bitmap. */ if ((pdp->pd_php != NULL) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } if (pdp->pd_php == NULL) { @@ -473,11 +569,15 @@ dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) /* * allocate a pollcache skeleton here. Delay allocating bitmap * structures until dpwrite() time, since we don't know the - * optimal size yet. + * optimal size yet. We also delay setting the pid until either + * dpwrite() or attempt to poll on the instance, allowing parents + * to create instances of /dev/poll for their children. (In the + * epoll compatibility case, this check isn't performed to maintain + * semantic compatibility.) */ pcp = pcache_alloc(); dpep->dpe_pcache = pcp; - pcp->pc_pid = curproc->p_pid; + pcp->pc_pid = -1; *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ mutex_enter(&devpoll_lock); ASSERT(minordev < dptblsize); @@ -499,7 +599,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dp_entry_t *dpep; pollcache_t *pcp; pollfd_t *pollfdp, *pfdp; - int error; + dvpoll_epollfd_t *epfdp; + uintptr_t limit; + int error, size; ssize_t uiosize; nfds_t pollfdnum; struct pollhead *php = NULL; @@ -515,11 +617,23 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); pcp = dpep->dpe_pcache; - if (curproc->p_pid != pcp->pc_pid) { - return (EACCES); + + if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + curproc->p_pid != pcp->pc_pid) { + if (pcp->pc_pid != -1) + return (EACCES); + + pcp->pc_pid = curproc->p_pid; } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = sizeof (dvpoll_epollfd_t); + } else { + size = sizeof (pollfd_t); + } + uiosize = uiop->uio_resid; - pollfdnum = uiosize / sizeof (pollfd_t); + pollfdnum = uiosize / size; mutex_enter(&curproc->p_lock); if (pollfdnum > (uint_t)rctl_enforced_value( rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { @@ -534,6 +648,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * each polled fd to the cached set. */ pollfdp = kmem_alloc(uiosize, KM_SLEEP); + limit = (uintptr_t)pollfdp + (pollfdnum * size); /* * Although /dev/poll uses the write(2) interface to cache fds, it's @@ -555,9 +670,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) mutex_enter(&dpep->dpe_lock); dpep->dpe_writerwait++; while (dpep->dpe_refcnt != 0) { + /* + * We need to do a bit of a dance here: we need to drop + * our dpe_lock and grab the pc_lock to broadcast the pc_cv to + * kick any DP_POLL/DP_PPOLL sleepers. + */ + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_WRITEWANTED; + cv_broadcast(&pcp->pc_cv); + mutex_exit(&pcp->pc_lock); + mutex_enter(&dpep->dpe_lock); + + if (dpep->dpe_refcnt == 0) + break; + if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + mutex_exit(&pcp->pc_lock); kmem_free(pollfdp, uiosize); return (set_errno(EINTR)); } @@ -565,24 +698,107 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dpep->dpe_writerwait--; dpep->dpe_flag |= DP_WRITER_PRESENT; dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + if (pcp->pc_bitmap == NULL) { pcache_create(pcp, pollfdnum); } - for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) { + for (pfdp = pollfdp; (uintptr_t)pfdp < limit; + pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { fd = pfdp->fd; - if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) + if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { + /* + * epoll semantics demand that we return EBADF if our + * specified fd is invalid. + */ + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + error = EBADF; + break; + } + continue; + } + pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { + + fp = NULL; + if (pdp == NULL) { + /* + * If we're in epoll compatibility mode, check + * that the fd is valid before allocating + * anything for it; epoll semantics demand that + * we return EBADF if our specified fd is + * invalid. + */ + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if ((fp = getf(fd)) == NULL) { + error = EBADF; + break; + } + } + pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); + } else { + /* + * epoll semantics demand that we error out if + * a file descriptor is added twice, which we + * check (imperfectly) by checking if we both + * have the file descriptor cached and the + * file pointer that correponds to the file + * descriptor matches our cached value. If + * there is a pointer mismatch, the file + * descriptor was closed without being removed. + * The converse is clearly not true, however, + * so to narrow the window by which a spurious + * EEXIST may be returned, we also check if + * this fp has been added to an epoll control + * descriptor in the past; if it hasn't, we + * know that this is due to fp reuse -- it's + * not a true EEXIST case. (By performing this + * additional check, we limit the window of + * spurious EEXIST to situations where a single + * file descriptor is being used across two or + * more epoll control descriptors -- and even + * then, the file descriptor must be closed and + * reused in a relatively tight time span.) + */ + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if (pdp->pd_fp != NULL && + (fp = getf(fd)) != NULL && + fp == pdp->pd_fp && + (fp->f_flag2 & FEPOLLED)) { + error = EEXIST; + releasef(fd); + break; + } + + /* + * We have decided that the cached + * information was stale: it either + * didn't match, or the fp had never + * actually been epoll()'d on before. + * We need to now clear our pd_events + * to assure that we don't mistakenly + * operate on cached event disposition. + */ + pdp->pd_events = 0; + } } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + epfdp = (dvpoll_epollfd_t *)pfdp; + pdp->pd_epolldata = epfdp->dpep_data; + } + ASSERT(pdp->pd_fd == fd); ASSERT(pdp->pd_pcache == pcp); if (fd >= pcp->pc_mapsize) { @@ -593,7 +809,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } - if ((fp = getf(fd)) == NULL) { + if (fp == NULL && (fp = getf(fd)) == NULL) { /* * The fd is not valid. Since we can't pass * this error back in the write() call, set @@ -604,12 +820,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp->pd_events |= pfdp->events; continue; } + + /* + * To (greatly) reduce EEXIST false positives, we + * denote that this fp has been epoll()'d. We do this + * regardless of epoll compatibility mode, as the flag + * is harmless if not in epoll compatibility mode. + */ + fp->f_flag2 |= FEPOLLED; + /* * Don't do VOP_POLL for an already cached fd with * same poll events. */ if ((pdp->pd_events == pfdp->events) && - (pdp->pd_fp != NULL)) { + (pdp->pd_fp == fp)) { /* * the events are already cached */ @@ -665,7 +890,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } releasef(fd); } else { - if (pdp == NULL) { + if (pdp == NULL || pdp->pd_fp == NULL) { + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* + * As with the add case (above), epoll + * semantics demand that we error out + * in this case. + */ + error = ENOENT; + break; + } + continue; } ASSERT(pdp->pd_fd == fd); @@ -690,6 +925,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) return (error); } +#define DP_SIGMASK_RESTORE(ksetp) { \ + if (ksetp != NULL) { \ + mutex_enter(&p->p_lock); \ + if (lwp->lwp_cursig == 0) { \ + t->t_hold = lwp->lwp_sigoldmask; \ + t->t_flag &= ~T_TOMASK; \ + } \ + mutex_exit(&p->p_lock); \ + } \ +} + /*ARGSUSED*/ static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) @@ -701,7 +947,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) int error = 0; STRUCT_DECL(dvpoll, dvpoll); - if (cmd == DP_POLL) { + if (cmd == DP_POLL || cmd == DP_PPOLL) { /* do this now, before we sleep on DP_WRITER_PRESENT */ now = gethrtime(); } @@ -713,10 +959,39 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) mutex_exit(&devpoll_lock); ASSERT(dpep != NULL); pcp = dpep->dpe_pcache; - if (curproc->p_pid != pcp->pc_pid) - return (EACCES); mutex_enter(&dpep->dpe_lock); + + if (cmd == DP_EPOLLCOMPAT) { + if (dpep->dpe_refcnt != 0) { + /* + * We can't turn on epoll compatibility while there + * are outstanding operations. + */ + mutex_exit(&dpep->dpe_lock); + return (EBUSY); + } + + /* + * epoll compatibility is a one-way street: there's no way + * to turn it off for a particular open. + */ + dpep->dpe_flag |= DP_ISEPOLLCOMPAT; + mutex_exit(&dpep->dpe_lock); + + return (0); + } + + if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + curproc->p_pid != pcp->pc_pid) { + if (pcp->pc_pid != -1) { + mutex_exit(&dpep->dpe_lock); + return (EACCES); + } + + pcp->pc_pid = curproc->p_pid; + } + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || (dpep->dpe_writerwait != 0)) { if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { @@ -729,15 +1004,43 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) switch (cmd) { case DP_POLL: + case DP_PPOLL: { pollstate_t *ps; nfds_t nfds; int fdcnt = 0; + size_t size, fdsize, dpsize; hrtime_t deadline = 0; + k_sigset_t *ksetp = NULL; + k_sigset_t kset; + sigset_t set; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + struct proc *p = ttoproc(curthread); STRUCT_INIT(dvpoll, mode); - error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), - STRUCT_SIZE(dvpoll)); + + /* + * The dp_setp member is only required/consumed for DP_PPOLL, + * which otherwise uses the same structure as DP_POLL. + */ + if (cmd == DP_POLL) { + dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - + (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); + } else { + ASSERT(cmd == DP_PPOLL); + dpsize = STRUCT_SIZE(dvpoll); + } + + if ((mode & FKIOCTL) != 0) { + /* Kernel-internal ioctl call */ + bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); + error = 0; + } else { + error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), + dpsize); + } + if (error) { DP_REFRELE(dpep); return (EFAULT); @@ -755,6 +1058,52 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) deadline += now; } + if (cmd == DP_PPOLL) { + void *setp = STRUCT_FGETP(dvpoll, dp_setp); + + if (setp != NULL) { + if (copyin(setp, &set, sizeof (set))) { + DP_REFRELE(dpep); + return (EFAULT); + } + + sigutok(&set, &kset); + ksetp = &kset; + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + + /* + * Like ppoll() with a non-NULL sigset, we'll + * call cv_reltimedwait_sig() just to check for + * signals. This call will return immediately + * with either 0 (signalled) or -1 (no signal). + * There are some conditions whereby we can + * get 0 from cv_reltimedwait_sig() without + * a true signal (e.g., a directed stop), so + * we restore our signal mask in the unlikely + * event that lwp_cursig is 0. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, + &p->p_lock, 0, TR_CLOCK_TICK)) { + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + + mutex_exit(&p->p_lock); + + DP_REFRELE(dpep); + return (EINTR); + } + + mutex_exit(&p->p_lock); + } + } + if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { /* * We are just using DP_POLL to sleep, so @@ -762,17 +1111,29 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * Do not check for signals if we have a zero timeout. */ DP_REFRELE(dpep); - if (deadline == 0) + if (deadline == 0) { + DP_SIGMASK_RESTORE(ksetp); return (0); + } + mutex_enter(&curthread->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&curthread->t_delay_cv, &curthread->t_delay_lock, deadline)) > 0) continue; mutex_exit(&curthread->t_delay_lock); + + DP_SIGMASK_RESTORE(ksetp); + return (error == 0 ? EINTR : 0); } + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = nfds * (fdsize = sizeof (epoll_event_t)); + } else { + size = nfds * (fdsize = sizeof (pollfd_t)); + } + /* * XXX It would be nice not to have to alloc each time, but it * requires another per thread structure hook. This can be @@ -782,37 +1143,45 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) curthread->t_pollstate = pollstate_create(); ps = curthread->t_pollstate; } - if (ps->ps_dpbufsize < nfds) { - struct proc *p = ttoproc(curthread); + + if (ps->ps_dpbufsize < size) { /* - * The maximum size should be no large than - * current maximum open file count. + * If nfds is larger than twice the current maximum + * open file count, we'll silently clamp it. This + * only limits our exposure to allocating an + * inordinate amount of kernel memory; it doesn't + * otherwise affect the semantics. (We have this + * check at twice the maximum instead of merely the + * maximum because some applications pass an nfds that + * is only slightly larger than their limit.) */ mutex_enter(&p->p_lock); - if (nfds > p->p_fno_ctl) { - mutex_exit(&p->p_lock); - DP_REFRELE(dpep); - return (EINVAL); + if ((nfds >> 1) > p->p_fno_ctl) { + nfds = p->p_fno_ctl; + size = nfds * fdsize; } mutex_exit(&p->p_lock); - kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) * - ps->ps_dpbufsize); - ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) * - nfds, KM_SLEEP); - ps->ps_dpbufsize = nfds; + + if (ps->ps_dpbufsize < size) { + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); + ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); + ps->ps_dpbufsize = size; + } } mutex_enter(&pcp->pc_lock); for (;;) { - pcp->pc_flag = 0; - error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt); + pcp->pc_flag &= ~PC_POLLWAKE; + + error = dp_pcache_poll(dpep, ps->ps_dpbuf, + pcp, nfds, &fdcnt); if (fdcnt > 0 || error != 0) break; /* * A pollwake has happened since we polled cache. */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -822,8 +1191,40 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) /* immediate timeout; do not check signals */ break; } - error = cv_timedwait_sig_hrtime(&pcp->pc_cv, - &pcp->pc_lock, deadline); + + if (!(pcp->pc_flag & PC_WRITEWANTED)) { + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); + } else { + error = 1; + } + + if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) { + /* + * We've been kicked off of our cv because a + * writer wants in. We're going to drop our + * reference count and then wait until the + * writer is gone -- at which point we'll + * reacquire the pc_lock and call into + * dp_pcache_poll() to get the updated state. + */ + mutex_exit(&pcp->pc_lock); + + mutex_enter(&dpep->dpe_lock); + dpep->dpe_refcnt--; + cv_broadcast(&dpep->dpe_cv); + + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || + (dpep->dpe_writerwait != 0)) { + error = cv_wait_sig_swap(&dpep->dpe_cv, + &dpep->dpe_lock); + } + + dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + } + /* * If we were awakened by a signal or timeout * then break the loop, else poll again. @@ -837,9 +1238,11 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) } mutex_exit(&pcp->pc_lock); + DP_SIGMASK_RESTORE(ksetp); + if (error == 0 && fdcnt > 0) { - if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll, - dp_fds), sizeof (pollfd_t) * fdcnt)) { + if (copyout(ps->ps_dpbuf, + STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { DP_REFRELE(dpep); return (EFAULT); } @@ -901,10 +1304,25 @@ static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { + minor_t minor; + dp_entry_t *dpep; + + minor = getminor(dev); + + mutex_enter(&devpoll_lock); + dpep = devpolltbl[minor]; + ASSERT(dpep != NULL); + mutex_exit(&devpoll_lock); + /* * Polling on a /dev/poll fd is not fully supported yet. */ - *reventsp = POLLERR; + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* no error in epoll compat. mode */ + *reventsp = 0; + } else { + *reventsp = POLLERR; + } return (0); } diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 49ca6f0475..8944fcbff3 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,6 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/file.h> @@ -820,7 +821,7 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents, if (error != 0 || *revents != 0) break; - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; if (timo == -1) { diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c index cb20c24270..5852e40799 100644 --- a/usr/src/uts/common/io/usb/usba/usba_ugen.c +++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c @@ -23,6 +23,10 @@ */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* * UGEN: USB Generic Driver support code * * This code provides entry points called by the ugen driver or other @@ -1082,7 +1086,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_INTR_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_INTR_IN_POLL_PENDING; @@ -1101,7 +1108,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_ISOC_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_ISOC_IN_POLL_PENDING; @@ -1115,9 +1125,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; case UGEN_MINOR_DEV_STAT_NODE: - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; @@ -1131,9 +1142,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; } } else { - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLHUP|POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 6dc0d00011..98ca32332f 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -1209,7 +1209,8 @@ f_getfl(int fd, int *flagp) error = EBADF; else { vnode_t *vp = fp->f_vnode; - int flag = fp->f_flag | (fp->f_flag2 << 16); + int flag = fp->f_flag | + ((fp->f_flag2 & ~FEPOLLED) << 16); /* * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index c6ebe8b110..18a5ded1c6 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,6 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -8311,7 +8312,7 @@ chkrd: } *reventsp = (short)retevents; - if (retevents) { + if (retevents && !(events & POLLET)) { if (headlocked) mutex_exit(&stp->sd_lock); return (0); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ee396632ad..c2bf2f0483 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -20,7 +20,7 @@ # # # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. -# Copyright 2013, Joyent, Inc. All rights reserved. +# Copyright 2014, Joyent, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2015 Nexenta Systems, Inc. All rights reserved. @@ -215,6 +215,7 @@ CHKHDRS= \ emul64cmd.h \ emul64var.h \ epm.h \ + epoll.h \ errno.h \ errorq.h \ errorq_impl.h \ diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h index 36c815c69f..4e4c76d9b0 100644 --- a/usr/src/uts/common/sys/devpoll.h +++ b/usr/src/uts/common/sys/devpoll.h @@ -24,11 +24,13 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_DEVPOLL_H #define _SYS_DEVPOLL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/poll_impl.h> #include <sys/types32.h> @@ -39,8 +41,10 @@ extern "C" { /* /dev/poll ioctl */ #define DPIOC (0xD0 << 8) -#define DP_POLL (DPIOC | 1) /* poll on fds in cached in /dev/poll */ +#define DP_POLL (DPIOC | 1) /* poll on fds cached via /dev/poll */ #define DP_ISPOLLED (DPIOC | 2) /* is this fd cached in /dev/poll */ +#define DP_PPOLL (DPIOC | 3) /* ppoll on fds cached via /dev/poll */ +#define DP_EPOLLCOMPAT (DPIOC | 4) /* turn on epoll compatibility */ #define DEVPOLLSIZE 1000 /* /dev/poll table size increment */ @@ -51,14 +55,21 @@ typedef struct dvpoll { pollfd_t *dp_fds; /* pollfd array */ nfds_t dp_nfds; /* num of pollfd's in dp_fds[] */ int dp_timeout; /* time out in milisec */ + sigset_t *dp_setp; /* sigset, if any */ } dvpoll_t; typedef struct dvpoll32 { caddr32_t dp_fds; /* pollfd array */ uint32_t dp_nfds; /* num of pollfd's in dp_fds[] */ int32_t dp_timeout; /* time out in milisec */ + caddr32_t dp_setp; /* sigset, if any */ } dvpoll32_t; +typedef struct dvpoll_epollfd { + pollfd_t dpep_pollfd; /* must be first member */ + uint64_t dpep_data; /* data payload */ +} dvpoll_epollfd_t; + #ifdef _KERNEL typedef struct dp_entry { @@ -71,6 +82,7 @@ typedef struct dp_entry { } dp_entry_t; #define DP_WRITER_PRESENT 0x1 /* a write is in progress */ +#define DP_ISEPOLLCOMPAT 0x2 /* epoll compatibility mode */ #define DP_REFRELE(dpep) { \ mutex_enter(&(dpep)->dpe_lock); \ diff --git a/usr/src/uts/common/sys/epoll.h b/usr/src/uts/common/sys/epoll.h new file mode 100644 index 0000000000..f2e4b90ab7 --- /dev/null +++ b/usr/src/uts/common/sys/epoll.h @@ -0,0 +1,89 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_EPOLL_H +#define _SYS_EPOLL_H + +#include <sys/types.h> +#include <sys/poll.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef union epoll_data { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; +} epoll_data_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct epoll_event { + uint32_t events; /* events */ + epoll_data_t data; /* user-specified data */ +} epoll_event_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +/* + * Define the EPOLL* constants in terms of their poll(2)/poll(7) equivalents. + * Note that the values match the equivalents in Linux to allow for any binary + * compatibility layers to not need to translate them. + */ +#define EPOLLIN 0x0001 +#define EPOLLPRI 0x0002 +#define EPOLLOUT 0x0004 +#define EPOLLRDNORM 0x0040 +#define EPOLLRDBAND 0x0080 +#define EPOLLWRNORM 0x0100 +#define EPOLLWRBAND 0x0200 +#define EPOLLMSG 0x0400 /* not used */ +#define EPOLLERR 0x0008 +#define EPOLLHUP 0x0010 +#define EPOLLRDHUP 0x2000 + +#define EPOLLWAKEUP (1UL << 29) /* no meaning; silently ignored */ +#define EPOLLONESHOT (1UL << 30) /* translated to POLLONESHOT */ +#define EPOLLET (1UL << 31) /* translated to POLLET */ + +#define EPOLL_CTL_ADD 1 +#define EPOLL_CTL_DEL 2 +#define EPOLL_CTL_MOD 3 + +#define EPOLL_CLOEXEC 02000000 + +#if !defined(_KERNEL) + +extern int epoll_create(int size); +extern int epoll_create1(int flags); +extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); +extern int epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout); +extern int epoll_pwait(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EPOLL_H */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index 03acc088c2..1f736d3d01 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -120,6 +120,11 @@ typedef struct fpollinfo { #ifdef _KERNEL /* + * This is a flag that is set on f_flag2, but is never user-visible + */ +#define FEPOLLED 0x8000 + +/* * Fake flags for driver ioctl calls to inform them of the originating * process' model. See <sys/model.h> * diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h index 9fff78a966..efc8457a6a 100644 --- a/usr/src/uts/common/sys/poll.h +++ b/usr/src/uts/common/sys/poll.h @@ -30,6 +30,10 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_H #define _SYS_POLL_H @@ -59,6 +63,7 @@ typedef unsigned long nfds_t; #define POLLWRNORM POLLOUT #define POLLRDBAND 0x0080 /* out-of-band data is readable */ #define POLLWRBAND 0x0100 /* out-of-band data is writeable */ +#define POLLRDHUP 0x4000 /* read-side hangup */ #define POLLNORM POLLRDNORM @@ -70,7 +75,13 @@ typedef unsigned long nfds_t; #define POLLHUP 0x0010 /* fd has been hung up on */ #define POLLNVAL 0x0020 /* invalid pollfd entry */ -#define POLLREMOVE 0x0800 /* remove a cached poll fd from /dev/poll */ +/* + * These events will never be specified in revents, but may be specified in + * events to control /dev/poll behavior. + */ +#define POLLREMOVE 0x0800 /* remove cached /dev/poll fd */ +#define POLLONESHOT 0x1000 /* /dev/poll should one-shot this fd */ +#define POLLET 0x2000 /* edge-triggered /dev/poll fd */ #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index ede99d0df2..2e866ec4d4 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_IMPL_H #define _SYS_POLL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Caching Poll Subsystem: * @@ -160,6 +162,7 @@ typedef struct polldat { int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ struct port_kevent *pd_portev; /* associated port event struct */ + uint64_t pd_epolldata; /* epoll data, if any */ } polldat_t; /* @@ -187,7 +190,8 @@ typedef struct pollcache { } pollcache_t; /* pc_flag */ -#define T_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */ #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 7f37529941..c33156a4fc 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,6 +29,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -525,13 +526,13 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) } /* - * If T_POLLWAKE is set, a pollwakeup() was performed on + * If PC_POLLWAKE is set, a pollwakeup() was performed on * one of the file descriptors. This can happen only if * one of the VOP_POLL() functions dropped pcp->pc_lock. * The only current cases of this is in procfs (prpoll()) * and STREAMS (strpoll()). */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -886,9 +887,9 @@ retry: } /* - * This function is called to inform a thread that - * an event being polled for has occurred. - * The pollstate lock on the thread should be held on entry. + * This function is called to inform a thread (or threads) that an event being + * polled on has occurred. The pollstate lock on the thread should be held + * on entry. */ void pollnotify(pollcache_t *pcp, int fd) @@ -896,8 +897,8 @@ pollnotify(pollcache_t *pcp, int fd) ASSERT(fd < pcp->pc_mapsize); ASSERT(MUTEX_HELD(&pcp->pc_lock)); BT_SET(pcp->pc_bitmap, fd); - pcp->pc_flag |= T_POLLWAKE; - cv_signal(&pcp->pc_cv); + pcp->pc_flag |= PC_POLLWAKE; + cv_broadcast(&pcp->pc_cv); } /* @@ -2024,7 +2025,7 @@ retry: */ if ((pdp->pd_php != NULL) && (pollfdp[entry].events == pdp->pd_events) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } /* @@ -2251,7 +2252,7 @@ pollstate_destroy(pollstate_t *ps) pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); ps->ps_pcacheset = NULL; if (ps->ps_dpbuf != NULL) { - kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); ps->ps_dpbuf = NULL; } mutex_destroy(&ps->ps_lock); diff --git a/usr/src/uts/intel/poll/Makefile b/usr/src/uts/intel/poll/Makefile index fe16be6421..b4be5deb63 100644 --- a/usr/src/uts/intel/poll/Makefile +++ b/usr/src/uts/intel/poll/Makefile @@ -52,6 +52,16 @@ include $(UTSBASE)/intel/Makefile.intel CERRWARN += -_gcc=-Wno-uninitialized # +# It's unfortunate that we have to disable this; however, it's lint's fault. We +# have a line which only causes a lint warning on a 64-bit build. If we suppress +# it, then the 32-bit lint build complains about it being unnecessarily +# suppressed. Therefore, the only thing it seems like we can do is disable the +# lint warning completely. +# + +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# # Define targets # ALL_TARGET = $(BINARY) $(SRC_CONFILE) diff --git a/usr/src/uts/sparc/poll/Makefile b/usr/src/uts/sparc/poll/Makefile index f49278c1c5..c8722105ee 100644 --- a/usr/src/uts/sparc/poll/Makefile +++ b/usr/src/uts/sparc/poll/Makefile @@ -57,6 +57,11 @@ CFLAGS += $(CCVERBOSE) CERRWARN += -_gcc=-Wno-uninitialized # +# See uts/intel/poll/Makefile for why this is necessary. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + +# # Default build targets. # .KEEP_STATE: diff --git a/usr/src/uts/sun4v/io/vcc.c b/usr/src/uts/sun4v/io/vcc.c index feeaf03e8f..85f722e467 100644 --- a/usr/src/uts/sun4v/io/vcc.c +++ b/usr/src/uts/sun4v/io/vcc.c @@ -24,6 +24,9 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/file.h> @@ -2456,7 +2459,7 @@ vcc_chpoll(dev_t dev, short events, int anyyet, short *reventsp, *reventsp |= (events & POLLIN); } - if (((*reventsp) == 0) && (!anyyet)) { + if ((((*reventsp) == 0) && (!anyyet)) || (events & POLLET)) { *phpp = &vport->poll; if (events & POLLIN) { mutex_enter(&vport->lock); |