summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorBryan Cantrill <bryan@joyent.com>2014-04-23 07:41:10 +0000
committerBryan Cantrill <bryan@joyent.com>2014-04-23 07:44:59 +0000
commit7f98ab2316da18bdec16f363a24fffa33594b4f2 (patch)
tree03b60e0981e86e005101b073f9b0a537bada1e31 /usr/src
parente1bd8a8a019df3353d0acd7f92dd710f887ef90b (diff)
downloadillumos-joyent-7f98ab2316da18bdec16f363a24fffa33594b4f2.tar.gz
OS-2893 add support for epoll
Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/truss/codes.c4
-rw-r--r--usr/src/lib/libc/amd64/Makefile3
-rw-r--r--usr/src/lib/libc/i386/Makefile.com4
-rw-r--r--usr/src/lib/libc/port/mapfile-vers8
-rw-r--r--usr/src/lib/libc/port/sys/epoll.c207
-rw-r--r--usr/src/lib/libc/sparc/Makefile.com4
-rw-r--r--usr/src/lib/libc/sparcv9/Makefile.com4
-rw-r--r--usr/src/man/man3c/Makefile9
-rw-r--r--usr/src/man/man3c/epoll_create.3c106
-rw-r--r--usr/src/man/man3c/epoll_ctl.3c316
-rw-r--r--usr/src/man/man3c/epoll_wait.3c115
-rw-r--r--usr/src/man/man5/Makefile2
-rw-r--r--usr/src/man/man5/epoll.5115
-rw-r--r--usr/src/man/man7d/poll.7d9
-rw-r--r--usr/src/man/man9e/chpoll.9e17
-rw-r--r--usr/src/uts/common/fs/fifofs/fifovnops.c13
-rw-r--r--usr/src/uts/common/fs/portfs/port_vnops.c16
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c4
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c13
-rw-r--r--usr/src/uts/common/fs/sockfs/socknotify.c2
-rw-r--r--usr/src/uts/common/io/1394/targets/av1394/av1394_async.c9
-rw-r--r--usr/src/uts/common/io/devpoll.c353
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket.c3
-rw-r--r--usr/src/uts/common/io/usb/usba/usba_ugen.c24
-rw-r--r--usr/src/uts/common/os/streamio.c3
-rw-r--r--usr/src/uts/common/sys/Makefile2
-rw-r--r--usr/src/uts/common/sys/devpoll.h18
-rw-r--r--usr/src/uts/common/sys/epoll.h88
-rw-r--r--usr/src/uts/common/sys/poll.h15
-rw-r--r--usr/src/uts/common/sys/poll_impl.h10
-rw-r--r--usr/src/uts/common/syscall/poll.c19
-rw-r--r--usr/src/uts/sun4v/io/vcc.c5
32 files changed, 1421 insertions, 99 deletions
diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c
index 069268dc05..4e453da0c1 100644
--- a/usr/src/cmd/truss/codes.c
+++ b/usr/src/cmd/truss/codes.c
@@ -23,7 +23,7 @@
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -703,6 +703,8 @@ const struct ioc {
/* /dev/poll ioctl() control codes */
{ (uint_t)DP_POLL, "DP_POLL", NULL },
{ (uint_t)DP_ISPOLLED, "DP_ISPOLLED", NULL },
+ { (uint_t)DP_PPOLL, "DP_PPOLL", NULL },
+ { (uint_t)DP_EPOLLCOMPAT, "DP_EPOLLCOMPAT", NULL },
/* the old /proc ioctl() control codes */
#define PIOC ('q'<<8)
{ (uint_t)(PIOC|1), "PIOCSTATUS", NULL },
diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile
index 873c2ded87..cec30a4fbd 100644
--- a/usr/src/lib/libc/amd64/Makefile
+++ b/usr/src/lib/libc/amd64/Makefile
@@ -20,7 +20,7 @@
#
#
# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
@@ -851,6 +851,7 @@ PORTSYS= \
chmod.o \
chown.o \
corectl.o \
+ epoll.o \
exacctsys.o \
execl.o \
execle.o \
diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com
index d91540d0ae..ddbed44735 100644
--- a/usr/src/lib/libc/i386/Makefile.com
+++ b/usr/src/lib/libc/i386/Makefile.com
@@ -20,9 +20,8 @@
#
#
# Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2013, Joyent, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
# Use is subject to license terms.
@@ -890,6 +889,7 @@ PORTSYS= \
chmod.o \
chown.o \
corectl.o \
+ epoll.o \
exacctsys.o \
execl.o \
execle.o \
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index 86adb85d94..c08f234962 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -24,9 +24,8 @@
# Copyright 2010 Nexenta Systems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
# Copyright (c) 2012 by Delphix. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
# Copyright (c) 2013 Gary Mills
#
@@ -2721,6 +2720,11 @@ $endif
_dgettext;
_doprnt;
_doscan;
+ epoll_create;
+ epoll_create1;
+ epoll_ctl;
+ epoll_wait;
+ epoll_pwait;
_errfp;
_errxfp;
exportfs;
diff --git a/usr/src/lib/libc/port/sys/epoll.c b/usr/src/lib/libc/port/sys/epoll.c
new file mode 100644
index 0000000000..d90b625293
--- /dev/null
+++ b/usr/src/lib/libc/port/sys/epoll.c
@@ -0,0 +1,207 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/epoll.h>
+#include <sys/devpoll.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+
+/*
+ * Events that match their epoll(7) equivalents.
+ */
+#if EPOLLIN != POLLIN
+#error value of EPOLLIN does not match value of POLLIN
+#endif
+
+#if EPOLLPRI != POLLPRI
+#error value of EPOLLPRI does not match value of POLLPRI
+#endif
+
+#if EPOLLOUT != POLLOUT
+#error value of EPOLLOUT does not match value of POLLOUT
+#endif
+
+#if EPOLLRDNORM != POLLRDNORM
+#error value of EPOLLRDNORM does not match value of POLLRDNORM
+#endif
+
+#if EPOLLRDBAND != POLLRDBAND
+#error value of EPOLLRDBAND does not match value of POLLRDBAND
+#endif
+
+#if EPOLLERR != POLLERR
+#error value of EPOLLERR does not match value of POLLERR
+#endif
+
+#if EPOLLHUP != POLLHUP
+#error value of EPOLLHUP does not match value of POLLHUP
+#endif
+
+/*
+ * Events that we ignore entirely. They can be set in events, but they will
+ * never be returned.
+ */
+#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP)
+
+/*
+ * Events that we swizzle into other bit positions.
+ */
+#define EPOLLSWIZZLED \
+ (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM)
+
+int
+epoll_create(int size)
+{
+ int fd;
+
+ /*
+ * From the epoll_create() man page: "Since Linux 2.6.8, the size
+ * argument is ignored, but must be greater than zero." You keep using
+ * that word "ignored"...
+ */
+ if (size <= 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if ((fd = open("/dev/poll", O_RDWR)) == -1)
+ return (-1);
+
+ if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) {
+ (void) close(fd);
+ return (-1);
+ }
+
+ return (fd);
+}
+
+int
+epoll_create1(int flags)
+{
+ int fd, oflags = O_RDWR;
+
+ if (flags & EPOLL_CLOEXEC)
+ oflags |= O_CLOEXEC;
+
+ if ((fd = open("/dev/poll", oflags)) == -1)
+ return (-1);
+
+ if (ioctl(fd, DP_EPOLLCOMPAT, 0) == -1) {
+ (void) close(fd);
+ return (-1);
+ }
+
+ return (fd);
+}
+
+int
+epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
+{
+ dvpoll_epollfd_t epoll[2];
+ uint32_t events = event->events;
+ uint32_t ev;
+ int i = 0;
+
+ epoll[i].dpep_pollfd.fd = fd;
+
+ switch (op) {
+ case EPOLL_CTL_DEL:
+ ev = POLLREMOVE;
+ break;
+
+ case EPOLL_CTL_MOD:
+ /*
+ * In the modify case, we pass down two events: one to
+ * remove the event and another to add it back.
+ */
+ epoll[i++].dpep_pollfd.events = POLLREMOVE;
+ epoll[i].dpep_pollfd.fd = fd;
+ /* FALLTHROUGH */
+
+ case EPOLL_CTL_ADD:
+ /*
+ * Mask off the events that we ignore, and then swizzle the
+ * events for which our values differ from their epoll(7)
+ * equivalents.
+ */
+ ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED);
+
+ if (events & EPOLLRDHUP)
+ ev |= POLLRDHUP;
+
+ if (events & EPOLLET)
+ ev |= POLLET;
+
+ if (events & EPOLLONESHOT)
+ ev |= POLLONESHOT;
+
+ if (events & EPOLLWRNORM)
+ ev |= POLLWRNORM;
+
+ if (events & EPOLLWRBAND)
+ ev |= POLLWRBAND;
+
+ epoll[i].dpep_data = event->data.u64;
+ break;
+
+ default:
+ errno = EOPNOTSUPP;
+ return (-1);
+ }
+
+ epoll[i].dpep_pollfd.events = ev;
+
+ return (write(epfd, epoll, sizeof (epoll[0]) * (i + 1)) == -1 ? -1 : 0);
+}
+
+int
+epoll_wait(int epfd, struct epoll_event *events,
+ int maxevents, int timeout)
+{
+ struct dvpoll arg;
+
+ if (maxevents <= 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ arg.dp_nfds = maxevents;
+ arg.dp_timeout = timeout;
+ arg.dp_fds = (pollfd_t *)events;
+
+ return (ioctl(epfd, DP_POLL, &arg));
+}
+
+int
+epoll_pwait(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask)
+{
+ struct dvpoll arg;
+
+ if (maxevents <= 0) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ arg.dp_nfds = maxevents;
+ arg.dp_timeout = timeout;
+ arg.dp_fds = (pollfd_t *)events;
+ arg.dp_setp = (sigset_t *)sigmask;
+
+ return (ioctl(epfd, DP_PPOLL, &arg));
+}
diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com
index c8779479d3..34d35699d0 100644
--- a/usr/src/lib/libc/sparc/Makefile.com
+++ b/usr/src/lib/libc/sparc/Makefile.com
@@ -20,9 +20,8 @@
#
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
# Use is subject to license terms.
@@ -925,6 +924,7 @@ PORTSYS= \
chmod.o \
chown.o \
corectl.o \
+ epoll.o \
exacctsys.o \
execl.o \
execle.o \
diff --git a/usr/src/lib/libc/sparcv9/Makefile.com b/usr/src/lib/libc/sparcv9/Makefile.com
index 7f4a6e4e38..65ac2dedc7 100644
--- a/usr/src/lib/libc/sparcv9/Makefile.com
+++ b/usr/src/lib/libc/sparcv9/Makefile.com
@@ -20,9 +20,8 @@
#
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
# Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
-# Copyright (c) 2012, Joyent, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
# Copyright 2011 Nexenta Systems, Inc. All rights reserved.
# Use is subject to license terms.
@@ -871,6 +870,7 @@ PORTSYS= \
chmod.o \
chown.o \
corectl.o \
+ epoll.o \
exacctsys.o \
execl.o \
execle.o \
diff --git a/usr/src/man/man3c/Makefile b/usr/src/man/man3c/Makefile
index f3b6f18a7f..3056f2c923 100644
--- a/usr/src/man/man3c/Makefile
+++ b/usr/src/man/man3c/Makefile
@@ -13,6 +13,7 @@
# Copyright 2011, Richard Lowe
# Copyright 2013 Nexenta Systems, Inc. All rights reserved.
# Copyright 2013, OmniTI Computer Consulting, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
include $(SRC)/Makefile.master
@@ -106,6 +107,9 @@ MANFILES= __fbufsize.3c \
enable_extended_FILE_stdio.3c \
encrypt.3c \
end.3c \
+ epoll_create.3c \
+ epoll_ctl.3c \
+ epoll_wait.3c \
err.3c \
euclen.3c \
exit.3c \
@@ -706,6 +710,8 @@ MANLINKS= FD_CLR.3c \
endusershell.3c \
endutent.3c \
endutxent.3c \
+ epoll_create1.3c \
+ epoll_pwait.3c \
erand48.3c \
errno.3c \
errx.3c \
@@ -1435,6 +1441,9 @@ _etext.3c := LINKSRC = end.3c
edata.3c := LINKSRC = end.3c
etext.3c := LINKSRC = end.3c
+epoll_create1.3c := LINKSRC = epoll_create.3c
+epoll_pwait.3c := LINKSRC = epoll_wait.3c
+
errx.3c := LINKSRC = err.3c
verr.3c := LINKSRC = err.3c
verrx.3c := LINKSRC = err.3c
diff --git a/usr/src/man/man3c/epoll_create.3c b/usr/src/man/man3c/epoll_create.3c
new file mode 100644
index 0000000000..87d0a85c38
--- /dev/null
+++ b/usr/src/man/man3c/epoll_create.3c
@@ -0,0 +1,106 @@
+'\" te
+.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved.
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.TH EPOLL_CREATE 3C "Apr 17, 2014"
+.SH NAME
+epoll_create, epoll_create1 \- create an epoll instance
+.SH SYNOPSIS
+
+.LP
+.nf
+#include <sys/epoll.h>
+
+\fBint\fR \fBepoll_create\fR(\fBint\fR \fIsize\fR);
+.fi
+
+.LP
+.nf
+\fBint\fR \fBepoll_create1\fR(\fBint\fR \fIflags\fR);
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The \fBepoll_create()\fR and \fBepoll_create1()\fR functions both create an
+\fBepoll\fR(5) instance that can be operated upon via \fBepoll_ctl\fR(3C),
+\fBepoll_wait\fR(3C) and \fBepoll_pwait\fR(3C). \fBepoll\fR instances are
+represented as file descriptors, and should be closed via \fBclose\fR(2).
+
+The only difference between the two functions is their signature;
+\fBepoll_create()\fR takes a size argument that
+is vestigal and is only meaningful in as much as it must be greater than
+zero, while \fBepoll_create1()\fR takes a flags argument that can have
+any of the following values:
+
+.sp
+.ne 2
+.na
+\fBEPOLL_CLOEXEC\fR
+.ad
+.RS 12n
+Instance should be closed upon an
+\fBexec\fR(2); see \fBopen\fR(2)'s description of \fBO_CLOEXEC\fR.
+.RE
+
+.SH RETURN VALUES
+.sp
+.LP
+Upon succesful completion, 0 is returned. Otherwise, -1 is returned and errno
+is set to indicate the error.
+.SH ERRORS
+.sp
+.LP
+The \fBepoll_create()\fR and \fBepoll_create1()\fR functions will fail if:
+.sp
+.ne 2
+.na
+\fB\fBEINVAL\fR\fR
+.ad
+.RS 10n
+Either the \fIsize\fR is zero (\fBepoll_create()\fR) or the \fIflags\fR
+are invalid (\fBepoll_create1()\fR).
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEMFILE\fR\fR
+.ad
+.RS 10n
+There are currently {\fBOPEN_MAX\fR} file descriptors open in the calling
+process.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBENFILE\fR\fR
+.ad
+.RS 10n
+The maximum allowable number of files is currently open in the system.
+.RE
+
+.sp
+.LP
+.SH NOTES
+.sp
+.LP
+
+The \fBepoll\fR(5) facility is implemented for purposes of offering
+compatibility for Linux-borne applications; native
+applications should continue to prefer using event ports via the
+\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C)
+interfaces. See \fBepoll\fR(5) for compatibility details and restrictions.
+.RE
+
+.SH SEE ALSO
+.sp
+.LP
+\fBepoll_ctl\fR(3C), \fBepoll_wait\fR(3C), \fBepoll\fR(5)
diff --git a/usr/src/man/man3c/epoll_ctl.3c b/usr/src/man/man3c/epoll_ctl.3c
new file mode 100644
index 0000000000..0cce6affa4
--- /dev/null
+++ b/usr/src/man/man3c/epoll_ctl.3c
@@ -0,0 +1,316 @@
+'\" te
+.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved.
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.TH EPOLL_CTL 3C "Apr 17, 2014"
+.SH NAME
+epoll_ctl \- control an epoll instance
+.SH SYNOPSIS
+
+.LP
+.nf
+#include <sys/epoll.h>
+
+\fBint\fR \fBepoll_ctl\fR(\fBint\fR \fIepfd\fR, \fBint\fR \fIop\fR, \fBint\fR \fIfd\fR, \fBstruct epoll_event *\fR\fIevent\fR);
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The \fBepoll_ctl()\fR function executes the operation specified by
+\fIop\fR (as parameterized by \fIevent\fR) on the \fIepfd\fR epoll instance.
+Valid values for \fIop\fR:
+
+.sp
+.ne 2
+.na
+\fBEPOLL_CTL_ADD\fR
+.ad
+.RS 12n
+For the \fBepoll\fR(5) instance specified by \fIepfd\fR,
+associate the file descriptor specified by \fIfd\fR with the event specified
+by \fIevent\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLL_CTL_DEL\fR
+.ad
+.RS 12n
+For the \fBepoll\fR(5) instance specified by \fIepfd\fR,
+remove all event associations for the file descriptor specified by \fIfd\fR.
+\fIevent\fR is ignored, and may be NULL.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLL_CTL_MOD\fR
+.ad
+.RS 12n
+For the \fBepoll\fR(5) instance specified by \fIepfd\fR, modify the event
+association for the file descriptor specified by \fIfd\fR to be that
+specified by \fIevent\fR.
+
+.RE
+
+The \fIevent\fR parameter has the following structure:
+
+.in +4
+.nf
+typedef union epoll_data {
+ void *ptr;
+ int fd;
+ uint32_t u32;
+ uint64_t u64;
+} epoll_data_t;
+
+struct epoll_event {
+ uint32_t events;
+ epoll_data_t data;
+};
+.fi
+.in -4
+
+The \fIdata\fR field specifies the datum to
+be associated with the event and
+will be returned via \fBepoll_wait\fR(3C).
+The \fIevents\fR field denotes both the desired events (when specified via
+\fBepoll_ctl()\fR) and the events that have occurred (when returned via
+\fBepoll_wait\fR(3C)).
+In either case, the
+\fIevents\fR field is a bitmask constructed by a logical \fBOR\fR operation
+of any combination of the following event flags:
+
+.sp
+.ne 2
+.na
+\fBEPOLLIN\fR
+.as
+.RS 14n
+Data other than high priority data may be read without blocking. For streams,
+this flag is set in the returned \fIevents\fR even if the message is of
+zero length.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLPRI\fR
+.as
+.RS 14n
+Normal data (priority band equals 0) may be read without blocking. For streams,
+this flag is set in the returned \fIevents\fR even if the message is of zero
+length.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLOUT\fR
+.as
+.RS 14n
+Normal data (priority band equals 0) may be written without blocking.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLRDNORM\fR
+.as
+.RS 14n
+Normal data (priority band equals 0) may be read without blocking. For streams,
+this flag is set in the returned \fIrevents\fR even if the message is of
+zero length.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLRDBAND\fR
+.as
+.RS 14n
+Data from a non-zero priority band may be read without blocking. For streams,
+this flag is set in the returned \fIrevents\fR even if the message is of
+zero length.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLWRNORM\fR
+.as
+.RS 14n
+The same as \fBEPOLLOUT\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLWRBAND\fR
+.as
+.RS 14n
+Priority data (priority band > 0) may be written. This event only examines
+bands that have been written to at least once.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLMSG\fR
+.as
+.RS 14n
+This exists only for backwards binary and source compatibility with Linux;
+it has no meaning and is ignored.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLERR\fR
+.as
+.RS 14n
+An error has occurred on the device or stream. This flag is only valid in the
+returned \fIevents\fR field.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLHUP\fR
+.as
+.RS 14n
+A hangup has occurred on the stream. This event and \fBEPOLLOUT\fR are mutually
+exclusive; a stream can never be writable if a hangup has occurred. However,
+this event and \fBEPOLLIN\fR, \fBEPOLLRDNORM\fR, \fBEPOLLRDBAND\fR,
+\fBEPOLLRDHUP\fR or
+\fBEPOLLPRI\fR are not mutually exclusive. This flag is only valid in the
+the \fIevents\fR field returned from \fBepoll_wait\fR(3C); it is not used
+in the \fIevents\fR field specified via \fBepoll_ctl()\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLRDHUP\fR
+.as
+.RS 14n
+The stream socket peer shutdown the writing half of the connection and no
+further data will be readable via the socket. This event is not mutually
+exclusive with \fBEPOLLIN\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLWAKEUP\fR
+.as
+.RS 14n
+This exists only for backwards binary and source compatibility with Linux;
+it has no meaning and is ignored.
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLONESHOT\fR
+.as
+.RS 14n
+Sets the specified event to be in one-shot mode, whereby the event association
+with the \fBepoll\fR(5) instance specified by \fIepfd\fR is removed atomically
+as the event is returned via \fBepoll_wait\fR(3C). Use of this mode allows
+for resolution of some of the
+races inherent in multithreaded use of \fBepoll_wait\fR(3C).
+.RE
+
+.sp
+.ne 2
+.na
+\fBEPOLLET\fR
+.as
+.RS 14n
+Sets the specified event to be edge-triggered mode instead of the default
+mode of level-triggered. In this mode, events will be induced by
+transitions on an event source rather than the state of the event source.
+While perhaps superficially appealing, this mode introduces several new
+potential failure modes for user-level software and should be used
+with caution.
+.RE
+
+.SH RETURN VALUES
+.sp
+.LP
+Upon succesful completion, \fBepoll_ctl()\fR returns 0.
+If an error occurs, -1 is returned and errno is set to indicate
+the error.
+
+.SH ERRORS
+.sp
+.LP
+\fBepoll_ctl()\fR will fail if:
+.sp
+.ne 2
+.na
+\fB\fBEBADF\fR\fR
+.ad
+.RS 10n
+\fIepfd\fR is not a valid file descriptor.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEFAULT\fR\fR
+.ad
+.RS 10n
+The memory associated with \fIevent\fR was not mapped.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEEXIST\fR\fR
+.ad
+.RS 10n
+The operation specified was \fBEPOLL_CTL_ADD\fR and the specified file
+descriptor is already associated with an event for the specified
+\fBepoll\fR(5) instance.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBENOENT\fR\fR
+.ad
+.RS 10n
+The operation specified was \fBEPOLL_CTL_MOD\fR or \fBEPOLL_CTL_DEL\fR and
+the specified file descriptor is not associated with an event for the
+specified \fBepoll\fR(5) instance.
+.RE
+
+.sp
+.LP
+.SH NOTES
+.sp
+.LP
+
+The \fBepoll\fR(5) facility is implemented for purposes of offering
+compatibility for Linux-borne applications; native
+applications should continue to prefer using event ports via the
+\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C)
+interfaces. See \fBepoll\fR(5) for compatibility details and restrictions.
+.RE
+
+.SH SEE ALSO
+.sp
+.LP
+\fBepoll_create\fR(3C), \fBepoll_wait\fR(3C),
+\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C),
+\fBepoll\fR(5)
diff --git a/usr/src/man/man3c/epoll_wait.3c b/usr/src/man/man3c/epoll_wait.3c
new file mode 100644
index 0000000000..6405c30b5d
--- /dev/null
+++ b/usr/src/man/man3c/epoll_wait.3c
@@ -0,0 +1,115 @@
+'\" te
+.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved.
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.TH EPOLL_WAIT 3C "Apr 17, 2014"
+.SH NAME
+epoll_wait, epoll_pwait \- wait for epoll events
+.SH SYNOPSIS
+
+.LP
+.nf
+#include <sys/epoll.h>
+
+\fBint\fR \fBepoll_wait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR,
+ \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR);
+.fi
+
+.LP
+.nf
+\fBint\fR \fBepoll_pwait\fR(\fBint\fR \fIepfd\fR, \fBstruct epoll_event *\fR\fIevents\fR,
+ \fBint\fR \fImaxevents\fR, \fBint\fR \fItimeout\fR,
+ \fBconst sigset_t *\fR\fIsigmask\fR);
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+The \fBepoll_wait()\fR function waits for events on the \fBepoll\fR(5)
+instance specified by \fIepfd\fR. The \fIevents\fR parameter must point to
+an array of \fImaxevents\fR \fIepoll_event\fR structures to be
+filled in with pending events. The \fItimeout\fR argument specifies the
+number of milliseconds to wait for an event if none is pending. A
+\fItimeout\fR of -1 denotes an infinite timeout.
+
+The \fBepoll_pwait()\fR is similar to \fBepoll_wait()\fR, but takes an
+additional \fIsigmask\fR argument that specifies the desired signal mask
+when \fBepoll_pwait()\fR is blocked. It is equivalent to atomically
+setting the signal mask, calling \fBepoll_wait()\fR, and restoring the
+signal mask upon return, and is therefore similar to the relationship
+between \fBselect\fR(3C) and \fBpselect\fR(3C).
+
+.SH RETURN VALUES
+.sp
+.LP
+Upon successful completion, \fBepoll_wait()\fR and \fBepoll_pwait()\fR return
+the number of events, or 0 if none was pending and \fItimeout\fR milliseconds
+elapsed. If an error occurs, -1 is returned and errno is set to indicate
+the error.
+
+.SH ERRORS
+.sp
+.LP
+The \fBepoll_wait()\fR and \fBepoll_pwait()\fR functions will fail if:
+.sp
+.ne 2
+.na
+\fB\fBEBADF\fR\fR
+.ad
+.RS 10n
+\fIepfd\fR is not a valid file descriptor.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEFAULT\fR\fR
+.ad
+.RS 10n
+The memory associated with \fIevents\fR was not mapped or was not writable.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEINTR\fR\fR
+.ad
+.RS 10n
+A signal was received during the \fBepoll_wait()\fR or \fBepoll_pwait()\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBEINVAL\fR\fR
+.ad
+.RS 10n
+Either \fIepfd\fR is not a valid \fBepoll\fR(5) instance or \fImaxevents\fR
+is not greater than zero.
+.RE
+
+.sp
+.LP
+.SH NOTES
+.sp
+.LP
+
+The \fBepoll\fR(5) facility is implemented for purposes of offering
+compatibility for Linux-borne applications; native
+applications should continue to prefer using event ports via the
+\fBport_create\fR(3C), \fBport_associate\fR(3C) and \fBport_get\fR(3C)
+interfaces. See \fBepoll\fR(5) for compatibility details and restrictions.
+.RE
+
+.SH SEE ALSO
+.sp
+.LP
+\fBepoll_create\fR(3C), \fBepoll_ctl\fR(3C),
+\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_get\fR(3C),
+\fBpselect\fR(3C), \fBepoll\fR(5)
diff --git a/usr/src/man/man5/Makefile b/usr/src/man/man5/Makefile
index eff831c3d4..8e805da008 100644
--- a/usr/src/man/man5/Makefile
+++ b/usr/src/man/man5/Makefile
@@ -13,6 +13,7 @@
# Copyright 2011, Richard Lowe
# Copyright (c) 2012 by Delphix. All rights reserved.
# Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
include $(SRC)/Makefile.master
@@ -41,6 +42,7 @@ MANFILES= Intro.5 \
dhcp.5 \
dhcp_modules.5 \
environ.5 \
+ epoll.5 \
eqnchar.5 \
extendedFILE.5 \
filesystem.5 \
diff --git a/usr/src/man/man5/epoll.5 b/usr/src/man/man5/epoll.5
new file mode 100644
index 0000000000..bba13e15d2
--- /dev/null
+++ b/usr/src/man/man5/epoll.5
@@ -0,0 +1,115 @@
+'\" te
+.\" Copyright (c) 2014, Joyent, Inc. All Rights Reserved.
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.TH EPOLL 5 "Apr 17, 2014"
+.SH NAME
+epoll \- Linux-compatible I/O event notification facility
+.SH SYNOPSIS
+
+.LP
+.nf
+#include <sys/epoll.h>
+.fi
+
+.SH DESCRIPTION
+.sp
+.LP
+
+\fBepoll\fR is a facility for efficient event-oriented I/O that has a
+similar model to \fBpoll\fR(2), but does not necessitate rescanning a
+set of file descriptors to wait for an event. \fBepoll\fR is of Linux
+origins, and this facility is designed to be binary-compatible with
+the Linux facility, including the following interfaces:
+
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fBepoll_create\fR(3C) creates an \fBepoll\fR instance, returning a file
+descriptor. It contains a size arugment which is meaningful only in as
+much as it cannot be 0.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fBepoll_create1\fR(3C) also creates an \fBepoll\fR instance, but eliminates
+the meaningless size argument -- replacing it instead with a flags
+argument.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fBepoll_ctl\fR(3C) allows file descriptors to be added
+(via \fBEPOLL_CTL_ADD\fR), deleted (via \fBEPOLL_CTL_DEL\fR) or
+modified (via \fBEPOLL_CTL_MOD\fR) with respect to the \fBepoll\fR'd set
+of file descriptors.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fBepoll_wait\fR(3C) fetches pending events for file descriptors added
+via \fBepoll_ctl\fR(3C), blocking the caller if no such events are pending.
+.RE
+.RS +4
+.TP
+.ie t \(bu
+.el o
+\fBepoll_pwait\fR(3C) opeates in a similar manner to \fBepoll_wait\fR(3C), but
+allows the caller to specify a signal mask to be set atomically with respect
+to waiting for events.
+.RE
+
+.sp
+.LP
+.SH NOTES
+.sp
+.LP
+
+The \fBepoll\fR facility is implemented
+for purposes of offering compatibility to and portability of Linux-borne
+applications; native applications should continue to prefer using event ports
+via the \fBport_create\fR(3C),
+\fBport_associate\fR(3C) and \fBport_getn\fR(3C) interfaces.
+In particular, use of \fBepoll\fR in a multithreaded environment is fraught
+with peril; even when using \fBEPOLLONESHOT\fR for one-shot events,
+there are race conditions with respect to \fBclose\fR(2) that are unresolvable.
+(For more details, see the aborted effort in Linux to resolve this via the
+proposed
+\fBEPOLL_CTL_DISABLE\fR operation.)
+The event port facility -- like the BSD kqueue facility that inspired it --
+is designed to deal with such issues via explicit event source dissociation.
+
+While a best effort has been made to mimic the Linux semantics, there
+are some semantics that are too peculiar or ill-conceived to merit
+accommodation. In particular, the Linux \fBepoll\fR facility will -- by
+design -- continue to generate events for closed file descriptors where/when
+the underlying file description remains open. For example, if one were
+to \fBfork\fR(2) and subsequently close an actively \fBepoll\fR'd file
+descriptor in the parent,
+any events generated in the child on the implicitly duplicated file descriptor
+will continue to be delivered to the parent -- despite the fact that the
+parent itself no longer has any notion of the file description!
+This \fBepoll\fR facility refuses to honor
+these semantics; closing the \fBEPOLL_CTL_ADD\fR'd file descriptor
+will always result in no further
+events being generated for that event description.
+
+.RE
+.SH SEE ALSO
+.sp
+.LP
+\fBepoll_create\fR(3C), \fBepoll_create1\fR(3C), \fBepoll_ctl\fR(3C),
+\fBepoll_wait\fR(3C), \fBepoll_pwait\fR(3C),
+\fBport_create\fR(3C), \fBport_associate\fR(3C), \fBport_dissociate\fR(3C),
+\fBport_get\fR(3C),
+\fBpselect\fR(3C)
diff --git a/usr/src/man/man7d/poll.7d b/usr/src/man/man7d/poll.7d
index cd3db77de9..7a3292eb97 100644
--- a/usr/src/man/man7d/poll.7d
+++ b/usr/src/man/man7d/poll.7d
@@ -73,15 +73,6 @@ Pointer to \fBpollfd\fR structure.
.SH DESCRIPTION
.LP
-Note -
-.sp
-.RS 2
-The \fB/dev/poll\fR device, associated driver and corresponding manpages may be
-removed in a future Solaris release. For similar functionality in the event
-ports framework, see \fBport_create\fR(3C).
-.RE
-.sp
-.LP
The \fB/dev/poll\fR driver is a special driver that enables you to monitor
multiple sets of polled file descriptors. By using the \fB/dev/poll\fR
driver, you can efficiently poll large numbers of file descriptors. Access to
diff --git a/usr/src/man/man9e/chpoll.9e b/usr/src/man/man9e/chpoll.9e
index a2adaf7a9c..24e3379861 100644
--- a/usr/src/man/man9e/chpoll.9e
+++ b/usr/src/man/man9e/chpoll.9e
@@ -123,6 +123,17 @@ The same as \fBPOLLOUT\fR.
Priority data (priority band > 0) may be written.
.RE
+.sp
+.ne 2
+.na
+\fB\fBPOLLET\fR\fR
+.ad
+.RS 14n
+The desired event is to be edge-triggered; calls to \fBpollwakeup\fR(9F)
+should not be suppressed, even if the event is pending at the time of
+call to the \fBchpoll()\fR function.
+.RE
+
.RE
.sp
@@ -200,7 +211,11 @@ be called with multiple events at one time. The \fBpollwakup()\fR can be called
regardless of whether or not the \fBchpoll()\fR entry is called; it should be
called every time the driver detects the pollable event. The driver must not
hold any mutex across the call to \fBpollwakeup\fR(9F) that is acquired in its
-\fBchpoll()\fR entry point, or a deadlock may result.
+\fBchpoll()\fR entry point, or a deadlock may result. Note that if
+\fBPOLLET\fR is set in the specified events, the driver must call
+\fBpollwakeup\fR(9F) on subsequent events, even if events are pending at
+the time of the call to \fBchpoll()\fR.
+
.RE
.SH RETURN VALUES
.sp
diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c
index ac89e430c7..fee2924093 100644
--- a/usr/src/uts/common/fs/fifofs/fifovnops.c
+++ b/usr/src/uts/common/fs/fifofs/fifovnops.c
@@ -27,7 +27,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
/*
* FIFOFS file system vnode operations. This file system
@@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
}
/*
- * if we happened to get something, return
+ * if we happened to get something and we're not edge-triggered, return
*/
-
- if ((*reventsp = (short)retevents) != 0) {
+ if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) {
mutex_exit(&fnp->fn_lock->flk_lock);
return (0);
}
/*
- * If poll() has not found any events yet, set up event cell
- * to wake up the poll if a requested event occurs on this
+ * If poll() has not found any events yet or we're edge-triggered, set
+ * up event cell to wake up the poll if a requested event occurs on this
* pipe/fifo.
*/
if (!anyyet) {
diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c
index b2f5088e06..ab95c0a1f8 100644
--- a/usr/src/uts/common/fs/portfs/port_vnops.c
+++ b/usr/src/uts/common/fs/portfs/port_vnops.c
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/vfs_opreg.h>
@@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp,
levents |= POLLOUT;
levents &= events;
*reventsp = levents;
- if (levents == 0) {
- if (!anyyet) {
- *phpp = &pp->port_pollhd;
- portq->portq_flags |=
- events & POLLIN ? PORTQ_POLLIN : 0;
- portq->portq_flags |=
- events & POLLOUT ? PORTQ_POLLOUT : 0;
- }
+ if ((levents == 0 && !anyyet) || (events & POLLET)) {
+ *phpp = &pp->port_pollhd;
+ portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0;
+ portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0;
}
mutex_exit(&portq->portq_mutex);
return (0);
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index c84b9d3726..84a8ae4d31 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -6010,7 +6010,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp,
}
*reventsp = revents;
- if (!anyyet && revents == 0) {
+ if ((!anyyet && revents == 0) || (events & POLLET)) {
/*
* Arrange to wake up the polling lwp when
* the target process/lwp stops or terminates
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 0be628f329..d3ff264eef 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -953,6 +953,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
if (!list_is_empty(&so->so_acceptq_list))
*reventsp |= (POLLIN|POLLRDNORM) & events;
+ /*
+ * If we're looking for POLLRDHUP, indicate it if we have sent the
+ * last rx signal for the socket.
+ */
+ if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG))
+ *reventsp |= POLLRDHUP;
+
/* Data */
/* so_downcalls is null for sctp */
if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
@@ -988,14 +995,18 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
*reventsp |= POLLHUP;
}
- if (!*reventsp && !anyyet) {
+ if ((!*reventsp && !anyyet) || (events & POLLET)) {
/* Check for read events again, but this time under lock */
if (events & (POLLIN|POLLRDNORM)) {
mutex_enter(&so->so_lock);
if (SO_HAVE_DATA(so) ||
!list_is_empty(&so->so_acceptq_list)) {
+ if (events & POLLET)
+ so->so_pollev |= SO_POLLEV_IN;
+
mutex_exit(&so->so_lock);
*reventsp |= (POLLIN|POLLRDNORM) & events;
+
return (0);
} else {
so->so_pollev |= SO_POLLEV_IN;
diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c
index 3d5ba2a7e8..3f858afecc 100644
--- a/usr/src/uts/common/fs/sockfs/socknotify.c
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c
@@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev)
so->so_state |= SS_SENTLASTREADSIG;
so->so_pollev &= ~SO_POLLEV_IN;
- *pollev |= POLLIN|POLLRDNORM;
+ *pollev |= POLLIN|POLLRDNORM|POLLRDHUP;
*sigev |= SOCKETSIG_READ;
return (1);
diff --git a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c
index 94323582d6..c2c16e848b 100644
--- a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c
+++ b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c
@@ -24,7 +24,9 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
/*
* av1394 asynchronous module
@@ -359,9 +361,10 @@ av1394_async_poll(av1394_inst_t *avp, short events, int anyyet, short *reventsp,
AV1394_TNF_ENTER(av1394_async_poll);
if (events & POLLIN) {
- if (av1394_peekq(rq)) {
+ if (av1394_peekq(rq))
*reventsp |= POLLIN;
- } else if (!anyyet) {
+
+ if ((!*reventsp && !anyyet) || (events & POLLET)) {
mutex_enter(&ap->a_mutex);
ap->a_pollevents |= POLLIN;
*phpp = &ap->a_pollhead;
diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c
index 7d311aaa10..14c2f47fac 100644
--- a/usr/src/uts/common/io/devpoll.c
+++ b/usr/src/uts/common/io/devpoll.c
@@ -25,6 +25,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -45,6 +46,8 @@
#include <sys/devpoll.h>
#include <sys/rctl.h>
#include <sys/resource.h>
+#include <sys/schedctl.h>
+#include <sys/epoll.h>
#define RESERVED 1
@@ -237,7 +240,8 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
* stale entries!
*/
static int
-dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp)
+dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
+ pollcache_t *pcp, nfds_t nfds, int *fdcntp)
{
int start, ostart, end;
int fdcnt, fd;
@@ -247,7 +251,10 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp)
boolean_t no_wrap;
pollhead_t *php;
polldat_t *pdp;
+ pollfd_t *pfdp;
+ epoll_event_t *epoll;
int error = 0;
+ short mask = POLLRDHUP | POLLWRNORM | POLLWRBAND;
ASSERT(MUTEX_HELD(&pcp->pc_lock));
if (pcp->pc_bitmap == NULL) {
@@ -257,6 +264,14 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp)
*/
return (error);
}
+
+ if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ pfdp = NULL;
+ epoll = (epoll_event_t *)dpbuf;
+ } else {
+ pfdp = (pollfd_t *)dpbuf;
+ epoll = NULL;
+ }
retry:
start = ostart = pcp->pc_mapstart;
end = pcp->pc_mapend;
@@ -316,11 +331,32 @@ repoll:
* polling a closed fd. Hope this will remind
* user to do a POLLREMOVE.
*/
- pfdp[fdcnt].fd = fd;
- pfdp[fdcnt].revents = POLLNVAL;
- fdcnt++;
+ if (pfdp != NULL) {
+ pfdp[fdcnt].fd = fd;
+ pfdp[fdcnt].revents = POLLNVAL;
+ fdcnt++;
+ continue;
+ }
+
+ /*
+ * In the epoll compatibility case, we actually
+ * perform the implicit removal to remain
+ * closer to the epoll semantics.
+ */
+ ASSERT(epoll != NULL);
+
+ pdp->pd_fp = NULL;
+ pdp->pd_events = 0;
+
+ if (php != NULL) {
+ pollhead_delete(php, pdp);
+ pdp->pd_php = NULL;
+ }
+
+ BT_CLEAR(pcp->pc_bitmap, fd);
continue;
}
+
if (fp != pdp->pd_fp) {
/*
* user is polling on a cached fd which was
@@ -376,9 +412,59 @@ repoll:
}
if (revent != 0) {
- pfdp[fdcnt].fd = fd;
- pfdp[fdcnt].events = pdp->pd_events;
- pfdp[fdcnt].revents = revent;
+ if (pfdp != NULL) {
+ pfdp[fdcnt].fd = fd;
+ pfdp[fdcnt].events = pdp->pd_events;
+ pfdp[fdcnt].revents = revent;
+ } else {
+ epoll_event_t *ep = &epoll[fdcnt];
+
+ ASSERT(epoll != NULL);
+ ep->data.u64 = pdp->pd_epolldata;
+
+ /*
+ * If any of the event bits are set for
+ * which poll and epoll representations
+ * differ, swizzle in the native epoll
+ * values.
+ */
+ if (revent & mask) {
+ ep->events = (revent & ~mask) |
+ ((revent & POLLRDHUP) ?
+ EPOLLRDHUP : 0) |
+ ((revent & POLLWRNORM) ?
+ EPOLLWRNORM : 0) |
+ ((revent & POLLWRBAND) ?
+ EPOLLWRBAND : 0);
+ } else {
+ ep->events = revent;
+ }
+ }
+
+ /*
+ * If POLLET is set, clear the bit in the
+ * bitmap -- which effectively latches the
+ * edge on a pollwakeup() from the driver.
+ */
+ if (pdp->pd_events & POLLET)
+ BT_CLEAR(pcp->pc_bitmap, fd);
+
+ /*
+ * If POLLONESHOT is set, perform the implicit
+ * POLLREMOVE.
+ */
+ if (pdp->pd_events & POLLONESHOT) {
+ pdp->pd_fp = NULL;
+ pdp->pd_events = 0;
+
+ if (php != NULL) {
+ pollhead_delete(php, pdp);
+ pdp->pd_php = NULL;
+ }
+
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ }
+
fdcnt++;
} else if (php != NULL) {
/*
@@ -392,7 +478,7 @@ repoll:
* in bitmap.
*/
if ((pdp->pd_php != NULL) &&
- ((pcp->pc_flag & T_POLLWAKE) == 0)) {
+ ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
BT_CLEAR(pcp->pc_bitmap, fd);
}
if (pdp->pd_php == NULL) {
@@ -499,7 +585,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
dp_entry_t *dpep;
pollcache_t *pcp;
pollfd_t *pollfdp, *pfdp;
- int error;
+ dvpoll_epollfd_t *epfdp;
+ uintptr_t limit;
+ int error, size;
ssize_t uiosize;
nfds_t pollfdnum;
struct pollhead *php = NULL;
@@ -515,11 +603,18 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
ASSERT(dpep != NULL);
mutex_exit(&devpoll_lock);
pcp = dpep->dpe_pcache;
- if (curproc->p_pid != pcp->pc_pid) {
+
+ if (curproc->p_pid != pcp->pc_pid)
return (EACCES);
+
+ if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ size = sizeof (dvpoll_epollfd_t);
+ } else {
+ size = sizeof (pollfd_t);
}
+
uiosize = uiop->uio_resid;
- pollfdnum = uiosize / sizeof (pollfd_t);
+ pollfdnum = uiosize / size;
mutex_enter(&curproc->p_lock);
if (pollfdnum > (uint_t)rctl_enforced_value(
rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
@@ -534,6 +629,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
* each polled fd to the cached set.
*/
pollfdp = kmem_alloc(uiosize, KM_SLEEP);
+ limit = (uintptr_t)pollfdp + (pollfdnum * size);
/*
* Although /dev/poll uses the write(2) interface to cache fds, it's
@@ -555,9 +651,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
mutex_enter(&dpep->dpe_lock);
dpep->dpe_writerwait++;
while (dpep->dpe_refcnt != 0) {
+ /*
+ * We need to do a bit of a dance here: we need to drop
+ * our dpe_lock and grab the pc_lock to broadcast the pc_cv to
+ * kick any DP_POLL/DP_PPOLL sleepers.
+ */
+ mutex_exit(&dpep->dpe_lock);
+ mutex_enter(&pcp->pc_lock);
+ pcp->pc_flag |= PC_WRITEWANTED;
+ cv_broadcast(&pcp->pc_cv);
+ mutex_exit(&pcp->pc_lock);
+ mutex_enter(&dpep->dpe_lock);
+
+ if (dpep->dpe_refcnt == 0)
+ break;
+
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
dpep->dpe_writerwait--;
mutex_exit(&dpep->dpe_lock);
+ mutex_enter(&pcp->pc_lock);
+ pcp->pc_flag &= ~PC_WRITEWANTED;
+ mutex_exit(&pcp->pc_lock);
kmem_free(pollfdp, uiosize);
return (set_errno(EINTR));
}
@@ -565,13 +679,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
dpep->dpe_writerwait--;
dpep->dpe_flag |= DP_WRITER_PRESENT;
dpep->dpe_refcnt++;
+
mutex_exit(&dpep->dpe_lock);
mutex_enter(&pcp->pc_lock);
+ pcp->pc_flag &= ~PC_WRITEWANTED;
+
if (pcp->pc_bitmap == NULL) {
pcache_create(pcp, pollfdnum);
}
- for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) {
+ for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
+ pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
fd = pfdp->fd;
if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles)
continue;
@@ -580,9 +698,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
if (pdp == NULL) {
pdp = pcache_alloc_fd(0);
pdp->pd_fd = fd;
+
+ if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ /* LINTED pointer alignment */
+ epfdp = (dvpoll_epollfd_t *)pfdp;
+ pdp->pd_epolldata = epfdp->dpep_data;
+ }
+
pdp->pd_pcache = pcp;
pcache_insert_fd(pcp, pdp, pollfdnum);
+ } else {
+ if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) &&
+ pdp->pd_fp != NULL) {
+ /*
+ * epoll semantics demand that we error
+ * out in this case.
+ */
+ error = EEXIST;
+ break;
+ }
}
+
ASSERT(pdp->pd_fd == fd);
ASSERT(pdp->pd_pcache == pcp);
if (fd >= pcp->pc_mapsize) {
@@ -665,7 +801,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
}
releasef(fd);
} else {
- if (pdp == NULL) {
+ if (pdp == NULL || pdp->pd_fp == NULL) {
+ if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ /*
+ * As with the add case (above), epoll
+ * semantics demand that we error out
+ * in this case.
+ */
+ error = ENOENT;
+ break;
+ }
+
continue;
}
ASSERT(pdp->pd_fd == fd);
@@ -690,6 +836,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
return (error);
}
+#define DP_SIGMASK_RESTORE(ksetp) { \
+ if (ksetp != NULL) { \
+ mutex_enter(&p->p_lock); \
+ if (lwp->lwp_cursig == 0) { \
+ t->t_hold = lwp->lwp_sigoldmask; \
+ t->t_flag &= ~T_TOMASK; \
+ } \
+ mutex_exit(&p->p_lock); \
+ } \
+}
+
/*ARGSUSED*/
static int
dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
@@ -701,7 +858,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
int error = 0;
STRUCT_DECL(dvpoll, dvpoll);
- if (cmd == DP_POLL) {
+ if (cmd == DP_POLL || cmd == DP_PPOLL) {
/* do this now, before we sleep on DP_WRITER_PRESENT */
now = gethrtime();
}
@@ -717,6 +874,27 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
return (EACCES);
mutex_enter(&dpep->dpe_lock);
+
+ if (cmd == DP_EPOLLCOMPAT) {
+ if (dpep->dpe_refcnt != 0) {
+ /*
+ * We can't turn on epoll compatibility while there
+ * are outstanding operations.
+ */
+ mutex_exit(&dpep->dpe_lock);
+ return (EBUSY);
+ }
+
+ /*
+ * epoll compatibility is a one-way street: there's no way
+ * to turn it off for a particular open.
+ */
+ dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
+ mutex_exit(&dpep->dpe_lock);
+
+ return (0);
+ }
+
while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
(dpep->dpe_writerwait != 0)) {
if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
@@ -729,15 +907,36 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
switch (cmd) {
case DP_POLL:
+ case DP_PPOLL:
{
pollstate_t *ps;
nfds_t nfds;
int fdcnt = 0;
+ size_t size, fdsize, dpsize;
hrtime_t deadline = 0;
+ k_sigset_t *ksetp = NULL;
+ k_sigset_t kset;
+ sigset_t set;
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ struct proc *p = ttoproc(curthread);
STRUCT_INIT(dvpoll, mode);
- error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
- STRUCT_SIZE(dvpoll));
+
+ /*
+ * The dp_setp member is only required/consumed for DP_PPOLL,
+ * which otherwise uses the same structure as DP_POLL.
+ */
+ if (cmd == DP_POLL) {
+ dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
+ (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
+ } else {
+ ASSERT(cmd == DP_PPOLL);
+ dpsize = STRUCT_SIZE(dvpoll);
+ }
+
+ error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
+
if (error) {
DP_REFRELE(dpep);
return (EFAULT);
@@ -755,6 +954,52 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
deadline += now;
}
+ if (cmd == DP_PPOLL) {
+ void *setp = STRUCT_FGETP(dvpoll, dp_setp);
+
+ if (setp != NULL) {
+ if (copyin(setp, &set, sizeof (set))) {
+ DP_REFRELE(dpep);
+ return (EFAULT);
+ }
+
+ sigutok(&set, &kset);
+ ksetp = &kset;
+
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+ lwp->lwp_sigoldmask = t->t_hold;
+ t->t_hold = *ksetp;
+ t->t_flag |= T_TOMASK;
+
+ /*
+ * Like ppoll() with a non-NULL sigset, we'll
+ * call cv_reltimedwait_sig() just to check for
+ * signals. This call will return immediately
+ * with either 0 (signalled) or -1 (no signal).
+ * There are some conditions whereby we can
+ * get 0 from cv_reltimedwait_sig() without
+ * a true signal (e.g., a directed stop), so
+ * we restore our signal mask in the unlikely
+ * event that lwp_cursig is 0.
+ */
+ if (!cv_reltimedwait_sig(&t->t_delay_cv,
+ &p->p_lock, 0, TR_CLOCK_TICK)) {
+ if (lwp->lwp_cursig == 0) {
+ t->t_hold = lwp->lwp_sigoldmask;
+ t->t_flag &= ~T_TOMASK;
+ }
+
+ mutex_exit(&p->p_lock);
+
+ DP_REFRELE(dpep);
+ return (EINTR);
+ }
+
+ mutex_exit(&p->p_lock);
+ }
+ }
+
if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
/*
* We are just using DP_POLL to sleep, so
@@ -762,17 +1007,29 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
* Do not check for signals if we have a zero timeout.
*/
DP_REFRELE(dpep);
- if (deadline == 0)
+ if (deadline == 0) {
+ DP_SIGMASK_RESTORE(ksetp);
return (0);
+ }
+
mutex_enter(&curthread->t_delay_lock);
while ((error =
cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
&curthread->t_delay_lock, deadline)) > 0)
continue;
mutex_exit(&curthread->t_delay_lock);
+
+ DP_SIGMASK_RESTORE(ksetp);
+
return (error == 0 ? EINTR : 0);
}
+ if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
+ size = nfds * (fdsize = sizeof (epoll_event_t));
+ } else {
+ size = nfds * (fdsize = sizeof (pollfd_t));
+ }
+
/*
* XXX It would be nice not to have to alloc each time, but it
* requires another per thread structure hook. This can be
@@ -782,8 +1039,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
curthread->t_pollstate = pollstate_create();
ps = curthread->t_pollstate;
}
- if (ps->ps_dpbufsize < nfds) {
- struct proc *p = ttoproc(curthread);
+ if (ps->ps_dpbufsize < size) {
/*
* The maximum size should be no large than
* current maximum open file count.
@@ -792,27 +1048,28 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
if (nfds > p->p_fno_ctl) {
mutex_exit(&p->p_lock);
DP_REFRELE(dpep);
+ DP_SIGMASK_RESTORE(ksetp);
return (EINVAL);
}
mutex_exit(&p->p_lock);
- kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) *
- ps->ps_dpbufsize);
- ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) *
- nfds, KM_SLEEP);
- ps->ps_dpbufsize = nfds;
+ kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
+ ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
+ ps->ps_dpbufsize = size;
}
mutex_enter(&pcp->pc_lock);
for (;;) {
- pcp->pc_flag = 0;
- error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt);
+ pcp->pc_flag &= ~PC_POLLWAKE;
+
+ error = dp_pcache_poll(dpep, ps->ps_dpbuf,
+ pcp, nfds, &fdcnt);
if (fdcnt > 0 || error != 0)
break;
/*
* A pollwake has happened since we polled cache.
*/
- if (pcp->pc_flag & T_POLLWAKE)
+ if (pcp->pc_flag & PC_POLLWAKE)
continue;
/*
@@ -822,8 +1079,40 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
/* immediate timeout; do not check signals */
break;
}
- error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
- &pcp->pc_lock, deadline);
+
+ if (!(pcp->pc_flag & PC_WRITEWANTED)) {
+ error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
+ &pcp->pc_lock, deadline);
+ } else {
+ error = 1;
+ }
+
+ if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) {
+ /*
+ * We've been kicked off of our cv because a
+ * writer wants in. We're going to drop our
+ * reference count and then wait until the
+ * writer is gone -- at which point we'll
+ * reacquire the pc_lock and call into
+ * dp_pcache_poll() to get the updated state.
+ */
+ mutex_exit(&pcp->pc_lock);
+
+ mutex_enter(&dpep->dpe_lock);
+ dpep->dpe_refcnt--;
+ cv_broadcast(&dpep->dpe_cv);
+
+ while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
+ (dpep->dpe_writerwait != 0)) {
+ error = cv_wait_sig_swap(&dpep->dpe_cv,
+ &dpep->dpe_lock);
+ }
+
+ dpep->dpe_refcnt++;
+ mutex_exit(&dpep->dpe_lock);
+ mutex_enter(&pcp->pc_lock);
+ }
+
/*
* If we were awakened by a signal or timeout
* then break the loop, else poll again.
@@ -837,9 +1126,11 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
}
mutex_exit(&pcp->pc_lock);
+ DP_SIGMASK_RESTORE(ksetp);
+
if (error == 0 && fdcnt > 0) {
- if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll,
- dp_fds), sizeof (pollfd_t) * fdcnt)) {
+ if (copyout(ps->ps_dpbuf,
+ STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
DP_REFRELE(dpep);
return (EFAULT);
}
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
index 49ca6f0475..8944fcbff3 100644
--- a/usr/src/uts/common/io/ksocket/ksocket.c
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -22,6 +22,7 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/file.h>
@@ -820,7 +821,7 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents,
if (error != 0 || *revents != 0)
break;
- if (pcp->pc_flag & T_POLLWAKE)
+ if (pcp->pc_flag & PC_POLLWAKE)
continue;
if (timo == -1) {
diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c
index cb20c24270..5852e40799 100644
--- a/usr/src/uts/common/io/usb/usba/usba_ugen.c
+++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c
@@ -23,6 +23,10 @@
*/
/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+/*
* UGEN: USB Generic Driver support code
*
* This code provides entry points called by the ugen driver or other
@@ -1082,7 +1086,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events,
((epp->ep_state &
UGEN_EP_STATE_INTR_IN_POLLING_ON) == 0)) {
*reventsp |= POLLIN;
- } else if (!anyyet) {
+ }
+
+ if ((!*reventsp && !anyyet) ||
+ (events & POLLET)) {
*phpp = &epp->ep_pollhead;
epp->ep_state |=
UGEN_EP_STATE_INTR_IN_POLL_PENDING;
@@ -1101,7 +1108,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events,
((epp->ep_state &
UGEN_EP_STATE_ISOC_IN_POLLING_ON) == 0)) {
*reventsp |= POLLIN;
- } else if (!anyyet) {
+ }
+
+ if ((!*reventsp && !anyyet) ||
+ (events & POLLET)) {
*phpp = &epp->ep_pollhead;
epp->ep_state |=
UGEN_EP_STATE_ISOC_IN_POLL_PENDING;
@@ -1115,9 +1125,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events,
break;
case UGEN_MINOR_DEV_STAT_NODE:
- if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) {
+ if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED)
*reventsp |= POLLIN;
- } else if (!anyyet) {
+
+ if ((!*reventsp && !anyyet) || (events & POLLET)) {
*phpp = &ugenp->ug_ds.dev_pollhead;
ugenp->ug_ds.dev_stat |=
UGEN_DEV_STATUS_POLL_PENDING;
@@ -1131,9 +1142,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events,
break;
}
} else {
- if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) {
+ if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED)
*reventsp |= POLLHUP|POLLIN;
- } else if (!anyyet) {
+
+ if ((!*reventsp && !anyyet) || (events & POLLET)) {
*phpp = &ugenp->ug_ds.dev_pollhead;
ugenp->ug_ds.dev_stat |=
UGEN_DEV_STATUS_POLL_PENDING;
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index f9df89923f..4c99f92411 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -24,6 +24,7 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#include <sys/types.h>
@@ -8312,7 +8313,7 @@ chkrd:
}
*reventsp = (short)retevents;
- if (retevents) {
+ if (retevents && !(events & POLLET)) {
if (headlocked)
mutex_exit(&stp->sd_lock);
return (0);
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 0a72d3d882..08b2488b97 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -22,6 +22,7 @@
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright 2013, Joyent, Inc. All rights reserved.
# Copyright 2013 Garrett D'Amore <garrett@damore.org>
+# Copyright (c) 2014, Joyent, Inc. All rights reserved.
#
include $(SRC)/uts/Makefile.uts
@@ -212,6 +213,7 @@ CHKHDRS= \
emul64cmd.h \
emul64var.h \
epm.h \
+ epoll.h \
errno.h \
errorq.h \
errorq_impl.h \
diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h
index 36c815c69f..4e4c76d9b0 100644
--- a/usr/src/uts/common/sys/devpoll.h
+++ b/usr/src/uts/common/sys/devpoll.h
@@ -24,11 +24,13 @@
* All rights reserved.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#ifndef _SYS_DEVPOLL_H
#define _SYS_DEVPOLL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/poll_impl.h>
#include <sys/types32.h>
@@ -39,8 +41,10 @@ extern "C" {
/* /dev/poll ioctl */
#define DPIOC (0xD0 << 8)
-#define DP_POLL (DPIOC | 1) /* poll on fds in cached in /dev/poll */
+#define DP_POLL (DPIOC | 1) /* poll on fds cached via /dev/poll */
#define DP_ISPOLLED (DPIOC | 2) /* is this fd cached in /dev/poll */
+#define DP_PPOLL (DPIOC | 3) /* ppoll on fds cached via /dev/poll */
+#define DP_EPOLLCOMPAT (DPIOC | 4) /* turn on epoll compatibility */
#define DEVPOLLSIZE 1000 /* /dev/poll table size increment */
@@ -51,14 +55,21 @@ typedef struct dvpoll {
pollfd_t *dp_fds; /* pollfd array */
nfds_t dp_nfds; /* num of pollfd's in dp_fds[] */
int dp_timeout; /* time out in milisec */
+ sigset_t *dp_setp; /* sigset, if any */
} dvpoll_t;
typedef struct dvpoll32 {
caddr32_t dp_fds; /* pollfd array */
uint32_t dp_nfds; /* num of pollfd's in dp_fds[] */
int32_t dp_timeout; /* time out in milisec */
+ caddr32_t dp_setp; /* sigset, if any */
} dvpoll32_t;
+typedef struct dvpoll_epollfd {
+ pollfd_t dpep_pollfd; /* must be first member */
+ uint64_t dpep_data; /* data payload */
+} dvpoll_epollfd_t;
+
#ifdef _KERNEL
typedef struct dp_entry {
@@ -71,6 +82,7 @@ typedef struct dp_entry {
} dp_entry_t;
#define DP_WRITER_PRESENT 0x1 /* a write is in progress */
+#define DP_ISEPOLLCOMPAT 0x2 /* epoll compatibility mode */
#define DP_REFRELE(dpep) { \
mutex_enter(&(dpep)->dpe_lock); \
diff --git a/usr/src/uts/common/sys/epoll.h b/usr/src/uts/common/sys/epoll.h
new file mode 100644
index 0000000000..9b1c37f0dd
--- /dev/null
+++ b/usr/src/uts/common/sys/epoll.h
@@ -0,0 +1,88 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_EPOLL_H
+#define _SYS_EPOLL_H
+
+#include <sys/poll.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef union epoll_data {
+ void *ptr;
+ int fd;
+ uint32_t u32;
+ uint64_t u64;
+} epoll_data_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct epoll_event {
+ uint32_t events; /* events */
+ epoll_data_t data; /* user-specified data */
+} epoll_event_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+/*
+ * Define the EPOLL* constants in terms of their poll(2)/poll(7) equivalents.
+ * Note that the values match the equivalents in Linux to allow for any binary
+ * compatibility layers to not need to translate them.
+ */
+#define EPOLLIN 0x0001
+#define EPOLLPRI 0x0002
+#define EPOLLOUT 0x0004
+#define EPOLLRDNORM 0x0040
+#define EPOLLRDBAND 0x0080
+#define EPOLLWRNORM 0x0100
+#define EPOLLWRBAND 0x0200
+#define EPOLLMSG 0x0400 /* not used */
+#define EPOLLERR 0x0008
+#define EPOLLHUP 0x0010
+#define EPOLLRDHUP 0x2000
+
+#define EPOLLWAKEUP (1UL << 29) /* no meaning; silently ignored */
+#define EPOLLONESHOT (1UL << 30) /* translated to POLLONESHOT */
+#define EPOLLET (1UL << 31) /* translated to POLLET */
+
+#define EPOLL_CTL_ADD 1
+#define EPOLL_CTL_DEL 2
+#define EPOLL_CTL_MOD 3
+
+#define EPOLL_CLOEXEC 02000000
+
+#if !defined(_KERNEL)
+
+extern int epoll_create(int size);
+extern int epoll_create1(int flags);
+extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
+extern int epoll_wait(int epfd, struct epoll_event *events,
+ int maxevents, int timeout);
+extern int epoll_pwait(int epfd, struct epoll_event *events,
+ int maxevents, int timeout, const sigset_t *sigmask);
+
+#endif /* !_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_EPOLL_H */
diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h
index b1e222a3a3..4e3673753e 100644
--- a/usr/src/uts/common/sys/poll.h
+++ b/usr/src/uts/common/sys/poll.h
@@ -28,11 +28,13 @@
* All rights reserved.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#ifndef _SYS_POLL_H
#define _SYS_POLL_H
-#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 11.9 */
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -59,6 +61,7 @@ typedef unsigned long nfds_t;
#define POLLWRNORM POLLOUT
#define POLLRDBAND 0x0080 /* out-of-band data is readable */
#define POLLWRBAND 0x0100 /* out-of-band data is writeable */
+#define POLLRDHUP 0x4000 /* read-side hangup */
#define POLLNORM POLLRDNORM
@@ -70,7 +73,13 @@ typedef unsigned long nfds_t;
#define POLLHUP 0x0010 /* fd has been hung up on */
#define POLLNVAL 0x0020 /* invalid pollfd entry */
-#define POLLREMOVE 0x0800 /* remove a cached poll fd from /dev/poll */
+/*
+ * These events will never be specified in revents, but may be specified in
+ * events to control /dev/poll behavior.
+ */
+#define POLLREMOVE 0x0800 /* remove cached /dev/poll fd */
+#define POLLONESHOT 0x1000 /* /dev/poll should one-shot this fd */
+#define POLLET 0x2000 /* edge-triggered /dev/poll fd */
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h
index ede99d0df2..2e866ec4d4 100644
--- a/usr/src/uts/common/sys/poll_impl.h
+++ b/usr/src/uts/common/sys/poll_impl.h
@@ -24,11 +24,13 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
+
#ifndef _SYS_POLL_IMPL_H
#define _SYS_POLL_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Caching Poll Subsystem:
*
@@ -160,6 +162,7 @@ typedef struct polldat {
int pd_nsets; /* num of xref sets, used by poll(2) */
xref_t *pd_ref; /* ptr to xref info, 1 for each set */
struct port_kevent *pd_portev; /* associated port event struct */
+ uint64_t pd_epolldata; /* epoll data, if any */
} polldat_t;
/*
@@ -187,7 +190,8 @@ typedef struct pollcache {
} pollcache_t;
/* pc_flag */
-#define T_POLLWAKE 0x02 /* pollwakeup() occurred */
+#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */
+#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */
#if defined(_KERNEL)
/*
diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c
index 7f37529941..c33156a4fc 100644
--- a/usr/src/uts/common/syscall/poll.c
+++ b/usr/src/uts/common/syscall/poll.c
@@ -29,6 +29,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
/*
@@ -525,13 +526,13 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
}
/*
- * If T_POLLWAKE is set, a pollwakeup() was performed on
+ * If PC_POLLWAKE is set, a pollwakeup() was performed on
* one of the file descriptors. This can happen only if
* one of the VOP_POLL() functions dropped pcp->pc_lock.
* The only current cases of this is in procfs (prpoll())
* and STREAMS (strpoll()).
*/
- if (pcp->pc_flag & T_POLLWAKE)
+ if (pcp->pc_flag & PC_POLLWAKE)
continue;
/*
@@ -886,9 +887,9 @@ retry:
}
/*
- * This function is called to inform a thread that
- * an event being polled for has occurred.
- * The pollstate lock on the thread should be held on entry.
+ * This function is called to inform a thread (or threads) that an event being
+ * polled on has occurred. The pollstate lock on the thread should be held
+ * on entry.
*/
void
pollnotify(pollcache_t *pcp, int fd)
@@ -896,8 +897,8 @@ pollnotify(pollcache_t *pcp, int fd)
ASSERT(fd < pcp->pc_mapsize);
ASSERT(MUTEX_HELD(&pcp->pc_lock));
BT_SET(pcp->pc_bitmap, fd);
- pcp->pc_flag |= T_POLLWAKE;
- cv_signal(&pcp->pc_cv);
+ pcp->pc_flag |= PC_POLLWAKE;
+ cv_broadcast(&pcp->pc_cv);
}
/*
@@ -2024,7 +2025,7 @@ retry:
*/
if ((pdp->pd_php != NULL) &&
(pollfdp[entry].events == pdp->pd_events) &&
- ((pcp->pc_flag & T_POLLWAKE) == 0)) {
+ ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
BT_CLEAR(pcp->pc_bitmap, fd);
}
/*
@@ -2251,7 +2252,7 @@ pollstate_destroy(pollstate_t *ps)
pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets);
ps->ps_pcacheset = NULL;
if (ps->ps_dpbuf != NULL) {
- kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t));
+ kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
ps->ps_dpbuf = NULL;
}
mutex_destroy(&ps->ps_lock);
diff --git a/usr/src/uts/sun4v/io/vcc.c b/usr/src/uts/sun4v/io/vcc.c
index feeaf03e8f..85f722e467 100644
--- a/usr/src/uts/sun4v/io/vcc.c
+++ b/usr/src/uts/sun4v/io/vcc.c
@@ -24,6 +24,9 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ */
#include <sys/types.h>
#include <sys/file.h>
@@ -2456,7 +2459,7 @@ vcc_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
*reventsp |= (events & POLLIN);
}
- if (((*reventsp) == 0) && (!anyyet)) {
+ if ((((*reventsp) == 0) && (!anyyet)) || (events & POLLET)) {
*phpp = &vport->poll;
if (events & POLLIN) {
mutex_enter(&vport->lock);