From f9eb9fdf196b6ed476e4ffc69cecd8b0da3cb7e7 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Sun, 15 Nov 2015 13:22:42 -0800 Subject: 6451 ztest fails due to checksum errors Reviewed by: George Wilson Reviewed by: Prakash Surya Reviewed by: Jorgen Lundman Approved by: Dan McDonald --- usr/src/cmd/ztest/ztest.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index c5eae695c9..764008f5b8 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -4780,7 +4780,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - int bshift = SPA_OLD_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; int maxfaults; int mirror_save; @@ -4941,6 +4941,31 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { + /* + * The offset must be chosen carefully to ensure that + * we do not inject a given logical block with errors + * on two different leaf devices, because ZFS can not + * tolerate that (if maxfaults==1). + * + * We divide each leaf into chunks of size + * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk + * there is a series of ranges to which we can inject errors. + * Each range can accept errors on only a single leaf vdev. + * The error injection ranges are separated by ranges + * which we will not inject errors on any device (DMZs). + * Each DMZ must be large enough such that a single block + * can not straddle it, so that a single block can not be + * a target in two different injection ranges (on different + * leaf vdevs). + * + * For example, with 3 leaves, each chunk looks like: + * 0 to 32M: injection range for leaf 0 + * 32M to 64M: DMZ - no injection allowed + * 64M to 96M: injection range for leaf 1 + * 96M to 128M: DMZ - no injection allowed + * 128M to 160M: injection range for leaf 2 + * 160M to 192M: DMZ - no injection allowed + */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); -- cgit v1.2.3 From 3d729aecc03ea6ebb9bd5d56b8dccd24f57daa41 Mon Sep 17 00:00:00 2001 From: Jerry Jelinek Date: Thu, 15 Oct 2015 16:26:52 -0700 Subject: 6342 want signalfd support Reviewed by: Patrick Mooney Reviewed by: Robert Mustacchi Reviewed by: Igor Kozhukhov Reviewed by: Garrett D'Amore Approved by: Dan McDonald --- usr/src/cmd/devfsadm/misc_link.c | 3 + usr/src/lib/libc/amd64/Makefile | 1 + usr/src/lib/libc/i386/Makefile.com | 1 + usr/src/lib/libc/port/mapfile-vers | 5 + usr/src/lib/libc/port/sys/signalfd.c | 60 ++ usr/src/lib/libc/sparc/Makefile.com | 1 + usr/src/man/man3c/Makefile | 1 + usr/src/man/man3c/signalfd.3c | 192 ++++++ usr/src/pkg/manifests/system-header.mf | 1 + usr/src/pkg/manifests/system-kernel.mf | 7 + usr/src/pkg/manifests/system-library.man3c.inc | 1 + usr/src/uts/common/Makefile.files | 2 + usr/src/uts/common/io/signalfd.c | 774 +++++++++++++++++++++++++ usr/src/uts/common/io/signalfd.conf | 16 + usr/src/uts/common/os/exit.c | 8 + usr/src/uts/common/os/sig.c | 16 + usr/src/uts/common/sys/Makefile | 1 + usr/src/uts/common/sys/proc.h | 2 + usr/src/uts/common/sys/signalfd.h | 100 ++++ usr/src/uts/common/sys/thread.h | 2 +- usr/src/uts/intel/Makefile.intel | 1 + usr/src/uts/intel/signalfd/Makefile | 68 +++ usr/src/uts/sparc/Makefile.sparc | 1 + usr/src/uts/sparc/signalfd/Makefile | 68 +++ 24 files changed, 1331 insertions(+), 1 deletion(-) create mode 100644 usr/src/lib/libc/port/sys/signalfd.c create mode 100644 usr/src/man/man3c/signalfd.3c create mode 100644 usr/src/uts/common/io/signalfd.c create mode 100644 usr/src/uts/common/io/signalfd.conf create mode 100644 usr/src/uts/common/sys/signalfd.h create mode 100644 usr/src/uts/intel/signalfd/Makefile create mode 100644 usr/src/uts/sparc/signalfd/Makefile diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index bf59fb5e6b..5f241df296 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -92,6 +92,9 @@ static devfsadm_create_t misc_cbt[] = { { "pseudo", "ddi_pseudo", "eventfd", TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name }, + { "pseudo", "ddi_pseudo", "signalfd", + TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name + }, { "pseudo", "ddi_pseudo", "rsm", TYPE_EXACT | DRV_EXACT, ILEVEL_0, minor_name }, diff --git a/usr/src/lib/libc/amd64/Makefile b/usr/src/lib/libc/amd64/Makefile index dbda6c0c31..a968fa45f7 100644 --- a/usr/src/lib/libc/amd64/Makefile +++ b/usr/src/lib/libc/amd64/Makefile @@ -902,6 +902,7 @@ PORTSYS= \ sidsys.o \ siginterrupt.o \ signal.o \ + signalfd.o \ sigpending.o \ sigstack.o \ stat.o \ diff --git a/usr/src/lib/libc/i386/Makefile.com b/usr/src/lib/libc/i386/Makefile.com index 4ebd6473a9..c0f74678f8 100644 --- a/usr/src/lib/libc/i386/Makefile.com +++ b/usr/src/lib/libc/i386/Makefile.com @@ -942,6 +942,7 @@ PORTSYS= \ sidsys.o \ siginterrupt.o \ signal.o \ + signalfd.o \ sigpending.o \ sigstack.o \ stat.o \ diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers index f7f6e6a137..d62c86b364 100644 --- a/usr/src/lib/libc/port/mapfile-vers +++ b/usr/src/lib/libc/port/mapfile-vers @@ -93,6 +93,11 @@ $if _x86 && _ELF64 $add amd64 $endif +SYMBOL_VERSION ILLUMOS_0.18 { # signalfd + protected: + signalfd; +} ILLUMOS_0.17; + SYMBOL_VERSION ILLUMOS_0.17 { # glob(3C) LFS $if lf64 protected: diff --git a/usr/src/lib/libc/port/sys/signalfd.c b/usr/src/lib/libc/port/sys/signalfd.c new file mode 100644 index 0000000000..0080c52bdf --- /dev/null +++ b/usr/src/lib/libc/port/sys/signalfd.c @@ -0,0 +1,60 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#include +#include +#include +#include +#include + +int +signalfd(int fd, const sigset_t *mask, int flags) +{ + int origfd = fd; + + if (fd == -1) { + int oflags = O_RDONLY; + + if (flags & ~(SFD_NONBLOCK | SFD_CLOEXEC)) { + errno = EINVAL; + return (-1); + } + + if (flags & SFD_NONBLOCK) + oflags |= O_NONBLOCK; + + if (flags & SFD_CLOEXEC) + oflags |= O_CLOEXEC; + + if ((fd = open("/dev/signalfd", oflags)) < 0) + return (-1); + } + + if (ioctl(fd, SIGNALFDIOC_MASK, mask) != 0) { + if (origfd == -1) { + int old = errno; + (void) close(fd); + errno = old; + } + /* + * Trying to modify an existing sigfd so if this failed + * it's because it's not a valid fd or not a sigfd. ioctl + * returns the correct errno for these cases. + */ + return (-1); + } + + return (fd); +} diff --git a/usr/src/lib/libc/sparc/Makefile.com b/usr/src/lib/libc/sparc/Makefile.com index 2228bc848a..dc50cf3b38 100644 --- a/usr/src/lib/libc/sparc/Makefile.com +++ b/usr/src/lib/libc/sparc/Makefile.com @@ -976,6 +976,7 @@ PORTSYS= \ sidsys.o \ siginterrupt.o \ signal.o \ + signalfd.o \ sigpending.o \ sigstack.o \ stat.o \ diff --git a/usr/src/man/man3c/Makefile b/usr/src/man/man3c/Makefile index c38d65a57a..9ba004eff7 100644 --- a/usr/src/man/man3c/Makefile +++ b/usr/src/man/man3c/Makefile @@ -411,6 +411,7 @@ MANFILES= __fbufsize.3c \ sigfpe.3c \ siginterrupt.3c \ signal.3c \ + signalfd.3c \ sigqueue.3c \ sigsetops.3c \ sigstack.3c \ diff --git a/usr/src/man/man3c/signalfd.3c b/usr/src/man/man3c/signalfd.3c new file mode 100644 index 0000000000..43699a50a5 --- /dev/null +++ b/usr/src/man/man3c/signalfd.3c @@ -0,0 +1,192 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" Copyright 2015, Joyent, Inc. +.\" +.Dd "Jun 15, 2015" +.Dt SIGNALFD 3C +.Os +.Sh NAME +.Nm signalfd +.Nd create or modify a file descriptor for signal handling +.Sh SYNOPSIS +.In sys/signalfd.h +. +.Ft int +.Fo signalfd +.Fa "int fd" +.Fa "const sigset_t *mask" +.Fa "int flags" +.Fc +. +.Sh DESCRIPTION +The +.Fn signalfd +function returns a file descriptor that can be used +for synchronous consumption of signals. The file descriptor can be operated +upon via +.Xr read 2 +and the facilities that notify of file descriptor activity (e.g. +.Xr poll 2 , +.Xr port_get 3C , +.Xr epoll_wait 3C +). To dispose of the instance +.Xr close 2 +should be called on the file descriptor. +.Pp +If the +.Va fd +argument is -1, a new signalfd file descriptor will be +returned, otherwise the +.Va fd +argument should be an existing signalfd file descriptor whose signal mask will +be updated. +.Pp +The +.Va mask +argument specifies the set of signals that are relevant to the file descriptor. +It may be manipulated with the standard signal set manipulation functions +documented in +.Xr sigsetops 3C . +Signals in the mask which cannot be caught (e.g. +.Fa SIGKILL ) +are ignored. +.Pp +The +.Va flags +argument specifies additional parameters for the instance, and can have any of +the following values: +.Bl -tag -width Dv +.It Sy SFD_CLOEXEC +Instance will be closed upon an +.Xr exec 2 ; +see description for +.Fa O_CLOEXEC +in +.Xr open 2 . +.It Sy SFD_NONBLOCK +Instance will be set to be non-blocking. A +.Xr read 2 +on a signalfd instance that has been initialized with +.Fa SFD_NONBLOCK , +or made non-blocking in other ways, will return +.Er EAGAIN +in lieu of blocking if there are no signals from the +.Va mask +that are pending. +.El +.Pp +As with +.Xr sigwait 2 , +reading a signal from the file descriptor will consume the signal. The signals +used with signalfd file descriptors are normally first blocked so that their +handler does not run when a signal arrives. If the signal is not blocked the +behavior matches that of +.Xr sigwait 2 ; +if a +.Xr read 2 +is pending then the signal is consumed by the read, otherwise the signal is +consumed by the handler. +.Pp +The following operations can be performed upon a signalfd file descriptor: +.Bl -tag -width Dv +.It Sy read(2) +Reads and consumes one or more of the pending signals that match the file +descriptor's +.Va mask . +The read buffer must be large enough to hold one or more +.Vt signalfd_siginfo +structures, which is described below. +.Xr read 2 +will block if there are no matching signals pending, or return +.Er EAGAIN +if the instance was created with +.Fa SFD_NONBLOCK . +After a +.Xr fork 2 , +if the child reads from the descriptor it will only consume signals from itself. +.It Sy poll(2) +Provide notification when one of the signals from the +.Va mask +arrives. +.Fa POLLIN +and +.Fa POLLRDNORM +will be set. +.It Sy close(2) +Closes the desriptor. +.El +.Pp +The +.Vt signalfd_siginfo +structure returned from +.Xr read 2 +is a fixed size 128 byte structure defined as follows: +.Bd -literal +typedef struct signalfd_siginfo { + uint32_t ssi_signo; /* signal from signal.h */ + int32_t ssi_errno; /* error from errno.h */ + int32_t ssi_code; /* signal code */ + uint32_t ssi_pid; /* PID of sender */ + uint32_t ssi_uid; /* real UID of sender */ + int32_t ssi_fd; /* file descriptor (SIGIO) */ + uint32_t ssi_tid; /* unused */ + uint32_t ssi_band; /* band event (SIGIO) */ + uint32_t ssi_overrun; /* unused */ + uint32_t ssi_trapno; /* trap number that caused signal */ + int32_t ssi_status; /* exit status or signal (SIGCHLD) */ + int32_t ssi_int; /* unused */ + uint64_t ssi_ptr; /* unused */ + uint64_t ssi_utime; /* user CPU time consumed (SIGCHLD) */ + uint64_t ssi_stime; /* system CPU time consumed (SIGCHLD) */ + uint64_t ssi_addr; /* address that generated signal */ + uint8_t ssi_pad[48]; /* pad size to 128 bytes */ +} signalfd_siginfo_t; +.Ed +.Sh RETURN VALUES +Upon succesful completion, a file descriptor associated with the instance +is returned. Otherwise, -1 is returned and errno is set to indicate the error. +When +.Va fd +is not -1 and there is no error, the value of +.Va fd +is returned. +.Sh ERRORS +The +.Fn signalfd function +will fail if: +.Bl -tag -width Er +.It Er EBADF +The +.Va fd +descriptor is invalid. +.It Er EFAULT +The +.Va mask +address is invalid. +.It Er EINVAL +The +.Va fd +descriptor is not a signalfd descriptor or the +.Va flags +are invalid. +.It Er EMFILE +There are currently +.Va OPEN_MAX +file descriptors open in the calling process. +.It Er ENODEV +Unable to allocate state for the file descriptor. +.El +.Sh SEE ALSO +.Xr poll 2 , +.Xr sigwait 2 , +.Xr sigsetops 3C , +.Xr sigwaitinfo 3C , +.Xr signal.h 3HEAD diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf index 4551ca095c..697b75b6c2 100644 --- a/usr/src/pkg/manifests/system-header.mf +++ b/usr/src/pkg/manifests/system-header.mf @@ -1428,6 +1428,7 @@ file path=usr/include/sys/shm_impl.h file path=usr/include/sys/sid.h file path=usr/include/sys/siginfo.h file path=usr/include/sys/signal.h +file path=usr/include/sys/signalfd.h file path=usr/include/sys/skein.h file path=usr/include/sys/sleepq.h file path=usr/include/sys/smbios.h diff --git a/usr/src/pkg/manifests/system-kernel.mf b/usr/src/pkg/manifests/system-kernel.mf index d3cf047cb9..a00bb109cc 100644 --- a/usr/src/pkg/manifests/system-kernel.mf +++ b/usr/src/pkg/manifests/system-kernel.mf @@ -94,6 +94,9 @@ dir path=lib/svc dir path=lib/svc/manifest group=sys dir path=lib/svc/manifest/system group=sys dir path=lib/svc/method +dir path=usr/kernel group=sys +dir path=usr/kernel/drv group=sys +dir path=usr/kernel/drv/$(ARCH64) group=sys dir path=usr/share/man dir path=usr/share/man/man1m dir path=usr/share/man/man2 @@ -254,6 +257,7 @@ $(i386_ONLY)driver name=sd perms="* 0640 root sys" \ driver name=sgen perms="* 0600 root sys" \ alias=scsa,08.bfcp \ alias=scsa,08.bvhci +driver name=signalfd perms="* 0666 root sys" driver name=simnet clone_perms="simnet 0666 root sys" perms="* 0666 root sys" $(i386_ONLY)driver name=smbios perms="smbios 0444 root sys" driver name=softmac @@ -820,6 +824,9 @@ file path=lib/svc/manifest/system/scheduler.xml group=sys mode=0444 file path=lib/svc/method/svc-dumpadm mode=0555 file path=lib/svc/method/svc-intrd mode=0555 file path=lib/svc/method/svc-scheduler mode=0555 +file path=usr/kernel/drv/$(ARCH64)/signalfd group=sys +$(i386_ONLY)file path=usr/kernel/drv/signalfd group=sys +file path=usr/kernel/drv/signalfd.conf group=sys $(sparc_ONLY)file path=usr/share/man/man1m/monitor.1m $(sparc_ONLY)file path=usr/share/man/man1m/obpsym.1m # On SPARC driver/bscv is Serverblade1 specific, and in system/kernel/platform diff --git a/usr/src/pkg/manifests/system-library.man3c.inc b/usr/src/pkg/manifests/system-library.man3c.inc index ae061edac9..30999ee484 100644 --- a/usr/src/pkg/manifests/system-library.man3c.inc +++ b/usr/src/pkg/manifests/system-library.man3c.inc @@ -406,6 +406,7 @@ file path=usr/share/man/man3c/shm_unlink.3c file path=usr/share/man/man3c/sigfpe.3c file path=usr/share/man/man3c/siginterrupt.3c file path=usr/share/man/man3c/signal.3c +file path=usr/share/man/man3c/signalfd.3c file path=usr/share/man/man3c/sigqueue.3c file path=usr/share/man/man3c/sigsetops.3c file path=usr/share/man/man3c/sigstack.3c diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index e64cf0db35..e0530c886f 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -993,6 +993,8 @@ DEVPOOL_OBJS += devpool.o EVENTFD_OBJS += eventfd.o +SIGNALFD_OBJS += signalfd.o + I8042_OBJS += i8042.o KB8042_OBJS += \ diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c new file mode 100644 index 0000000000..32f8f85f7a --- /dev/null +++ b/usr/src/uts/common/io/signalfd.c @@ -0,0 +1,774 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Support for the signalfd facility, a Linux-borne facility for + * file descriptor-based synchronous signal consumption. + * + * As described on the signalfd(3C) man page, the general idea behind these + * file descriptors is that they can be used to synchronously consume signals + * via the read(2) syscall. That capability already exists with the + * sigwaitinfo(3C) function but the key advantage of signalfd is that, because + * it is file descriptor based, poll(2) can be used to determine when signals + * are available to be consumed. + * + * The general implementation uses signalfd_state to hold both the signal set + * and poll head for an open file descriptor. Because a process can be using + * different sigfds with different signal sets, each signalfd_state poll head + * can be thought of as an independent signal stream and the thread(s) waiting + * on that stream will get poll notification when any signal in the + * corresponding set is received. + * + * The sigfd_proc_state_t struct lives on the proc_t and maintains per-proc + * state for function callbacks and data when the proc needs to do work during + * signal delivery for pollwakeup. + * + * The read side of the implementation is straightforward and mimics the + * kernel behavior for sigtimedwait(). Signals continue to live on either + * the proc's p_sig, or thread's t_sig, member. Read consumes the signal so + * that it is no longer pending. + * + * The poll side is more complex since all of the sigfds on the process need + * to be examined every time a signal is delivered to the process in order to + * pollwake any thread waiting in poll for that signal. + * + * Because it is likely that a process will only be using one, or a few, sigfds, + * but many total file descriptors, we maintain a list of sigfds which need + * pollwakeup. The list lives on the proc's p_sigfd struct. In this way only + * zero, or a few, of the state structs will need to be examined every time a + * signal is delivered to the process, instead of having to examine all of the + * file descriptors to find the state structs. When a state struct with a + * matching signal set is found then pollwakeup is called. + * + * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list + * will clear out on its own. There is an exit helper (signalfd_exit_helper) + * which cleans up any remaining per-proc state when the process exits. + * + * The main complexity with signalfd is the interaction of forking and polling. + * This interaction is complex because now two processes have a fd that + * references the same dev_t (and its associated signalfd_state), but signals + * go to only one of those processes. Also, we don't know when one of the + * processes closes its fd because our 'close' entry point is only called when + * the last fd is closed (which could be by either process). + * + * Because the state struct is referenced by both file descriptors, and the + * state struct represents a signal stream needing a pollwakeup, if both + * processes were polling then both processes would get a pollwakeup when a + * signal arrives for either process (that is, the pollhead is associated with + * our dev_t so when a signal arrives the pollwakeup wakes up all waiters). + * + * Fortunately this is not a common problem in practice, but the implementation + * attempts to mitigate unexpected behavior. The typical behavior is that the + * parent has been polling the signalfd (which is why it was open in the first + * place) and the parent might have a pending signalfd_state (with the + * pollhead) on its per-process sigfd_list. After the fork the child will + * simply close that fd (among others) as part of the typical fork/close/exec + * pattern. Because the child will never poll that fd, it will never get any + * state onto its own sigfd_list (the child starts with a null list). The + * intention is that the child sees no pollwakeup activity for signals unless + * it explicitly reinvokes poll on the sigfd. + * + * As background, there are two primary polling cases to consider when the + * parent process forks: + * 1) If any thread is blocked in poll(2) then both the parent and child will + * return from the poll syscall with EINTR. This means that if either + * process wants to re-poll on a sigfd then it needs to re-run poll and + * would come back in to the signalfd_poll entry point. The parent would + * already have the dev_t's state on its sigfd_list and the child would not + * have anything there unless it called poll again on its fd. + * 2) If the process is using /dev/poll(7D) then the polling info is being + * cached by the poll device and the process might not currently be blocked + * on anything polling related. A subsequent DP_POLL ioctl will not invoke + * our signalfd_poll entry point again. Because the parent still has its + * sigfd_list setup, an incoming signal will hit our signalfd_pollwake_cb + * entry point, which in turn calls pollwake, and /dev/poll will do the + * right thing on DP_POLL. The child will not have a sigfd_list yet so the + * signal will not cause a pollwakeup. The dp code does its own handling for + * cleaning up its cache. + * + * This leaves only one odd corner case. If the parent and child both use + * the dup-ed sigfd to poll then when a signal is delivered to either process + * there is no way to determine which one should get the pollwakeup (since + * both processes will be queued on the same signal stream poll head). What + * happens in this case is that both processes will return from poll, but only + * one of them will actually have a signal to read. The other will return + * from read with EAGAIN, or block. This case is actually similar to the + * situation within a single process which got two different sigfd's with the + * same mask (or poll on two fd's that are dup-ed). Both would return from poll + * when a signal arrives but only one read would consume the signal and the + * other read would fail or block. Applications which poll on shared fd's + * cannot assume that a subsequent read will actually obtain data. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct signalfd_state signalfd_state_t; + +struct signalfd_state { + kmutex_t sfd_lock; /* lock protecting state */ + pollhead_t sfd_pollhd; /* poll head */ + k_sigset_t sfd_set; /* signals for this fd */ + signalfd_state_t *sfd_next; /* next state on global list */ +}; + +/* + * Internal global variables. + */ +static kmutex_t signalfd_lock; /* lock protecting state */ +static dev_info_t *signalfd_devi; /* device info */ +static id_space_t *signalfd_minor; /* minor number arena */ +static void *signalfd_softstate; /* softstate pointer */ +static signalfd_state_t *signalfd_state; /* global list of state */ + +/* + * If we don't already have an entry in the proc's list for this state, add one. + */ +static void +signalfd_wake_list_add(signalfd_state_t *state) +{ + proc_t *p = curproc; + list_t *lst; + sigfd_wake_list_t *wlp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(p->p_sigfd != NULL); + + lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; + for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) { + if (wlp->sigfd_wl_state == state) + break; + } + + if (wlp == NULL) { + wlp = kmem_zalloc(sizeof (sigfd_wake_list_t), KM_SLEEP); + wlp->sigfd_wl_state = state; + list_insert_head(lst, wlp); + } +} + +static void +signalfd_wake_rm(list_t *lst, sigfd_wake_list_t *wlp) +{ + list_remove(lst, wlp); + kmem_free(wlp, sizeof (sigfd_wake_list_t)); +} + +static void +signalfd_wake_list_rm(proc_t *p, signalfd_state_t *state) +{ + sigfd_wake_list_t *wlp; + list_t *lst; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + if (p->p_sigfd == NULL) + return; + + lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; + for (wlp = list_head(lst); wlp != NULL; wlp = list_next(lst, wlp)) { + if (wlp->sigfd_wl_state == state) { + signalfd_wake_rm(lst, wlp); + break; + } + } + + if (list_is_empty(lst)) { + ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL; + list_destroy(lst); + kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t)); + p->p_sigfd = NULL; + } +} + +static void +signalfd_wake_list_cleanup(proc_t *p) +{ + sigfd_wake_list_t *wlp; + list_t *lst; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = NULL; + + lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; + while (!list_is_empty(lst)) { + wlp = (sigfd_wake_list_t *)list_remove_head(lst); + kmem_free(wlp, sizeof (sigfd_wake_list_t)); + } +} + +static void +signalfd_exit_helper(void) +{ + proc_t *p = curproc; + list_t *lst; + + /* This being non-null is the only way we can get here */ + ASSERT(p->p_sigfd != NULL); + + mutex_enter(&p->p_lock); + lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; + + signalfd_wake_list_cleanup(p); + list_destroy(lst); + kmem_free(p->p_sigfd, sizeof (sigfd_proc_state_t)); + p->p_sigfd = NULL; + mutex_exit(&p->p_lock); +} + +/* + * Called every time a signal is delivered to the process so that we can + * see if any signal stream needs a pollwakeup. We maintain a list of + * signal state elements so that we don't have to look at every file descriptor + * on the process. If necessary, a further optimization would be to maintain a + * signal set mask that is a union of all of the sets in the list so that + * we don't even traverse the list if the signal is not in one of the elements. + * However, since the list is likely to be very short, this is not currently + * being done. A more complex data structure might also be used, but it is + * unclear what that would be since each signal set needs to be checked for a + * match. + */ +static void +signalfd_pollwake_cb(void *arg0, int sig) +{ + proc_t *p = (proc_t *)arg0; + list_t *lst; + sigfd_wake_list_t *wlp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + if (p->p_sigfd == NULL) + return; + + lst = &((sigfd_proc_state_t *)p->p_sigfd)->sigfd_list; + wlp = list_head(lst); + while (wlp != NULL) { + signalfd_state_t *state = wlp->sigfd_wl_state; + + mutex_enter(&state->sfd_lock); + + if (sigismember(&state->sfd_set, sig) && + state->sfd_pollhd.ph_list != NULL) { + sigfd_wake_list_t *tmp = wlp; + + /* remove it from the list */ + wlp = list_next(lst, wlp); + signalfd_wake_rm(lst, tmp); + + mutex_exit(&state->sfd_lock); + pollwakeup(&state->sfd_pollhd, POLLRDNORM | POLLIN); + } else { + mutex_exit(&state->sfd_lock); + wlp = list_next(lst, wlp); + } + } +} + +_NOTE(ARGSUSED(1)) +static int +signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + signalfd_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + + if (minor != SIGNALFDMNRN_SIGNALFD) + return (ENXIO); + + mutex_enter(&signalfd_lock); + + minor = (minor_t)id_allocff(signalfd_minor); + + if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) { + id_free(signalfd_minor, minor); + mutex_exit(&signalfd_lock); + return (ENODEV); + } + + state = ddi_get_soft_state(signalfd_softstate, minor); + *devp = makedevice(major, minor); + + state->sfd_next = signalfd_state; + signalfd_state = state; + + mutex_exit(&signalfd_lock); + + return (0); +} + +/* + * Consume one signal from our set in a manner similar to sigtimedwait(). + * The block parameter is used to control whether we wait for a signal or + * return immediately if no signal is pending. We use the thread's t_sigwait + * member in the same way that it is used by sigtimedwait. + * + * Return 0 if we successfully consumed a signal or an errno if not. + */ +static int +consume_signal(k_sigset_t set, uio_t *uio, boolean_t block) +{ + k_sigset_t oldmask; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t now; + timespec_t *rqtp = NULL; /* null means blocking */ + int timecheck = 0; + int ret = 0; + k_siginfo_t info, *infop; + signalfd_siginfo_t ssi, *ssp = &ssi; + + if (block == B_FALSE) { + timecheck = timechanged; + gethrestime(&now); + rqtp = &now; /* non-blocking check for pending signals */ + } + + t->t_sigwait = set; + + mutex_enter(&p->p_lock); + /* + * set the thread's signal mask to unmask those signals in the + * specified set. + */ + schedctl_finish_sigblock(t); + oldmask = t->t_hold; + sigdiffset(&t->t_hold, &t->t_sigwait); + + /* + * Based on rqtp, wait indefinitely until we take a signal in our set + * or return immediately if there are no signals pending from our set. + */ + while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock, rqtp, + timecheck)) > 0) + continue; + + /* Restore thread's signal mask to its previous value. */ + t->t_hold = oldmask; + t->t_sig_check = 1; /* so post_syscall sees new t_hold mask */ + + if (ret == -1) { + /* no signals pending */ + mutex_exit(&p->p_lock); + sigemptyset(&t->t_sigwait); + return (EAGAIN); /* no signals pending */ + } + + /* Don't bother with signal if it is not in request set. */ + if (lwp->lwp_cursig == 0 || + !sigismember(&t->t_sigwait, lwp->lwp_cursig)) { + mutex_exit(&p->p_lock); + /* + * lwp_cursig is zero if pokelwps() awakened cv_wait_sig(). + * This happens if some other thread in this process called + * forkall() or exit(). + */ + sigemptyset(&t->t_sigwait); + return (EINTR); + } + + if (lwp->lwp_curinfo) { + infop = &lwp->lwp_curinfo->sq_info; + } else { + infop = &info; + bzero(infop, sizeof (info)); + infop->si_signo = lwp->lwp_cursig; + infop->si_code = SI_NOINFO; + } + + lwp->lwp_ru.nsignals++; + + DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop); + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + mutex_exit(&p->p_lock); + + /* Convert k_siginfo into external, datamodel independent, struct. */ + bzero(ssp, sizeof (*ssp)); + ssp->ssi_signo = infop->si_signo; + ssp->ssi_errno = infop->si_errno; + ssp->ssi_code = infop->si_code; + ssp->ssi_pid = infop->si_pid; + ssp->ssi_uid = infop->si_uid; + ssp->ssi_fd = infop->si_fd; + ssp->ssi_band = infop->si_band; + ssp->ssi_trapno = infop->si_trapno; + ssp->ssi_status = infop->si_status; + ssp->ssi_utime = infop->si_utime; + ssp->ssi_stime = infop->si_stime; + ssp->ssi_addr = (uint64_t)(intptr_t)infop->si_addr; + + ret = uiomove(ssp, sizeof (*ssp), UIO_READ, uio); + + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + sigemptyset(&t->t_sigwait); + return (ret); +} + +/* + * This is similar to sigtimedwait. Based on the fd mode we may wait until a + * signal within our specified set is posted. We consume as many available + * signals within our set as we can. + */ +_NOTE(ARGSUSED(2)) +static int +signalfd_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + signalfd_state_t *state; + minor_t minor = getminor(dev); + boolean_t block = B_TRUE; + k_sigset_t set; + boolean_t got_one = B_FALSE; + int res; + + if (uio->uio_resid < sizeof (signalfd_siginfo_t)) + return (EINVAL); + + state = ddi_get_soft_state(signalfd_softstate, minor); + + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) + block = B_FALSE; + + mutex_enter(&state->sfd_lock); + set = state->sfd_set; + mutex_exit(&state->sfd_lock); + + if (sigisempty(&set)) + return (set_errno(EINVAL)); + + do { + res = consume_signal(state->sfd_set, uio, block); + if (res == 0) + got_one = B_TRUE; + + /* + * After consuming one signal we won't block trying to consume + * further signals. + */ + block = B_FALSE; + } while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t)); + + if (got_one) + res = 0; + + return (res); +} + +/* + * If ksigset_t's were a single word, we would do: + * return (((p->p_sig | t->t_sig) & set) & fillset); + */ +static int +signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set) +{ + return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) & + set.__sigbits[0]) | + ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) & + set.__sigbits[1]) | + (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) & + set.__sigbits[2]) & FILLSET2)); +} + +_NOTE(ARGSUSED(4)) +static int +signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + signalfd_state_t *state; + minor_t minor = getminor(dev); + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + short revents = 0; + + state = ddi_get_soft_state(signalfd_softstate, minor); + + mutex_enter(&state->sfd_lock); + + if (signalfd_sig_pending(p, t, state->sfd_set) != 0) + revents |= POLLRDNORM | POLLIN; + + mutex_exit(&state->sfd_lock); + + if (!(*reventsp = revents & events) && !anyyet) { + *phpp = &state->sfd_pollhd; + + /* + * Enable pollwakeup handling. + */ + if (p->p_sigfd == NULL) { + sigfd_proc_state_t *pstate; + + pstate = kmem_zalloc(sizeof (sigfd_proc_state_t), + KM_SLEEP); + list_create(&pstate->sigfd_list, + sizeof (sigfd_wake_list_t), + offsetof(sigfd_wake_list_t, sigfd_wl_lst)); + + mutex_enter(&p->p_lock); + /* check again now that we're locked */ + if (p->p_sigfd == NULL) { + p->p_sigfd = pstate; + } else { + /* someone beat us to it */ + list_destroy(&pstate->sigfd_list); + kmem_free(pstate, sizeof (sigfd_proc_state_t)); + } + mutex_exit(&p->p_lock); + } + + mutex_enter(&p->p_lock); + if (((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb == + NULL) { + ((sigfd_proc_state_t *)p->p_sigfd)->sigfd_pollwake_cb = + signalfd_pollwake_cb; + } + signalfd_wake_list_add(state); + mutex_exit(&p->p_lock); + } + + return (0); +} + +_NOTE(ARGSUSED(4)) +static int +signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + signalfd_state_t *state; + minor_t minor = getminor(dev); + sigset_t mask; + + state = ddi_get_soft_state(signalfd_softstate, minor); + + switch (cmd) { + case SIGNALFDIOC_MASK: + if (ddi_copyin((caddr_t)arg, (caddr_t)&mask, sizeof (sigset_t), + md) != 0) + return (set_errno(EFAULT)); + + mutex_enter(&state->sfd_lock); + sigutok(&mask, &state->sfd_set); + mutex_exit(&state->sfd_lock); + + return (0); + + default: + break; + } + + return (ENOTTY); +} + +_NOTE(ARGSUSED(1)) +static int +signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + signalfd_state_t *state, **sp; + minor_t minor = getminor(dev); + proc_t *p = curproc; + + state = ddi_get_soft_state(signalfd_softstate, minor); + + if (state->sfd_pollhd.ph_list != NULL) { + pollwakeup(&state->sfd_pollhd, POLLERR); + pollhead_clean(&state->sfd_pollhd); + } + + /* Make sure our state is removed from our proc's pollwake list. */ + mutex_enter(&p->p_lock); + signalfd_wake_list_rm(p, state); + mutex_exit(&p->p_lock); + + mutex_enter(&signalfd_lock); + + /* Remove our state from our global list. */ + for (sp = &signalfd_state; *sp != state; sp = &((*sp)->sfd_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->sfd_next; + + ddi_soft_state_free(signalfd_softstate, minor); + id_free(signalfd_minor, minor); + + mutex_exit(&signalfd_lock); + + return (0); +} + +static int +signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + if (cmd != DDI_ATTACH || signalfd_devi != NULL) + return (DDI_FAILURE); + + mutex_enter(&signalfd_lock); + + signalfd_minor = id_space_create("signalfd_minor", 1, L_MAXMIN32 + 1); + if (signalfd_minor == NULL) { + cmn_err(CE_WARN, "signalfd couldn't create id space"); + mutex_exit(&signalfd_lock); + return (DDI_FAILURE); + } + + if (ddi_soft_state_init(&signalfd_softstate, + sizeof (signalfd_state_t), 0) != 0) { + cmn_err(CE_WARN, "signalfd failed to create soft state"); + id_space_destroy(signalfd_minor); + mutex_exit(&signalfd_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "signalfd", S_IFCHR, + SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/signalfd couldn't create minor node"); + ddi_soft_state_fini(&signalfd_softstate); + id_space_destroy(signalfd_minor); + mutex_exit(&signalfd_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + signalfd_devi = devi; + + sigfd_exit_helper = signalfd_exit_helper; + + mutex_exit(&signalfd_lock); + + return (DDI_SUCCESS); +} + +_NOTE(ARGSUSED(0)) +static int +signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + default: + return (DDI_FAILURE); + } + + /* list should be empty */ + VERIFY(signalfd_state == NULL); + + mutex_enter(&signalfd_lock); + id_space_destroy(signalfd_minor); + + ddi_remove_minor_node(signalfd_devi, NULL); + signalfd_devi = NULL; + sigfd_exit_helper = NULL; + + ddi_soft_state_fini(&signalfd_softstate); + mutex_exit(&signalfd_lock); + + return (DDI_SUCCESS); +} + +_NOTE(ARGSUSED(0)) +static int +signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)signalfd_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops signalfd_cb_ops = { + signalfd_open, /* open */ + signalfd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + signalfd_read, /* read */ + nodev, /* write */ + signalfd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + signalfd_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops signalfd_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + signalfd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + signalfd_attach, /* attach */ + signalfd_detach, /* detach */ + nodev, /* reset */ + &signalfd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "signalfd support", /* name of module */ + &signalfd_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/signalfd.conf b/usr/src/uts/common/io/signalfd.conf new file mode 100644 index 0000000000..de44738a14 --- /dev/null +++ b/usr/src/uts/common/io/signalfd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +name="signalfd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index c5d54b5978..f0c0983a3a 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -455,6 +455,14 @@ proc_exit(int why, int what) (*dtrace_helpers_cleanup)(); } + /* + * Clean up any signalfd state for the process. + */ + if (p->p_sigfd != NULL) { + VERIFY(sigfd_exit_helper != NULL); + (*sigfd_exit_helper)(); + } + /* untimeout the realtime timers */ if (p->p_itimer != NULL) timer_exit(); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 0b79c3765a..453b1f22d4 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -60,6 +60,7 @@ #include #include #include +#include const k_sigset_t nullsmask = {0, 0, 0}; @@ -94,6 +95,12 @@ const k_sigset_t holdvfork = static int isjobstop(int); static void post_sigcld(proc_t *, sigqueue_t *); + +/* + * signalfd helper function which is set when the signalfd driver loads. + */ +void (*sigfd_exit_helper)(); + /* * Internal variables for counting number of user thread stop requests posted. * They may not be accurate at some special situation such as that a virtually @@ -307,6 +314,11 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) (void) eat_signal(t, sig); thread_unlock(t); DTRACE_PROC2(signal__send, kthread_t *, t, int, sig); + if (p->p_sigfd != NULL && ((sigfd_proc_state_t *) + (p->p_sigfd))->sigfd_pollwake_cb != NULL) + (*((sigfd_proc_state_t *)(p->p_sigfd))-> + sigfd_pollwake_cb)(p, sig); + } else if ((tt = p->p_tlist) != NULL) { /* * Make sure that some lwp that already exists @@ -345,6 +357,10 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } DTRACE_PROC2(signal__send, kthread_t *, tt, int, sig); + if (p->p_sigfd != NULL && ((sigfd_proc_state_t *) + (p->p_sigfd))->sigfd_pollwake_cb != NULL) + (*((sigfd_proc_state_t *)(p->p_sigfd))-> + sigfd_pollwake_cb)(p, sig); } } diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index ed02832500..94c09029b5 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -509,6 +509,7 @@ CHKHDRS= \ sid.h \ siginfo.h \ signal.h \ + signalfd.h \ skein.h \ sleepq.h \ smbios.h \ diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index f1a2fc5485..5abf8fd3cd 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -197,6 +198,7 @@ typedef struct proc { k_sigset_t p_extsig; /* signals sent from another contract */ k_sigset_t p_ignore; /* ignore when generated */ k_sigset_t p_siginfo; /* gets signal info with signal */ + void *p_sigfd; /* signalfd support state */ struct sigqueue *p_sigqueue; /* queued siginfo structures */ struct sigqhdr *p_sigqhdr; /* hdr to sigqueue structure pool */ struct sigqhdr *p_signhdr; /* hdr to signotify structure pool */ diff --git a/usr/src/uts/common/sys/signalfd.h b/usr/src/uts/common/sys/signalfd.h new file mode 100644 index 0000000000..2661d5a05f --- /dev/null +++ b/usr/src/uts/common/sys/signalfd.h @@ -0,0 +1,100 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Header file to support the signalfd facility. Note that this facility + * is designed to be binary compatible with the Linux signalfd facility, modulo + * the signals themselves; values for constants here should therefore exactly + * match those found in Linux, and this facility shouldn't be extended + * independently of Linux. + */ + +#ifndef _SYS_SIGNALFD_H +#define _SYS_SIGNALFD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define SFD_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define SFD_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define SIGNALFDIOC (('s' << 24) | ('f' << 16) | ('d' << 8)) +#define SIGNALFDIOC_MASK (SIGNALFDIOC | 1) /* set mask */ + +typedef struct signalfd_siginfo { + uint32_t ssi_signo; /* signal from signal.h */ + int32_t ssi_errno; /* error from errno.h */ + int32_t ssi_code; /* signal code */ + uint32_t ssi_pid; /* PID of sender */ + uint32_t ssi_uid; /* real UID of sender */ + int32_t ssi_fd; /* File descriptor (SIGIO) */ + uint32_t ssi_tid; /* unused */ + uint32_t ssi_band; /* band event (SIGIO) */ + uint32_t ssi_overrun; /* unused */ + uint32_t ssi_trapno; /* trap number that caused signal */ + int32_t ssi_status; /* exit status or signal (SIGCHLD) */ + int32_t ssi_int; /* unused */ + uint64_t ssi_ptr; /* unused */ + uint64_t ssi_utime; /* user CPU time consumed (SIGCHLD) */ + uint64_t ssi_stime; /* system CPU time consumed (SIGCHLD) */ + uint64_t ssi_addr; /* address that generated signal */ + uint8_t ssi_pad[48]; /* Pad size to 128 bytes to allow for */ + /* additional fields in the future. */ +} signalfd_siginfo_t; + +#ifndef _KERNEL + +extern int signalfd(int, const sigset_t *, int); + +#else + +#define SIGNALFDMNRN_SIGNALFD 0 +#define SIGNALFDMNRN_CLONE 1 + +typedef struct sigfd_wake_list { + list_node_t sigfd_wl_lst; + void *sigfd_wl_state; +} sigfd_wake_list_t; + +/* + * This holds the proc_t state for a process which is using signalfd. + */ +typedef struct sigfd_proc_state { + void (*sigfd_pollwake_cb)(void *, int); + list_t sigfd_list; +} sigfd_proc_state_t; + + +extern void (*sigfd_exit_helper)(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SIGNALFD_H */ diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 188230d61e..fd6a60c65e 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -164,7 +164,7 @@ typedef struct _kthread { k_sigset_t t_sig; /* signals pending to this process */ k_sigset_t t_extsig; /* signals sent from another contract */ k_sigset_t t_hold; /* hold signal bit mask */ - k_sigset_t t_sigwait; /* sigtimedwait() is accepting these */ + k_sigset_t t_sigwait; /* sigtimedwait/sigfd accepting these */ struct _kthread *t_forw; /* process's forward thread link */ struct _kthread *t_back; /* process's backward thread link */ struct _kthread *t_thlink; /* tid (lwpid) lookup hash link */ diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index d055d0a8d1..79aa34879f 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -324,6 +324,7 @@ DRV_KMODS += sd DRV_KMODS += sdhost DRV_KMODS += sgen DRV_KMODS += si3124 +DRV_KMODS += signalfd DRV_KMODS += smbios DRV_KMODS += skd DRV_KMODS += softmac diff --git a/usr/src/uts/intel/signalfd/Makefile b/usr/src/uts/intel/signalfd/Makefile new file mode 100644 index 0000000000..d1a461c2f1 --- /dev/null +++ b/usr/src/uts/intel/signalfd/Makefile @@ -0,0 +1,68 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = signalfd +OBJECTS = $(SIGNALFD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SIGNALFD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +CERRWARN += -_gcc=-Wno-parentheses + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc index 41a57721e8..e677f5363f 100644 --- a/usr/src/uts/sparc/Makefile.sparc +++ b/usr/src/uts/sparc/Makefile.sparc @@ -239,6 +239,7 @@ DRV_KMODS += bridge trill DRV_KMODS += bpf DRV_KMODS += dca DRV_KMODS += eventfd +DRV_KMODS += signalfd # # Hardware Drivers in common space diff --git a/usr/src/uts/sparc/signalfd/Makefile b/usr/src/uts/sparc/signalfd/Makefile new file mode 100644 index 0000000000..a60bc617e1 --- /dev/null +++ b/usr/src/uts/sparc/signalfd/Makefile @@ -0,0 +1,68 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = signalfd +OBJECTS = $(SIGNALFD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SIGNALFD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +CERRWARN += -_gcc=-Wno-parentheses + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ -- cgit v1.2.3 From 68ecb2ec930c4b0f00acaf8e0abb2b19c4b8b76f Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Sun, 15 Nov 2015 13:06:56 -0800 Subject: 6393 zfs receive a full send as a clone Reviewed by: Matthew Ahrens Reviewed by: Prakash Surya Reviewed by: Richard Elling Approved by: Dan McDonald --- usr/src/man/man1m/zfs.1m | 9 +- usr/src/pkg/manifests/system-test-zfstest.mf | 5 +- usr/src/test/zfs-tests/runfiles/delphix.run | 3 +- .../tests/functional/cli_root/zfs_receive/Makefile | 5 +- .../cli_root/zfs_receive/zfs_receive_010_pos.ksh | 172 +++++++++++++++++++++ usr/src/uts/common/fs/zfs/dmu_send.c | 158 ++++++++++++------- usr/src/uts/common/fs/zfs/sys/dmu_impl.h | 3 +- usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h | 12 +- 8 files changed, 304 insertions(+), 63 deletions(-) create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh diff --git a/usr/src/man/man1m/zfs.1m b/usr/src/man/man1m/zfs.1m index 1fdadc346a..63cb918480 100644 --- a/usr/src/man/man1m/zfs.1m +++ b/usr/src/man/man1m/zfs.1m @@ -21,7 +21,7 @@ .\" .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011 Joshua M. Clulow -.\" Copyright (c) 2011, 2014 by Delphix. All rights reserved. +.\" Copyright (c) 2011, 2015 by Delphix. All rights reserved. .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. .\" Copyright (c) 2014, Joyent, Inc. All rights reserved. .\" Copyright (c) 2014 by Adam Stevko. All rights reserved. @@ -2775,8 +2775,11 @@ Do not actually receive the stream. This can be useful in conjunction with the option to verify the name the receive operation would use. .It Fl o Sy origin Ns = Ns Ar snapshot Forces the stream to be received as a clone of the given snapshot. -This is only valid if the stream is an incremental stream whose source -is the same as the provided origin. +If the stream is a full send stream, this will create the filesystem +described by the stream as a clone of the specified snapshot. Which +snapshot was specified will not affect the success or failure of the +receive, as long as the snapshot does exist. If the stream is an +incremental send stream, all the normal verification will be performed. .It Fl u File system that is associated with the received stream is not mounted. .It Fl v diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index ffea12e25b..70e5ff602e 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -10,7 +10,7 @@ # # -# Copyright (c) 2012, 2014 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2015, Nexenta Systems Inc. All rights reserved. # @@ -662,6 +662,9 @@ file \ file \ path=opt/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg \ mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos \ + mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zfs_rename/cleanup mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zfs_rename/setup mode=0555 file path=opt/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.cfg \ diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index 0bac84c12e..266247bd4a 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -139,7 +139,8 @@ tests = ['zfs_written_property_001_pos'] [/opt/zfs-tests/tests/functional/cli_root/zfs_receive] tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_005_neg', 'zfs_receive_006_pos', - 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg'] + 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', + 'zfs_receive_010_pos'] [/opt/zfs-tests/tests/functional/cli_root/zfs_rename] tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile index 3e9f28238b..f203bfc344 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile @@ -10,7 +10,7 @@ # # -# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright (c) 2012, 2015 by Delphix. All rights reserved. # include $(SRC)/Makefile.master @@ -28,7 +28,8 @@ PROGS = cleanup \ zfs_receive_006_pos \ zfs_receive_007_neg \ zfs_receive_008_pos \ - zfs_receive_009_neg + zfs_receive_009_neg \ + zfs_receive_010_pos CMDS = $(PROGS:%=$(TESTDIR)/%) $(CMDS) := FILEMODE = 0555 diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh new file mode 100644 index 0000000000..f9c1ec4a5f --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh @@ -0,0 +1,172 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test that receiving a full send as a clone works correctly. +# +# STRATEGY: +# 1. Create pool and filesystems. +# 2. Send filesystem, receive as clone of itself. +# 3. Verify that nop-write saves space. +# 4. Send filesystem, receive as clone of other filesystem. +# 5. Verify that contents are correct. +# 6. Repeat steps 4 and 5 with filesystems swapped. +# + +verify_runnable "both" + +fs=$TESTPOOL/$TESTFS/base/fs +fs2=$TESTPOOL/$TESTFS/base/fs2 +rfs=$TESTPOOL/$TESTFS/base/rfs + +function make_object +{ + local objnum=$1 + local mntpnt=$2 + local type=$3 + if [[ $type == "file" ]]; then + $DD if=/dev/urandom of=${mntpnt}/f$objnum bs=512 count=16 + elif [[ $type == "hole1" ]]; then + $DD if=/dev/urandom of=${mntpnt}/fh$objnum bs=512 count=5 stride=4 + elif [[ $type == "hole2" ]]; then + $DD if=/dev/urandom of=${mntpnt}/fh$objnum bs=512 count=4 stride=5 + elif [[ $type == "directory" ]]; then + $MKDIR ${mntpnt}/d$objnum + elif [[ $type == "missing" ]]; then + $TOUCH ${mntpnt}/h$objnum + fi +} + +function create_pair +{ + local objnum=$1 + local mntpnt1=$2 + local mntpnt2=$3 + local type1=$4 + local type2=$5 + make_object $objnum $mntpnt1 $type1 + make_object $objnum $mntpnt2 $type2 +} + +function cleanup +{ + $ZFS destroy -Rf $TESTPOOL/$TESTFS/base + rm /tmp/zr010p* +} + +log_assert "zfs receive of full send as clone should work" +log_onexit cleanup +log_must $ZFS create -o checksum=sha256 -o compression=gzip -o recordsize=512 \ + $TESTPOOL/$TESTFS/base + +log_must $ZFS create $fs +log_must $ZFS create $fs2 +mntpnt=$(get_prop mountpoint $fs) +mntpnt2=$(get_prop mountpoint $fs2) + +# +# Now, we create the two filesystems. By creating objects with +# different types and the same object number in each filesystem, we +# create a situation where, when you receive the full send of each as +# a clone of the other, we will test to ensure that the code correctly +# handles receiving all object types onto all other object types. +# + +# Receive a file onto a file (and vice versa). +create_pair 8 $mntpnt $mntpnt2 "file" "file" + +# Receive a file onto a file with holes (and vice versa). +create_pair 9 $mntpnt $mntpnt2 "file" "hole1" + +# Receive a file onto a directory (and vice versa). +create_pair 10 $mntpnt $mntpnt2 "file" "directory" + +# Receive a file onto a missing object (and vice versa). +create_pair 11 $mntpnt $mntpnt2 "file" "missing" + +# Receive a file with holes onto a file with holes (and vice versa). +create_pair 12 $mntpnt $mntpnt2 "hole1" "hole2" + +# Receive a file with holes onto a directory (and vice versa). +create_pair 13 $mntpnt $mntpnt2 "hole1" "directory" + +# Receive a file with holes onto a missing object (and vice versa). +create_pair 14 $mntpnt $mntpnt2 "hole1" "missing" + +# Receive a directory onto a directory (and vice versa). +create_pair 15 $mntpnt $mntpnt2 "directory" "directory" + +# Receive a directory onto a missing object (and vice versa). +create_pair 16 $mntpnt $mntpnt2 "directory" "missing" + +# Receive a missing object onto a missing object (and vice versa). +create_pair 17 $mntpnt $mntpnt2 "missing" "missing" + +# Receive a file with a different record size onto a file (and vice versa). +log_must $ZFS set recordsize=128k $fs +$DD if=/dev/urandom of=$mntpnt/f18 bs=128k count=64 +$TOUCH $mntpnt2/f18 + +# Remove objects that are intended to be missing. +$RM $mntpnt/h17 +$RM $mntpnt2/h* + +log_must $ZFS snapshot $fs@s1 +log_must $ZFS snapshot $fs2@s1 + +log_must $ZFS send $fs@s1 > /tmp/zr010p +log_must $ZFS send $fs2@s1 > /tmp/zr010p2 + + +# +# Test that, when we receive a full send as a clone of itself, +# nop-write saves us all the space used by data blocks. +# +cat /tmp/zr010p | log_must $ZFS receive -o origin=$fs@s1 $rfs +size=$(get_prop used $rfs) +size2=$(get_prop used $fs) +if [[ $size -ge $(($size2 / 10)) ]] then + log_fail "nop-write failure; expected usage less than "\ + "$(($size2 / 10)), but is using $size" +fi +log_must $ZFS destroy -fr $rfs + +# Correctness testing: receive each full send as a clone of the other fiesystem. +cat /tmp/zr010p | log_must $ZFS receive -o origin=$fs2@s1 $rfs +mntpnt_old=$(get_prop mountpoint $fs) +mntpnt_new=$(get_prop mountpoint $rfs) +log_must $DIFF -r $mntpnt_old $mntpnt_new +log_must $ZFS destroy -r $rfs + +cat /tmp/zr010p2 | log_must $ZFS receive -o origin=$fs@s1 $rfs +mntpnt_old=$(get_prop mountpoint $fs2) +mntpnt_new=$(get_prop mountpoint $rfs) +log_must $DIFF -r $mntpnt_old $mntpnt_new + +log_pass "zfs receive of full send as clone works" diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index dea691d603..8358f2a0dc 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -137,6 +137,14 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) return (0); } +/* + * Fill in the drr_free struct, or perform aggregation if the previous record is + * also a free record, and the two are adjacent. + * + * Note that we send free records even for a full send, because we want to be + * able to receive a full send as a clone, which requires a list of all the free + * and freeobject records that were generated on the source. + */ static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) @@ -160,15 +168,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); - /* - * If we are doing a non-incremental send, then there can't - * be any data in the dataset we're receiving into. Therefore - * a free record would simply be a no-op. Save space by not - * sending it to begin with. - */ - if (!dsp->dsa_incremental) - return (0); - if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -347,10 +346,6 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); - /* See comment in dump_free(). */ - if (!dsp->dsa_incremental) - return (0); - /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -750,6 +745,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = @@ -772,7 +768,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; dsp->dsa_resume_object = resumeobj; dsp->dsa_resume_offset = resumeoff; @@ -1286,7 +1281,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE) { + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } @@ -1305,6 +1300,15 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) drba->drba_origin)) return (SET_ERROR(ENOENT)); + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); + /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, MAXNAMELEN); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); @@ -1344,7 +1348,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } - if (dsl_dataset_phys(origin)->ds_guid != fromguid) { + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); @@ -1674,6 +1679,20 @@ struct receive_writer_arg { uint64_t bytes_read; /* bytes read when current record created */ }; +struct objlist { + list_t list; /* List of struct receive_objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t last_lookup; +}; + +struct receive_objnode { + list_node_t node; + uint64_t object; +}; + struct receive_arg { objset_t *os; vnode_t *vp; /* The vnode to read the stream from */ @@ -1691,12 +1710,7 @@ struct receive_arg { int err; boolean_t byteswap; /* Sorted list of objects not to issue prefetches for. */ - list_t ignore_obj_list; -}; - -struct receive_ign_obj_node { - list_node_t node; - uint64_t object; + struct objlist ignore_objlist; }; typedef struct guid_map_entry { @@ -2008,13 +2022,14 @@ receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; + int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; if (dmu_object_info(rwa->os, obj, NULL) != 0) @@ -2024,7 +2039,8 @@ receive_freeobjects(struct receive_writer_arg *rwa, if (err != 0) return (err); } - + if (next_err != ESRCH) + return (next_err); return (0); } @@ -2354,6 +2370,66 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) return (0); } +static void +objlist_create(struct objlist *list) +{ + list_create(&list->list, sizeof (struct receive_objnode), + offsetof(struct receive_objnode, node)); + list->last_lookup = 0; +} + +static void +objlist_destroy(struct objlist *list) +{ + for (struct receive_objnode *n = list_remove_head(&list->list); + n != NULL; n = list_remove_head(&list->list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->list); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +static boolean_t +objlist_exists(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = list_head(&list->list); + ASSERT3U(object, >=, list->last_lookup); + list->last_lookup = object; + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&list->list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->list); + } + return (node != NULL && node->object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +static void +objlist_insert(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->object = object; +#ifdef ZFS_DEBUG + struct receive_objnode *last_object = list_tail(&list->list); + uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&list->list, node); +} + /* * Issue the prefetch reads for any necessary indirect blocks. * @@ -2376,13 +2452,7 @@ static void receive_read_prefetch(struct receive_arg *ra, uint64_t object, uint64_t offset, uint64_t length) { - struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); - while (node != NULL && node->object < object) { - VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); - kmem_free(node, sizeof (*node)); - node = list_head(&ra->ignore_obj_list); - } - if (node == NULL || node->object > object) { + if (!objlist_exists(&ra->ignore_objlist, object)) { dmu_prefetch(ra->os, object, 1, offset, length, ZIO_PRIORITY_SYNC_READ); } @@ -2415,18 +2485,7 @@ receive_read_record(struct receive_arg *ra) */ if (err == ENOENT || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { - struct receive_ign_obj_node *node = - kmem_zalloc(sizeof (*node), - KM_SLEEP); - node->object = drro->drr_object; -#ifdef ZFS_DEBUG - struct receive_ign_obj_node *last_object = - list_tail(&ra->ignore_obj_list); - uint64_t last_objnum = (last_object != NULL ? - last_object->object : 0); - ASSERT3U(node->object, >, last_objnum); -#endif - list_insert_tail(&ra->ignore_obj_list, node); + objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; } return (err); @@ -2643,7 +2702,6 @@ resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) return (0); } - /* * Read in the stream's records, one by one, and apply them to the pool. There * are two threads involved; the thread that calls this function will spin up a @@ -2677,8 +2735,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, sizeof (ra.bytes_read), 1, &ra.bytes_read); } - list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), - offsetof(struct receive_ign_obj_node, node)); + objlist_create(&ra.ignore_objlist); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -2832,12 +2889,7 @@ out: } *voffp = ra.voff; - for (struct receive_ign_obj_node *n = - list_remove_head(&ra.ignore_obj_list); n != NULL; - n = list_remove_head(&ra.ignore_obj_list)) { - kmem_free(n, sizeof (*n)); - } - list_destroy(&ra.ignore_obj_list); + objlist_destroy(&ra.ignore_objlist); return (err); } diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index 00be9dc725..8f3b27ff3f 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -24,7 +24,7 @@ */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -293,7 +293,6 @@ typedef struct dmu_sendarg { uint64_t dsa_toguid; int dsa_err; dmu_pendop_t dsa_pending_op; - boolean_t dsa_incremental; uint64_t dsa_featureflags; uint64_t dsa_last_data_object; uint64_t dsa_last_data_offset; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 47799ff657..8fc49c7fd4 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_IOCTL_H @@ -126,6 +126,16 @@ typedef enum dmu_send_resume_token_version { #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) +/* + * This send stream, if it is a full send, includes the FREE and FREEOBJECT + * records that are created by the sending process. This means that the send + * stream can be received as a clone, even though it is not an incremental. + * This is not implemented as a feature flag, because the receiving side does + * not need to have implemented it to receive this stream; it is fully backwards + * compatible. We need a flag, though, because full send streams without it + * cannot necessarily be received as a clone correctly. + */ +#define DRR_FLAG_FREERECORDS (1<<2) /* * flags in the drr_checksumflags field in the DRR_WRITE and -- cgit v1.2.3