diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-04-24 12:05:06 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-04-24 12:05:06 +0000 |
commit | fbe38b28735cd7e7715a61f2d1066dbbf1327b5c (patch) | |
tree | 82b5600384e40993ef24fee34a90e234f4520c7f | |
parent | 6f1d54d1657b9c469ae08d5ee733e114fe36c690 (diff) | |
parent | ff7af0d3beb1bddf8bb93afc2e9042dc3828be3d (diff) | |
download | illumos-joyent-fbe38b28735cd7e7715a61f2d1066dbbf1327b5c.tar.gz |
[illumos-gate merge]
commit ff7af0d3beb1bddf8bb93afc2e9042dc3828be3d
8007 want sys/stddef.h for offsetof and container_of macros
commit 63982b82e639bf9f496423925738dd3f86bda7aa
7976 libstand/dosfs: cache FAT32 in 128 Kb blocks to save loader memory
commit f289ce6eb03db0584699ec4fed88ef795a33dd79
7839 uts: implement boot environment support
commit 76608ff7a54afda798e7fdc98681fb6d37322109
7838 loader: pass loader environment to kernel as module
commit 14ee0d29c415966483c8c602b05bf27669c29497
7462 loader should support multiboot2 protocol
commit 1738dd6ec94e36a9828d13a6e52ac7fb68cb52ed
7461 illumos should support multiboot2 protocol
commit 660946868929e02041af7b5b1c3e14f547c53f11
8021 ARC buf data scatter-ization
commit df950592be5771afa05177cbbef90ff275f2526f
8088 Add support for LSI Intruder and Cutlass cards.
Conflicts:
usr/src/uts/common/fs/zfs/arc.c
98 files changed, 6173 insertions, 1518 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version index 6e85cbfb08..018e7e7816 100644 --- a/usr/src/boot/Makefile.version +++ b/usr/src/boot/Makefile.version @@ -33,4 +33,4 @@ LOADER_VERSION = 1.1 # Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes. # The version is processed from left to right, the version number can only # be increased. -BOOT_VERSION = $(LOADER_VERSION)-2017.4.1.1 +BOOT_VERSION = $(LOADER_VERSION)-2017.4.22.1 diff --git a/usr/src/boot/lib/libstand/dosfs.c b/usr/src/boot/lib/libstand/dosfs.c index 6cf50b8ba2..617041566e 100644 --- a/usr/src/boot/lib/libstand/dosfs.c +++ b/usr/src/boot/lib/libstand/dosfs.c @@ -26,7 +26,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); /* * Readonly filesystem for Microsoft FAT12/FAT16/FAT32 filesystems, @@ -65,6 +64,7 @@ struct fs_ops dosfs_fsops = { #define DEPSEC 16 /* directory entries per sector */ #define DSHIFT 4 /* DEPSEC shift */ #define LOCLUS 2 /* lowest cluster number */ +#define FATBLKSZ 0x20000 /* size of block in the FAT cache buffer */ /* DOS "BIOS Parameter Block" */ typedef struct { @@ -132,18 +132,6 @@ static DOS_DE dot[2] = { ((u_int)cv2((de)->dex.h_clus) << 16) | \ cv2((de)->clus)) -/* - * fat cache metadata - */ -struct fatcache { - int unit; /* disk unit number */ - int size; /* buffer (and fat) size in sectors */ - u_char *buf; -}; - -static struct fatcache fat; - -static int dosunmount(DOS_FS *); static int parsebs(DOS_FS *, DOS_BS *); static int namede(DOS_FS *, const char *, DOS_DE **); static int lookup(DOS_FS *, u_int, const char *, DOS_DE **); @@ -153,36 +141,38 @@ static off_t fsize(DOS_FS *, DOS_DE *); static int fatcnt(DOS_FS *, u_int); static int fatget(DOS_FS *, u_int *); static int fatend(u_int, u_int); -static int ioread(DOS_FS *, u_int, void *, u_int); -static int ioget(struct open_file *, daddr_t, void *, u_int); +static int ioread(DOS_FS *, u_int, void *, size_t); +static int ioget(struct open_file *, daddr_t, void *, size_t); -static void -dos_read_fat(DOS_FS *fs, struct open_file *fd) +static int +dos_read_fatblk(DOS_FS *fs, struct open_file *fd, u_int blknum) { - struct devdesc *dd = fd->f_devdata; - - if (fat.buf != NULL) { /* can we reuse old buffer? */ - if (fat.size != fs->spf) { - free(fat.buf); /* no, free old buffer */ - fat.buf = NULL; + int err; + size_t io_size; + daddr_t offset_in_fat, max_offset_in_fat; + + offset_in_fat = ((daddr_t)blknum) * FATBLKSZ; + max_offset_in_fat = secbyt(fs->spf); + io_size = FATBLKSZ; + if (offset_in_fat > max_offset_in_fat) + offset_in_fat = max_offset_in_fat; + if (offset_in_fat + io_size > max_offset_in_fat) + io_size = ((size_t)(max_offset_in_fat - offset_in_fat)); + + if (io_size != 0) { + err = ioget(fd, fs->lsnfat + bytsec(offset_in_fat), + fs->fatbuf, io_size); + if (err != 0) { + fs->fatbuf_blknum = ((u_int)(-1)); + return (err); } } - if (fat.buf == NULL) - fat.buf = malloc(secbyt(fs->spf)); + if (io_size < FATBLKSZ) + memset(fs->fatbuf + io_size, 0, FATBLKSZ - io_size); - if (fat.buf != NULL) { - if (ioget(fd, fs->lsnfat, fat.buf, secbyt(fs->spf)) == 0) { - fat.size = fs->spf; - fat.unit = dd->d_unit; - return; - } - } - if (fat.buf != NULL) /* got IO error */ - free(fat.buf); - fat.buf = NULL; - fat.unit = -1; /* impossible unit */ - fat.size = 0; + fs->fatbuf_blknum = blknum; + return (0); } /* @@ -192,24 +182,27 @@ static int dos_mount(DOS_FS *fs, struct open_file *fd) { int err; - struct devdesc *dd = fd->f_devdata; u_char *buf; bzero(fs, sizeof(DOS_FS)); fs->fd = fd; - if ((err = !(buf = malloc(secbyt(1))) ? errno : 0) || - (err = ioget(fs->fd, 0, buf, secbyt(1))) || + if ((buf = malloc(secbyt(1))) == NULL) + return (errno); + if ((err = ioget(fs->fd, 0, buf, secbyt(1))) || (err = parsebs(fs, (DOS_BS *)buf))) { - if (buf != NULL) - free(buf); - (void)dosunmount(fs); + free(buf); return (err); } free(buf); - if (fat.buf == NULL || fat.unit != dd->d_unit) - dos_read_fat(fs, fd); + if ((fs->fatbuf = malloc(FATBLKSZ)) == NULL) + return (errno); + err = dos_read_fatblk(fs, fd, 0); + if (err != 0) { + free(fs->fatbuf); + return (err); + } fs->root = dot[0]; fs->root.name[0] = ' '; @@ -228,21 +221,9 @@ dos_mount(DOS_FS *fs, struct open_file *fd) static int dos_unmount(DOS_FS *fs) { - int err; - if (fs->links) return (EBUSY); - if ((err = dosunmount(fs))) - return (err); - return (0); -} - -/* - * Common code shared by dos_mount() and dos_unmount() - */ -static int -dosunmount(DOS_FS *fs) -{ + free(fs->fatbuf); free(fs); return (0); } @@ -257,16 +238,20 @@ dos_open(const char *path, struct open_file *fd) DOS_FILE *f; DOS_FS *fs; u_int size, clus; - int err = 0; + int err; /* Allocate mount structure, associate with open */ - fs = malloc(sizeof(DOS_FS)); - - if ((err = dos_mount(fs, fd))) - goto out; + if ((fs = malloc(sizeof(DOS_FS))) == NULL) + return (errno); + if ((err = dos_mount(fs, fd))) { + free(fs); + return (err); + } - if ((err = namede(fs, path, &de))) - goto out; + if ((err = namede(fs, path, &de))) { + dos_unmount(fs); + return (err); + } clus = stclus(fs->fatsz, de); size = cv4(de->size); @@ -274,18 +259,20 @@ dos_open(const char *path, struct open_file *fd) if ((!(de->attr & FA_DIR) && (!clus != !size)) || ((de->attr & FA_DIR) && size) || (clus && !okclus(fs, clus))) { - err = EINVAL; - goto out; + dos_unmount(fs); + return (EINVAL); + } + if ((f = malloc(sizeof(DOS_FILE))) == NULL) { + err = errno; + dos_unmount(fs); + return (err); } - f = malloc(sizeof(DOS_FILE)); bzero(f, sizeof(DOS_FILE)); f->fs = fs; fs->links++; f->de = *de; fd->f_fsdata = (void *)f; - - out: - return (err); + return (0); } /* @@ -761,34 +748,57 @@ fatcnt(DOS_FS *fs, u_int c) } /* - * Get next cluster in cluster chain. Use in core fat cache unless another - * device replaced it. + * Get next cluster in cluster chain. Use in core fat cache unless + * the number of current 128K block in FAT has changed. */ static int fatget(DOS_FS *fs, u_int *c) { - u_char buf[4]; - u_int x, offset, n, nbyte; - struct devdesc *dd = fs->fd->f_devdata; - int err = 0; + u_int val_in, val_out, offset, blknum, nbyte; + const u_char *p_entry; + int err; - if (fat.unit != dd->d_unit) { - /* fat cache was changed to another device, don't use it */ - err = ioread(fs, secbyt(fs->lsnfat) + fatoff(fs->fatsz, *c), buf, - fs->fatsz != 32 ? 2 : 4); - if (err) - return (err); - } else { - offset = fatoff(fs->fatsz, *c); - nbyte = fs->fatsz != 32 ? 2 : 4; + /* check input value to prevent overflow in fatoff() */ + val_in = *c; + if (val_in & 0xf0000000) + return (EINVAL); - if (offset + nbyte > secbyt(fat.size)) - return (EINVAL); - memcpy(buf, fat.buf + offset, nbyte); + /* ensure that current 128K FAT block is cached */ + offset = fatoff(fs->fatsz, val_in); + nbyte = fs->fatsz != 32 ? 2 : 4; + if (offset + nbyte > secbyt(fs->spf)) + return (EINVAL); + blknum = offset / FATBLKSZ; + offset %= FATBLKSZ; + if (offset + nbyte > FATBLKSZ) + return (EINVAL); + if (blknum != fs->fatbuf_blknum) { + err = dos_read_fatblk(fs, fs->fd, blknum); + if (err != 0) + return (err); } - - x = fs->fatsz != 32 ? cv2(buf) : cv4(buf); - *c = fs->fatsz == 12 ? *c & 1 ? x >> 4 : x & 0xfff : x; + p_entry = fs->fatbuf + offset; + + /* extract cluster number from FAT entry */ + switch (fs->fatsz) { + case 32: + val_out = cv4(p_entry); + val_out &= 0x0fffffff; + break; + case 16: + val_out = cv2(p_entry); + break; + case 12: + val_out = cv2(p_entry); + if (val_in & 1) + val_out >>= 4; + else + val_out &= 0xfff; + break; + default: + return (EINVAL); + } + *c = val_out; return (0); } @@ -805,7 +815,7 @@ fatend(u_int sz, u_int c) * Offset-based I/O primitive */ static int -ioread(DOS_FS *fs, u_int offset, void *buf, u_int nbyte) +ioread(DOS_FS *fs, u_int offset, void *buf, size_t nbyte) { char *s; u_int off, n; @@ -843,8 +853,16 @@ ioread(DOS_FS *fs, u_int offset, void *buf, u_int nbyte) * Sector-based I/O primitive */ static int -ioget(struct open_file *fd, daddr_t lsec, void *buf, u_int size) +ioget(struct open_file *fd, daddr_t lsec, void *buf, size_t size) { - return ((fd->f_dev->dv_strategy)(fd->f_devdata, F_READ, lsec, - size, buf, NULL)); + size_t rsize; + int rv; + + /* Make sure we get full read or error. */ + rsize = 0; + rv = (fd->f_dev->dv_strategy)(fd->f_devdata, F_READ, lsec, + size, buf, &rsize); + if ((rv == 0) && (size != rsize)) + rv = EIO; + return (rv); } diff --git a/usr/src/boot/lib/libstand/dosfs.h b/usr/src/boot/lib/libstand/dosfs.h index f2370ee502..0915c70930 100644 --- a/usr/src/boot/lib/libstand/dosfs.h +++ b/usr/src/boot/lib/libstand/dosfs.h @@ -24,7 +24,6 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD$ */ #ifndef DOSIO_H @@ -96,6 +95,8 @@ typedef union { typedef struct { struct open_file *fd; /* file descriptor */ + u_char *fatbuf; /* FAT cache buffer */ + u_int fatbuf_blknum; /* number of 128K block in FAT cache buffer */ u_int links; /* active links to structure */ u_int spc; /* sectors per cluster */ u_int bsize; /* cluster size in bytes */ diff --git a/usr/src/boot/sys/boot/common/bootstrap.h b/usr/src/boot/sys/boot/common/bootstrap.h index d228875f7f..010dda130e 100644 --- a/usr/src/boot/sys/boot/common/bootstrap.h +++ b/usr/src/boot/sys/boot/common/bootstrap.h @@ -22,8 +22,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #ifndef _BOOTSTRAP_H_ @@ -234,6 +232,8 @@ void file_discard(struct preloaded_file *fp); void file_addmetadata(struct preloaded_file *fp, int type, size_t size, void *p); int file_addmodule(struct preloaded_file *fp, char *modname, int version, struct kernel_module **newmp); +void build_environment_module(void); +vm_offset_t bi_copyenv(vm_offset_t); /* MI module loaders */ #ifdef __elfN @@ -306,7 +306,13 @@ struct arch_switch */ uint64_t (*arch_loadaddr)(u_int type, void *data, uint64_t addr); #define LOAD_ELF 1 /* data points to the ELF header. */ -#define LOAD_RAW 2 /* data points to the file name. */ +#define LOAD_RAW 2 /* data points to the module file name. */ +#define LOAD_KERN 3 /* data points to the kernel file name. */ +#define LOAD_MEM 4 /* data points to int for buffer size. */ + /* + * Interface to release the load address. + */ + void (*arch_free_loadaddr)(uint64_t addr, uint64_t pages); /* * Interface to inform MD code about a loaded (ELF) segment. This diff --git a/usr/src/boot/sys/boot/common/load_elf.c b/usr/src/boot/sys/boot/common/load_elf.c index 287bfac56a..b7fc4bea09 100644 --- a/usr/src/boot/sys/boot/common/load_elf.c +++ b/usr/src/boot/sys/boot/common/load_elf.c @@ -251,11 +251,15 @@ __elfN(loadfile_raw)(char *filename, u_int64_t dest, if (ef.kernel == 1 && multiboot == 0) setenv("kernelname", filename, 1); fp->f_name = strdup(filename); - if (multiboot == 0) - fp->f_type = strdup(ef.kernel ? - __elfN(kerneltype) : __elfN(moduletype)); - else - fp->f_type = strdup("elf multiboot kernel"); + if (multiboot == 0) { + fp->f_type = strdup(ef.kernel ? + __elfN(kerneltype) : __elfN(moduletype)); + } else { + if (multiboot == 1) + fp->f_type = strdup("elf multiboot kernel"); + else + fp->f_type = strdup("elf multiboot2 kernel"); + } #ifdef ELF_VERBOSE if (ef.kernel) diff --git a/usr/src/boot/sys/boot/common/module.c b/usr/src/boot/sys/boot/common/module.c index b091cf23b6..50afdbef7f 100644 --- a/usr/src/boot/sys/boot/common/module.c +++ b/usr/src/boot/sys/boot/common/module.c @@ -25,7 +25,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); /* * file/module function dispatcher, support, etc. @@ -403,6 +402,88 @@ file_load_dependencies(struct preloaded_file *base_file) } /* + * Calculate the size of the environment module. + * The environment is list of name=value C strings, ending with a '\0' byte. + */ +static size_t +env_get_size(void) +{ + size_t size = 0; + struct env_var *ep; + + /* Traverse the environment. */ + for (ep = environ; ep != NULL; ep = ep->ev_next) { + size += strlen(ep->ev_name); + size++; /* "=" */ + if (ep->ev_value != NULL) + size += strlen(ep->ev_value); + size++; /* nul byte */ + } + size++; /* nul byte */ + return (size); +} + +/* + * Create virtual module for environment variables. + * This module should be created as late as possible before executing + * the OS kernel, or we may miss some environment variable updates. + */ +void +build_environment_module(void) +{ + struct preloaded_file *fp; + size_t size; + char *name = "environment"; + vm_offset_t laddr; + + /* We can't load first */ + if ((file_findfile(NULL, NULL)) == NULL) { + printf("Can not load environment module: %s\n", + "the kernel is not loaded"); + return; + } + + size = env_get_size(); + + fp = file_alloc(); + if (fp != NULL) { + fp->f_name = strdup(name); + fp->f_type = strdup(name); + } + + if (fp == NULL || fp->f_name == NULL || fp->f_type == NULL) { + printf("Can not load environment module: %s\n", + "out of memory"); + if (fp != NULL) + file_discard(fp); + return; + } + + + if (archsw.arch_loadaddr != NULL) + loadaddr = archsw.arch_loadaddr(LOAD_MEM, &size, loadaddr); + + if (loadaddr == 0) { + printf("Can not load environment module: %s\n", + "out of memory"); + file_discard(fp); + return; + } + + laddr = bi_copyenv(loadaddr); + + /* Looks OK so far; populate control structure */ + fp->f_loader = -1; + fp->f_addr = loadaddr; + fp->f_size = laddr - loadaddr; + + /* recognise space consumption */ + loadaddr = laddr; + + file_insert_tail(fp); +} + +/* * We've been asked to load (fname) as (type), so just suck it in, * no arguments or anything. */ @@ -413,6 +494,7 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert) char *name; int fd, got; vm_offset_t laddr; + struct stat st; /* We can't load first */ if ((file_findfile(NULL, NULL)) == NULL) { @@ -434,12 +516,25 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert) free(name); return(NULL); } + if (fstat(fd, &st) < 0) { + close(fd); + snprintf(command_errbuf, sizeof (command_errbuf), + "stat error '%s': %s", name, strerror(errno)); + free(name); + return(NULL); + } if (archsw.arch_loadaddr != NULL) loadaddr = archsw.arch_loadaddr(LOAD_RAW, name, loadaddr); + if (loadaddr == 0) { + close(fd); + snprintf(command_errbuf, sizeof (command_errbuf), + "no memory to load %s", name); + free(name); + return(NULL); + } - laddr = roundup(loadaddr, PAGE_SIZE); - loadaddr = laddr; + laddr = loadaddr; for (;;) { /* read in 4k chunks; size is not really important */ got = archsw.arch_readin(fd, laddr, 4096); @@ -450,6 +545,9 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert) "error reading '%s': %s", name, strerror(errno)); free(name); close(fd); + if (archsw.arch_free_loadaddr != NULL) + archsw.arch_free_loadaddr(loadaddr, + (uint64_t)(roundup2(st.st_size, PAGE_SIZE) >> 12)); return(NULL); } laddr += got; @@ -893,6 +991,11 @@ file_discard(struct preloaded_file *fp) struct kernel_module *mp, *mp1; if (fp == NULL) return; + + if (archsw.arch_free_loadaddr != NULL && fp->f_addr) + archsw.arch_free_loadaddr(fp->f_addr, + (uint64_t)(roundup2(fp->f_size, PAGE_SIZE) >> 12)); + md = fp->f_metadata; while (md) { md1 = md; @@ -906,13 +1009,10 @@ file_discard(struct preloaded_file *fp) mp1 = mp; mp = mp->m_next; free(mp1); - } - if (fp->f_name != NULL) - free(fp->f_name); - if (fp->f_type != NULL) - free(fp->f_type); - if (fp->f_args != NULL) - free(fp->f_args); + } + free(fp->f_name); + free(fp->f_type); + free(fp->f_args); free(fp); } diff --git a/usr/src/boot/sys/boot/common/multiboot2.c b/usr/src/boot/sys/boot/common/multiboot2.c new file mode 100644 index 0000000000..adf1b7e2b4 --- /dev/null +++ b/usr/src/boot/sys/boot/common/multiboot2.c @@ -0,0 +1,894 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Toomas Soome <tsoome@me.com> + */ + +/* + * This module adds support for loading and booting illumos multiboot2 + * kernel. This code is only built to support the illumos kernel, it does + * not support xen. + */ +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/linker.h> +#include <sys/module.h> +#include <sys/stdint.h> +#include <sys/multiboot2.h> +#include <stand.h> +#include <stdbool.h> +#include "libzfs.h" + +#include "bootstrap.h" + +#include <machine/metadata.h> +#include <machine/pc/bios.h> + +#include "../i386/libi386/libi386.h" +#include "../i386/btx/lib/btxv86.h" +#include "pxe.h" + +extern BOOTPLAYER bootplayer; /* dhcp info */ +extern void multiboot_tramp(); + +#include "platform/acfreebsd.h" +#include "acconfig.h" +#define ACPI_SYSTEM_XFACE +#include "actypes.h" +#include "actbl.h" + +extern ACPI_TABLE_RSDP *rsdp; + +/* MB data heap pointer. */ +static vm_offset_t last_addr; +extern char bootprog_info[]; + +extern int elf32_loadfile_raw(char *filename, u_int64_t dest, + struct preloaded_file **result, int multiboot); +static int multiboot2_loadfile(char *, u_int64_t, struct preloaded_file **); +static int multiboot2_exec(struct preloaded_file *); + +struct file_format multiboot2 = { multiboot2_loadfile, multiboot2_exec }; +static bool keep_bs = false; +static bool have_framebuffer = false; +static vm_offset_t load_addr; +static vm_offset_t entry_addr; + +/* + * Validate tags in info request. This function is provided just to + * recognize the current tag list and only serves as a limited + * safe guard against possibly corrupt information. + */ +static bool +is_info_request_valid(multiboot_header_tag_information_request_t *rtag) +{ + int i; + + /* + * If the tag is optional and we do not support it, we do not + * have to do anything special, so we skip optional tags. + */ + if (rtag->mbh_flags & MULTIBOOT_HEADER_TAG_OPTIONAL) + return (true); + + for (i = 0; i < (rtag->mbh_size - sizeof (*rtag)) / + sizeof (rtag->mbh_requests[0]); i++) + switch (rtag->mbh_requests[i]) { + case MULTIBOOT_TAG_TYPE_END: + case MULTIBOOT_TAG_TYPE_CMDLINE: + case MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME: + case MULTIBOOT_TAG_TYPE_MODULE: + case MULTIBOOT_TAG_TYPE_BASIC_MEMINFO: + case MULTIBOOT_TAG_TYPE_BOOTDEV: + case MULTIBOOT_TAG_TYPE_MMAP: + case MULTIBOOT_TAG_TYPE_FRAMEBUFFER: + case MULTIBOOT_TAG_TYPE_VBE: + case MULTIBOOT_TAG_TYPE_ELF_SECTIONS: + case MULTIBOOT_TAG_TYPE_APM: + case MULTIBOOT_TAG_TYPE_EFI32: + case MULTIBOOT_TAG_TYPE_EFI64: + case MULTIBOOT_TAG_TYPE_ACPI_OLD: + case MULTIBOOT_TAG_TYPE_ACPI_NEW: + case MULTIBOOT_TAG_TYPE_NETWORK: + case MULTIBOOT_TAG_TYPE_EFI_MMAP: + case MULTIBOOT_TAG_TYPE_EFI_BS: + case MULTIBOOT_TAG_TYPE_EFI32_IH: + case MULTIBOOT_TAG_TYPE_EFI64_IH: + case MULTIBOOT_TAG_TYPE_LOAD_BASE_ADDR: + break; + default: + printf("unsupported information tag: 0x%x\n", + rtag->mbh_requests[i]); + return (false); + } + return (true); +} + +static int +multiboot2_loadfile(char *filename, u_int64_t dest, + struct preloaded_file **result) +{ + int fd, error; + uint32_t i; + struct stat st; + caddr_t header_search; + multiboot2_header_t *header; + multiboot_header_tag_t *tag; + multiboot_header_tag_address_t *addr_tag = NULL; + multiboot_header_tag_entry_address_t *entry_tag = NULL; + struct preloaded_file *fp; + + /* This allows to check other file formats from file_formats array. */ + error = EFTYPE; + if (filename == NULL) + return (error); + + /* is kernel already loaded? */ + fp = file_findfile(NULL, NULL); + if (fp != NULL) + return (error); + + if ((fd = open(filename, O_RDONLY)) == -1) + return (errno); + + /* + * Read MULTIBOOT_SEARCH size in order to search for the + * multiboot magic header. + */ + header_search = malloc(MULTIBOOT_SEARCH); + if (header_search == NULL) { + close(fd); + return (ENOMEM); + } + + if (read(fd, header_search, MULTIBOOT_SEARCH) != MULTIBOOT_SEARCH) + goto out; + + header = NULL; + for (i = 0; i <= (MULTIBOOT_SEARCH - sizeof (multiboot2_header_t)); + i += MULTIBOOT_HEADER_ALIGN) { + header = (multiboot2_header_t *)(header_search + i); + + /* Do we have match on magic? */ + if (header->mb2_magic != MULTIBOOT2_HEADER_MAGIC) { + header = NULL; + continue; + } + /* + * Validate checksum, the sum of magic + architecture + + * header_length + checksum must equal 0. + */ + if (header->mb2_magic + header->mb2_architecture + + header->mb2_header_length + header->mb2_checksum != 0) { + header = NULL; + continue; + } + /* + * Finally, the entire header must fit within MULTIBOOT_SEARCH. + */ + if (i + header->mb2_header_length > MULTIBOOT_SEARCH) { + header = NULL; + continue; + } + break; + } + + if (header == NULL) + goto out; + + for (tag = header->mb2_tags; tag->mbh_type != MULTIBOOT_TAG_TYPE_END; + tag = (multiboot_header_tag_t *)((uintptr_t)tag + + roundup2(tag->mbh_size, MULTIBOOT_TAG_ALIGN))) { + switch (tag->mbh_type) { + case MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST: + if (is_info_request_valid((void*)tag) == false) + goto out; + break; + case MULTIBOOT_HEADER_TAG_ADDRESS: + addr_tag = (multiboot_header_tag_address_t *)tag; + break; + case MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS: + entry_tag = + (multiboot_header_tag_entry_address_t *)tag; + break; + case MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS: + break; + case MULTIBOOT_HEADER_TAG_FRAMEBUFFER: + have_framebuffer = true; + break; + case MULTIBOOT_HEADER_TAG_MODULE_ALIGN: + /* we always align modules */ + break; + case MULTIBOOT_HEADER_TAG_EFI_BS: + keep_bs = true; + break; + default: + if (!(tag->mbh_flags & MULTIBOOT_HEADER_TAG_OPTIONAL)) { + printf("unsupported tag: 0x%x\n", + tag->mbh_type); + goto out; + } + } + } + + /* + * We must have addr_tag and entry_tag to load a 64-bit kernel. + * If these tags are missing, we either have a 32-bit kernel, or + * this is not our kernel at all. + */ + if (addr_tag != NULL && entry_tag != NULL) { + fp = file_alloc(); + if (fp == NULL) { + error = ENOMEM; + goto out; + } + if (lseek(fd, 0, SEEK_SET) == -1) { + printf("lseek failed\n"); + error = EIO; + file_discard(fp); + goto out; + } + if (fstat(fd, &st) < 0) { + printf("fstat failed\n"); + error = EIO; + file_discard(fp); + goto out; + } + + load_addr = addr_tag->mbh_load_addr; + entry_addr = entry_tag->mbh_entry_addr; + fp->f_addr = archsw.arch_loadaddr(LOAD_KERN, filename, + addr_tag->mbh_load_addr); + if (fp->f_addr == 0) { + error = ENOMEM; + file_discard(fp); + goto out; + } + fp->f_size = archsw.arch_readin(fd, fp->f_addr, st.st_size); + + if (fp->f_size != st.st_size) { + printf("error reading: %s", strerror(errno)); + file_discard(fp); + error = EIO; + goto out; + } + + fp->f_name = strdup(filename); + fp->f_type = strdup("aout multiboot2 kernel"); + if (fp->f_name == NULL || fp->f_type == NULL) { + error = ENOMEM; + file_discard(fp); + goto out; + } + + fp->f_metadata = NULL; + error = 0; + } else { + /* elf32_loadfile_raw will fill the attributes in fp. */ + error = elf32_loadfile_raw(filename, dest, &fp, 2); + if (error != 0) { + printf("elf32_loadfile_raw failed: %d unable to " + "load multiboot2 kernel\n", error); + goto out; + } + entry_addr = fp->f_addr; + /* + * We want the load_addr to have some legal value, + * so we set it same as the entry_addr. + * The distinction is important with UEFI, but not + * with BIOS version, because BIOS version does not use + * staging area. + */ + load_addr = fp->f_addr; + } + + setenv("kernelname", fp->f_name, 1); + bios_addsmapdata(fp); + *result = fp; +out: + free(header_search); + close(fd); + return (error); +} + +/* + * Since for now we have no way to pass the environment to the kernel other than + * through arguments, we need to take care of console setup. + * + * If the console is in mirror mode, set the kernel console from $os_console. + * If it's unset, use first item from $console. + * If $console is "ttyX", also pass $ttyX-mode, since it may have been set by + * the user. + * + * In case of memory allocation errors, just return the original command line + * so we have a chance of booting. + * + * On success, cl will be freed and a new, allocated command line string is + * returned. + */ +static char * +update_cmdline(char *cl) +{ + char *os_console = getenv("os_console"); + char *ttymode = NULL; + char mode[10]; + char *tmp; + int len; + + if (os_console == NULL) { + tmp = strdup(getenv("console")); + os_console = strsep(&tmp, ", "); + } else { + os_console = strdup(os_console); + } + + if (os_console == NULL) + return (cl); + + if (strncmp(os_console, "tty", 3) == 0) { + snprintf(mode, sizeof (mode), "%s-mode", os_console); + ttymode = getenv(mode); /* We will never get NULL. */ + } + + if (strstr(cl, "-B") != NULL) { + len = strlen(cl) + 1; + /* + * If console is not present, add it. + * If console is ttyX, add ttymode. + */ + tmp = strstr(cl, "console"); + if (tmp == NULL) { + len += 12; /* " -B console=" */ + len += strlen(os_console); + if (ttymode != NULL) { + len += 13; /* ",ttyX-mode=\"\"" */ + len += strlen(ttymode); + } + tmp = malloc(len); + if (tmp == NULL) { + free(os_console); + return (cl); + } + if (ttymode != NULL) { + snprintf(tmp, len, + "%s -B console=%s,%s-mode=\"%s\"", + cl, os_console, os_console, ttymode); + } else { + snprintf(tmp, len, "%s -B console=%s", + cl, os_console); + } + } else { + /* console is set, do we need tty mode? */ + tmp += 8; + if (strstr(tmp, "tty") == tmp) { + strncpy(mode, tmp, 4); + mode[4] = '\0'; + strncat(mode, "-mode", 5); + ttymode = getenv(mode); + } else { /* nope */ + free(os_console); + return (cl); + } + len = strlen(cl) + 1; + len += 13; /* ",ttyX-mode=\"\"" */ + len += strlen(ttymode); + tmp = malloc(len); + if (tmp == NULL) { + free(os_console); + return (cl); + } + snprintf(tmp, len, "%s,%s=\"%s\"", cl, mode, ttymode); + } + } else { + /* + * no -B, so we need to add " -B console=%s[,ttyX-mode=\"%s\"]" + */ + len = strlen(cl) + 1; + len += 12; /* " -B console=" */ + len += strlen(os_console); + if (ttymode != NULL) { + len += 13; /* ",ttyX-mode=\"\"" */ + len += strlen(ttymode); + } + tmp = malloc(len); + if (tmp == NULL) { + free(os_console); + return (cl); + } + if (ttymode != NULL) { + snprintf(tmp, len, "%s -B console=%s,%s-mode=\"%s\"", + cl, os_console, os_console, ttymode); + } else { + snprintf(tmp, len, "%s -B console=%s", cl, os_console); + } + } + free(os_console); + free(cl); + return (tmp); +} + +/* + * Search the command line for named property. + * + * Return codes: + * 0 The name is found, we return the data in value and len. + * ENOENT The name is not found. + * EINVAL The provided command line is badly formed. + */ +static int +find_property_value(const char *cmd, const char *name, const char **value, + size_t *len) +{ + const char *namep, *valuep; + size_t name_len, value_len; + int quoted; + + *value = NULL; + *len = 0; + + if (cmd == NULL) + return (ENOENT); + + while (*cmd != '\0') { + if (cmd[0] != '-' || cmd[1] != 'B') { + cmd++; + continue; + } + cmd += 2; /* Skip -B */ + while (cmd[0] == ' ' || cmd[0] == '\t') + cmd++; /* Skip whitespaces. */ + while (*cmd != '\0' && cmd[0] != ' ' && cmd[0] != '\t') { + namep = cmd; + valuep = strchr(cmd, '='); + if (valuep == NULL) + break; + name_len = valuep - namep; + valuep++; + value_len = 0; + quoted = 0; + for (; ; ++value_len) { + if (valuep[value_len] == '\0') + break; + + /* Is this value quoted? */ + if (value_len == 0 && + (valuep[0] == '\'' || valuep[0] == '"')) { + quoted = valuep[0]; + ++value_len; + } + + /* + * In the quote accept any character, + * but look for ending quote. + */ + if (quoted != 0) { + if (valuep[value_len] == quoted) + quoted = 0; + continue; + } + + /* A comma or white space ends the value. */ + if (valuep[value_len] == ',' || + valuep[value_len] == ' ' || + valuep[value_len] == '\t') + break; + } + if (quoted != 0) { + printf("Missing closing '%c' in \"%s\"\n", + quoted, valuep); + return (EINVAL); + } + if (value_len != 0) { + if (strncmp(namep, name, name_len) == 0) { + *value = valuep; + *len = value_len; + return (0); + } + } + cmd = valuep + value_len; + while (*cmd == ',') + cmd++; + } + } + return (ENOENT); +} + +/* + * Build the kernel command line. Shared function between MB1 and MB2. + */ +int +mb_kernel_cmdline(struct preloaded_file *fp, struct devdesc *rootdev, + char **line) +{ + const char *fs = getenv("fstype"); + char *cmdline = NULL; + size_t len; + bool zfs_root = false; + int rv = 0; + + if (rootdev->d_type == DEVT_ZFS) + zfs_root = true; + + /* If we have fstype set in env, reset zfs_root if needed. */ + if (fs != NULL && strcmp(fs, "zfs") != 0) + zfs_root = false; + + /* + * If we have fstype set on the command line, + * reset zfs_root if needed. + */ + rv = find_property_value(fp->f_args, "fstype", &fs, &len); + switch (rv) { + case EINVAL: /* invalid command line */ + default: + return (rv); + case ENOENT: /* fall through */ + case 0: + break; + } + + if (fs != NULL && strncmp(fs, "zfs", len) != 0) + zfs_root = false; + + len = strlen(fp->f_name) + 1; + + if (fp->f_args != NULL) + len += strlen(fp->f_args) + 1; + + if (zfs_root == true) + len += 3 + strlen(zfs_bootfs(rootdev)) + 1; + + cmdline = malloc(len); + if (cmdline == NULL) + return (ENOMEM); + + if (zfs_root == true) { + if (fp->f_args != NULL) { + snprintf(cmdline, len, "%s %s -B %s", fp->f_name, + fp->f_args, zfs_bootfs(rootdev)); + } else { + snprintf(cmdline, len, "%s -B %s", fp->f_name, + zfs_bootfs(rootdev)); + } + } else if (fp->f_args != NULL) + snprintf(cmdline, len, "%s %s", fp->f_name, fp->f_args); + else + snprintf(cmdline, len, "%s", fp->f_name); + + *line = update_cmdline(cmdline); + return (0); +} + +/* + * Returns allocated virtual address from MB info area. + */ +static vm_offset_t +mb_malloc(size_t n) +{ + vm_offset_t ptr = last_addr; + last_addr = roundup(last_addr + n, MULTIBOOT_TAG_ALIGN); + return (ptr); +} + +/* + * Calculate size for module tag list. + */ +static size_t +module_size(struct preloaded_file *fp) +{ + size_t len, size; + struct preloaded_file *mfp; + + size = 0; + for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) { + len = strlen(mfp->f_name) + 1; + len += strlen(mfp->f_type) + 5 + 1; /* 5 is for "type=" */ + if (mfp->f_args != NULL) + len += strlen(mfp->f_args) + 1; + size += sizeof (multiboot_tag_module_t) + len; + size = roundup(size, MULTIBOOT_TAG_ALIGN); + } + return (size); +} + +/* + * Calculate size for bios smap tag. + */ +static size_t +biossmap_size(struct preloaded_file *fp) +{ + int num; + struct file_metadata *md; + + md = file_findmetadata(fp, MODINFOMD_SMAP); + if (md == NULL) + return (0); + + num = md->md_size / sizeof(struct bios_smap); /* number of entries */ + return (sizeof (multiboot_tag_mmap_t) + + num * sizeof (multiboot_mmap_entry_t)); +} + +static size_t +mbi_size(struct preloaded_file *fp, char *cmdline) +{ + size_t size; + + size = sizeof (uint32_t) * 2; /* first 2 fields from MBI header */ + size += sizeof (multiboot_tag_string_t) + strlen(cmdline) + 1; + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + size += sizeof (multiboot_tag_string_t) + strlen(bootprog_info) + 1; + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + size += sizeof (multiboot_tag_basic_meminfo_t); + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + size += module_size(fp); + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + size += biossmap_size(fp); + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + + if (strstr(getenv("loaddev"), "pxe") != NULL) { + size += sizeof(multiboot_tag_network_t) + sizeof (BOOTPLAYER); + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + } + + if (rsdp != NULL) { + if (rsdp->Revision == 0) { + size += sizeof (multiboot_tag_old_acpi_t) + + sizeof(ACPI_RSDP_COMMON); + } else { + size += sizeof (multiboot_tag_new_acpi_t) + + rsdp->Length; + } + size = roundup2(size, MULTIBOOT_TAG_ALIGN); + } + size += sizeof(multiboot_tag_t); + + return (size); +} + +static int +multiboot2_exec(struct preloaded_file *fp) +{ + struct preloaded_file *mfp; + multiboot2_info_header_t *mbi; + char *cmdline = NULL; + struct devdesc *rootdev; + struct file_metadata *md; + int i, error, num; + int rootfs = 0; + size_t size; + struct bios_smap *smap; + vm_offset_t tmp; + i386_getdev((void **)(&rootdev), NULL, NULL); + + error = EINVAL; + if (rootdev == NULL) { + printf("can't determine root device\n"); + goto error; + } + + /* + * Set the image command line. + */ + if (fp->f_args == NULL) { + cmdline = getenv("boot-args"); + if (cmdline != NULL) { + fp->f_args = strdup(cmdline); + if (fp->f_args == NULL) { + error = ENOMEM; + goto error; + } + } + } + + error = mb_kernel_cmdline(fp, rootdev, &cmdline); + if (error != 0) + goto error; + + /* mb_kernel_cmdline() updates the environment. */ + build_environment_module(); + + size = mbi_size(fp, cmdline); /* Get the size for MBI. */ + + /* Set up the base for mb_malloc. */ + for (mfp = fp; mfp->f_next != NULL; mfp = mfp->f_next); + + /* Start info block from the new page. */ + last_addr = roundup(mfp->f_addr + mfp->f_size, MULTIBOOT_MOD_ALIGN); + + /* Do we have space for multiboot info? */ + if (last_addr + size >= memtop_copyin) { + error = ENOMEM; + goto error; + } + + mbi = (multiboot2_info_header_t *)PTOV(last_addr); + last_addr = (vm_offset_t)mbi->mbi_tags; + + { + multiboot_tag_string_t *tag; + i = sizeof (multiboot_tag_string_t) + strlen(cmdline) + 1; + tag = (multiboot_tag_string_t *) mb_malloc(i); + + tag->mb_type = MULTIBOOT_TAG_TYPE_CMDLINE; + tag->mb_size = i; + memcpy(tag->mb_string, cmdline, strlen(cmdline) + 1); + free(cmdline); + cmdline = NULL; + } + + { + multiboot_tag_string_t *tag; + i = sizeof (multiboot_tag_string_t) + strlen(bootprog_info) + 1; + tag = (multiboot_tag_string_t *) mb_malloc(i); + + tag->mb_type = MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME; + tag->mb_size = i; + memcpy(tag->mb_string, bootprog_info, + strlen(bootprog_info) + 1); + } + + { + multiboot_tag_basic_meminfo_t *tag; + tag = (multiboot_tag_basic_meminfo_t *) + mb_malloc(sizeof (*tag)); + + tag->mb_type = MULTIBOOT_TAG_TYPE_BASIC_MEMINFO; + tag->mb_size = sizeof (*tag); + tag->mb_mem_lower = bios_basemem / 1024; + tag->mb_mem_upper = bios_extmem / 1024; + } + + num = 0; + for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) { + num++; + if (mfp->f_type != NULL && strcmp(mfp->f_type, "rootfs") == 0) + rootfs++; + } + + if (num == 0 || rootfs == 0) { + /* We need at least one module - rootfs. */ + printf("No rootfs module provided, aborting\n"); + error = EINVAL; + goto error; + } + + /* + * Set the stage for physical memory layout: + * - We have kernel at load_addr. + * - Modules are aligned to page boundary. + * - MBI is aligned to page boundary. + * - Set the tmp to point to physical address of the first module. + */ + tmp = roundup2(load_addr + fp->f_size, MULTIBOOT_MOD_ALIGN); + + for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) { + multiboot_tag_module_t *tag; + + num = strlen(mfp->f_name) + 1; + num += strlen(mfp->f_type) + 5 + 1; + if (mfp->f_args != NULL) { + num += strlen(mfp->f_args) + 1; + } + cmdline = malloc(num); + if (cmdline == NULL) { + error = ENOMEM; + goto error; + } + + if (mfp->f_args != NULL) + snprintf(cmdline, num, "%s type=%s %s", + mfp->f_name, mfp->f_type, mfp->f_args); + else + snprintf(cmdline, num, "%s type=%s", + mfp->f_name, mfp->f_type); + + tag = (multiboot_tag_module_t *)mb_malloc(sizeof (*tag) + num); + + tag->mb_type = MULTIBOOT_TAG_TYPE_MODULE; + tag->mb_size = sizeof (*tag) + num; + tag->mb_mod_start = tmp; + tag->mb_mod_end = tmp + mfp->f_size; + tmp = roundup2(tag->mb_mod_end, MULTIBOOT_MOD_ALIGN); + memcpy(tag->mb_cmdline, cmdline, num); + free(cmdline); + cmdline = NULL; + } + + md = file_findmetadata(fp, MODINFOMD_SMAP); + if (md == NULL) { + printf("no memory smap\n"); + error = EINVAL; + goto error; + } + + smap = (struct bios_smap *)md->md_data; + num = md->md_size / sizeof(struct bios_smap); /* number of entries */ + + { + multiboot_tag_mmap_t *tag; + multiboot_mmap_entry_t *mmap_entry; + + tag = (multiboot_tag_mmap_t *) + mb_malloc(sizeof (*tag) + + num * sizeof (multiboot_mmap_entry_t)); + + tag->mb_type = MULTIBOOT_TAG_TYPE_MMAP; + tag->mb_size = sizeof (*tag) + + num * sizeof (multiboot_mmap_entry_t); + tag->mb_entry_size = sizeof (multiboot_mmap_entry_t); + tag->mb_entry_version = 0; + mmap_entry = (multiboot_mmap_entry_t *)tag->mb_entries; + + for (i = 0; i < num; i++) { + mmap_entry[i].mmap_addr = smap[i].base; + mmap_entry[i].mmap_len = smap[i].length; + mmap_entry[i].mmap_type = smap[i].type; + mmap_entry[i].mmap_reserved = 0; + } + } + + if (strstr(getenv("loaddev"), "pxe") != NULL) { + multiboot_tag_network_t *tag; + tag = (multiboot_tag_network_t *) + mb_malloc(sizeof(*tag) + sizeof (BOOTPLAYER)); + + tag->mb_type = MULTIBOOT_TAG_TYPE_NETWORK; + tag->mb_size = sizeof(*tag) + sizeof (BOOTPLAYER); + memcpy(tag->mb_dhcpack, &bootplayer, sizeof (BOOTPLAYER)); + } + + if (rsdp != NULL) { + multiboot_tag_new_acpi_t *ntag; + multiboot_tag_old_acpi_t *otag; + int size; + + if (rsdp->Revision == 0) { + size = sizeof (*otag) + rsdp->Length; + otag = (multiboot_tag_old_acpi_t *)mb_malloc(size); + otag->mb_type = MULTIBOOT_TAG_TYPE_ACPI_OLD; + otag->mb_size = size; + memcpy(otag->mb_rsdp, rsdp, sizeof (ACPI_RSDP_COMMON)); + } else { + size = sizeof (*ntag) + rsdp->Length; + ntag = (multiboot_tag_new_acpi_t *)mb_malloc(size); + ntag->mb_type = MULTIBOOT_TAG_TYPE_ACPI_NEW; + ntag->mb_size = size; + memcpy(ntag->mb_rsdp, rsdp, rsdp->Length); + } + } + + /* + * MB tag list end marker. + */ + { + multiboot_tag_t *tag = (multiboot_tag_t *) + mb_malloc(sizeof(*tag)); + tag->mb_type = MULTIBOOT_TAG_TYPE_END; + tag->mb_size = sizeof(*tag); + } + + mbi->mbi_total_size = last_addr - (vm_offset_t)mbi; + mbi->mbi_reserved = 0; + + dev_cleanup(); + __exec((void *)VTOP(multiboot_tramp), MULTIBOOT2_BOOTLOADER_MAGIC, + (void *)entry_addr, (void *)VTOP(mbi)); + panic("exec returned"); + +error: + if (cmdline != NULL) + free(cmdline); + return (error); +} diff --git a/usr/src/boot/sys/boot/efi/loader/bootinfo.c b/usr/src/boot/sys/boot/efi/loader/bootinfo.c index 1f45ea3493..6c90871c06 100644 --- a/usr/src/boot/sys/boot/efi/loader/bootinfo.c +++ b/usr/src/boot/sys/boot/efi/loader/bootinfo.c @@ -27,7 +27,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); #include <stand.h> #include <string.h> @@ -112,7 +111,7 @@ bi_getboothowto(char *kargs) * Each variable is formatted as <name>=<value>, with a single nul * separating each variable, and a double nul terminating the environment. */ -static vm_offset_t +vm_offset_t bi_copyenv(vm_offset_t start) { struct env_var *ep; diff --git a/usr/src/boot/sys/boot/i386/libi386/biosacpi.c b/usr/src/boot/sys/boot/i386/libi386/biosacpi.c index 18f8050783..a82862dd3f 100644 --- a/usr/src/boot/sys/boot/i386/libi386/biosacpi.c +++ b/usr/src/boot/sys/boot/i386/libi386/biosacpi.c @@ -43,6 +43,7 @@ * environment. */ +ACPI_TABLE_RSDP *rsdp; static ACPI_TABLE_RSDP *biosacpi_find_rsdp(void); static ACPI_TABLE_RSDP *biosacpi_search_rsdp(char *base, int length); @@ -51,7 +52,6 @@ static ACPI_TABLE_RSDP *biosacpi_search_rsdp(char *base, int length); void biosacpi_detect(void) { - ACPI_TABLE_RSDP *rsdp; char buf[24]; int revision; diff --git a/usr/src/boot/sys/boot/i386/libi386/multiboot.c b/usr/src/boot/sys/boot/i386/libi386/multiboot.c index 899f75bca6..32e4fe3b07 100644 --- a/usr/src/boot/sys/boot/i386/libi386/multiboot.c +++ b/usr/src/boot/sys/boot/i386/libi386/multiboot.c @@ -41,7 +41,6 @@ #include <sys/linker.h> #include <sys/module.h> #include <sys/stdint.h> -#include <stdbool.h> #define _MACHINE_ELF_WANT_32BIT #include <machine/elf.h> #include <machine/metadata.h> @@ -58,7 +57,6 @@ #define MULTIBOOT_SUPPORTED_FLAGS \ (MULTIBOOT_AOUT_KLUDGE|MULTIBOOT_PAGE_ALIGN|MULTIBOOT_MEMORY_INFO) -#define NUM_MODULES 2 #define METADATA_FIXED_SIZE (PAGE_SIZE*4) #define METADATA_MODULE_SIZE PAGE_SIZE @@ -67,12 +65,17 @@ /* MB data heap pointer */ static vm_offset_t last_addr; +extern char bootprog_info[]; extern int elf32_loadfile_raw(char *filename, u_int64_t dest, struct preloaded_file **result, int multiboot); extern int elf64_load_modmetadata(struct preloaded_file *fp, u_int64_t dest); extern int elf64_obj_loadfile(char *filename, u_int64_t dest, struct preloaded_file **result); +extern int mb_kernel_cmdline(struct preloaded_file *, struct devdesc *, + char **); + +extern void multiboot_tramp(); static int multiboot_loadfile(char *, u_int64_t, struct preloaded_file **); static int multiboot_exec(struct preloaded_file *); @@ -84,10 +87,6 @@ struct file_format multiboot = { multiboot_loadfile, multiboot_exec }; struct file_format multiboot_obj = { multiboot_obj_loadfile, multiboot_obj_exec }; -extern void multiboot_tramp(); - -static const char mbl_name[] = "illumos Loader"; - static int num_modules(struct preloaded_file *kfp) { @@ -124,7 +123,6 @@ multiboot_loadfile(char *filename, u_int64_t dest, ssize_t search_size; int fd; struct multiboot_header *header; - char *cmdline; struct preloaded_file *fp; if (filename == NULL) @@ -219,12 +217,9 @@ multiboot_loadfile(char *filename, u_int64_t dest, goto out; } fp->f_metadata = NULL; - - *result = fp; error = 0; } else { - - error = elf32_loadfile_raw(filename, dest, result, 1); + error = elf32_loadfile_raw(filename, dest, &fp, 1); if (error != 0) { printf("elf32_loadfile_raw failed: %d unable to " "load multiboot kernel\n", error); @@ -232,8 +227,9 @@ multiboot_loadfile(char *filename, u_int64_t dest, } } - setenv("kernelname", (*result)->f_name, 1); - bios_addsmapdata(*result); + setenv("kernelname", fp->f_name, 1); + bios_addsmapdata(fp); + *result = fp; out: free(header_search); close(fd); @@ -253,267 +249,6 @@ mb_malloc(size_t n) return (ptr); } -/* - * Since for now we have no way to pass the environment to the kernel other than - * through arguments, we need to take care of console setup. - * - * If the console is in mirror mode, set the kernel console from $os_console. - * If it's unset, use first item from $console. - * If $console is "ttyX", also pass $ttyX-mode, since it may have been set by - * the user. - * - * In case of memory allocation errors, just return original command line, - * so we have chance of booting. - * - * On success, cl will be freed and a new, allocated command line string is - * returned. - */ -static char * -update_cmdline(char *cl) -{ - char *os_console = getenv("os_console"); - char *ttymode = NULL; - char mode[10]; - char *tmp; - int len; - - if (os_console == NULL) { - tmp = strdup(getenv("console")); - os_console = strsep(&tmp, ", "); - } else - os_console = strdup(os_console); - - if (os_console == NULL) - return (cl); - - if (strncmp(os_console, "tty", 3) == 0) { - snprintf(mode, sizeof (mode), "%s-mode", os_console); - ttymode = getenv(mode); /* never NULL */ - } - - if (strstr(cl, "-B") != NULL) { - len = strlen(cl) + 1; - /* - * if console is not present, add it - * if console is ttyX, add ttymode - */ - tmp = strstr(cl, "console"); - if (tmp == NULL) { - len += 12; /* " -B console=" */ - len += strlen(os_console); - if (ttymode != NULL) { - len += 13; /* ",ttyX-mode=\"\"" */ - len += strlen(ttymode); - } - tmp = malloc(len); - if (tmp == NULL) { - free(os_console); - return (cl); - } - if (ttymode != NULL) - sprintf(tmp, - "%s -B console=%s,%s-mode=\"%s\"", - cl, os_console, os_console, ttymode); - else - sprintf(tmp, "%s -B console=%s", - cl, os_console); - } else { - /* console is set, do we need tty mode? */ - tmp += 8; - if (strstr(tmp, "tty") == tmp) { - strncpy(mode, tmp, 4); - mode[4] = '\0'; - strcat(mode, "-mode"); - ttymode = getenv(mode); /* never NULL */ - } else { /* nope */ - free(os_console); - return (cl); - } - len = strlen(cl) + 1; - len += 13; /* ",ttyX-mode=\"\"" */ - len += strlen(ttymode); - tmp = malloc(len); - if (tmp == NULL) { - free(os_console); - return (cl); - } - sprintf(tmp, "%s,%s=\"%s\"", cl, mode, ttymode); - } - } else { - /* - * no -B, so we need to add " -B console=%s[,ttyX-mode=\"%s\"]" - */ - len = strlen(cl) + 1; - len += 12; /* " -B console=" */ - len += strlen(os_console); - if (ttymode != NULL) { - len += 13; /* ",ttyX-mode=\"\"" */ - len += strlen(ttymode); - } - tmp = malloc(len); - if (tmp == NULL) { - free(os_console); - return (cl); - } - if (ttymode != NULL) - sprintf(tmp, "%s -B console=%s,%s-mode=\"%s\"", cl, - os_console, os_console, ttymode); - else - sprintf(tmp, "%s -B console=%s", cl, os_console); - } - free(os_console); - free(cl); - return (tmp); -} - -/* - * Search the command line for named property. - * - * Return codes: - * 0 The name is found, we return the data in value and len. - * ENOENT The name is not found. - * EINVAL The provided command line is badly formed. - */ -static int -find_property_value(const char *cmd, const char *name, const char **value, - size_t *len) -{ - const char *namep, *valuep; - size_t name_len, value_len; - int quoted; - - *value = NULL; - *len = 0; - - if (cmd == NULL) - return (ENOENT); - - while (*cmd != '\0') { - if (cmd[0] != '-' || cmd[1] != 'B') { - cmd++; - continue; - } - cmd += 2; /* Skip -B */ - while (cmd[0] == ' ' || cmd[0] == '\t') - cmd++; /* Skip whitespaces. */ - while (*cmd != '\0' && cmd[0] != ' ' && cmd[0] != '\t') { - namep = cmd; - valuep = strchr(cmd, '='); - if (valuep == NULL) - break; - name_len = valuep - namep; - valuep++; - value_len = 0; - quoted = 0; - for (; ; ++value_len) { - if (valuep[value_len] == '\0') - break; - - /* Is this value quoted? */ - if (value_len == 0 && - (valuep[0] == '\'' || valuep[0] == '"')) { - quoted = valuep[0]; - ++value_len; - } - - /* - * In the quote accept any character, - * but look for ending quote. - */ - if (quoted != 0) { - if (valuep[value_len] == quoted) - quoted = 0; - continue; - } - - /* A comma or white space ends the value. */ - if (valuep[value_len] == ',' || - valuep[value_len] == ' ' || - valuep[value_len] == '\t') - break; - } - if (quoted != 0) { - printf("Missing closing '%c' in \"%s\"\n", - quoted, valuep); - return (EINVAL); - } - - if (value_len != 0) { - if (strncmp(namep, name, name_len) == 0) { - *value = valuep; - *len = value_len; - return (0); - } - } - cmd = valuep + value_len; - while (*cmd == ',') - cmd++; - } - } - return (ENOENT); -} - -static int -kernel_cmdline(struct preloaded_file *fp, struct i386_devdesc *rootdev, - char **line) -{ - const char *fs = getenv("fstype"); - char *cmdline = NULL; - size_t len; - bool zfs_root = false; - int rv = 0; - - if (rootdev->d_type == DEVT_ZFS) - zfs_root = true; - - /* If we have fstype set in env, reset zfs_root if needed. */ - if (fs != NULL && strcmp(fs, "zfs") != 0) - zfs_root = false; - - /* - * If we have fstype set on the command line, - * reset zfs_root if needed. - */ - rv = find_property_value(fp->f_args, "fstype", &fs, &len); - switch (rv) { - case EINVAL: /* invalid command line */ - return (rv); - case ENOENT: /* fall through */ - case 0: - break; - } - - if (fs != NULL && strncmp(fs, "zfs", len) != 0) - zfs_root = false; - - len = strlen(fp->f_name) + 1; - - if (fp->f_args != NULL) - len += strlen(fp->f_args) + 1; - - if (zfs_root == true) - len += 3 + strlen(zfs_bootfs(rootdev)) + 1; - - cmdline = malloc(len); - if (cmdline == NULL) - return (ENOMEM); - - if (zfs_root == true) { - if (fp->f_args != NULL) - snprintf(cmdline, len, "%s %s -B %s", fp->f_name, - fp->f_args, zfs_bootfs(rootdev)); - else - snprintf(cmdline, len, "%s -B %s", fp->f_name, - zfs_bootfs(rootdev)); - } else if (fp->f_args != NULL) - snprintf(cmdline, len, "%s %s", fp->f_name, fp->f_args); - else - snprintf(cmdline, len, "%s", fp->f_name); - - *line = update_cmdline(cmdline); - return (0); -} - static int multiboot_exec(struct preloaded_file *fp) { @@ -521,13 +256,12 @@ multiboot_exec(struct preloaded_file *fp) vm_offset_t module_start, metadata_size; vm_offset_t modulep, kernend, entry; struct file_metadata *md; - Elf_Ehdr *ehdr; struct multiboot_info *mb_info = NULL; struct multiboot_mod_list *mb_mod = NULL; multiboot_memory_map_t *mmap; struct bios_smap *smap; - struct i386_devdesc *rootdev; - extern BOOTPLAYER bootplayer; /* dhcp info */ + struct devdesc *rootdev; + extern BOOTPLAYER bootplayer; /* dhcp info */ char *cmdline = NULL; size_t len; int error, num, i; @@ -535,10 +269,10 @@ multiboot_exec(struct preloaded_file *fp) int xen = 0; /* flag for xen */ int kernel = 0; /* flag for kernel */ - /* set up base for mb_malloc */ + /* Set up base for mb_malloc. */ for (mfp = fp; mfp->f_next != NULL; mfp = mfp->f_next); - /* start info block from new page */ + /* Start info block from new page. */ last_addr = roundup(mfp->f_addr + mfp->f_size, MULTIBOOT_MOD_ALIGN); /* Allocate the multiboot struct and fill the basic details. */ @@ -548,9 +282,10 @@ multiboot_exec(struct preloaded_file *fp) mb_info->flags = MULTIBOOT_INFO_MEMORY|MULTIBOOT_INFO_BOOT_LOADER_NAME; mb_info->mem_lower = bios_basemem / 1024; mb_info->mem_upper = bios_extmem / 1024; - mb_info->boot_loader_name = mb_malloc(strlen(mbl_name) + 1); + mb_info->boot_loader_name = mb_malloc(strlen(bootprog_info) + 1); - i386_copyin(mbl_name, mb_info->boot_loader_name, strlen(mbl_name)+1); + i386_copyin(bootprog_info, mb_info->boot_loader_name, + strlen(bootprog_info) + 1); i386_getdev((void **)(&rootdev), NULL, NULL); if (rootdev == NULL) { @@ -560,12 +295,12 @@ multiboot_exec(struct preloaded_file *fp) } /* - * boot image command line. if args were not provided, we need to set + * Boot image command line. If args were not provided, we need to set * args here, and that depends on image type... - * fortunately we only have following options: - * 64 or 32 bit unix or xen. so we just check if f_name has unix. + * Fortunately we only have following options: + * 64 or 32 bit unix or xen. So we just check if f_name has unix. */ - /* do we boot xen? */ + /* Do we boot xen? */ if (strstr(fp->f_name, "unix") == NULL) xen = 1; @@ -581,7 +316,7 @@ multiboot_exec(struct preloaded_file *fp) } if (num == 0 || rootfs == 0) { - /* need at least one module - rootfs */ + /* We need at least one module - rootfs. */ printf("No rootfs module provided, aborting\n"); error = EINVAL; goto error; @@ -603,7 +338,7 @@ multiboot_exec(struct preloaded_file *fp) if (strcmp(mfp->f_type, "kernel") == 0) { cmdline = NULL; - error = kernel_cmdline(mfp, rootdev, &cmdline); + error = mb_kernel_cmdline(mfp, rootdev, &cmdline); if (error != 0) goto error; } else { @@ -667,7 +402,7 @@ multiboot_exec(struct preloaded_file *fp) } /* * Set the image command line. Need to do this as last thing, - * as Illumos kernel dboot_startkern will check cmdline + * as illumos kernel dboot_startkern will check cmdline * address as last check to find first free address. */ if (fp->f_args == NULL) { @@ -685,7 +420,7 @@ multiboot_exec(struct preloaded_file *fp) } /* - * if image is xen, we just use f_name + f_args for commandline + * If the image is xen, we just use f_name + f_args for commandline * for unix, we need to add zfs-bootfs. */ if (xen) { @@ -708,7 +443,7 @@ multiboot_exec(struct preloaded_file *fp) } } else { cmdline = NULL; - if ((error = kernel_cmdline(fp, rootdev, &cmdline)) != 0) + if ((error = mb_kernel_cmdline(fp, rootdev, &cmdline)) != 0) goto error; } @@ -719,8 +454,8 @@ multiboot_exec(struct preloaded_file *fp) cmdline = NULL; dev_cleanup(); - __exec((void *)VTOP(multiboot_tramp), (void *)entry, - (void *)VTOP(mb_info)); + __exec((void *)VTOP(multiboot_tramp), MULTIBOOT_BOOTLOADER_MAGIC, + (void *)entry, (void *)VTOP(mb_info)); panic("exec returned"); diff --git a/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S b/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S index 0bd604365f..452a86bbb8 100644 --- a/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S +++ b/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S @@ -26,9 +26,6 @@ * $FreeBSD$ */ -#define ASM_FILE -#include "multiboot.h" - /* * The multiboot specification requires the executable to be launched * with %cs set to a flat read/execute segment with offset 0 and limit @@ -43,9 +40,9 @@ multiboot_tramp: /* Be sure that interrupts are disabled. */ cli - movl $MULTIBOOT_BOOTLOADER_MAGIC, %eax + movl 4(%esp), %eax /* bootloader magic */ /* Get the entry point and address of the multiboot_info parameter. */ - movl 8(%esp), %ebx - movl 4(%esp), %ecx + movl 12(%esp), %ebx /* multiboot_info */ + movl 8(%esp), %ecx /* entry */ call *%ecx diff --git a/usr/src/boot/sys/boot/i386/loader/Makefile b/usr/src/boot/sys/boot/i386/loader/Makefile index f2dac3bfc9..1275085c18 100644 --- a/usr/src/boot/sys/boot/i386/loader/Makefile +++ b/usr/src/boot/sys/boot/i386/loader/Makefile @@ -17,8 +17,9 @@ include $(SRC)/Makefile.master include $(SRC)/boot/Makefile.version -CFLAGS= -O2 -CPPFLAGS= -DSTAND -nostdinc -I../../../../include -I../../.. +CFLAGS= -O2 +CPPFLAGS= -DSTAND -nostdinc -I../../../../include -I../../.. +CPPFLAGS += -I$(SRC)/uts/intel/sys/acpi LOADER= zfsloader NEWVERSWHAT= "ZFS enabled bootstrap loader" x86 MAN= @@ -63,7 +64,7 @@ LIBFICL= ../../ficl/i386/libficl.a # Always add MI sources SRCS += boot.c commands.c console.c devopen.c interp.c SRCS += interp_backslash.c interp_parse.c ls.c misc.c -SRCS += module.c panic.c linenoise.c +SRCS += module.c panic.c linenoise.c multiboot2.c SRCS += load_elf32.c load_elf32_obj.c reloc_elf32.c SRCS += load_elf64.c load_elf64_obj.c reloc_elf64.c @@ -107,6 +108,9 @@ CPPFLAGS += -I../btx/lib include ../Makefile.inc +# For multiboot2.h, must be last, to avoid conflicts +CPPFLAGS += -I$(SRC)/uts/common + vers.c: ../../common/newvers.sh $(SRC)/boot/Makefile.version $(SH) ../../common/newvers.sh ${LOADER_VERSION} ${NEWVERSWHAT} @@ -135,7 +139,6 @@ DPADD= ${LIBFICL} ${LIBZFSBOOT} ${LIBI386} ${LIBSTAND} LDADD= ${LIBFICL} ${LIBZFSBOOT} ${LIBI386} ${LIBSTAND} CLEANFILES += machine x86 -CFLAGS += -DLOADER_PREFER_AMD64 machine: $(RM) machine diff --git a/usr/src/boot/sys/boot/i386/loader/conf.c b/usr/src/boot/sys/boot/i386/loader/conf.c index b47c9219e8..d99c3a4b49 100644 --- a/usr/src/boot/sys/boot/i386/loader/conf.c +++ b/usr/src/boot/sys/boot/i386/loader/conf.c @@ -101,10 +101,12 @@ extern struct file_format amd64_elf; extern struct file_format amd64_elf_obj; extern struct file_format multiboot; extern struct file_format multiboot_obj; +extern struct file_format multiboot2; extern struct file_format linux; extern struct file_format linux_initrd; struct file_format *file_formats[] = { + &multiboot2, &multiboot, &multiboot_obj, &amd64_elf, diff --git a/usr/src/boot/sys/boot/i386/loader/main.c b/usr/src/boot/sys/boot/i386/loader/main.c index be092c552f..9f9d69f0c2 100644 --- a/usr/src/boot/sys/boot/i386/loader/main.c +++ b/usr/src/boot/sys/boot/i386/loader/main.c @@ -38,7 +38,9 @@ #include <machine/cpufunc.h> #include <machine/psl.h> #include <sys/disk.h> +#include <sys/param.h> #include <sys/reboot.h> +#include <sys/multiboot2.h> #include "bootstrap.h" #include "common/bootargs.h" @@ -81,6 +83,18 @@ extern char end[]; static void *heap_top; static void *heap_bottom; +static uint64_t +i386_loadaddr(u_int type, void *data, uint64_t addr) +{ + /* + * Our modules are page aligned. + */ + if (type == LOAD_RAW || type == LOAD_MEM) + return (roundup2(addr, MULTIBOOT_MOD_ALIGN)); + + return (addr); +} + int main(void) { @@ -162,6 +176,7 @@ main(void) archsw.arch_readin = i386_readin; archsw.arch_isainb = isa_inb; archsw.arch_isaoutb = isa_outb; + archsw.arch_loadaddr = i386_loadaddr; #ifdef LOADER_ZFS_SUPPORT archsw.arch_zfs_probe = i386_zfs_probe; #endif diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 0658d7c639..10a2f5a4f7 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -3553,7 +3553,6 @@ typedef struct mdb_arc_buf_hdr_t { struct { uint32_t b_bufcnt; uintptr_t b_state; - uintptr_t b_pdata; } b_l1hdr; } mdb_arc_buf_hdr_t; diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 9ddf5e1021..0137e6f448 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -60,6 +60,7 @@ #include <sys/arc.h> #include <sys/ddt.h> #include <sys/zfeature.h> +#include <sys/abd.h> #include <zfs_comutil.h> #undef verify #include <libzfs.h> @@ -2537,7 +2538,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2603,7 +2604,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *abd = abd_alloc(size, B_FALSE); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -2616,7 +2617,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(NULL, spa, bp, data, size, + zio_nowait(zio_read(NULL, spa, bp, abd, size, zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } @@ -3397,6 +3398,13 @@ name: return (NULL); } +/* ARGSUSED */ +static int +random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) +{ + return (random_get_pseudo_bytes(buf, len)); +} + /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -3428,7 +3436,8 @@ zdb_read_block(char *thing, spa_t *spa) uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *pbuf, *lbuf, *buf; + abd_t *pabd; + void *lbuf, *buf; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3499,7 +3508,7 @@ zdb_read_block(char *thing, spa_t *spa) psize = size; lsize = size; - pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3527,15 +3536,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3558,21 +3567,21 @@ zdb_read_block(char *thing, spa_t *spa) void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - bcopy(pbuf, pbuf2, psize); + abd_copy_to_buf(pbuf2, pabd, psize); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, + random_get_pseudo_bytes_cb, NULL)); - VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize) == 0); + VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize)); for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; lsize -= SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { - if (zio_decompress_data(c, pbuf, lbuf, - psize, lsize) == 0 && - zio_decompress_data(c, pbuf2, lbuf2, - psize, lsize) == 0 && + if (zio_decompress_data(c, pabd, + lbuf, psize, lsize) == 0 && + zio_decompress_data_buf(c, pbuf2, + lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; } @@ -3591,7 +3600,7 @@ zdb_read_block(char *thing, spa_t *spa) buf = lbuf; size = lsize; } else { - buf = pbuf; + buf = abd_to_buf(pabd); size = psize; } @@ -3609,7 +3618,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: - umem_free(pbuf, SPA_MAXBLOCKSIZE); + abd_free(pabd); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c index 583e422286..bc02b1b670 100644 --- a/usr/src/cmd/zdb/zdb_il.c +++ b/usr/src/cmd/zdb/zdb_il.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ /* @@ -41,6 +41,7 @@ #include <sys/resource.h> #include <sys/zil.h> #include <sys/zil_impl.h> +#include <sys/abd.h> extern uint8_t dump_opt[256]; @@ -117,13 +118,27 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) } /* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + for (int i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ static void zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { - char *data, *dlimit; + abd_t *data; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; - char buf[SPA_MAXBLOCKSIZE]; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - bzero(buf, sizeof (buf)); (void) printf("%s<hole>\n", prefix); return; } @@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); if (error) - return; - data = buf; + goto out; } else { - data = (char *)(lr + 1); + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); } - dlimit = data + MIN(lr->lr_length, - (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("%s", prefix); - while (data < dlimit) { - if (isprint(*data)) - (void) printf("%c ", *data); - else - (void) printf("%2X", *data); - data++; - } + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); (void) printf("\n"); + +out: + abd_free(data); } /* ARGSUSED */ diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index 75a3d5245f..16f79b52ef 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -111,6 +111,7 @@ #include <sys/refcount.h> #include <sys/zfeature.h> #include <sys/dsl_userhold.h> +#include <sys/abd.h> #include <stdio.h> #include <stdio_ext.h> #include <stdlib.h> @@ -188,6 +189,7 @@ extern uint64_t metaslab_df_alloc_threshold; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern boolean_t zfs_abd_scatter_enabled; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -5051,7 +5053,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *abd; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; @@ -5131,14 +5133,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); - ztest_pattern_set(buf, psize, ~pattern); + abd = abd_alloc_linear(psize, B_TRUE); + ztest_pattern_set(abd_to_buf(abd), psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, - buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(abd); (void) rw_unlock(&ztest_name_lock); } @@ -5421,6 +5423,12 @@ ztest_resume_thread(void *arg) */ if (ztest_random(10) == 0) zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); } return (NULL); } diff --git a/usr/src/common/zfs/zfs_fletcher.c b/usr/src/common/zfs/zfs_fletcher.c index a58fa14b7c..c889169b42 100644 --- a/usr/src/common/zfs/zfs_fletcher.c +++ b/usr/src/common/zfs/zfs_fletcher.c @@ -24,6 +24,7 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ /* @@ -133,17 +134,29 @@ #include <sys/byteorder.h> #include <sys/zio.h> #include <sys/spa.h> +#include <zfs_fletcher.h> -/*ARGSUSED*/ void -fletcher_2_native(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +fletcher_init(zio_cksum_t *zcp) { + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +int +fletcher_2_incremental_native(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -151,18 +164,33 @@ fletcher_2_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } /*ARGSUSED*/ void -fletcher_2_byteswap(const void *buf, uint64_t size, +fletcher_2_native(const void *buf, size_t size, const void *ctx_template, zio_cksum_t *zcp) { + fletcher_init(zcp); + (void) fletcher_2_incremental_native((void *) buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -170,50 +198,23 @@ fletcher_2_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } /*ARGSUSED*/ void -fletcher_4_native(const void *buf, uint64_t size, +fletcher_2_byteswap(const void *buf, size_t size, const void *ctx_template, zio_cksum_t *zcp) { - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + fletcher_init(zcp); + (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } -/*ARGSUSED*/ -void -fletcher_4_byteswap(const void *buf, uint64_t size, - const void *ctx_template, zio_cksum_t *zcp) +int +fletcher_4_incremental_native(void *buf, size_t size, void *data) { - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; + zio_cksum_t *zcp = data; - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) -{ const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -231,12 +232,23 @@ fletcher_4_incremental_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); } +/*ARGSUSED*/ void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +fletcher_4_native(const void *buf, size_t size, + const void *ctx_template, zio_cksum_t *zcp) { + fletcher_init(zcp); + (void) fletcher_4_incremental_native((void *) buf, size, zcp); +} + +int +fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -254,4 +266,14 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); +} + +/*ARGSUSED*/ +void +fletcher_4_byteswap(const void *buf, size_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_4_incremental_byteswap((void *) buf, size, zcp); } diff --git a/usr/src/common/zfs/zfs_fletcher.h b/usr/src/common/zfs/zfs_fletcher.h index a920cc816d..33c6c728cf 100644 --- a/usr/src/common/zfs/zfs_fletcher.h +++ b/usr/src/common/zfs/zfs_fletcher.h @@ -24,6 +24,7 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #ifndef _ZFS_FLETCHER_H @@ -40,12 +41,15 @@ extern "C" { * fletcher checksum functions */ -void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *); +void fletcher_init(zio_cksum_t *); +void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *); +void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *); +int fletcher_2_incremental_native(void *, size_t, void *); +int fletcher_2_incremental_byteswap(void *, size_t, void *); +void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *); +void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *); +int fletcher_4_incremental_native(void *, size_t, void *); +int fletcher_4_incremental_byteswap(void *, size_t, void *); #ifdef __cplusplus } diff --git a/usr/src/head/iso/stddef_iso.h b/usr/src/head/iso/stddef_iso.h index b94960793c..37e10aec4b 100644 --- a/usr/src/head/iso/stddef_iso.h +++ b/usr/src/head/iso/stddef_iso.h @@ -82,16 +82,6 @@ typedef unsigned int size_t; /* (historical version) */ } #endif /* end of namespace std */ -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) -#define offsetof(s, m) __builtin_offsetof(s, m) -#else -#if __cplusplus >= 199711L -#define offsetof(s, m) (std::size_t)(&(((s *)0)->m)) -#else -#define offsetof(s, m) (size_t)(&(((s *)0)->m)) -#endif -#endif /* GNUC, etc. */ - #if !defined(_MAX_ALIGN_T) #if !defined(_STRICT_SYMBOLS) || defined(_STDC_C11) #define _MAX_ALIGN_T diff --git a/usr/src/head/stddef.h b/usr/src/head/stddef.h index 1e3d016048..6f04b7f7c9 100644 --- a/usr/src/head/stddef.h +++ b/usr/src/head/stddef.h @@ -31,10 +31,9 @@ #ifndef _STDDEF_H #define _STDDEF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/isa_defs.h> #include <iso/stddef_iso.h> +#include <sys/stddef.h> /* * Allow global visibility for symbols defined in diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c index 2641d53e00..4e89dc053d 100644 --- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c +++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c @@ -192,19 +192,19 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - fletcher_4_incremental_native(drr, + (void) fletcher_4_incremental_native(drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); if (drr->drr_type != DRR_BEGIN) { ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. drr_checksum.drr_checksum)); drr->drr_u.drr_checksum.drr_checksum = *zc; } - fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, - sizeof (zio_cksum_t), zc); + (void) fletcher_4_incremental_native( + &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc); if (write(outfd, drr, sizeof (*drr)) == -1) return (errno); if (payload_len != 0) { - fletcher_4_incremental_native(payload, payload_len, zc); + (void) fletcher_4_incremental_native(payload, payload_len, zc); if (write(outfd, payload, payload_len) == -1) return (errno); } @@ -2093,9 +2093,9 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, if (zc) { if (byteswap) - fletcher_4_incremental_byteswap(buf, ilen, zc); + (void) fletcher_4_incremental_byteswap(buf, ilen, zc); else - fletcher_4_incremental_native(buf, ilen, zc); + (void) fletcher_4_incremental_native(buf, ilen, zc); } return (0); } @@ -3649,7 +3649,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, * recv_read() above; do it again correctly. */ bzero(&zcksum, sizeof (zio_cksum_t)); - fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); + (void) fletcher_4_incremental_byteswap(&drr, + sizeof (drr), &zcksum); flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index d0421bea94..3636b4e76e 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -61,6 +61,7 @@ #include <sys/dsl_destroy.h> #include <sys/dsl_userhold.h> #include <sys/blkptr.h> +#include <sys/abd.h> extern uint64_t metaslab_gang_bang; extern uint64_t metaslab_df_alloc_threshold; @@ -68,3 +69,4 @@ extern boolean_t zfeature_checks_disable; extern uint64_t zfs_deadman_synctime_ms; extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; +extern boolean_t zfs_abd_scatter_enabled; diff --git a/usr/src/pkg/manifests/driver-storage-mr_sas.mf b/usr/src/pkg/manifests/driver-storage-mr_sas.mf index 32a138a184..d1b39a659e 100644 --- a/usr/src/pkg/manifests/driver-storage-mr_sas.mf +++ b/usr/src/pkg/manifests/driver-storage-mr_sas.mf @@ -23,6 +23,7 @@ # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2013 Nexenta Systems, Inc. All rights reserved. # Copyright 2015 Garrett D'Amore <garrett@damore.org> +# Copyright 2017 Citrus IT Limited. All rights reserved. # # @@ -45,21 +46,29 @@ dir path=usr/share/man/man7d $(sparc_ONLY)driver name=mr_sas class=scsi-self-identifying \ alias=pci1000,78 \ alias=pci1000,79 \ + alias=pciex1000,52 \ + alias=pciex1000,53 \ alias=pciex1000,5b \ alias=pciex1000,5d \ alias=pciex1000,5f \ alias=pciex1000,71 \ alias=pciex1000,73 \ alias=pciex1000,78 \ - alias=pciex1000,79 + alias=pciex1000,79 \ + alias=pciex1000,ce \ + alias=pciex1000,cf $(i386_ONLY)driver name=mr_sas class=scsi-self-identifying \ + alias=pciex1000,52 \ + alias=pciex1000,53 \ alias=pciex1000,5b \ alias=pciex1000,5d \ alias=pciex1000,5f \ alias=pciex1000,71 \ alias=pciex1000,73 \ alias=pciex1000,78 \ - alias=pciex1000,79 + alias=pciex1000,79 \ + alias=pciex1000,ce \ + alias=pciex1000,cf file path=kernel/drv/$(ARCH64)/mr_sas group=sys $(i386_ONLY)file path=kernel/drv/mr_sas group=sys file path=kernel/drv/mr_sas.conf group=sys diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf index 7fec376963..4e135e7751 100644 --- a/usr/src/pkg/manifests/system-header.mf +++ b/usr/src/pkg/manifests/system-header.mf @@ -1433,6 +1433,7 @@ file path=usr/include/sys/stat_impl.h file path=usr/include/sys/statfs.h file path=usr/include/sys/statvfs.h file path=usr/include/sys/stdbool.h +file path=usr/include/sys/stddef.h file path=usr/include/sys/stdint.h file path=usr/include/sys/stermio.h file path=usr/include/sys/stream.h diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index 114e9108df..1010ad94ed 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -158,6 +158,7 @@ file path=opt/zfs-tests/bin/file_trunc mode=0555 file path=opt/zfs-tests/bin/file_write mode=0555 file path=opt/zfs-tests/bin/getholes mode=0555 file path=opt/zfs-tests/bin/largest_file mode=0555 +file path=opt/zfs-tests/bin/memory_balloon mode=0555 file path=opt/zfs-tests/bin/mkbusy mode=0555 file path=opt/zfs-tests/bin/mkfiles mode=0555 file path=opt/zfs-tests/bin/mkholes mode=0555 @@ -2302,14 +2303,18 @@ file path=opt/zfs-tests/tests/perf/regression/random_reads mode=0555 file path=opt/zfs-tests/tests/perf/regression/random_readwrite mode=0555 file path=opt/zfs-tests/tests/perf/regression/random_writes mode=0555 file path=opt/zfs-tests/tests/perf/regression/sequential_reads mode=0555 -file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached \ +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_arc_cached \ mode=0555 -file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached_clone \ +file \ + path=opt/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone \ + mode=0555 +file path=opt/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached \ mode=0555 file path=opt/zfs-tests/tests/perf/regression/sequential_writes mode=0555 file path=opt/zfs-tests/tests/perf/regression/setup mode=0555 file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0444 file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0444 +file path=opt/zfs-tests/tests/perf/scripts/profile.d mode=0444 license cr_Sun license=cr_Sun license lic_CDDL license=lic_CDDL depend fmri=system/file-system/zfs/tests type=require diff --git a/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile b/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile new file mode 100644 index 0000000000..df1d370356 --- /dev/null +++ b/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile @@ -0,0 +1,22 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +PROG = memory_balloon + +include $(SRC)/cmd/Makefile.cmd + +LINTFLAGS += -erroff=E_FUNC_SET_NOT_USED + +include ../Makefile.subdirs diff --git a/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c b/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c new file mode 100644 index 0000000000..958f6e6609 --- /dev/null +++ b/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c @@ -0,0 +1,103 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * Steal memory from the kernel, forcing the ARC to decrease in size, and hold + * it until the process receives a signal. + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/shm.h> +#include <strings.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> + +static void +usage(char *progname) +{ + (void) fprintf(stderr, "Usage: %s -f <bytes>\n", progname); + exit(1); +} + +static void +fail(char *err, int rval) +{ + perror(err); + exit(rval); +} + +static void +daemonize(void) +{ + pid_t pid; + + if ((pid = fork()) < 0) { + fail("fork", 1); + } else if (pid != 0) { + (void) fprintf(stdout, "%ld\n", pid); + exit(0); + } + + (void) setsid(); + (void) close(0); + (void) close(1); + (void) close(2); +} + +int +main(int argc, char *argv[]) +{ + int c; + boolean_t fflag = B_FALSE; + char *prog = argv[0]; + long long size; + char *stroll_leftovers; + int shm_id; + void *shm_attached; + + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + /* Run in the foreground */ + case 'f': + fflag = B_TRUE; + break; + default: + usage(prog); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + usage(prog); + size = strtoll(argv[0], &stroll_leftovers, 10); + if (size <= 0) + fail("invalid size in bytes", 1); + + if ((shm_id = shmget(IPC_PRIVATE, size, IPC_CREAT|IPC_EXCL)) == -1) + fail("shmget", 1); + if ((shm_attached = shmat(shm_id, NULL, SHM_SHARE_MMU)) == (void *)-1) + fail("shmat", 1); + + if (fflag == B_FALSE) + daemonize(); + (void) pause(); + + /* NOTREACHED */ + return (0); +} diff --git a/usr/src/test/zfs-tests/runfiles/perf-regression.run b/usr/src/test/zfs-tests/runfiles/perf-regression.run index 0095931ad5..dbb30f0327 100644 --- a/usr/src/test/zfs-tests/runfiles/perf-regression.run +++ b/usr/src/test/zfs-tests/runfiles/perf-regression.run @@ -10,7 +10,7 @@ # # -# Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (c) 2015, 2016 by Delphix. All rights reserved. # [DEFAULT] @@ -24,7 +24,7 @@ post = cleanup outputdir = /var/tmp/test_results [/opt/zfs-tests/tests/perf/regression] -tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_cached', - 'sequential_reads_cached_clone', 'random_reads', 'random_writes', - 'random_readwrite'] +tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_arc_cached', + 'sequential_reads_arc_cached_clone', 'sequential_reads_dbuf_cached', + 'random_reads', 'random_writes', 'random_readwrite'] post = diff --git a/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh index a4f90be49b..7f6faf690e 100644 --- a/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh @@ -49,26 +49,51 @@ function cleanup verify_runnable "global" log_onexit cleanup -OUTFILE='/var/tmp/mdb-outfile' -set -A dcmds "::walk spa" \ - "::walk spa | ::spa " \ - "::walk spa | ::spa -c" \ - "::walk spa | ::spa -v" \ - "::walk spa | ::spa_config" \ - "::walk spa | ::spa_space" \ - "::walk spa | ::spa_space -b" \ - "::walk spa | ::spa_vdevs" \ - "::walk spa | ::walk metaslab" \ - "::walk spa | ::print struct spa spa_root_vdev | ::vdev" \ - "::walk spa | ::print struct spa spa_root_vdev | ::vdev -re" \ +tmpfile=$(mktemp) +log_must zpool scrub $TESTPOOL + +typeset spa=$(mdb -ke "::spa" | awk "/$TESTPOOL/ {print \$1}") +typeset off_ub=$(mdb -ke "::offsetof spa_t spa_uberblock | =J") +typeset off_rbp=$(mdb -ke "::offsetof uberblock_t ub_rootbp | =J") +typeset bp=$(mdb -ke "$spa + $off_ub + $off_rbp =J") + +# dcmds and walkers skipped due to being DEBUG only or difficult to run: +# ::zfs_params +# ::refcount + +set -A dcmds "::abuf_find 1 2" \ + "::arc" \ + "::arc -b" \ + "::arc_compression_stats" \ + "$bp ::blkptr" \ + "$bp ::dva" \ + "::walk spa" \ + "::spa" \ + "$spa ::spa " \ + "$spa ::spa -c" \ + "$spa ::spa -h" \ + "$spa ::spa -v" \ + "$spa ::spa -Mmh" \ + "$spa ::spa_config" \ + "$spa ::spa_space" \ + "$spa ::spa_space -b" \ + "$spa ::spa_vdevs" \ + "$spa ::print spa_t spa_root_vdev | ::vdev" \ + "$spa ::print spa_t spa_root_vdev | ::vdev -re" \ + "$spa ::print -a spa_t spa_dsl_pool->dp_dirty_datasets | ::walk txg_list" \ + "$spa ::print -a spa_t spa_uberblock.ub_rootbp | ::blkptr" \ + "$spa ::walk metaslab" \ + "$spa ::walk metaslab | ::head -1 | ::metaslab_weight" \ + "$spa ::walk metaslab | ::head -1 | ::metaslab_trace" \ + "$spa ::walk zio_root | ::zio -c" \ + "$spa ::walk zio_root | ::zio -r" \ + "$spa ::walk zms_freelist" + "$spa ::zfs_blkstats -v" \ "::dbufs" \ "::dbufs -n mos -o mdn -l 0 -b 0" \ "::dbufs | ::dbuf" \ "::dbuf_stats" \ - "::abuf_find 1 2" \ - "::walk spa | ::print -a struct spa spa_uberblock.ub_rootbp | ::blkptr" \ - "::walk spa | ::print -a struct spa spa_dsl_pool->dp_dirty_datasets | ::walk txg_list" \ - "::walk spa | ::walk zms_freelist" + "dbuf_cache ::walk multilist" # # The commands above were supplied by the ZFS development team. The idea is to # do as much checking as possible without the need to hardcode addresses. diff --git a/usr/src/test/zfs-tests/tests/perf/perf.shlib b/usr/src/test/zfs-tests/tests/perf/perf.shlib index 38e30f255d..ff980c0e6e 100644 --- a/usr/src/test/zfs-tests/tests/perf/perf.shlib +++ b/usr/src/test/zfs-tests/tests/perf/perf.shlib @@ -182,6 +182,18 @@ function get_max_arc_size echo $max_arc_size } +function get_max_dbuf_cache_size +{ + typeset -l max_dbuf_cache_size=$(dtrace -qn 'BEGIN { + printf("%u\n", `dbuf_cache_max_bytes); + exit(0); + }') + + [[ $? -eq 0 ]] || log_fail "get_max_dbuf_cache_size failed" + + echo $max_dbuf_cache_size +} + # Create a file with some information about how this system is configured. function get_system_config { diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh index abf05ca719..655366e00c 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh @@ -69,8 +69,9 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Random reads with $PERF_RUNTYPE settings" do_fio_run random_reads.fio false true diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh index 2422f9c658..f41a2b526e 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh @@ -69,8 +69,9 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Random reads and writes with $PERF_RUNTYPE settings" do_fio_run random_readwrite.fio false true diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh index c48ae76140..9e201a827c 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh @@ -61,8 +61,9 @@ fi # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Random writes with $PERF_RUNTYPE settings" do_fio_run random_writes.fio true false diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh index 60083c8673..580f2d94e4 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh @@ -69,9 +69,10 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat" - "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" + "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Sequential reads with $PERF_RUNTYPE settings" do_fio_run sequential_reads.fio false true diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh index b4365c0871..97bb8bdc31 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh @@ -68,9 +68,10 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat" - "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" + "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Sequential cached reads with $PERF_RUNTYPE settings" do_fio_run sequential_reads.fio false false diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh index c656eb4643..cfc748c843 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh @@ -84,9 +84,10 @@ export TESTFS=$PERFPOOL/$TESTCLONE # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat" - "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" + "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Sequential cached reads from $TESTFS with $PERF_RUNTYPE settings" do_fio_run sequential_reads.fio false false diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh new file mode 100644 index 0000000000..f7ea4b75c6 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh @@ -0,0 +1,82 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +# +# Description: +# Trigger fio runs using the sequential_reads job file. The number of runs and +# data collected is determined by the PERF_* variables. See do_fio_run for +# details about these variables. +# +# The files to read from are created prior to the first fio run, and used +# for all fio runs. The ARC is not cleared to ensure that all data is cached. +# +# This is basically a copy of the sequential_reads_cached test case, but with +# a smaller dateset so that we can fit everything into the decompressed, linear +# space in the dbuf cache. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/perf/perf.shlib + +function cleanup +{ + log_must zfs destroy $TESTFS +} + +log_assert "Measure IO stats during sequential read load" +log_onexit cleanup + +export TESTFS=$PERFPOOL/testfs +recreate_perfpool +log_must zfs create $PERF_FS_OPTS $TESTFS + +# Ensure the working set can be cached in the dbuf cache. +export TOTAL_SIZE=$(($(get_max_dbuf_cache_size) * 3 / 4)) + +# Variables for use by fio. +if [[ -n $PERF_REGRESSION_WEEKLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k'} +elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then + export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY} + export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'} + export PERF_NTHREADS=${PERF_NTHREADS:-'64'} + export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'} + export PERF_IOSIZES=${PERF_IOSIZES:-'64k'} +fi + +# Layout the files to be used by the read tests. Create as many files as the +# largest number of threads. An fio run with fewer threads will use a subset +# of the available files. +export NUMJOBS=$(get_max $PERF_NTHREADS) +export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS)) +log_must fio $FIO_SCRIPTS/mkfiles.fio + +# Set up the scripts and output files that will log performance data. +lun_list=$(pool_to_lun_list $PERFPOOL) +log_note "Collecting backend IO stats with lun list $lun_list" +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" + "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") + +log_note "Sequential cached reads with $PERF_RUNTYPE settings" +do_fio_run sequential_reads.fio false false +log_pass "Measure IO stats during sequential cached read load" diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh index e2f2cca0d2..493a3d18b7 100644 --- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh +++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh @@ -61,8 +61,9 @@ fi # Set up the scripts and output files that will log performance data. lun_list=$(pool_to_lun_list $PERFPOOL) log_note "Collecting backend IO stats with lun list $lun_list" -export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io" - "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat") +export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" + "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat" + "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat") log_note "Sequential writes with $PERF_RUNTYPE settings" do_fio_run sequential_writes.fio true false diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/profile.d b/usr/src/test/zfs-tests/tests/perf/scripts/profile.d new file mode 100644 index 0000000000..e7fbd1fca5 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/perf/scripts/profile.d @@ -0,0 +1,37 @@ +#!/usr/sbin/dtrace -s + +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#pragma D option stackframes=100 + +/* + * @stacks: The number of times a stack has been recorded + */ + +profile-997 +/ arg0 / +{ + @stacks[stack()] = count(); +} + +ERROR +{ + trace(arg1); + trace(arg2); + trace(arg3); + trace(arg4); + trace(arg5); +} diff --git a/usr/src/tools/mbh_patch/Makefile b/usr/src/tools/mbh_patch/Makefile index 68d2559864..95e8442340 100644 --- a/usr/src/tools/mbh_patch/Makefile +++ b/usr/src/tools/mbh_patch/Makefile @@ -23,7 +23,6 @@ # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" include ../Makefile.tools diff --git a/usr/src/tools/mbh_patch/mbh_patch.c b/usr/src/tools/mbh_patch/mbh_patch.c index 8a5fa4cd31..204009ab4d 100644 --- a/usr/src/tools/mbh_patch/mbh_patch.c +++ b/usr/src/tools/mbh_patch/mbh_patch.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <errno.h> #include <fcntl.h> @@ -37,7 +35,9 @@ #include <sys/elf_notes.h> #include <sys/mman.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include "sys/multiboot.h" +#include "sys/multiboot2.h" static char *pname; static char *fname; @@ -46,7 +46,57 @@ static char *image; /* pointer to the ELF file in memory */ #define ELFSEEK(offset) ((void *)(image + offset)) /* - * patch the load address / entry address + * Find MB2 header tags for entry and patch it. + * The first tag is right after header. + */ +static int +patch64_mb2(multiboot2_header_t *mbh2, int file_offset, + Elf64_Addr ptload_start, Elf32_Off ptload_offset) +{ + multiboot_header_tag_t *tagp = mbh2->mb2_tags; + multiboot_header_tag_address_t *mbaddr = NULL; + multiboot_header_tag_entry_address_t *mbentry = NULL; + + /* + * Loop until we get end TAG or we have both tags. + */ + while (tagp->mbh_type != MULTIBOOT_HEADER_TAG_END && + (mbaddr == NULL || mbentry == NULL)) { + switch (tagp->mbh_type) { + case MULTIBOOT_HEADER_TAG_ADDRESS: + mbaddr = (multiboot_header_tag_address_t *)tagp; + break; + case MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS: + mbentry = (multiboot_header_tag_entry_address_t *)tagp; + break; + } + tagp = (multiboot_header_tag_t *) + ((uintptr_t)tagp + + P2ROUNDUP(tagp->mbh_size, MULTIBOOT_TAG_ALIGN)); + } + + if (mbaddr == NULL || mbentry == NULL) { + (void) fprintf(stderr, "Missing multiboot2 %s tag\n", + (mbaddr == NULL)? "address" : "entry"); + return (1); + } + + /* Patch it. */ + mbaddr->mbh_load_addr = ptload_start - ptload_offset; + mbaddr->mbh_header_addr = mbaddr->mbh_load_addr + file_offset; + mbentry->mbh_entry_addr = ptload_start; + +#ifdef VERBOSE + (void) printf(" ELF64 MB2 header patched\n"); + (void) printf("\tload_addr now: 0x%x\n", mbaddr->mbh_load_addr); + (void) printf("\theader_addr now: 0x%x\n", mbaddr->mbh_header_addr); + (void) printf("\tentry_addr now: 0x%x\n", mbentry->mbh_entry_addr); +#endif + return (0); +} + +/* + * Patch the load address / entry address for MB1 and MB2 if present. * Find the physical load address of the 1st PT_LOAD segment. * Find the amount that e_entry exceeds that amount. * Now go back and subtract the excess from the p_paddr of the LOAD segment. @@ -56,8 +106,9 @@ patch64(Elf64_Ehdr *eh) { Elf64_Phdr *phdr; caddr_t phdrs = NULL; - int ndx, mem; + int ndx, mem, mem2; multiboot_header_t *mbh; + multiboot2_header_t *mbh2; /* * Verify some ELF basics - this must be an executable with program @@ -84,7 +135,7 @@ patch64(Elf64_Ehdr *eh) } /* - * Look for multiboot header. It must be 32-bit aligned and + * Look for multiboot1 header. It must be 32-bit aligned and * completely contained in the 1st 8K of the file. */ for (mem = 0; mem < 8192 - sizeof (multiboot_header_t); mem += 4) { @@ -100,6 +151,30 @@ patch64(Elf64_Ehdr *eh) } /* + * Look for multiboot2 header. It must be 64-bit aligned and + * completely contained in the 1st 32K of the file. + * We do not require it to be present. + */ + ndx = 0; + for (mem2 = 0; + mem2 <= MULTIBOOT_SEARCH - sizeof (multiboot2_header_t); + mem2 += MULTIBOOT_HEADER_ALIGN) { + mbh2 = ELFSEEK(mem2); + ndx = mbh2->mb2_header_length; + if (mbh2->mb2_magic == MULTIBOOT2_HEADER_MAGIC) + break; + ndx = 0; + } + + if (ndx == 0 || mem2 + ndx > MULTIBOOT_SEARCH) { +#ifdef VERBOSE + (void) fprintf(stderr, "%s: %s: Didn't find multiboot2 " + "header\n", pname, fname); +#endif + mbh2 = NULL; + } + + /* * Find the 1:1 mapped PT_LOAD section */ for (ndx = 0; ndx < eh->e_phnum; ndx++) { @@ -135,6 +210,16 @@ patch64(Elf64_Ehdr *eh) return (1); } + if (mbh2 != NULL && ((mem2 < phdr->p_offset) || + (mem2 >= (phdr->p_offset + phdr->p_filesz)))) { +#ifdef VERBOSE + (void) fprintf(stderr, "%s: %s: multiboot2 header not" + " in 1st PT_LOAD\n", pname, fname); +#endif + mem2 = 0; + mbh2 = NULL; + } + /* * Patch the multiboot header fields to get entire file loaded. * Grub uses the MB header for 64 bit loading. @@ -148,6 +233,9 @@ patch64(Elf64_Ehdr *eh) (void) printf("\tentry_addr now: 0x%x\n", mbh->entry_addr); (void) printf("\theader_addr now: 0x%x\n", mbh->header_addr); #endif + if (mbh2 != NULL) + return (patch64_mb2(mbh2, mem2, phdr->p_paddr, + phdr->p_offset)); return (0); } @@ -162,9 +250,10 @@ main(int argc, char **argv) int fd; uchar_t *ident; void *hdr = NULL; + struct stat sb; /* - * we expect one argument -- the elf file + * We expect one argument -- the elf file. */ if (argc != 2) { (void) fprintf(stderr, "usage: %s <unix-elf-file>\n", argv[0]); @@ -184,11 +273,25 @@ main(int argc, char **argv) return (1); } + if (fstat(fd, &sb) != 0) { + (void) fprintf(stderr, "%s: fstat failed: %s\n", + pname, strerror(errno)); + return (1); + } + + /* Make sure we have at least MULTIBOOT_SEARCH bytes. */ + if (sb.st_size < MULTIBOOT_SEARCH) { + (void) fprintf(stderr, "%s: %s is too small for a kernel\n", + pname, fname); + return (1); + } + /* - * mmap just the 1st 8K -- since that's where the GRUB - * multiboot header must be located. + * mmap the 1st 32K -- MB1 header is within first 8k and MB2 header + * is within 32k. */ - image = mmap(NULL, 8192, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + image = mmap(NULL, MULTIBOOT_SEARCH, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); if (image == MAP_FAILED) { (void) fprintf(stderr, "%s: mmap() of %s failed: %s\n", pname, fname, strerror(errno)); diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 4aaa968965..450c903674 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1378,6 +1378,7 @@ SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \ BOOTFS_OBJS += bootfs_construct.o bootfs_vfsops.o bootfs_vnops.o ZFS_COMMON_OBJS += \ + abd.o \ arc.o \ blkptr.o \ bplist.o \ diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c new file mode 100644 index 0000000000..932ba800ed --- /dev/null +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -0,0 +1,940 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include <sys/abd.h> +#include <sys/param.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +boolean_t zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#ifdef _KERNEL +extern vmem_t *zio_alloc_arena; +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +static void * +abd_alloc_chunk() +{ + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + return (c); +} + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +void +abd_init(void) +{ + vmem_t *data_alloc_arena = NULL; + +#ifdef _KERNEL + data_alloc_arena = zio_alloc_arena; +#endif + + /* + * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH + * so that no allocator metadata is stored with the buffers. + */ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(size_t chunkcnt) +{ + size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, size); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + if (!zfs_abd_scatter_enabled) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + size_t n = abd_chunkcnt_for_bytes(size); + abd_t *abd = abd_alloc_struct(n); + + abd->abd_flags = ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + void *c = abd_alloc_chunk(); + ASSERT3P(c, !=, NULL); + abd->abd_u.abd_scatter.abd_chunks[i] = c; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd->abd_u.abd_scatter.abd_offset = + new_offset % zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + abd->abd_size = sabd->abd_size - off; + abd->abd_parent = sabd; + refcount_create(&abd->abd_children); + (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +struct abd_iter { + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; /* position (relative to abd_offset) */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ +}; + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = zfs_abd_chunk_size - offset; + paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + while (size > 0) { + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd); + abd_iter_init(&saiter, sabd); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the first size bytes of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) +{ + return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); +} diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 73c568c0b6..b10ea0bc69 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -128,14 +128,14 @@ * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and - * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). * * The L1ARC's data pointer may or may not be uncompressed. The ARC has the - * ability to store the physical data (b_pdata) associated with the DVA of the - * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block, + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, * it will match its on-disk compression characteristics. This behavior can be * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the - * compressed ARC functionality is disabled, the b_pdata will point to an + * compressed ARC functionality is disabled, the b_pabd will point to an * uncompressed version of the on-disk data. * * Data in the L1ARC is not accessed by consumers of the ARC directly. Each @@ -174,7 +174,7 @@ * | l1arc_buf_hdr_t * | | arc_buf_t * | b_buf +------------>+-----------+ arc_buf_t - * | b_pdata +-+ |b_next +---->+-----------+ + * | b_pabd +-+ |b_next +---->+-----------+ * +-----------+ | |-----------| |b_next +-->NULL * | |b_comp = T | +-----------+ * | |b_data +-+ |b_comp = F | @@ -191,8 +191,8 @@ * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new * arc_buf_t and either copies uncompressed data into a new data buffer from an - * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a - * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the * hdr is compressed and the desired compression characteristics of the * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be @@ -216,7 +216,7 @@ * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ - * | b_pdata +-+ |---------| |b_next +-->NULL + * | b_pabd +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ @@ -230,19 +230,19 @@ * | +------+ | * +---------------------------------+ * - * Writing to the ARC requires that the ARC first discard the hdr's b_pdata + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t. As the I/O pipeline performs the write, * it may compress the data before writing it to disk. The ARC will be called * with the transformed data and will bcopy the transformed on-disk block into - * a newly allocated b_pdata. Writes are always done into buffers which have + * a newly allocated b_pabd. Writes are always done into buffers which have * either been loaned (and hence are new and don't have other readers) or * buffers which have been released (and hence have their own hdr, if there * were originally other readers of the buf's original hdr). This ensures that * the ARC only needs to update a single buf and its hdr after a write occurs. * - * When the L2ARC is in use, it will also take advantage of the b_pdata. The - * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means * that when compressed ARC is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the @@ -264,7 +264,9 @@ #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> #include <sys/zfs_zone.h> +#include <sys/zio_checksum.h> #include <sys/multilist.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -300,7 +302,7 @@ int zfs_arc_evict_batch_limit = 10; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; -/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ @@ -463,13 +465,13 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* - * Uncompressed size of the data stored in b_pdata. If compressed + * Uncompressed size of the data stored in b_pabd. If compressed * arc is disabled then this value will be identical to the stat * above. */ @@ -883,7 +885,7 @@ typedef struct l1arc_buf_hdr { refcount_t b_refcnt; arc_callback_t *b_acb; - void *b_pdata; + abd_t *b_pabd; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; @@ -1083,7 +1085,7 @@ typedef struct l2arc_write_callback { typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_abd; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; @@ -1093,10 +1095,14 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); -static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); -static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_pabd(arc_buf_hdr_t *); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); @@ -1436,7 +1442,9 @@ static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && - buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); IMPLY(shared, ARC_BUF_SHARED(buf)); IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); @@ -1540,7 +1548,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) uint64_t csize; void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); - csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* @@ -1575,7 +1584,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, - BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); @@ -1873,7 +1882,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { - bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, arc_buf_size(buf)); } } else { @@ -1925,7 +1934,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) return (0); } else { int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), - hdr->b_l1hdr.b_pdata, buf->b_data, + hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); /* @@ -1962,7 +1971,7 @@ arc_decompress(arc_buf_t *buf) } /* - * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) @@ -1994,14 +2003,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -2029,14 +2038,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; } ASSERT(!GHOST_STATE(state)); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } @@ -2133,7 +2142,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2203,7 +2212,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); } else { uint32_t buffers = 0; @@ -2232,7 +2241,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } ASSERT3U(bufcnt, ==, buffers); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { @@ -2245,7 +2254,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); /* * When moving a header off of a ghost state, @@ -2285,7 +2294,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } @@ -2367,7 +2376,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) /* * Given a hdr and a buf, returns whether that buf can share its b_data buffer - * with the hdr's b_pdata. + * with the hdr's b_pabd. */ static boolean_t arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) @@ -2444,20 +2453,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new + * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new * buffer to store the buf's data. * - * There is one additional restriction here because we're sharing - * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively - * involved in an L2ARC write, because if this buf is used by an - * arc_write() then the hdr's data buffer will be released when the + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. */ - boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr); + boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && + abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { - buf->b_data = hdr->b_l1hdr.b_pdata; + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); buf->b_flags |= ARC_BUF_FLAG_SHARED; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { @@ -2553,11 +2565,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag) } static void -l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - df->l2df_data = data; + df->l2df_abd = abd; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); @@ -2588,7 +2600,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) arc_space_return(size, ARC_SPACE_DATA); } - l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); } /* @@ -2602,7 +2614,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_can_share(hdr, buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2611,7 +2623,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&state->arcs_size, buf, hdr); - hdr->b_l1hdr.b_pdata = buf->b_data; + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); buf->b_flags |= ARC_BUF_FLAG_SHARED; @@ -2631,7 +2645,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2640,7 +2654,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) */ refcount_transfer_ownership(&state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); - hdr->b_l1hdr.b_pdata = NULL; + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; buf->b_flags &= ~ARC_BUF_FLAG_SHARED; /* @@ -2735,7 +2751,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { /* * If the current arc_buf_t is sharing its data buffer with the - * hdr, then reassign the hdr's b_pdata to share it with the new + * hdr, then reassign the hdr's b_pabd to share it with the new * buffer at the end of the list. The shared buffer is always * the last one on the hdr's buffer list. * @@ -2750,8 +2766,8 @@ arc_buf_destroy_impl(arc_buf_t *buf) /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); - arc_hdr_free_pdata(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_pabd(hdr); /* * We must setup a new shared block between the @@ -2789,26 +2805,26 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); - hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +arc_hdr_free_pabd(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); /* * If the hdr is currently being written to the l2arc then @@ -2820,10 +2836,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr) arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, arc_hdr_size(hdr), hdr); } - hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_pabd = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); @@ -2860,7 +2876,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2901,7 +2917,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ - ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -2919,11 +2935,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the - * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * l2arc device. Otherwise, the b_l1hdr.b_pabd field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); - VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { @@ -3012,6 +3028,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_decompress() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); + arc_hdr_free_pabd(hdr); + arc_share_buf(hdr, buf); + } + return (buf); } @@ -3087,8 +3115,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } #endif - if (hdr->b_l1hdr.b_pdata != NULL) { - arc_hdr_free_pdata(hdr); + if (hdr->b_l1hdr.b_pabd != NULL) { + arc_hdr_free_pabd(hdr); } } @@ -3156,7 +3184,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pdata field) during its write phase. + * (i.e. its b_pabd field) during it's write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. @@ -3171,7 +3199,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { /* * This buffer is cached on the 2nd Level ARC; @@ -3237,9 +3265,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly - * in arc_free_data_buf(). + * in arc_free_data_impl(). */ - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -3337,7 +3365,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since - * arc_get_data_buf() doesn't check for overflow + * arc_get_data_impl() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the @@ -4000,6 +4028,7 @@ arc_kmem_reap_now(void) extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; + extern kmem_cache_t *abd_chunk_cache; #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { @@ -4027,6 +4056,7 @@ arc_kmem_reap_now(void) kmem_cache_reap_now(zio_data_buf_cache[i]); } } + kmem_cache_reap_now(abd_chunk_cache); kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); @@ -4042,13 +4072,13 @@ arc_kmem_reap_now(void) } /* - * Threads can block in arc_get_data_buf() waiting for this thread to evict + * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in - * arc_get_data_buf() are sleeping while holding the hash lock for their + * arc_get_data_impl() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * - * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, @@ -4088,7 +4118,7 @@ arc_reclaim_thread(void) /* * We call arc_adjust() before (possibly) calling * arc_kmem_reap_now(), so that we can wake up - * arc_get_data_buf() sooner. + * arc_get_data_impl() sooner. */ evicted = arc_adjust(); @@ -4245,18 +4275,45 @@ arc_is_overflowing(void) return (arc_size >= arc_c + overflow); } +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ -static void * -arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - void *datap = NULL; - arc_state_t *state = hdr->b_l1hdr.b_state; - arc_buf_contents_t type = arc_buf_type(hdr); + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); @@ -4298,11 +4355,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { - ASSERT(type == ARC_BUFC_DATA); - datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } @@ -4338,14 +4392,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } - return (datap); +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } } /* * Free the arc data buffer. */ static void -arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); @@ -4362,11 +4436,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { - zio_buf_free(data, size); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(data, size); arc_space_return(size, ARC_SPACE_DATA); } } @@ -4639,7 +4711,7 @@ arc_read_done(zio_t *zio) if (callback_cnt == 0) { ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || @@ -4735,7 +4807,7 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -4878,7 +4950,7 @@ top: hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); @@ -4896,9 +4968,9 @@ top: * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pdata(hdr); + arc_hdr_alloc_pabd(hdr); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); size = arc_hdr_size(hdr); /* @@ -5001,7 +5073,7 @@ top: ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, - size, hdr->b_l1hdr.b_pdata, + size, hdr->b_l1hdr.b_pabd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | @@ -5040,7 +5112,7 @@ top: } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); /* @@ -5232,16 +5304,17 @@ arc_release(arc_buf_t *buf, void *tag) arc_unshare_buf(hdr, buf); /* - * Now we need to recreate the hdr's b_pdata. Since we + * Now we need to recreate the hdr's b_pabd. Since we * have lastbuf handy, we try to share with it, but if - * we can't then we allocate a new b_pdata and copy the + * we can't then we allocate a new b_pabd and copy the * data from buf into it. */ if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pdata(hdr); - bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize); + arc_hdr_alloc_pabd(hdr); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, + buf->b_data, psize); } VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { @@ -5257,7 +5330,7 @@ arc_release(arc_buf_t *buf, void *tag) HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, @@ -5276,7 +5349,7 @@ arc_release(arc_buf_t *buf, void *tag) mutex_exit(hash_lock); /* - * Allocate a new hdr. The new hdr will contain a b_pdata + * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); @@ -5354,15 +5427,15 @@ arc_write_ready(zio_t *zio) if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); arc_buf_unwatch(buf); - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } } } - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5384,33 +5457,47 @@ arc_write_ready(zio_t *zio) HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); + /* - * If the hdr is compressed, then copy the compressed - * zio contents into arc_buf_hdr_t. Otherwise, copy the original - * data buf into the hdr. Ideally, we would like to always copy the - * io_data into b_pdata but the user may have disabled compressed - * arc thus the on-disk block may or may not match what we maintain - * in the hdr's b_pdata field. + * Fill the hdr with data. If the hdr is compressed, the data we want + * is available from the zio, otherwise we can take it from the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - !ARC_BUF_COMPRESSED(buf)) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF); - ASSERT3U(psize, >, 0); - arc_hdr_alloc_pdata(hdr); - bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + arc_hdr_alloc_pabd(hdr); + + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, + ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } } else { - ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - /* - * This hdr is not compressed so we're able to share - * the arc_buf_t data buffer with the hdr. - */ arc_share_buf(hdr, buf); - ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, - HDR_GET_LSIZE(hdr))); } + arc_hdr_verify(hdr, zio->io_bp); } @@ -5515,6 +5602,7 @@ arc_write_done(zio_t *zio) ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); + abd_put(zio->io_abd); kmem_free(callback, sizeof (arc_write_callback_t)); } @@ -5551,10 +5639,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, callback->awcb_buf = buf; /* - * The hdr's b_pdata is now stale, free it now. A new data block + * The hdr's b_pabd is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ - if (hdr->b_l1hdr.b_pdata != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. @@ -5564,15 +5652,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pdata(hdr); + arc_hdr_free_pabd(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); - ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - zio = zio_write(pio, spa, txg, bp, buf->b_data, + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, @@ -6328,13 +6417,8 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT3P(df->l2df_data, !=, NULL); - if (df->l2df_type == ARC_BUFC_METADATA) { - zio_buf_free(df->l2df_data, df->l2df_size); - } else { - ASSERT(df->l2df_type == ARC_BUFC_DATA); - zio_data_buf_free(df->l2df_data, df->l2df_size); - } + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -6484,12 +6568,12 @@ l2arc_read_done(zio_t *zio) mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3P(zio->io_data, !=, NULL); + ASSERT3P(zio->io_abd, !=, NULL); /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ @@ -6523,7 +6607,7 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -6811,7 +6895,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); uint64_t size = arc_hdr_size(hdr); @@ -6826,20 +6910,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * lifetime of the ZIO and be cleaned up afterwards, we * add it to the l2arc_free_on_write queue. */ - void *to_write; + abd_t *to_write; if (!HDR_SHARED_DATA(hdr)) { - to_write = hdr->b_l1hdr.b_pdata; + to_write = hdr->b_l1hdr.b_pabd; } else { - arc_buf_contents_t type = arc_buf_type(hdr); - if (type == ARC_BUFC_METADATA) { - to_write = zio_buf_alloc(size); - } else { - ASSERT3U(type, ==, ARC_BUFC_DATA); - to_write = zio_data_buf_alloc(size); - } - - bcopy(hdr->b_l1hdr.b_pdata, to_write, size); - l2arc_free_data_on_write(to_write, size, type); + to_write = abd_alloc_for_io(size, + HDR_ISTYPE_METADATA(hdr)); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + l2arc_free_abd_on_write(to_write, size, + arc_buf_type(hdr)); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, size, to_write, diff --git a/usr/src/uts/common/fs/zfs/blkptr.c b/usr/src/uts/common/fs/zfs/blkptr.c index 7e61dc96ff..ff93ff4456 100644 --- a/usr/src/uts/common/fs/zfs/blkptr.c +++ b/usr/src/uts/common/fs/zfs/blkptr.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 8d42481ea0..812ff3badd 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -46,6 +46,7 @@ #include <sys/blkptr.h> #include <sys/range_tree.h> #include <sys/callb.h> +#include <sys/abd.h> uint_t zfs_dbuf_evict_key; @@ -3463,8 +3464,10 @@ dbuf_write_override_done(zio_t *zio) arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); - dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); } /* Issue I/O to commit a dirty buffer to disk. */ @@ -3557,7 +3560,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ - void *contents = (data != NULL) ? data->b_data : NULL; + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 9955f89e77..ba3e02cfb5 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -36,6 +36,7 @@ #include <sys/zio_checksum.h> #include <sys/zio_compress.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -651,9 +652,8 @@ ddt_free(ddt_entry_t *dde) for (int p = 0; p < DDT_PHYS_TYPES; p++) ASSERT(dde->dde_lead_zio[p] == NULL); - if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); cv_destroy(&dde->dde_cv); kmem_free(dde, sizeof (*dde)); @@ -917,7 +917,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) ddt_enter(ddt); - if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) avl_insert(&ddt->ddt_repair_tree, dde, where); else @@ -954,7 +954,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, - rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index f7ef30548d..7769003c43 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -46,6 +46,7 @@ #include <sys/zio_compress.h> #include <sys/sa.h> #include <sys/zfeature.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <sys/zfs_znode.h> @@ -1632,6 +1633,7 @@ dmu_sync_late_arrival_done(zio_t *zio) dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } @@ -1657,10 +1659,10 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, - zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size, - zp, dmu_sync_late_arrival_ready, NULL, - NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL, zb)); + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } @@ -2192,6 +2194,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); @@ -2215,4 +2218,5 @@ dmu_fini(void) xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); + abd_fini(); } diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index e40b9f88b6..c9a79b94e8 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -132,7 +132,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) { ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - fletcher_4_incremental_native(dsp->dsa_drr, + (void) fletcher_4_incremental_native(dsp->dsa_drr, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), &dsp->dsa_zc); if (dsp->dsa_drr->drr_type == DRR_BEGIN) { @@ -145,13 +145,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) if (dsp->dsa_drr->drr_type == DRR_END) { dsp->dsa_sent_end = B_TRUE; } - fletcher_4_incremental_native(&dsp->dsa_drr-> + (void) fletcher_4_incremental_native(&dsp->dsa_drr-> drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), &dsp->dsa_zc); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (payload_len != 0) { - fletcher_4_incremental_native(payload, payload_len, + (void) fletcher_4_incremental_native(payload, payload_len, &dsp->dsa_zc); if (dump_bytes(dsp, payload, payload_len) != 0) return (SET_ERROR(EINTR)); @@ -1742,11 +1742,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; - fletcher_4_incremental_byteswap(drr_begin, + (void) fletcher_4_incremental_byteswap(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); byteswap_record(drr_begin); } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { - fletcher_4_incremental_native(drr_begin, + (void) fletcher_4_incremental_native(drr_begin, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { return (SET_ERROR(EINVAL)); @@ -2419,9 +2419,9 @@ static void receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { - fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); } else { - fletcher_4_incremental_native(buf, len, &ra->cksum); + (void) fletcher_4_incremental_native(buf, len, &ra->cksum); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index c672128744..1963f15385 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -20,8 +20,8 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2016 Gary Mills + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include <sys/dsl_scan.h> @@ -47,6 +47,7 @@ #include <sys/sa.h> #include <sys/sa_impl.h> #include <sys/zfeature.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif @@ -1756,7 +1757,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1839,7 +1840,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) @@ -1854,9 +1854,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(scan_delay); - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, - zio_flags, zb)); + zio_nowait(zio_read(NULL, spa, bp, + abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, + NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ diff --git a/usr/src/uts/common/fs/zfs/edonr_zfs.c b/usr/src/uts/common/fs/zfs/edonr_zfs.c index 93f1221fd5..9a3430d946 100644 --- a/usr/src/uts/common/fs/zfs/edonr_zfs.c +++ b/usr/src/uts/common/fs/zfs/edonr_zfs.c @@ -22,19 +22,31 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/edonr.h> +#include <sys/abd.h> #define EDONR_MODE 512 #define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE +static int +edonr_incremental(void *buf, size_t size, void *arg) +{ + EdonRState *ctx = arg; + EdonRUpdate(ctx, buf, size * 8); + return (0); +} + /* * Native zio_checksum interface for the Edon-R hash function. */ /*ARGSUSED*/ void -zio_checksum_edonr_native(const void *buf, uint64_t size, +abd_checksum_edonr_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { uint8_t digest[EDONR_MODE / 8]; @@ -42,7 +54,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size, ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - EdonRUpdate(&ctx, buf, size * 8); + (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); EdonRFinal(&ctx, digest); bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); } @@ -51,12 +63,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size, * Byteswapped zio_checksum interface for the Edon-R hash function. */ void -zio_checksum_edonr_byteswap(const void *buf, uint64_t size, +abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_edonr_native(buf, size, ctx_template, &tmp); + abd_checksum_edonr_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); @@ -64,7 +76,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size, } void * -zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) { EdonRState *ctx; uint8_t salt_block[EDONR_BLOCK_SIZE]; @@ -93,7 +105,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) } void -zio_checksum_edonr_tmpl_free(void *ctx_template) +abd_checksum_edonr_tmpl_free(void *ctx_template) { EdonRState *ctx = ctx_template; diff --git a/usr/src/uts/common/fs/zfs/lz4.c b/usr/src/uts/common/fs/zfs/lz4.c index 3aa1b74ef3..82a08939dc 100644 --- a/usr/src/uts/common/fs/zfs/lz4.c +++ b/usr/src/uts/common/fs/zfs/lz4.c @@ -31,6 +31,9 @@ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html * - LZ4 source repository : http://code.google.com/p/lz4/ */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ #include <sys/zfs_context.h> diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c index 81a7f6b1c2..23a97aa3de 100644 --- a/usr/src/uts/common/fs/zfs/sha256.c +++ b/usr/src/uts/common/fs/zfs/sha256.c @@ -24,29 +24,39 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/sha2.h> +#include <sys/abd.h> + +static int +sha_incremental(void *buf, size_t size, void *arg) +{ + SHA2_CTX *ctx = arg; + SHA2Update(ctx, buf, size); + return (0); +} /*ARGSUSED*/ void -zio_checksum_SHA256(const void *buf, uint64_t size, +abd_checksum_SHA256(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; zio_cksum_t tmp; SHA2Init(SHA256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(&tmp, &ctx); /* * A prior implementation of this function had a * private SHA256 implementation always wrote things out in * Big Endian and there wasn't a byteswap variant of it. - * To preseve on disk compatibility we need to force that - * behaviour. + * To preserve on disk compatibility we need to force that + * behavior. */ zcp->zc_word[0] = BE_64(tmp.zc_word[0]); zcp->zc_word[1] = BE_64(tmp.zc_word[1]); @@ -56,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size, /*ARGSUSED*/ void -zio_checksum_SHA512_native(const void *buf, uint64_t size, +abd_checksum_SHA512_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { SHA2_CTX ctx; SHA2Init(SHA512_256, &ctx); - SHA2Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); SHA2Final(zcp, &ctx); } /*ARGSUSED*/ void -zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, +abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); + abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); diff --git a/usr/src/uts/common/fs/zfs/skein_zfs.c b/usr/src/uts/common/fs/zfs/skein_zfs.c index 6592340396..340da7adfb 100644 --- a/usr/src/uts/common/fs/zfs/skein_zfs.c +++ b/usr/src/uts/common/fs/zfs/skein_zfs.c @@ -20,42 +20,52 @@ */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> #include <sys/zio.h> #include <sys/skein.h> +#include <sys/abd.h> + +static int +skein_incremental(void *buf, size_t size, void *arg) +{ + Skein_512_Ctxt_t *ctx = arg; + (void) Skein_512_Update(ctx, buf, size); + return (0); +} /* * Computes a native 256-bit skein MAC checksum. Please note that this * function requires the presence of a ctx_template that should be allocated - * using zio_checksum_skein_tmpl_init. + * using abd_checksum_skein_tmpl_init. */ /*ARGSUSED*/ void -zio_checksum_skein_native(const void *buf, uint64_t size, +abd_checksum_skein_native(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { Skein_512_Ctxt_t ctx; ASSERT(ctx_template != NULL); bcopy(ctx_template, &ctx, sizeof (ctx)); - (void) Skein_512_Update(&ctx, buf, size); + (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); (void) Skein_512_Final(&ctx, (uint8_t *)zcp); bzero(&ctx, sizeof (ctx)); } /* - * Byteswapped version of zio_checksum_skein_native. This just invokes + * Byteswapped version of abd_checksum_skein_native. This just invokes * the native checksum function and byteswaps the resulting checksum (since * skein is internally endian-insensitive). */ void -zio_checksum_skein_byteswap(const void *buf, uint64_t size, +abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { zio_cksum_t tmp; - zio_checksum_skein_native(buf, size, ctx_template, &tmp); + abd_checksum_skein_native(abd, size, ctx_template, &tmp); zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); @@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size, * computations and returns a pointer to it. */ void * -zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) { Skein_512_Ctxt_t *ctx; @@ -79,10 +89,10 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) /* * Frees a skein context template previously allocated using - * zio_checksum_skein_tmpl_init. + * abd_checksum_skein_tmpl_init. */ void -zio_checksum_skein_tmpl_free(void *ctx_template) +abd_checksum_skein_tmpl_free(void *ctx_template) { Skein_512_Ctxt_t *ctx = ctx_template; diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 5764d47c33..be5b66fd3b 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -70,6 +70,7 @@ #include <sys/dsl_scan.h> #include <sys/zfeature.h> #include <sys/dsl_destroy.h> +#include <sys/abd.h> #ifdef _KERNEL #include <sys/bootprops.h> @@ -1876,6 +1877,7 @@ spa_load_verify_done(zio_t *zio) int error = zio->io_error; spa_t *spa = zio->io_spa; + abd_free(zio->io_abd); if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) @@ -1883,7 +1885,6 @@ spa_load_verify_done(zio_t *zio) else atomic_inc_64(&sle->sle_data_count); } - zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1913,12 +1914,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (!spa_load_verify_metadata) return (0); - if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) return (0); zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -1926,7 +1926,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); - zio_nowait(zio_read(rio, spa, bp, data, size, + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h new file mode 100644 index 0000000000..308f021b76 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/abd.h @@ -0,0 +1,150 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include <sys/isa_defs.h> +#include <sys/int_types.h> +#include <sys/debug.h> +#include <sys/refcount.h> +#ifdef _KERNEL +#include <sys/uio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */ +} abd_flags_t; + +typedef struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; + uint_t abd_chunk_size; + void *abd_chunks[]; + } abd_scatter; + struct abd_linear { + void *abd_buf; + } abd_linear; + } abd_u; +} abd_t; + +typedef int abd_iter_func_t(void *, size_t, void *); +typedef int abd_iter_func2_t(void *, void *, size_t, void *); + +extern boolean_t zfs_abd_scatter_enabled; + +inline boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0); +} + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_free(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *, size_t); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); + +/* + * Wrappers for calls with offsets of 0 + */ + +inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +inline void +abd_copy_from_buf(abd_t *abd, void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +inline int +abd_cmp_buf(abd_t *abd, void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * Module lifecycle + */ + +void abd_init(void); +void abd_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index 771610677e..15d2a9a7ad 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DDT_H @@ -35,6 +36,8 @@ extern "C" { #endif +struct abd; + /* * On-disk DDT formats, in the desired search order (newest version first). */ @@ -108,7 +111,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + struct abd *dde_repair_abd; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index d0bb431866..0caefcd153 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -419,15 +419,17 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) -#define BP_GET_UCSIZE(bp) \ - ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ - BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ @@ -597,8 +599,7 @@ _NOTE(CONSTCOND) } while (0) } #define BP_GET_BUFC_TYPE(bp) \ - (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ - ARC_BUFC_METADATA : ARC_BUFC_DATA) + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 8df5b3b785..931c42f2be 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -52,6 +52,7 @@ extern "C" { typedef struct vdev_queue vdev_queue_t; typedef struct vdev_cache vdev_cache_t; typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; extern int zfs_vdev_queue_depth_pct; extern uint32_t zfs_vdev_async_write_max_active; @@ -86,7 +87,7 @@ typedef struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + struct abd *ve_abd; uint64_t ve_offset; uint64_t ve_lastused; avl_node_t ve_offset_node; diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 5a6bd3c329..b50df27774 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -306,6 +306,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ struct dnode_phys; +struct abd; struct zio_cksum_report { struct zio_cksum_report *zcr_next; @@ -338,12 +339,12 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, struct abd *data, uint64_t offset); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + struct abd *zt_orig_abd; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -404,8 +405,8 @@ struct zio { blkptr_t io_bp_orig; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + struct abd *io_abd; + struct abd *io_orig_abd; uint64_t io_size; uint64_t io_orig_size; /* io_lsize != io_orig_size iff this is a raw write */ @@ -465,19 +466,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, - uint64_t lsize, zio_done_func_t *done, void *private, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + struct abd *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -493,12 +494,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, struct abd *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -528,19 +529,19 @@ extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); extern void zio_data_buf_free(void *buf, size_t size); -extern void zio_push_transform(zio_t *zio, void *data, uint64_t size, +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + struct abd *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h index 2f7579fd73..3eda057eae 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright Saso Kiselkov 2013, All rights reserved. */ @@ -34,10 +34,12 @@ extern "C" { #endif +struct abd; + /* * Signature for checksum functions. */ -typedef void zio_checksum_t(const void *data, uint64_t size, +typedef void zio_checksum_t(struct abd *, uint64_t size, const void *ctx_template, zio_cksum_t *zcp); typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); typedef void zio_checksum_tmpl_free_t(void *ctx_template); @@ -81,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; -extern zio_checksum_t zio_checksum_SHA512_native; -extern zio_checksum_t zio_checksum_SHA512_byteswap; +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; /* Skein */ -extern zio_checksum_t zio_checksum_skein_native; -extern zio_checksum_t zio_checksum_skein_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free; +extern zio_checksum_t abd_checksum_skein_native; +extern zio_checksum_t abd_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; /* Edon-R */ -extern zio_checksum_t zio_checksum_edonr_native; -extern zio_checksum_t zio_checksum_edonr_byteswap; -extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init; -extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free; +extern zio_checksum_t abd_checksum_edonr_native; +extern zio_checksum_t abd_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); -extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, - void *, uint64_t, uint64_t, zio_bad_cksum_t *); + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); extern void zio_checksum_templates_free(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h index 0c1783b140..bcffa699b5 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h @@ -25,12 +25,14 @@ */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H +#include <sys/abd.h> + #ifdef __cplusplus extern "C" { #endif @@ -61,15 +63,22 @@ typedef size_t zio_compress_func_t(void *src, void *dst, /* Common signature for all zio decompress functions. */ typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); +/* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); /* * Information about each compression function. */ typedef struct zio_compress_info { - zio_compress_func_t *ci_compress; /* compression function */ - zio_decompress_func_t *ci_decompress; /* decompression function */ - int ci_level; /* level parameter */ - char *ci_name; /* algorithm name */ + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; } zio_compress_info_t; extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; @@ -97,9 +106,11 @@ extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, /* * Compress and decompress data if necessary. */ -extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len); -extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len); #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index a081deb7ea..d43745cd11 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -45,6 +45,7 @@ #include <sys/arc.h> #include <sys/zil.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> /* * Virtual device management. @@ -961,16 +962,16 @@ vdev_probe_done(zio_t *zio) vps->vps_readable = 1; if (zio->io_error == 0 && spa_writeable(spa)) { zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, - zio->io_offset, zio->io_size, zio->io_data, + zio->io_offset, zio->io_size, zio->io_abd, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; @@ -1086,8 +1087,8 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index a6d6cfa61b..9b4755321d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -31,6 +31,7 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/kstat.h> +#include <sys/abd.h> /* * Virtual device read-ahead caching. @@ -141,12 +142,12 @@ static void vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) { ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); - ASSERT(ve->ve_data != NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_abd); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -176,14 +177,14 @@ vdev_cache_allocate(zio_t *zio) ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); - ASSERT(ve->ve_hits != 0); + ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -197,7 +198,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT(ve->ve_fill_io == NULL); + ASSERT3P(ve->ve_fill_io, ==, NULL); if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); @@ -206,7 +207,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); } /* @@ -220,16 +221,16 @@ vdev_cache_fill(zio_t *fio) vdev_cache_entry_t *ve = fio->io_private; zio_t *pio; - ASSERT(fio->io_size == VCBS); + ASSERT3U(fio->io_size, ==, VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == fio); - ASSERT(ve->ve_offset == fio->io_offset); - ASSERT(ve->ve_data == fio->io_data); + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); ve->ve_fill_io = NULL; @@ -260,7 +261,7 @@ vdev_cache_read(zio_t *zio) uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); zio_t *fio; - ASSERT(zio->io_type == ZIO_TYPE_READ); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) return (B_FALSE); @@ -274,7 +275,7 @@ vdev_cache_read(zio_t *zio) if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (B_FALSE); - ASSERT(cache_phase + zio->io_size <= VCBS); + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); mutex_enter(&vc->vc_lock); @@ -311,7 +312,7 @@ vdev_cache_read(zio_t *zio) } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; @@ -339,7 +340,7 @@ vdev_cache_write(zio_t *zio) uint64_t max_offset = P2ROUNDUP(io_end, VCBS); avl_index_t where; - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); mutex_enter(&vc->vc_lock); @@ -356,8 +357,9 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_abd, zio->io_abd, + start - ve->ve_offset, start - io_start, + end - start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index d6c16fce75..056d356f27 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -31,6 +31,7 @@ #include <sys/refcount.h> #include <sys/vdev_disk.h> #include <sys/vdev_impl.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/zio.h> #include <sys/sunldi.h> @@ -667,6 +668,12 @@ vdev_disk_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(EIO); + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } + kmem_free(vb, sizeof (vdev_buf_t)); zio_delay_interrupt(zio); @@ -778,7 +785,15 @@ vdev_disk_io_start(zio_t *zio) if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bp->b_flags |= B_FAILFAST; bp->b_bcount = zio->io_size; - bp->b_un.b_addr = zio->io_data; + + if (zio->io_type == ZIO_TYPE_READ) { + bp->b_un.b_addr = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->b_un.b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + bp->b_lblkno = lbtodb(zio->io_offset); bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 633621b0dd..147e693967 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -31,6 +31,7 @@ #include <sys/zio.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> +#include <sys/abd.h> /* * Virtual device vector for files. @@ -157,6 +158,12 @@ vdev_file_io_intr(buf_t *bp) if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(ENOSPC); + if (zio->io_type == ZIO_TYPE_READ) { + abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } else { + abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); + } + kmem_free(vb, sizeof (vdev_buf_t)); zio_delay_interrupt(zio); } @@ -222,7 +229,15 @@ vdev_file_io_start(zio_t *zio) bioinit(bp); bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); bp->b_bcount = zio->io_size; - bp->b_un.b_addr = zio->io_data; + + if (zio->io_type == ZIO_TYPE_READ) { + bp->b_un.b_addr = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->b_un.b_addr = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + bp->b_lblkno = lbtodb(zio->io_offset); bp->b_bufsize = zio->io_size; bp->b_private = vf->vf_vnode; diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 866046315c..b76589f0f6 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -145,6 +145,7 @@ #include <sys/metaslab.h> #include <sys/zio.h> #include <sys/dsl_scan.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> /* @@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -444,6 +445,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -455,7 +457,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); retry: for (int l = 0; l < VDEV_LABELS; l++) { @@ -463,7 +466,7 @@ retry: zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -502,7 +505,7 @@ retry: goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); return (config); } @@ -636,8 +639,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - char *pad2; + abd_t *vp_abd; + abd_t *pad2; uberblock_t *ub; + abd_t *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -719,8 +724,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); /* * Generate a label describing the pool and our top-level vdev. @@ -780,7 +786,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -788,14 +794,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); - bzero(ub, VDEV_UBERBLOCK_RING); - *ub = spa->spa_uberblock; + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); - bzero(pad2, VDEV_PAD_SIZE); + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -805,7 +812,7 @@ retry: for (int l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -818,7 +825,7 @@ retry: offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -831,9 +838,9 @@ retry: } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2); + abd_free(ub_abd); + abd_free(vp_abd); /* * If this vdev hasn't been previously identified as a spare, then we @@ -897,7 +904,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = abd_to_buf(zio->io_abd); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -918,7 +925,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); } static void @@ -932,8 +939,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), - VDEV_UBERBLOCK_OFFSET(vd, n), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); } @@ -1000,9 +1007,6 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { - uberblock_t *ubbuf; - int n; - for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); @@ -1012,19 +1016,20 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) if (!vdev_writeable(vd)) return; - n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); + int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); - *ubbuf = *ub; + /* Copy the uberblock_t into the ABD */ + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); for (int l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1100,6 +1105,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; @@ -1117,15 +1123,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); - bzero(vp, sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1133,7 +1140,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd); nvlist_free(label); } diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index b038ef6f67..a57bd6c73b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -31,6 +31,7 @@ #include <sys/spa.h> #include <sys/vdev_impl.h> #include <sys/zio.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> /* @@ -196,13 +197,12 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -282,7 +282,8 @@ vdev_mirror_io_start(zio_t *zio) mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } @@ -307,7 +308,7 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); c++; @@ -392,7 +393,7 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; zio_vdev_io_redone(zio); zio_nowait(zio_vdev_child_io(zio, zio->io_bp, - mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, ZIO_TYPE_READ, zio->io_priority, 0, vdev_mirror_child_done, mc)); return; @@ -433,7 +434,7 @@ vdev_mirror_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio->io_data, zio->io_size, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 5211996f6a..9665ce9957 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -37,6 +37,7 @@ #include <sys/dsl_pool.h> #include <sys/zfs_zone.h> #include <sys/metaslab_impl.h> +#include <sys/abd.h> /* * ZFS I/O Scheduler @@ -377,12 +378,12 @@ vdev_queue_agg_io_done(zio_t *aio) zio_t *pio; zio_link_t *zl = NULL; while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_abd, aio->io_abd, + 0, pio->io_offset - aio->io_offset, pio->io_size); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_abd); } static int @@ -617,8 +618,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + abd_alloc_for_io(size, B_TRUE), size, first->io_type, + zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -630,12 +631,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_abd, + dio->io_offset - aio->io_offset, dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_abd, dio->io_abd, + dio->io_offset - aio->io_offset, 0, dio->io_size); } zio_add_child(dio, aio); diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index ff06896e8d..4b77438877 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -34,6 +34,7 @@ #include <sys/vdev_raidz.h> #include <sys/zio.h> #include <sys/zio_checksum.h> +#include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> @@ -108,7 +109,7 @@ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_abd; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -125,7 +126,7 @@ typedef struct raidz_map { uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ @@ -265,7 +266,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -273,11 +274,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_abd); size += rm->rm_col[c].rc_size; + } - if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -314,7 +317,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -328,8 +331,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -337,15 +341,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * replacing them with buffers to hold the result. */ for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_abd = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -355,13 +364,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_data = bad_parity[x]; + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset( + rm->rm_abd_copy, offset); + offset += rm->rm_col[x].rc_size; } } @@ -375,8 +388,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); } /* @@ -389,7 +404,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -403,7 +418,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); - if (rm->rm_datacopy != NULL) + if (rm->rm_abd_copy != NULL) return; /* @@ -419,17 +434,20 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + rm->rm_abd_copy = + abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_abd, col->rc_size); + abd_put(col->rc_abd); + col->rc_abd = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -442,7 +460,7 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { * the number of children in the target vdev. */ static raidz_map_t * -vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, +vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; @@ -455,6 +473,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -499,7 +518,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; - rm->rm_datacopy = NULL; + rm->rm_abd_copy = NULL; rm->rm_reports = 0; rm->rm_freed = 0; rm->rm_ecksuminjected = 0; @@ -515,7 +534,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_abd = NULL; rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; @@ -538,13 +557,16 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); - rm->rm_col[c].rc_data = data; + rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset(abd, off); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -584,29 +606,84 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -614,50 +691,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -666,59 +736,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_R].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -746,40 +805,153 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + + for (int i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, rq->q++) { + *dst ^= *rq->q; + + int j; + uint8_t *b; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); - src = rm->rm_col[VDEV_RAIDZ_P].rc_data; - dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -788,57 +960,43 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_data; - dst = rm->rm_col[x].rc_data; - - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); - count = MIN(ccount, xcount); + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } - + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - dst = rm->rm_col[x].rc_data; + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; exp = 255 - (rm->rm_cols - 1 - x); - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + struct reconst_q_struct rq = { abd_to_buf(src), exp }; + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -846,11 +1004,12 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; ASSERT(ntgts == 2); ASSERT(x < y); @@ -866,15 +1025,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; xsize = rm->rm_col[x].rc_size; ysize = rm->rm_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); - rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -883,12 +1042,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - xd = rm->rm_col[x].rc_data; - yd = rm->rm_col[y].rc_data; + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; /* * We now have: @@ -912,24 +1071,21 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + ASSERT3U(xsize, >=, ysize); + struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, - rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1244,7 +1400,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = abd_to_buf(rm->rm_col[c].rc_abd); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1252,7 +1408,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1300,8 +1456,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1388,6 +1561,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1619,7 +1806,9 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, * treat the on-disk format as if the only blocks are the complete 128 * KB size. */ - rm = vdev_raidz_map_alloc(data - (offset - origoffset), + abd_t *abd = abd_get_from_buf(data - (offset - origoffset), + SPA_OLD_MAXBLOCKSIZE); + rm = vdev_raidz_map_alloc(abd, SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -1658,13 +1847,14 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, * example of why this calculation is needed. */ if ((err = vdev_disk_physio(cvd, - ((char *)rc->rc_data) + colskip, colsize, + ((char *)rc->rc_abd) + colskip, colsize, VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, flags, isdump)) != 0) break; } vdev_raidz_map_free(rm); + abd_put(abd); #endif /* KERNEL */ return (err); @@ -1722,7 +1912,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, + rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -1738,7 +1928,7 @@ vdev_raidz_io_start(zio_t *zio) rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1795,7 +1985,7 @@ vdev_raidz_io_start(zio_t *zio) if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } @@ -1811,6 +2001,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1824,9 +2015,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_abd, buf, rc->rc_size); } } @@ -1872,7 +2065,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1881,7 +2074,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1989,7 +2182,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_abd, + rc->rc_size); } /* @@ -2020,7 +2214,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_abd, orig[i], + rc->rc_size); } do { @@ -2261,7 +2456,7 @@ vdev_raidz_io_done(zio_t *zio) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } while (++c < rm->rm_cols); @@ -2341,7 +2536,7 @@ done: continue; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_data, rc->rc_size, + rc->rc_offset, rc->rc_abd, rc->rc_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 26c8d793dc..ae72c667cb 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -40,6 +40,7 @@ #include <sys/vdev_impl.h> #include <sys/dmu_tx.h> #include <sys/dsl_pool.h> +#include <sys/abd.h> /* * The zfs intent log (ZIL) saves transaction records of system calls @@ -878,6 +879,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_abd); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; @@ -909,8 +911,10 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) ZIO_FLAG_CANFAIL); } if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 4bef635b0a..da09434078 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -43,6 +43,7 @@ #include <sys/zfeature.h> #include <sys/zfs_zone.h> #include <sys/metaslab_impl.h> +#include <sys/abd.h> /* * ========================================================================== @@ -274,12 +275,18 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); - zt->zt_orig_data = zio->io_data; + /* + * Ensure that anyone expecting this zio to contain a linear ABD isn't + * going to get a nasty surprise when they try to access the data. + */ + IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data)); + + zt->zt_orig_abd = zio->io_abd; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; zt->zt_transform = transform; @@ -287,7 +294,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, zt->zt_next = zio->io_transform_stack; zio->io_transform_stack = zt; - zio->io_data = data; + zio->io_abd = data; zio->io_size = size; } @@ -299,12 +306,12 @@ zio_pop_transforms(zio_t *zio) while ((zt = zio->io_transform_stack) != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, - zt->zt_orig_data, zt->zt_orig_size); + zt->zt_orig_abd, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_abd); - zio->io_data = zt->zt_orig_data; + zio->io_abd = zt->zt_orig_abd; zio->io_size = zt->zt_orig_size; zio->io_transform_stack = zt->zt_next; @@ -318,21 +325,26 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_abd, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size); + abd_return_buf_copy(data, tmp, size); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } } /* @@ -530,7 +542,7 @@ zio_bookmark_compare(const void *x1, const void *x2) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) @@ -589,7 +601,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; - zio->io_orig_data = zio->io_data = data; + zio->io_orig_abd = zio->io_abd = data; zio->io_orig_size = zio->io_size = psize; zio->io_lsize = lsize; zio->io_orig_flags = zio->io_flags = flags; @@ -731,7 +743,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -749,7 +761,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, @@ -790,7 +802,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -943,7 +955,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -964,7 +976,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -987,8 +999,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -1000,7 +1013,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -1065,7 +1078,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -1126,14 +1139,17 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); - - zio_push_transform(zio, cbuf, psize, psize, zio_decompress); + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1273,7 +1289,7 @@ zio_write_compress(zio_t *zio) /* If it's a compressed write that is not raw, compress the buffer. */ if (compress != ZIO_COMPRESS_OFF && psize == lsize) { void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -1308,9 +1324,11 @@ zio_write_compress(zio_t *zio) zio_buf_free(cbuf, lsize); psize = lsize; } else { - bzero((char *)cbuf + psize, rounded - psize); + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); psize = rounded; - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1830,26 +1848,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1860,8 +1890,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1871,7 +1905,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1879,16 +1914,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -1950,13 +1987,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -1972,13 +2010,16 @@ zio_gang_tree_assemble_done(zio_t *zio) if (zio->io_error) return; + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_abd); + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -1988,7 +2029,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -2001,7 +2043,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -2010,13 +2052,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -2049,7 +2092,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -2088,6 +2132,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + static int zio_write_gang_block(zio_t *pio) { @@ -2098,6 +2148,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -2158,12 +2209,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -2183,9 +2236,9 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, lsize, - &zp, zio_write_gang_member_ready, NULL, NULL, NULL, - &gn->gn_child[g], pio->io_priority, + abd_get_offset(pio->io_abd, pio->io_size - resid), lsize, + lsize, &zp, zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { @@ -2298,10 +2351,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ - if (zio->io_error == 0 && dde->dde_repair_data == NULL) - dde->dde_repair_data = zio->io_data; + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_abd); mutex_exit(&pio->io_lock); } @@ -2333,16 +2387,16 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } zio_nowait(zio_read(zio, zio->io_spa, bp, - zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); return (ZIO_PIPELINE_CONTINUE); @@ -2372,8 +2426,9 @@ zio_ddt_read_done(zio_t *zio) zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } - if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2405,7 +2460,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, + abd_cmp(zio->io_orig_abd, lio->io_orig_abd, zio->io_orig_size) != 0); } } @@ -2426,17 +2481,17 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) /* * Intuitively, it would make more sense to compare - * io_data than io_orig_data in the raw case since you + * io_abd than io_orig_abd in the raw case since you * don't want to look at any transformations that have * happened to the data. However, for raw I/Os the - * data will actually be the same in io_data and - * io_orig_data, so all we have to do is issue this as + * data will actually be the same in io_abd and + * io_orig_abd, so all we have to do is issue this as * a raw ARC read. */ if (do_raw) { zio_flags |= ZIO_FLAG_RAW; ASSERT3U(zio->io_size, ==, zio->io_orig_size); - ASSERT0(bcmp(zio->io_data, zio->io_orig_data, + ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd, zio->io_size)); ASSERT3P(zio->io_transform_stack, ==, NULL); } @@ -2447,7 +2502,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || - bcmp(abuf->b_data, zio->io_orig_data, + abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); arc_buf_destroy(abuf, &abuf); @@ -2613,12 +2668,12 @@ zio_ddt_write(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL, NULL, zio_ddt_ditto_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; } @@ -2635,13 +2690,13 @@ zio_ddt_write(zio_t *zio) ddt_phys_fill(ddp, bp); ddt_phys_addref(ddp); } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); - zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); dde->dde_lead_zio[p] = cio; } @@ -2981,11 +3036,11 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -3111,7 +3166,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3255,7 +3310,7 @@ zio_checksum_generate(zio_t *zio) } } - zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size); + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); return (ZIO_PIPELINE_CONTINUE); } @@ -3394,7 +3449,7 @@ zio_ready(zio_t *zio) if (BP_IS_GANG(bp)) { zio->io_flags &= ~ZIO_FLAG_NODATA; } else { - ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } } @@ -3549,21 +3604,28 @@ zio_done(zio_t *zio) zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); - char *abuf = zio->io_data; + char *abuf = NULL; + abd_t *adata = zio->io_abd; if (asize != psize) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, psize); - bzero(abuf + psize, asize - psize); + adata = abd_alloc_linear(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); } + if (adata != NULL) + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); + if (adata != NULL) + abd_return_buf(adata, abuf, asize); + if (asize != psize) - zio_buf_free(abuf, asize); + abd_free(adata); } } diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index 2bd9001456..e1c98b0b99 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -31,6 +31,7 @@ #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/zil.h> +#include <sys/abd.h> #include <zfs_fletcher.h> /* @@ -93,45 +94,85 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, +abd_checksum_off(abd_t *abd, uint64_t size, const void *ctx_template, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +/*ARGSUSED*/ +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_native, zcp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { {{NULL, NULL}, NULL, NULL, 0, "inherit"}, {{NULL, NULL}, NULL, NULL, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, NULL, NULL, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, - {{zio_checksum_off, zio_checksum_off}, + {{abd_checksum_off, abd_checksum_off}, NULL, NULL, 0, "noparity"}, - {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, - {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, - zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, - {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, - zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, }; @@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *abd, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; + void *data = abd_to_buf(abd); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; @@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, else bp->blk_cksum = eck->zec_cksum; eck->zec_magic = ZEC_MAGIC; - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); eck->zec_cksum = cksum; } else { - ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum], + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &bp->blk_cksum); } } int zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) + abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; @@ -308,25 +350,31 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { zio_eck_t *eck; zio_cksum_t verifier; + uint64_t data_size = size; + void *data = abd_borrow_buf_copy(abd, data_size); if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = data; uint64_t nused; eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) + if (eck->zec_magic == ZEC_MAGIC) { nused = zilc->zc_nused; - else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) + } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { nused = BSWAP_64(zilc->zc_nused); - else + } else { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } - if (nused > size) + if (nused > data_size) { + abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); + } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)data + data_size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) @@ -341,11 +389,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; expected_cksum = eck->zec_cksum; eck->zec_cksum = verifier; - ci->ci_func[byteswap](data, size, + abd_return_buf_copy(abd, data, data_size); + + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); - eck->zec_cksum = expected_cksum; + abd_copy_from_buf_off(abd, &expected_cksum, + eck_offset, sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, @@ -354,7 +406,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](data, size, + ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); } @@ -383,7 +435,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_abd; spa_t *spa = zio->io_spa; error = zio_checksum_error_impl(spa, bp, checksum, data, size, diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c index 4e2d645572..8d0a33de69 100644 --- a/usr/src/uts/common/fs/zfs/zio_compress.c +++ b/usr/src/uts/common/fs/zfs/zio_compress.c @@ -25,10 +25,7 @@ */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. - */ - -/* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -41,24 +38,23 @@ /* * Compression vectors. */ - zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { - {NULL, NULL, 0, "inherit"}, - {NULL, NULL, 0, "on"}, - {NULL, NULL, 0, "uncompressed"}, - {lzjb_compress, lzjb_decompress, 0, "lzjb"}, - {NULL, NULL, 0, "empty"}, - {gzip_compress, gzip_decompress, 1, "gzip-1"}, - {gzip_compress, gzip_decompress, 2, "gzip-2"}, - {gzip_compress, gzip_decompress, 3, "gzip-3"}, - {gzip_compress, gzip_decompress, 4, "gzip-4"}, - {gzip_compress, gzip_decompress, 5, "gzip-5"}, - {gzip_compress, gzip_decompress, 6, "gzip-6"}, - {gzip_compress, gzip_decompress, 7, "gzip-7"}, - {gzip_compress, gzip_decompress, 8, "gzip-8"}, - {gzip_compress, gzip_decompress, 9, "gzip-9"}, - {zle_compress, zle_decompress, 64, "zle"}, - {lz4_compress, lz4_decompress, 0, "lz4"}, + {"inherit", 0, NULL, NULL}, + {"on", 0, NULL, NULL}, + {"uncompressed", 0, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress}, + {"empty", 0, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress}, + {"gzip-2", 2, gzip_compress, gzip_decompress}, + {"gzip-3", 3, gzip_compress, gzip_decompress}, + {"gzip-4", 4, gzip_compress, gzip_decompress}, + {"gzip-5", 5, gzip_compress, gzip_decompress}, + {"gzip-6", 6, gzip_compress, gzip_decompress}, + {"gzip-7", 7, gzip_compress, gzip_decompress}, + {"gzip-8", 8, gzip_compress, gzip_decompress}, + {"gzip-9", 9, gzip_compress, gzip_decompress}, + {"zle", 64, zle_compress, zle_decompress}, + {"lz4", 0, lz4_compress, lz4_decompress} }; enum zio_compress @@ -85,10 +81,21 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + for (uint64_t *word = (uint64_t *)data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + size_t -zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len) { - uint64_t *word, *word_end; size_t c_len, d_len; zio_compress_info_t *ci = &zio_compress_table[c]; @@ -99,12 +106,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by returning zero size. */ - word_end = (uint64_t *)((char *)src + s_len); - for (word = src; word < word_end; word++) - if (*word != 0) - break; - - if (word == word_end) + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) return (0); if (c == ZIO_COMPRESS_EMPTY) @@ -112,7 +114,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) /* Compress at least 12.5% */ d_len = s_len - (s_len >> 3); - c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); + + /* No compression algorithms can read from ABDs directly */ + void *tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level); + abd_return_buf(src, tmp, s_len); if (c_len > d_len) return (s_len); @@ -122,13 +128,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) } int -zio_decompress_data(enum zio_compress c, void *src, void *dst, +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len) { zio_compress_info_t *ci = &zio_compress_table[c]; - if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) return (SET_ERROR(EINVAL)); return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); + abd_return_buf(src, tmp, s_len); + + return (ret); +} diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.c b/usr/src/uts/common/io/mr_sas/ld_pd_map.c index 829904afc0..0f2798a790 100644 --- a/usr/src/uts/common/io/mr_sas/ld_pd_map.c +++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.c @@ -23,6 +23,7 @@ */ /* * Copyright 2015 Garrett D'Amore <garrett@damore.org> + * Copyright 2017 Citrus IT Limited. All rights reserved. */ #include <sys/scsi/scsi.h> @@ -212,7 +213,6 @@ MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow, U32 rowMod; U32 armQ; U32 arm; - U16 devid = instance->device_id; ASSERT(raid->rowDataSize != 0); @@ -254,11 +254,8 @@ MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow, *pDevHandle = MR_PdDevHandleGet(pd, map); } else { *pDevHandle = MR_PD_INVALID; /* set dev handle as invalid. */ - if ((raid->level >= 5) && - ((devid != PCI_DEVICE_ID_LSI_INVADER) || - ((devid == PCI_DEVICE_ID_LSI_INVADER || - (devid == PCI_DEVICE_ID_LSI_FURY)) && - raid->regTypeReqOnRead != REGION_TYPE_UNUSED))) { + if (raid->level >= 5 && (!instance->gen3 || + raid->regTypeReqOnRead != REGION_TYPE_UNUSED)) { pRAID_Context->regLockFlags = REGION_TYPE_EXCLUSIVE; } else if (raid->level == 1) { /* Get Alternate Pd. */ @@ -403,8 +400,7 @@ MR_BuildRaidContext(struct mrsas_instance *instance, pRAID_Context->timeoutValue = map->raidMap.fpPdIoTimeoutSec; - if ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) || - (instance->device_id == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { pRAID_Context->regLockFlags = (isRead) ? raid->regTypeReqOnRead : raid->regTypeReqOnWrite; } else { diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.c b/usr/src/uts/common/io/mr_sas/mr_sas.c index 1bb6cec1d2..4e3fe4dcce 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.c +++ b/usr/src/uts/common/io/mr_sas/mr_sas.c @@ -45,7 +45,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2015 Citrus IT Limited. All rights reserved. + * Copyright 2015, 2017 Citrus IT Limited. All rights reserved. * Copyright 2015 Garrett D'Amore <garrett@damore.org> */ @@ -565,9 +565,16 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) /* initialize function pointers */ switch (device_id) { - case PCI_DEVICE_ID_LSI_TBOLT: case PCI_DEVICE_ID_LSI_INVADER: case PCI_DEVICE_ID_LSI_FURY: + case PCI_DEVICE_ID_LSI_INTRUDER: + case PCI_DEVICE_ID_LSI_INTRUDER_24: + case PCI_DEVICE_ID_LSI_CUTLASS_52: + case PCI_DEVICE_ID_LSI_CUTLASS_53: + dev_err(dip, CE_CONT, "?Gen3 device detected\n"); + instance->gen3 = 1; + /* FALLTHROUGH */ + case PCI_DEVICE_ID_LSI_TBOLT: dev_err(dip, CE_CONT, "?TBOLT device detected\n"); instance->func_ptr = @@ -584,6 +591,7 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * certain other features are available to a Skinny * HBA. */ + dev_err(dip, CE_CONT, "?Skinny device detected\n"); instance->skinny = 1; /* FALLTHRU */ @@ -1596,7 +1604,7 @@ mrsas_quiesce(dev_info_t *dip) /*ARGSUSED*/ static int mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, - scsi_hba_tran_t *tran, struct scsi_device *sd) + scsi_hba_tran_t *tran, struct scsi_device *sd) { struct mrsas_instance *instance; uint16_t tgt = sd->sd_address.a_target; @@ -1772,8 +1780,8 @@ mrsas_name_node(dev_info_t *dip, char *name, int len) */ static struct scsi_pkt * mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, - struct buf *bp, int cmdlen, int statuslen, int tgtlen, - int flags, int (*callback)(), caddr_t arg) + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) { struct scsa_cmd *acmd; struct mrsas_instance *instance; diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.h b/usr/src/uts/common/io/mr_sas/mr_sas.h index 8f27cbdf21..fe4c3659af 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.h +++ b/usr/src/uts/common/io/mr_sas/mr_sas.h @@ -45,6 +45,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2015 Garrett D'Amore <garrett@damore.org> + * Copyright 2017 Citrus IT Limited. All rights reserved. */ #ifndef _MR_SAS_H_ @@ -61,8 +62,8 @@ extern "C" { /* * MegaRAID SAS2.0 Driver meta data */ -#define MRSAS_VERSION "6.503.00.00ILLUMOS" -#define MRSAS_RELDATE "July 30, 2012" +#define MRSAS_VERSION "6.503.00.00ILLUMOS-20170421" +#define MRSAS_RELDATE "April 21, 2017" #define MRSAS_TRUE 1 #define MRSAS_FALSE 0 @@ -90,13 +91,23 @@ extern "C" { /* * MegaRAID SAS2.0 supported controllers */ -#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 -#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* Skinny */ #define PCI_DEVICE_ID_LSI_SKINNY 0x0071 #define PCI_DEVICE_ID_LSI_SKINNY_NEW 0x0073 +/* Liberator series (Gen2) */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 +/* Thunderbolt series */ #define PCI_DEVICE_ID_LSI_TBOLT 0x005b +/* Invader series (Gen3) */ #define PCI_DEVICE_ID_LSI_INVADER 0x005d #define PCI_DEVICE_ID_LSI_FURY 0x005f +#define PCI_DEVICE_ID_LSI_INTRUDER 0x00ce +#define PCI_DEVICE_ID_LSI_INTRUDER_24 0x00cf +#define PCI_DEVICE_ID_LSI_CUTLASS_52 0x0052 +#define PCI_DEVICE_ID_LSI_CUTLASS_53 0x0053 +/* Ventura series not yet supported */ /* * Register Index for 2108 Controllers. @@ -602,6 +613,7 @@ typedef struct mrsas_instance { uint8_t skinny; uint8_t tbolt; + uint8_t gen3; uint16_t reply_read_index; uint16_t reply_size; /* Single Reply struct size */ uint16_t raid_io_msg_size; /* Single message size */ diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c index 929ae8056e..9ff12ffb07 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c +++ b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c @@ -17,7 +17,7 @@ /* * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright 2015 Citrus IT Limited. All rights reserved. + * Copyright 2015, 2017 Citrus IT Limited. All rights reserved. * Copyright 2015 Garrett D'Amore <garrett@damore.org> */ @@ -1252,7 +1252,6 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, Mpi25IeeeSgeChain64_t *scsi_raid_io_sgl_ieee = NULL; ddi_acc_handle_t acc_handle = instance->mpi2_frame_pool_dma_obj.acc_handle; - uint16_t devid = instance->device_id; con_log(CL_ANN1, (CE_NOTE, "chkpnt: Building Chained SGL :%d", __LINE__)); @@ -1296,8 +1295,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, scsi_raid_io_sgl_ieee = (Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain; - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { Mpi25IeeeSgeChain64_t *sgl_ptr_end = scsi_raid_io_sgl_ieee; sgl_ptr_end += instance->max_sge_in_main_msg - 1; @@ -1313,8 +1311,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { if (i == (numElements - 1)) { ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, @@ -1342,8 +1339,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, con_log(CL_ANN1, (CE_NOTE, "[Chain Element index]:%x", i)); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { uint16_t ioFlags = ddi_get16(acc_handle, &scsi_raid_io->IoFlags); @@ -1366,8 +1362,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, ddi_put8(acc_handle, &ieeeChainElement->NextChainOffset, 0); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { ddi_put8(acc_handle, &ieeeChainElement->Flags, IEEE_SGE_FLAGS_CHAIN_ELEMENT); } else { @@ -1402,8 +1397,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance, ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { if (i == (numElements - 1)) { ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, @@ -1443,7 +1437,6 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, uint32_t lba_count = 0; uint32_t start_lba_hi = 0; uint32_t start_lba_lo = 0; - uint16_t devid = instance->device_id; ddi_acc_handle_t acc_handle = instance->mpi2_frame_pool_dma_obj.acc_handle; struct mrsas_cmd *cmd = NULL; @@ -1678,8 +1671,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, (MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY << MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { uint8_t regLockFlags = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.regLockFlags); uint16_t IoFlags = ddi_get16(acc_handle, @@ -1743,8 +1735,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, &scsi_raid_io->RaidContext.timeoutValue, local_map_ptr->raidMap.fpPdIoTimeoutSec); - if ((devid == PCI_DEVICE_ID_LSI_INVADER) || - (devid == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { uint8_t regLockFlags = ddi_get8(acc_handle, &scsi_raid_io->RaidContext.regLockFlags); @@ -1849,9 +1840,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap, ddi_put8(acc_handle, &scsi_raid_io->LUN[1], acmd->lun); - if (instance->fast_path_io && - ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) || - (instance->device_id == PCI_DEVICE_ID_LSI_FURY))) { + if (instance->fast_path_io && instance->gen3) { uint16_t IoFlags = ddi_get16(acc_handle, &scsi_raid_io->IoFlags); IoFlags |= MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH; @@ -2269,8 +2258,7 @@ mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *instance, /* get raid message frame pointer */ scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request; - if ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) || - (instance->device_id == PCI_DEVICE_ID_LSI_FURY)) { + if (instance->gen3) { Mpi25IeeeSgeChain64_t *sgl_ptr_end = (Mpi25IeeeSgeChain64_t *) &scsi_raid_io->SGL.IeeeChain; sgl_ptr_end += instance->max_sge_in_main_msg - 1; diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 80d344523e..c44301765b 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -545,6 +545,7 @@ CHKHDRS= \ statfs.h \ statvfs.h \ stdbool.h \ + stddef.h \ stdint.h \ stermio.h \ stmf.h \ diff --git a/usr/src/uts/common/sys/multiboot2.h b/usr/src/uts/common/sys/multiboot2.h new file mode 100644 index 0000000000..556b0217a3 --- /dev/null +++ b/usr/src/uts/common/sys/multiboot2.h @@ -0,0 +1,418 @@ +/* + * Copyright (C) 1999,2003,2007,2008,2009,2010 Free Software Foundation, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ANY + * DEVELOPER OR DISTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * Copyright 2016 Toomas Soome <tsoome@me.com> + */ + +/* + * This header contains definitions for Multiboot 2 boot protocol, based on + * the reference implementation by grub 2. + * + * At the time this was written (Jan 2017), the Multiboot 2 documentation is in + * process of being rewritten and the information in the specification is not + * entirely correct. Instead, you must rely on grub 2 source code. + * + * This header provides essential support for the Multiboot 2 specification + * for illumos and makes it possible to pass the needed structures from the + * boot loader to the kernel. + */ + +#ifndef _SYS_MULTIBOOT2_H +#define _SYS_MULTIBOOT2_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* How many bytes from the start of the file we search for the header. */ +#define MULTIBOOT_SEARCH 32768 +#define MULTIBOOT_HEADER_ALIGN 8 + +/* The magic field should contain this. */ +#define MULTIBOOT2_HEADER_MAGIC 0xe85250d6 + +/* This should be in %eax. */ +#define MULTIBOOT2_BOOTLOADER_MAGIC 0x36d76289 + +/* Alignment of multiboot modules. */ +#if defined(__i386) || defined(__amd64) +#define MULTIBOOT_MOD_ALIGN 0x00001000 +#else +#error No architecture defined +#endif + +/* Alignment of the multiboot info structure. */ +#define MULTIBOOT_INFO_ALIGN 0x00000008 + +/* Flags set in the 'flags' member of the multiboot header. */ + +#define MULTIBOOT_TAG_ALIGN 8 +#define MULTIBOOT_TAG_TYPE_END 0 +#define MULTIBOOT_TAG_TYPE_CMDLINE 1 +#define MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME 2 +#define MULTIBOOT_TAG_TYPE_MODULE 3 +#define MULTIBOOT_TAG_TYPE_BASIC_MEMINFO 4 +#define MULTIBOOT_TAG_TYPE_BOOTDEV 5 +#define MULTIBOOT_TAG_TYPE_MMAP 6 +#define MULTIBOOT_TAG_TYPE_VBE 7 +#define MULTIBOOT_TAG_TYPE_FRAMEBUFFER 8 +#define MULTIBOOT_TAG_TYPE_ELF_SECTIONS 9 +#define MULTIBOOT_TAG_TYPE_APM 10 +#define MULTIBOOT_TAG_TYPE_EFI32 11 +#define MULTIBOOT_TAG_TYPE_EFI64 12 +#define MULTIBOOT_TAG_TYPE_SMBIOS 13 +#define MULTIBOOT_TAG_TYPE_ACPI_OLD 14 +#define MULTIBOOT_TAG_TYPE_ACPI_NEW 15 +#define MULTIBOOT_TAG_TYPE_NETWORK 16 +#define MULTIBOOT_TAG_TYPE_EFI_MMAP 17 +#define MULTIBOOT_TAG_TYPE_EFI_BS 18 +#define MULTIBOOT_TAG_TYPE_EFI32_IH 19 +#define MULTIBOOT_TAG_TYPE_EFI64_IH 20 +#define MULTIBOOT_TAG_TYPE_LOAD_BASE_ADDR 21 + +#define MULTIBOOT_HEADER_TAG_END 0 +#define MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST 1 +#define MULTIBOOT_HEADER_TAG_ADDRESS 2 +#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS 3 +#define MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS 4 +#define MULTIBOOT_HEADER_TAG_FRAMEBUFFER 5 +#define MULTIBOOT_HEADER_TAG_MODULE_ALIGN 6 +#define MULTIBOOT_HEADER_TAG_EFI_BS 7 +#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS_EFI32 8 +#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS_EFI64 9 +#define MULTIBOOT_HEADER_TAG_RELOCATABLE 10 + +#define MULTIBOOT_ARCHITECTURE_I386 0 +#define MULTIBOOT_ARCHITECTURE_MIPS32 4 +#define MULTIBOOT_HEADER_TAG_OPTIONAL 1 + +/* Hints for relocatable kernel load preference */ +#define MULTIBOOT_LOAD_PREFERENCE_NONE 0 +#define MULTIBOOT_LOAD_PREFERENCE_LOW 1 +#define MULTIBOOT_LOAD_PREFERENCE_HIGH 2 + +/* Values for console_flags field in tag multiboot_header_tag_console_flags. */ +#define MULTIBOOT_CONSOLE_FLAGS_CONSOLE_REQUIRED 1 +#define MULTIBOOT_CONSOLE_FLAGS_EGA_TEXT_SUPPORTED 2 + +#ifndef _ASM + +#include <sys/stdint.h> + +#pragma pack(1) + +typedef struct multiboot_header_tag { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; +} multiboot_header_tag_t; + +typedef struct multiboot2_header { + /* Must be MULTIBOOT2_MAGIC - see above. */ + uint32_t mb2_magic; + + /* ISA */ + uint32_t mb2_architecture; + + /* Total header length. */ + uint32_t mb2_header_length; + + /* The above fields plus this one must equal 0 mod 2^32. */ + uint32_t mb2_checksum; + multiboot_header_tag_t mb2_tags[]; +} multiboot2_header_t; + +typedef struct multiboot_header_tag_information_request { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_requests[]; +} multiboot_header_tag_information_request_t; + +typedef struct multiboot_header_tag_address { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_header_addr; + uint32_t mbh_load_addr; + uint32_t mbh_load_end_addr; + uint32_t mbh_bss_end_addr; +} multiboot_header_tag_address_t; + +typedef struct multiboot_header_tag_entry_address { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_entry_addr; +} multiboot_header_tag_entry_address_t; + +typedef struct multiboot_header_tag_console_flags { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_console_flags; +} multiboot_header_tag_console_flags_t; + +typedef struct multiboot_header_tag_framebuffer { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_width; + uint32_t mbh_height; + uint32_t mbh_depth; +} multiboot_header_tag_framebuffer_t; + +typedef struct multiboot_header_tag_module_align { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; +} multiboot_header_tag_module_align_t; + +typedef struct multiboot_header_tag_relocatable { + uint16_t mbh_type; + uint16_t mbh_flags; + uint32_t mbh_size; + uint32_t mbh_min_addr; + uint32_t mbh_max_addr; + uint32_t mbh_align; + uint32_t mbh_preference; +} multiboot_header_tag_relocatable_t; + +typedef struct multiboot_color { + uint8_t mb_red; + uint8_t mb_green; + uint8_t mb_blue; +} multiboot_color_t; + +typedef struct multiboot_mmap_entry { + uint64_t mmap_addr; + uint64_t mmap_len; +#define MULTIBOOT_MEMORY_AVAILABLE 1 +#define MULTIBOOT_MEMORY_RESERVED 2 +#define MULTIBOOT_MEMORY_ACPI_RECLAIMABLE 3 +#define MULTIBOOT_MEMORY_NVS 4 +#define MULTIBOOT_MEMORY_BADRAM 5 + uint32_t mmap_type; + uint32_t mmap_reserved; +} multiboot_mmap_entry_t; + +typedef struct multiboot_tag { + uint32_t mb_type; + uint32_t mb_size; +} multiboot_tag_t; + +typedef struct multiboot2_info_header { + uint32_t mbi_total_size; + uint32_t mbi_reserved; + multiboot_tag_t mbi_tags[]; +} multiboot2_info_header_t; + +typedef struct multiboot_tag_string { + uint32_t mb_type; + uint32_t mb_size; + char mb_string[]; +} multiboot_tag_string_t; + +typedef struct multiboot_tag_module { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_mod_start; + uint32_t mb_mod_end; + char mb_cmdline[]; +} multiboot_tag_module_t; + +typedef struct multiboot_tag_basic_meminfo { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_mem_lower; + uint32_t mb_mem_upper; +} multiboot_tag_basic_meminfo_t; + +typedef struct multiboot_tag_bootdev { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_biosdev; + uint32_t mb_slice; + uint32_t mb_part; +} multiboot_tag_bootdev_t; + +typedef struct multiboot_tag_mmap { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_entry_size; + uint32_t mb_entry_version; + uint8_t mb_entries[]; +} multiboot_tag_mmap_t; + +struct multiboot_vbe_info_block { + uint8_t vbe_external_specification[512]; +}; + +struct multiboot_vbe_mode_info_block { + uint8_t vbe_external_specification[256]; +}; + +typedef struct multiboot_tag_vbe { + uint32_t mb_type; + uint32_t mb_size; + + uint16_t vbe_mode; + uint16_t vbe_interface_seg; + uint16_t vbe_interface_off; + uint16_t vbe_interface_len; + + struct multiboot_vbe_info_block vbe_control_info; + struct multiboot_vbe_mode_info_block vbe_mode_info; +} multiboot_tag_vbe_t; + +struct multiboot_tag_framebuffer_common { + uint32_t mb_type; + uint32_t mb_size; + + uint64_t framebuffer_addr; + uint32_t framebuffer_pitch; + uint32_t framebuffer_width; + uint32_t framebuffer_height; + uint8_t framebuffer_bpp; +#define MULTIBOOT_FRAMEBUFFER_TYPE_INDEXED 0 +#define MULTIBOOT_FRAMEBUFFER_TYPE_RGB 1 +#define MULTIBOOT_FRAMEBUFFER_TYPE_EGA_TEXT 2 + uint8_t framebuffer_type; + uint16_t mb_reserved; +}; + +typedef struct multiboot_tag_framebuffer { + struct multiboot_tag_framebuffer_common framebuffer_common; + + union { + struct { + uint16_t framebuffer_palette_num_colors; + multiboot_color_t framebuffer_palette[]; + } fb1; + struct { + uint8_t framebuffer_red_field_position; + uint8_t framebuffer_red_mask_size; + uint8_t framebuffer_green_field_position; + uint8_t framebuffer_green_mask_size; + uint8_t framebuffer_blue_field_position; + uint8_t framebuffer_blue_mask_size; + } fb2; + } u; +} multiboot_tag_framebuffer_t; + +typedef struct multiboot_tag_elf_sections { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_num; + uint32_t mb_entsize; + uint32_t mb_shndx; + char mb_sections[]; +} multiboot_tag_elf_sections_t; + +typedef struct multiboot_tag_apm { + uint32_t mb_type; + uint32_t mb_size; + uint16_t mb_version; + uint16_t mb_cseg; + uint32_t mb_offset; + uint16_t mb_cseg_16; + uint16_t mb_dseg; + uint16_t mb_flags; + uint16_t mb_cseg_len; + uint16_t mb_cseg_16_len; + uint16_t mb_dseg_len; +} multiboot_tag_apm_t; + +typedef struct multiboot_tag_efi32 { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_pointer; +} multiboot_tag_efi32_t; + +typedef struct multiboot_tag_efi64 { + uint32_t mb_type; + uint32_t mb_size; + uint64_t mb_pointer; +} multiboot_tag_efi64_t; + +typedef struct multiboot_tag_smbios { + uint32_t mb_type; + uint32_t mb_size; + uint8_t mb_major; + uint8_t mb_minor; + uint8_t mb_reserved[6]; + uint8_t mb_tables[]; +} multiboot_tag_smbios_t; + +typedef struct multiboot_tag_old_acpi { + uint32_t mb_type; + uint32_t mb_size; + uint8_t mb_rsdp[]; +} multiboot_tag_old_acpi_t; + +typedef struct multiboot_tag_new_acpi { + uint32_t mb_type; + uint32_t mb_size; + uint8_t mb_rsdp[]; +} multiboot_tag_new_acpi_t; + +typedef struct multiboot_tag_network { + uint32_t mb_type; + uint32_t mb_size; + uint8_t mb_dhcpack[]; +} multiboot_tag_network_t; + +typedef struct multiboot_tag_efi_mmap { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_descr_size; + uint32_t mb_descr_vers; + uint8_t mb_efi_mmap[]; +} multiboot_tag_efi_mmap_t; + +typedef struct multiboot_tag_efi32_ih { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_pointer; +} multiboot_tag_efi32_ih_t; + +typedef struct multiboot_tag_efi64_ih { + uint32_t mb_type; + uint32_t mb_size; + uint64_t mb_pointer; +} multiboot_tag_efi64_ih_t; + +typedef struct multiboot_tag_load_base_addr { + uint32_t mb_type; + uint32_t mb_size; + uint32_t mb_load_base_addr; +} multiboot_tag_load_base_addr_t; + +#pragma pack() + +#endif /* !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* !_SYS_MULTIBOOT2_H */ diff --git a/usr/src/uts/common/sys/multiboot2_impl.h b/usr/src/uts/common/sys/multiboot2_impl.h new file mode 100644 index 0000000000..d90ed0e8ee --- /dev/null +++ b/usr/src/uts/common/sys/multiboot2_impl.h @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Toomas Soome <tsoome@me.com> + */ + +#ifndef _SYS_MULTIBOOT2_IMPL_H +#define _SYS_MULTIBOOT2_IMPL_H + +/* + * Multiboot 2 protocol implementation for dboot. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/multiboot2.h> + +extern void *dboot_multiboot2_find_tag(multiboot2_info_header_t *, uint32_t); +extern char *dboot_multiboot2_cmdline(multiboot2_info_header_t *); +extern int dboot_multiboot2_modcount(multiboot2_info_header_t *); +extern uint32_t dboot_multiboot2_modstart(multiboot2_info_header_t *, int); +extern uint32_t dboot_multiboot2_modend(multiboot2_info_header_t *, int); +extern char *dboot_multiboot2_modcmdline(multiboot2_info_header_t *, int); +extern multiboot_tag_mmap_t * + dboot_multiboot2_get_mmap_tagp(multiboot2_info_header_t *); +extern boolean_t dboot_multiboot2_basicmeminfo(multiboot2_info_header_t *, + uint32_t *, uint32_t *); +extern uint64_t dboot_multiboot2_mmap_get_length(multiboot2_info_header_t *, + multiboot_tag_mmap_t *, int); +extern uint64_t dboot_multiboot2_mmap_get_base(multiboot2_info_header_t *, + multiboot_tag_mmap_t *, int); +extern uint32_t dboot_multiboot2_mmap_get_type(multiboot2_info_header_t *, + multiboot_tag_mmap_t *, int); +extern int dboot_multiboot2_mmap_nentries(multiboot2_info_header_t *, + multiboot_tag_mmap_t *); +extern paddr_t dboot_multiboot2_highest_addr(multiboot2_info_header_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTIBOOT2_IMPL_H */ diff --git a/usr/src/uts/common/sys/stddef.h b/usr/src/uts/common/sys/stddef.h new file mode 100644 index 0000000000..9dc9736241 --- /dev/null +++ b/usr/src/uts/common/sys/stddef.h @@ -0,0 +1,48 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Toomas Soome <tsoome@me.com> + */ + +#ifndef _SYS_STDDEF_H +#define _SYS_STDDEF_H + +/* + * Commonly used macros and definitions. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if !defined(offsetof) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) +#define offsetof(s, m) __builtin_offsetof(s, m) +#else +#if __cplusplus >= 199711L +#define offsetof(s, m) (std::size_t)(&(((s *)NULL)->m)) +#else +#define offsetof(s, m) ((size_t)(&(((s *)NULL)->m))) +#endif +#endif +#endif /* !offsetof */ + +#if !defined(container_of) +#define container_of(m, s, name) \ + (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name)) +#endif /* !container_of */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_STDDEF_H */ diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h index 2e895a8daf..03be89f461 100644 --- a/usr/src/uts/common/sys/sysmacros.h +++ b/usr/src/uts/common/sys/sysmacros.h @@ -33,6 +33,7 @@ #define _SYS_SYSMACROS_H #include <sys/param.h> +#include <sys/stddef.h> #ifdef __cplusplus extern "C" { @@ -369,18 +370,8 @@ extern unsigned char bcd_to_byte[256]; /* avoid any possibility of clashing with <stddef.h> version */ #if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_KMEMUSER) -#if !defined(offsetof) -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) -#define offsetof(s, m) __builtin_offsetof(s, m) -#else -#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) -#endif -#endif /* !offsetof */ - -#define container_of(m, s, name) \ - (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name)) - #define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0])) + #endif /* _KERNEL, !_KMEMUSER */ #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 7e782f3f61..f3f10e1b6c 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -139,6 +139,7 @@ BOOT_DRIVER_OBJS = \ boot_keyboard_table.o \ boot_vga.o \ boot_mmu.o \ + dboot_multiboot2.o \ $(FONT_OBJS) CORE_OBJS += $(BOOT_DRIVER_OBJS) diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index 2d55410f33..a3bf823c69 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -211,6 +211,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/xen/os/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr # @@ -422,6 +426,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/cpupm/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/boot/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/dboot/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/vm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/i86pc/boot/boot_console.c b/usr/src/uts/i86pc/boot/boot_console.c index cadc735588..6b0873d656 100644 --- a/usr/src/uts/i86pc/boot/boot_console.c +++ b/usr/src/uts/i86pc/boot/boot_console.c @@ -61,6 +61,22 @@ static int cons_color = CONS_COLOR; static int console = CONS_SCREEN_TEXT; static int tty_num = 0; static int tty_addr[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; +static char *boot_line; +static struct boot_env { + char *be_env; /* ends with double ascii nul */ + size_t be_size; /* size of the environment, including nul */ +} boot_env; + +static int serial_ischar(void); +static int serial_getchar(void); +static void serial_putchar(int); +static void serial_adjust_prop(void); + +#if !defined(_BOOT) +/* Set if the console or mode are expressed in the boot line */ +static int console_set, console_mode_set; +#endif + #if defined(__xpv) static int console_hypervisor_redirect = B_FALSE; static int console_hypervisor_device = CONS_INVALID; @@ -76,18 +92,6 @@ console_hypervisor_dev_type(int *tnum) } #endif /* __xpv */ -static int serial_ischar(void); -static int serial_getchar(void); -static void serial_putchar(int); -static void serial_adjust_prop(void); - -static char *boot_line = NULL; - -#if !defined(_BOOT) -/* Set if the console or mode are expressed in the boot line */ -static int console_set, console_mode_set; -#endif - /* Clear the screen and initialize VIDEO, XPOS and YPOS. */ void clear_screen(void) @@ -328,6 +332,67 @@ out: return (ret); } +/* + * Find prop from boot env module. The data in module is list of C strings + * name=value, the list is terminated by double nul. + */ +static const char * +find_boot_env_prop(const char *name) +{ + char *ptr; + size_t len; + uintptr_t size; + + if (boot_env.be_env == NULL) + return (NULL); + + ptr = boot_env.be_env; + len = strlen(name); + + /* + * Make sure we have at least len + 2 bytes in the environment. + * We are looking for name=value\0 constructs, and the environment + * itself is terminated by '\0'. + */ + if (boot_env.be_size < len + 2) + return (NULL); + + do { + if ((strncmp(ptr, name, len) == 0) && (ptr[len] == '=')) { + ptr += len + 1; + return (ptr); + } + /* find the first '\0' */ + while (*ptr != '\0') { + ptr++; + size = (uintptr_t)ptr - (uintptr_t)boot_env.be_env; + if (size > boot_env.be_size) + return (NULL); + } + ptr++; + + /* If the remainder is shorter than name + 2, get out. */ + size = (uintptr_t)ptr - (uintptr_t)boot_env.be_env; + if (boot_env.be_size - size < len + 2) + return (NULL); + } while (*ptr != '\0'); + return (NULL); +} + +/* + * Get prop value from either command line or boot environment. + * We always check kernel command line first, as this will keep the + * functionality and will allow user to override the values in environment. + */ +const char * +find_boot_prop(const char *name) +{ + const char *value = find_boot_line_prop(name); + + if (value == NULL) + value = find_boot_env_prop(name); + return (value); +} #define MATCHES(p, pat) \ (strncmp(p, pat, strlen(pat)) == 0 ? (p += strlen(pat), 1) : 0) @@ -341,14 +406,14 @@ out: /* * find a tty mode property either from cmdline or from boot properties */ -static char * +static const char * get_mode_value(char *name) { /* * when specified on boot line it looks like "name" "=".... */ if (boot_line != NULL) { - return (find_boot_line_prop(name)); + return (find_boot_prop(name)); } #if defined(_BOOT) @@ -377,8 +442,8 @@ static void serial_adjust_prop(void) { char propname[20]; - char *propval; - char *p; + const char *propval; + const char *p; ulong_t baud; uchar_t lcr = 0; uchar_t mcr = DTR | RTS; @@ -522,27 +587,47 @@ console_value_t console_devices[] = { { NULL, CONS_INVALID } }; +static void +bcons_init_env(struct xboot_info *xbi) +{ + uint32_t i; + struct boot_modules *modules; + + modules = (struct boot_modules *)(uintptr_t)xbi->bi_modules; + for (i = 0; i < xbi->bi_module_cnt; i++) { + if (modules[i].bm_type == BMT_ENV) + break; + } + if (i == xbi->bi_module_cnt) + return; + + boot_env.be_env = (char *)(uintptr_t)modules[i].bm_addr; + boot_env.be_size = modules[i].bm_size; +} + void -bcons_init(char *bootstr) +bcons_init(struct xboot_info *xbi) { console_value_t *consolep; size_t len, cons_len; - char *cons_str; + const char *cons_str; #if !defined(_BOOT) static char console_text[] = "text"; extern int post_fastreboot; #endif - boot_line = bootstr; + /* Set up data to fetch properties from commad line and boot env. */ + boot_line = (char *)(uintptr_t)xbi->bi_cmdline; + bcons_init_env(xbi); console = CONS_INVALID; #if defined(__xpv) - bcons_init_xen(bootstr); + bcons_init_xen(boot_line); #endif /* __xpv */ - cons_str = find_boot_line_prop("console"); + cons_str = find_boot_prop("console"); if (cons_str == NULL) - cons_str = find_boot_line_prop("output-device"); + cons_str = find_boot_prop("output-device"); #if !defined(_BOOT) if (post_fastreboot && strcmp(cons_str, "graphics") == 0) @@ -657,7 +742,6 @@ bcons_init(char *bootstr) kb_init(); break; } - boot_line = NULL; } #if !defined(_BOOT) diff --git a/usr/src/uts/i86pc/dboot/dboot_grub.s b/usr/src/uts/i86pc/dboot/dboot_grub.s index 92cacc4983..7409c12998 100644 --- a/usr/src/uts/i86pc/dboot/dboot_grub.s +++ b/usr/src/uts/i86pc/dboot/dboot_grub.s @@ -1,4 +1,3 @@ - /* * CDDL HEADER START * @@ -32,6 +31,7 @@ int silence_lint_warnings = 0; #else /* __lint */ #include <sys/multiboot.h> +#include <sys/multiboot2.h> #include <sys/asm_linkage.h> #include <sys/segments.h> #include <sys/controlregs.h> @@ -76,6 +76,103 @@ mb_header: .long 0 /* height 0 == don't care */ .long 0 /* depth 0 == don't care */ +#if defined(_BOOT_TARGET_i386) + /* + * The MB2 header must be 8 byte aligned relative to the beginning of + * the in-memory ELF object. The 32-bit kernel ELF file has sections + * which are 4-byte aligned, and as .align family directives only do + * control the alignment inside the section, we need to construct the + * image manually, by inserting the padding where needed. The alignment + * setup here depends on the first PT_LOAD section of the ELF file, if + * this section offset will change, this code must be reviewed. + * Similarily, if we add extra tag types into the information request + * or add tags into the tag list. + */ + .long 0 /* padding */ +#else + .balign MULTIBOOT_HEADER_ALIGN +#endif +mb2_header: + .long MULTIBOOT2_HEADER_MAGIC + .long MULTIBOOT_ARCHITECTURE_I386 + .long mb2_header_end - mb2_header + .long -(MULTIBOOT2_HEADER_MAGIC + MULTIBOOT_ARCHITECTURE_I386 + (mb2_header_end - mb2_header)) + + /* + * Multiboot 2 tags follow. Note, the first tag immediately follows + * the header. Subsequent tags must be aligned by MULTIBOOT_TAG_ALIGN. + * + * MB information request tag. + */ +information_request_tag_start: + .word MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST + .word 0 + .long information_request_tag_end - information_request_tag_start + .long MULTIBOOT_TAG_TYPE_CMDLINE + .long MULTIBOOT_TAG_TYPE_MODULE + .long MULTIBOOT_TAG_TYPE_BOOTDEV + .long MULTIBOOT_TAG_TYPE_MMAP + .long MULTIBOOT_TAG_TYPE_BASIC_MEMINFO +information_request_tag_end: + .long 0 /* padding */ + +#if defined (_BOOT_TARGET_amd64) + /* + * The following values are patched by mbh_patch for the 64-bit kernel, + * so we only provide this tag for the 64-bit kernel. + */ + .balign MULTIBOOT_TAG_ALIGN +address_tag_start: + .word MULTIBOOT_HEADER_TAG_ADDRESS + .word 0 + .long address_tag_end - address_tag_start + .long mb2_header + .globl mb2_load_addr +mb2_load_addr: + .long 0 /* load addr */ + .long 0 /* load_end_addr */ + .long 0 /* bss_end_addr */ +address_tag_end: + /* + * entry address tag + */ + .balign MULTIBOOT_TAG_ALIGN +entry_address_tag_start: + .word MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS + .word 0 + .long entry_address_tag_end - entry_address_tag_start + .long 0 /* entry addr */ +entry_address_tag_end: + + .balign MULTIBOOT_TAG_ALIGN /* Alignment for the next tag */ +#endif + /* + * MB console flags tag + */ +console_tag_start: + .word MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS + .word 0 + .long console_tag_end - console_tag_start + .long MULTIBOOT_CONSOLE_FLAGS_EGA_TEXT_SUPPORTED +console_tag_end: + .long 0 /* padding */ + + /* + * Tell the bootloader to load the modules page aligned to + * the specified alignment. + */ + .word MULTIBOOT_HEADER_TAG_MODULE_ALIGN + .word 0 + .long 8 + + /* + * Termination tag. + */ + .word MULTIBOOT_HEADER_TAG_END + .word 0 + .long 8 +mb2_header_end: + /* * At entry we are in protected mode, 32 bit execution, paging and * interrupts are disabled. @@ -85,7 +182,8 @@ mb_header: * segment registers all have segments with base 0, limit == 0xffffffff */ code_start: - movl %ebx, mb_info + movl %eax, mb_magic + movl %ebx, mb_addr movl $stack_space, %esp /* load my stack pointer */ addl $STACK_SIZE, %esp diff --git a/usr/src/uts/i86pc/dboot/dboot_multiboot2.c b/usr/src/uts/i86pc/dboot/dboot_multiboot2.c new file mode 100644 index 0000000000..ccf81cf773 --- /dev/null +++ b/usr/src/uts/i86pc/dboot/dboot_multiboot2.c @@ -0,0 +1,341 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Toomas Soome <tsoome@me.com> + */ + +/* + * dboot module utility functions for multiboot 2 tags processing. + */ + +#include <sys/inttypes.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/multiboot2.h> +#include <sys/multiboot2_impl.h> + +struct dboot_multiboot2_iterate_ctx; + +typedef boolean_t (*dboot_multiboot2_iterate_cb_t) + (int, multiboot_tag_t *, struct dboot_multiboot2_iterate_ctx *); + +struct dboot_multiboot2_iterate_ctx { + dboot_multiboot2_iterate_cb_t dboot_iter_callback; + int dboot_iter_index; /* item from set */ + uint32_t dboot_iter_tag; /* tag to search */ + multiboot_tag_t *dboot_iter_tagp; /* search result */ +}; + +/* + * Multiboot2 tag list elements are aligned to MULTIBOOT_TAG_ALIGN. + * To get the next item from the list, we first add the tag's size + * to the start of the current tag. Next, we round up that address to the + * nearest MULTIBOOT_TAG_ALIGN address. + */ + +static multiboot_tag_t * +dboot_multiboot2_first_tag(multiboot2_info_header_t *mbi) +{ + return (&mbi->mbi_tags[0]); +} + +static multiboot_tag_t * +dboot_multiboot2_next_tag(multiboot_tag_t *tag) +{ + if (tag == NULL || tag->mb_type == MULTIBOOT_TAG_TYPE_END) + return (NULL); + + return ((multiboot_tag_t *)P2ROUNDUP((uintptr_t)tag + + tag->mb_size, MULTIBOOT_TAG_ALIGN)); +} + +/* + * Walk the tag list until we hit the first instance of a given tag or + * the end of the list. + * MB2_NEXT_TAG() will return NULL on end of list. + */ +static void * +dboot_multiboot2_find_tag_impl(multiboot_tag_t *tagp, uint32_t tag) +{ + while (tagp != NULL && tagp->mb_type != tag) { + tagp = dboot_multiboot2_next_tag(tagp); + } + return (tagp); +} + +/* + * Walk the entire list to find the first instance of the given tag. + */ +void * +dboot_multiboot2_find_tag(multiboot2_info_header_t *mbi, uint32_t tag) +{ + multiboot_tag_t *tagp = dboot_multiboot2_first_tag(mbi); + + return (dboot_multiboot2_find_tag_impl(tagp, tag)); +} + +/* + * dboot_multiboot2_iterate() + * + * While most tags in tag list are unique, the modules are specified + * one module per tag and therefore we need an mechanism to process + * tags in set. + * + * Arguments: + * mbi: multiboot info header + * data: callback context. + * + * Return value: + * Processed item count. + * Callback returning B_TRUE will terminate the iteration. + */ +static int +dboot_multiboot2_iterate(multiboot2_info_header_t *mbi, + struct dboot_multiboot2_iterate_ctx *ctx) +{ + dboot_multiboot2_iterate_cb_t callback = ctx->dboot_iter_callback; + multiboot_tag_t *tagp; + uint32_t tag = ctx->dboot_iter_tag; + int index = 0; + + tagp = dboot_multiboot2_find_tag(mbi, tag); + while (tagp != NULL) { + if (callback != NULL) { + if (callback(index, tagp, ctx) == B_TRUE) { + return (index + 1); + } + } + tagp = dboot_multiboot2_next_tag(tagp); + tagp = dboot_multiboot2_find_tag_impl(tagp, tag); + index++; + } + return (index); +} + +char * +dboot_multiboot2_cmdline(multiboot2_info_header_t *mbi) +{ + multiboot_tag_string_t *tag; + + tag = dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_CMDLINE); + + if (tag != NULL) + return (&tag->mb_string[0]); + else + return (NULL); +} + +/* + * Simple callback to index item in set. + * Terminates iteration if the indexed item is found. + */ +static boolean_t +dboot_multiboot2_iterate_callback(int index, multiboot_tag_t *tagp, + struct dboot_multiboot2_iterate_ctx *ctx) +{ + if (index == ctx->dboot_iter_index) { + ctx->dboot_iter_tagp = tagp; + return (B_TRUE); + } + return (B_FALSE); +} + +int +dboot_multiboot2_modcount(multiboot2_info_header_t *mbi) +{ + struct dboot_multiboot2_iterate_ctx ctx = { + .dboot_iter_callback = NULL, + .dboot_iter_index = 0, + .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE, + .dboot_iter_tagp = NULL + }; + + return (dboot_multiboot2_iterate(mbi, &ctx)); +} + +uint32_t +dboot_multiboot2_modstart(multiboot2_info_header_t *mbi, int index) +{ + multiboot_tag_module_t *tagp; + struct dboot_multiboot2_iterate_ctx ctx = { + .dboot_iter_callback = dboot_multiboot2_iterate_callback, + .dboot_iter_index = index, + .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE, + .dboot_iter_tagp = NULL + }; + + if (dboot_multiboot2_iterate(mbi, &ctx) != 0) { + tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp; + + if (tagp != NULL) + return (tagp->mb_mod_start); + } + return (0); +} + +uint32_t +dboot_multiboot2_modend(multiboot2_info_header_t *mbi, int index) +{ + multiboot_tag_module_t *tagp; + struct dboot_multiboot2_iterate_ctx ctx = { + .dboot_iter_callback = dboot_multiboot2_iterate_callback, + .dboot_iter_index = index, + .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE, + .dboot_iter_tagp = NULL + }; + + if (dboot_multiboot2_iterate(mbi, &ctx) != 0) { + tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp; + + if (tagp != NULL) + return (tagp->mb_mod_end); + } + return (0); +} + +char * +dboot_multiboot2_modcmdline(multiboot2_info_header_t *mbi, int index) +{ + multiboot_tag_module_t *tagp; + struct dboot_multiboot2_iterate_ctx ctx = { + .dboot_iter_callback = dboot_multiboot2_iterate_callback, + .dboot_iter_index = index, + .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE, + .dboot_iter_tagp = NULL + }; + + if (dboot_multiboot2_iterate(mbi, &ctx) != 0) { + tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp; + + if (tagp != NULL) + return (&tagp->mb_cmdline[0]); + } + return (NULL); +} + +multiboot_tag_mmap_t * +dboot_multiboot2_get_mmap_tagp(multiboot2_info_header_t *mbi) +{ + return (dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_MMAP)); +} + +boolean_t +dboot_multiboot2_basicmeminfo(multiboot2_info_header_t *mbi, + uint32_t *lower, uint32_t *upper) +{ + multiboot_tag_basic_meminfo_t *mip; + + mip = dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_BASIC_MEMINFO); + if (mip != NULL) { + *lower = mip->mb_mem_lower; + *upper = mip->mb_mem_upper; + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Return the type of mmap entry referenced by index. + */ +uint32_t +dboot_multiboot2_mmap_get_type(multiboot2_info_header_t *mbi, + multiboot_tag_mmap_t *mb2_mmap_tagp, int index) +{ + multiboot_mmap_entry_t *mapentp; + + if (mb2_mmap_tagp == NULL) + mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi); + + if (mb2_mmap_tagp == NULL) + return (0); + + if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index) + return (0); + + mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries + + index * mb2_mmap_tagp->mb_entry_size); + return (mapentp->mmap_type); +} + +/* + * Return the length of mmap entry referenced by index. + */ +uint64_t +dboot_multiboot2_mmap_get_length(multiboot2_info_header_t *mbi, + multiboot_tag_mmap_t *mb2_mmap_tagp, int index) +{ + multiboot_mmap_entry_t *mapentp; + + if (mb2_mmap_tagp == NULL) + mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi); + + if (mb2_mmap_tagp == NULL) + return (0); + + if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index) + return (0); + + mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries + + index * mb2_mmap_tagp->mb_entry_size); + return (mapentp->mmap_len); +} + +/* + * Return the address from mmap entry referenced by index. + */ +uint64_t +dboot_multiboot2_mmap_get_base(multiboot2_info_header_t *mbi, + multiboot_tag_mmap_t *mb2_mmap_tagp, int index) +{ + multiboot_mmap_entry_t *mapentp; + + if (mb2_mmap_tagp == NULL) + mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi); + + if (mb2_mmap_tagp == NULL) + return (0); + + if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index) + return (0); + + mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries + + index * mb2_mmap_tagp->mb_entry_size); + return (mapentp->mmap_addr); +} + +/* + * Count and return the number of mmap entries provided by the tag. + */ +int +dboot_multiboot2_mmap_nentries(multiboot2_info_header_t *mbi, + multiboot_tag_mmap_t *mb2_mmap_tagp) +{ + if (mb2_mmap_tagp == NULL) + mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi); + + if (mb2_mmap_tagp != NULL) { + return ((mb2_mmap_tagp->mb_size - + offsetof(multiboot_tag_mmap_t, mb_entries)) / + mb2_mmap_tagp->mb_entry_size); + } + return (0); +} + +/* + * Return the highest address used by info header. + */ +paddr_t +dboot_multiboot2_highest_addr(multiboot2_info_header_t *mbi) +{ + return ((paddr_t)(uintptr_t)mbi + mbi->mbi_total_size); +} diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c index 6abb7c6349..344665bf1f 100644 --- a/usr/src/uts/i86pc/dboot/dboot_startkern.c +++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c @@ -33,6 +33,9 @@ #include <sys/systm.h> #include <sys/mach_mmu.h> #include <sys/multiboot.h> +#include <sys/multiboot2.h> +#include <sys/multiboot2_impl.h> +#include <sys/sysmacros.h> #include <sys/sha1.h> #include <util/string.h> #include <util/strtolctype.h> @@ -46,6 +49,7 @@ pfn_t *mfn_to_pfn_mapping; #else /* !__xpv */ extern multiboot_header_t mb_header; +extern uint32_t mb2_load_addr; extern int have_cpuid(void); #endif /* !__xpv */ @@ -132,7 +136,15 @@ start_info_t *xen_info; /* * If on the metal, then we have a multiboot loader. */ +uint32_t mb_magic; /* magic from boot loader */ +uint32_t mb_addr; /* multiboot info package from loader */ +int multiboot_version; multiboot_info_t *mb_info; +multiboot2_info_header_t *mb2_info; +multiboot_tag_mmap_t *mb2_mmap_tagp; +int num_entries; /* mmap entry count */ +boolean_t num_entries_set; /* is mmap entry count set */ +uintptr_t load_addr; #endif /* __xpv */ @@ -181,6 +193,30 @@ uint_t rsvdmemlists_used = 0; struct boot_modules modules[MAX_BOOT_MODULES]; uint_t modules_used = 0; +#ifdef __xpv +/* + * Xen strips the size field out of the mb_memory_map_t, see struct e820entry + * definition in Xen source. + */ +typedef struct { + uint32_t base_addr_low; + uint32_t base_addr_high; + uint32_t length_low; + uint32_t length_high; + uint32_t type; +} mmap_t; + +/* + * There is 512KB of scratch area after the boot stack page. + * We'll use that for everything except the kernel nucleus pages which are too + * big to fit there and are allocated last anyway. + */ +#define MAXMAPS 100 +static mmap_t map_buffer[MAXMAPS]; +#else +typedef mb_memory_map_t mmap_t; +#endif + /* * Debugging macros */ @@ -616,29 +652,182 @@ exclude_from_pci(uint64_t start, uint64_t end) } /* - * Xen strips the size field out of the mb_memory_map_t, see struct e820entry - * definition in Xen source. + * During memory allocation, find the highest address not used yet. */ -#ifdef __xpv -typedef struct { - uint32_t base_addr_low; - uint32_t base_addr_high; - uint32_t length_low; - uint32_t length_high; - uint32_t type; -} mmap_t; +static void +check_higher(paddr_t a) +{ + if (a < next_avail_addr) + return; + next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); + DBG(next_avail_addr); +} + +static int +dboot_loader_mmap_entries(void) +{ +#if !defined(__xpv) + if (num_entries_set == B_TRUE) + return (num_entries); + + switch (multiboot_version) { + case 1: + DBG(mb_info->flags); + if (mb_info->flags & 0x40) { + mb_memory_map_t *mmap; + + DBG(mb_info->mmap_addr); + DBG(mb_info->mmap_length); + check_higher(mb_info->mmap_addr + mb_info->mmap_length); + + for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; + (uint32_t)mmap < mb_info->mmap_addr + + mb_info->mmap_length; + mmap = (mb_memory_map_t *)((uint32_t)mmap + + mmap->size + sizeof (mmap->size))) + ++num_entries; + + num_entries_set = B_TRUE; + } + break; + case 2: + num_entries_set = B_TRUE; + num_entries = dboot_multiboot2_mmap_nentries(mb2_info, + mb2_mmap_tagp); + break; + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (num_entries); #else -typedef mb_memory_map_t mmap_t; + return (MAXMAPS); +#endif +} + +static uint32_t +dboot_loader_mmap_get_type(int index) +{ +#if !defined(__xpv) + mb_memory_map_t *mp, *mpend; + int i; + + switch (multiboot_version) { + case 1: + mp = (mb_memory_map_t *)mb_info->mmap_addr; + mpend = (mb_memory_map_t *) + (mb_info->mmap_addr + mb_info->mmap_length); + + for (i = 0; mp < mpend && i != index; i++) + mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + + sizeof (mp->size)); + if (mp >= mpend) { + dboot_panic("dboot_loader_mmap_get_type(): index " + "out of bounds: %d\n", index); + } + return (mp->type); + + case 2: + return (dboot_multiboot2_mmap_get_type(mb2_info, + mb2_mmap_tagp, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +#else + return (map_buffer[index].type); +#endif +} + +static uint64_t +dboot_loader_mmap_get_base(int index) +{ +#if !defined(__xpv) + mb_memory_map_t *mp, *mpend; + int i; + + switch (multiboot_version) { + case 1: + mp = (mb_memory_map_t *)mb_info->mmap_addr; + mpend = (mb_memory_map_t *) + (mb_info->mmap_addr + mb_info->mmap_length); + + for (i = 0; mp < mpend && i != index; i++) + mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + + sizeof (mp->size)); + if (mp >= mpend) { + dboot_panic("dboot_loader_mmap_get_base(): index " + "out of bounds: %d\n", index); + } + return (((uint64_t)mp->base_addr_high << 32) + + (uint64_t)mp->base_addr_low); + + case 2: + return (dboot_multiboot2_mmap_get_base(mb2_info, + mb2_mmap_tagp, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +#else + return (((uint64_t)map_buffer[index].base_addr_high << 32) + + (uint64_t)map_buffer[index].base_addr_low); +#endif +} + +static uint64_t +dboot_loader_mmap_get_length(int index) +{ +#if !defined(__xpv) + mb_memory_map_t *mp, *mpend; + int i; + + switch (multiboot_version) { + case 1: + mp = (mb_memory_map_t *)mb_info->mmap_addr; + mpend = (mb_memory_map_t *) + (mb_info->mmap_addr + mb_info->mmap_length); + + for (i = 0; mp < mpend && i != index; i++) + mp = (mb_memory_map_t *)((uint32_t)mp + mp->size + + sizeof (mp->size)); + if (mp >= mpend) { + dboot_panic("dboot_loader_mmap_get_length(): index " + "out of bounds: %d\n", index); + } + return (((uint64_t)mp->length_high << 32) + + (uint64_t)mp->length_low); + + case 2: + return (dboot_multiboot2_mmap_get_length(mb2_info, + mb2_mmap_tagp, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +#else + return (((uint64_t)map_buffer[index].length_high << 32) + + (uint64_t)map_buffer[index].length_low); #endif +} static void -build_pcimemlists(mmap_t *mem, int num) +build_pcimemlists(void) { - mmap_t *mmap; uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ uint64_t start; uint64_t end; - int i; + int i, num; /* * initialize @@ -647,18 +836,18 @@ build_pcimemlists(mmap_t *mem, int num) pcimemlists[0].size = pci_hi_limit - pci_lo_limit; pcimemlists_used = 1; + num = dboot_loader_mmap_entries(); /* * Fill in PCI memlists. */ - for (mmap = mem, i = 0; i < num; ++i, ++mmap) { - start = ((uint64_t)mmap->base_addr_high << 32) + - mmap->base_addr_low; - end = start + ((uint64_t)mmap->length_high << 32) + - mmap->length_low; + for (i = 0; i < num; ++i) { + start = dboot_loader_mmap_get_base(i); + end = start + dboot_loader_mmap_get_length(i); if (prom_debug) dboot_printf("\ttype: %d %" PRIx64 "..%" - PRIx64 "\n", mmap->type, start, end); + PRIx64 "\n", dboot_loader_mmap_get_type(i), + start, end); /* * page align start and end @@ -697,13 +886,7 @@ build_pcimemlists(mmap_t *mem, int num) #if defined(__xpv) /* * Initialize memory allocator stuff from hypervisor-supplied start info. - * - * There is 512KB of scratch area after the boot stack page. - * We'll use that for everything except the kernel nucleus pages which are too - * big to fit there and are allocated last anyway. */ -#define MAXMAPS 100 -static mmap_t map_buffer[MAXMAPS]; static void init_mem_alloc(void) { @@ -783,12 +966,159 @@ init_mem_alloc(void) set_xen_guest_handle(map.buffer, map_buffer); if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0) dboot_panic("getting XENMEM_machine_memory_map failed"); - build_pcimemlists(map_buffer, map.nr_entries); + build_pcimemlists(); } } #else /* !__xpv */ +static void +dboot_multiboot1_xboot_consinfo(void) +{ +} + +static void +dboot_multiboot2_xboot_consinfo(void) +{ +} + +static int +dboot_multiboot_modcount(void) +{ + switch (multiboot_version) { + case 1: + return (mb_info->mods_count); + + case 2: + return (dboot_multiboot2_modcount(mb2_info)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +} + +static uint32_t +dboot_multiboot_modstart(int index) +{ + switch (multiboot_version) { + case 1: + return (((mb_module_t *)mb_info->mods_addr)[index].mod_start); + + case 2: + return (dboot_multiboot2_modstart(mb2_info, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +} + +static uint32_t +dboot_multiboot_modend(int index) +{ + switch (multiboot_version) { + case 1: + return (((mb_module_t *)mb_info->mods_addr)[index].mod_end); + + case 2: + return (dboot_multiboot2_modend(mb2_info, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +} + +static char * +dboot_multiboot_modcmdline(int index) +{ + switch (multiboot_version) { + case 1: + return ((char *)((mb_module_t *) + mb_info->mods_addr)[index].mod_name); + + case 2: + return (dboot_multiboot2_modcmdline(mb2_info, index)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (0); +} + +/* + * Find the environment module for console setup. + * Since we need the console to print early boot messages, the console is set up + * before anything else and therefore we need to pick up the environment module + * early too. + * + * Note, we just will search for and if found, will pass the env + * module to console setup, the proper module list processing will happen later. + */ +static void +dboot_find_env(void) +{ + int i, modcount; + uint32_t mod_start, mod_end; + char *cmdline; + + modcount = dboot_multiboot_modcount(); + + for (i = 0; i < modcount; ++i) { + cmdline = dboot_multiboot_modcmdline(i); + if (cmdline == NULL) + continue; + + if (strstr(cmdline, "type=environment") == NULL) + continue; + + mod_start = dboot_multiboot_modstart(i); + mod_end = dboot_multiboot_modend(i); + modules[0].bm_addr = mod_start; + modules[0].bm_size = mod_end - mod_start; + modules[0].bm_name = NULL; + modules[0].bm_hash = NULL; + modules[0].bm_type = BMT_ENV; + bi->bi_modules = (native_ptr_t)(uintptr_t)modules; + bi->bi_module_cnt = 1; + return; + } +} + +static boolean_t +dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper) +{ + boolean_t rv = B_FALSE; + + switch (multiboot_version) { + case 1: + if (mb_info->flags & 0x01) { + *lower = mb_info->mem_lower; + *upper = mb_info->mem_upper; + rv = B_TRUE; + } + break; + + case 2: + return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper)); + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + return (rv); +} + static uint8_t dboot_a2h(char v) { @@ -860,6 +1190,8 @@ type_to_str(boot_module_type_t type) return ("file"); case BMT_HASH: return ("hash"); + case BMT_ENV: + return ("environment"); default: return ("unknown"); } @@ -908,21 +1240,23 @@ check_images(void) * hashes which are checked prior to transferring control to the kernel. */ static void -process_module(mb_module_t *mod) +process_module(int midx) { - int midx = modules_used++; + uint32_t mod_start = dboot_multiboot_modstart(midx); + uint32_t mod_end = dboot_multiboot_modend(midx); + char *cmdline = dboot_multiboot_modcmdline(midx); char *p, *q; + check_higher(mod_end); if (prom_debug) { dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", - midx, (char *)(mod->mod_name), - (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); + midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end); } - if (mod->mod_start > mod->mod_end) { + if (mod_start > mod_end) { dboot_panic("module #%d: module start address 0x%lx greater " "than end address 0x%lx", midx, - (ulong_t)mod->mod_start, (ulong_t)mod->mod_end); + (ulong_t)mod_start, (ulong_t)mod_end); } /* @@ -943,18 +1277,18 @@ process_module(mb_module_t *mod) * correct number of bytes in each module, achieving exactly this. */ - modules[midx].bm_addr = mod->mod_start; - modules[midx].bm_size = mod->mod_end - mod->mod_start; - modules[midx].bm_name = mod->mod_name; + modules[midx].bm_addr = mod_start; + modules[midx].bm_size = mod_end - mod_start; + modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline; modules[midx].bm_hash = NULL; modules[midx].bm_type = BMT_FILE; - if (mod->mod_name == NULL) { + if (cmdline == NULL) { modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname; return; } - p = (char *)(uintptr_t)mod->mod_name; + p = cmdline; modules[midx].bm_name = (native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r"); @@ -976,6 +1310,8 @@ process_module(mb_module_t *mod) modules[midx].bm_type = BMT_ROOTFS; } else if (strcmp(q, "hash") == 0) { modules[midx].bm_type = BMT_HASH; + } else if (strcmp(q, "environment") == 0) { + modules[midx].bm_type = BMT_ENV; } else if (strcmp(q, "file") != 0) { dboot_printf("\tmodule #%d: unknown module " "type '%s'; defaulting to 'file'", @@ -1065,89 +1401,69 @@ assign_module_hashes(void) } /* - * During memory allocation, find the highest address not used yet. - */ -static void -check_higher(paddr_t a) -{ - if (a < next_avail_addr) - return; - next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); - DBG(next_avail_addr); -} - -/* * Walk through the module information finding the last used address. * The first available address will become the top level page table. - * - * We then build the phys_install memlist from the multiboot information. */ static void -init_mem_alloc(void) +dboot_process_modules(void) { - mb_memory_map_t *mmap; - mb_module_t *mod; - uint64_t start; - uint64_t end; - uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ + int i, modcount; extern char _end[]; - int i; - - DBG_MSG("Entered init_mem_alloc()\n"); - DBG((uintptr_t)mb_info); - if (mb_info->mods_count > MAX_BOOT_MODULES) { + DBG_MSG("\nFinding Modules\n"); + modcount = dboot_multiboot_modcount(); + if (modcount > MAX_BOOT_MODULES) { dboot_panic("Too many modules (%d) -- the maximum is %d.", - mb_info->mods_count, MAX_BOOT_MODULES); + modcount, MAX_BOOT_MODULES); } /* * search the modules to find the last used address * we'll build the module list while we're walking through here */ - DBG_MSG("\nFinding Modules\n"); check_higher((paddr_t)(uintptr_t)&_end); - for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0; - i < mb_info->mods_count; - ++mod, ++i) { - process_module(mod); - check_higher(mod->mod_end); + for (i = 0; i < modcount; ++i) { + process_module(i); + modules_used++; } bi->bi_modules = (native_ptr_t)(uintptr_t)modules; DBG(bi->bi_modules); - bi->bi_module_cnt = mb_info->mods_count; + bi->bi_module_cnt = modcount; DBG(bi->bi_module_cnt); fixup_modules(); assign_module_hashes(); check_images(); +} + +/* + * We then build the phys_install memlist from the multiboot information. + */ +static void +dboot_process_mmap(void) +{ + uint64_t start; + uint64_t end; + uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */ + uint32_t lower, upper; + int i, mmap_entries; /* * Walk through the memory map from multiboot and build our memlist * structures. Note these will have native format pointers. */ DBG_MSG("\nFinding Memory Map\n"); - DBG(mb_info->flags); + num_entries = 0; + num_entries_set = B_FALSE; max_mem = 0; - if (mb_info->flags & 0x40) { - int cnt = 0; - - DBG(mb_info->mmap_addr); - DBG(mb_info->mmap_length); - check_higher(mb_info->mmap_addr + mb_info->mmap_length); - - for (mmap = (mb_memory_map_t *)mb_info->mmap_addr; - (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length; - mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size - + sizeof (mmap->size))) { - ++cnt; - start = ((uint64_t)mmap->base_addr_high << 32) + - mmap->base_addr_low; - end = start + ((uint64_t)mmap->length_high << 32) + - mmap->length_low; + if ((mmap_entries = dboot_loader_mmap_entries()) > 0) { + for (i = 0; i < mmap_entries; i++) { + uint32_t type = dboot_loader_mmap_get_type(i); + start = dboot_loader_mmap_get_base(i); + end = start + dboot_loader_mmap_get_length(i); if (prom_debug) dboot_printf("\ttype: %d %" PRIx64 "..%" - PRIx64 "\n", mmap->type, start, end); + PRIx64 "\n", type, start, end); /* * page align start and end @@ -1160,7 +1476,7 @@ init_mem_alloc(void) /* * only type 1 is usable RAM */ - switch (mmap->type) { + switch (type) { case 1: if (end > max_mem) max_mem = end; @@ -1214,22 +1530,21 @@ init_mem_alloc(void) continue; } } - build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt); - } else if (mb_info->flags & 0x01) { - DBG(mb_info->mem_lower); + build_pcimemlists(); + } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) { + DBG(lower); memlists[memlists_used].addr = 0; - memlists[memlists_used].size = mb_info->mem_lower * 1024; + memlists[memlists_used].size = lower * 1024; ++memlists_used; - DBG(mb_info->mem_upper); + DBG(upper); memlists[memlists_used].addr = 1024 * 1024; - memlists[memlists_used].size = mb_info->mem_upper * 1024; + memlists[memlists_used].size = upper * 1024; ++memlists_used; /* * Old platform - assume I/O space at the end of memory. */ - pcimemlists[0].addr = - (mb_info->mem_upper * 1024) + (1024 * 1024); + pcimemlists[0].addr = (upper * 1024) + (1024 * 1024); pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr; pcimemlists[0].next = 0; pcimemlists[0].prev = 0; @@ -1239,8 +1554,6 @@ init_mem_alloc(void) dboot_panic("No memory info from boot loader!!!"); } - check_higher(bi->bi_cmdline); - /* * finish processing the physinstall list */ @@ -1251,6 +1564,102 @@ init_mem_alloc(void) */ build_rsvdmemlists(); } + +/* + * The highest address is used as the starting point for dboot's simple + * memory allocator. + * + * Finding the highest address in case of Multiboot 1 protocol is + * quite painful in the sense that some information provided by + * the multiboot info structure points to BIOS data, and some to RAM. + * + * The module list was processed and checked already by dboot_process_modules(), + * so we will check the command line string and the memory map. + * + * This list of to be checked items is based on our current knowledge of + * allocations made by grub1 and will need to be reviewed if there + * are updates about the information provided by Multiboot 1. + * + * In the case of the Multiboot 2, our life is much simpler, as the MB2 + * information tag list is one contiguous chunk of memory. + */ +static paddr_t +dboot_multiboot1_highest_addr(void) +{ + paddr_t addr = NULL; + char *cmdl = (char *)mb_info->cmdline; + + if (mb_info->flags & MB_INFO_CMDLINE) + addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1)); + + if (mb_info->flags & MB_INFO_MEM_MAP) + addr = MAX(addr, + ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length))); + return (addr); +} + +static void +dboot_multiboot_highest_addr(void) +{ + paddr_t addr; + + switch (multiboot_version) { + case 1: + addr = dboot_multiboot1_highest_addr(); + if (addr != NULL) + check_higher(addr); + break; + case 2: + addr = dboot_multiboot2_highest_addr(mb2_info); + if (addr != NULL) + check_higher(addr); + break; + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } +} + +/* + * Walk the boot loader provided information and find the highest free address. + */ +static void +init_mem_alloc(void) +{ + DBG_MSG("Entered init_mem_alloc()\n"); + dboot_process_modules(); + dboot_process_mmap(); + dboot_multiboot_highest_addr(); +} + +static void +dboot_multiboot_get_fwtables(void) +{ + multiboot_tag_new_acpi_t *nacpitagp; + multiboot_tag_old_acpi_t *oacpitagp; + + /* no fw tables from multiboot 1 */ + if (multiboot_version != 2) + return; + + nacpitagp = (multiboot_tag_new_acpi_t *) + dboot_multiboot2_find_tag(mb2_info, + MULTIBOOT_TAG_TYPE_ACPI_NEW); + oacpitagp = (multiboot_tag_old_acpi_t *) + dboot_multiboot2_find_tag(mb2_info, + MULTIBOOT_TAG_TYPE_ACPI_OLD); + + if (nacpitagp != NULL) { + bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) + &nacpitagp->mb_rsdp[0]; + } else if (oacpitagp != NULL) { + bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t) + &oacpitagp->mb_rsdp[0]; + } else { + bi->bi_acpi_rsdp = NULL; + } +} #endif /* !__xpv */ /* @@ -1438,6 +1847,140 @@ kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\ module$ /platform/i86pc/$ISADIR/boot_archive\n\ See http://illumos.org/msg/SUNOS-8000-AK for details.\n" +static void +dboot_init_xboot_consinfo(void) +{ + uintptr_t addr; + /* + * boot info must be 16 byte aligned for 64 bit kernel ABI + */ + addr = (uintptr_t)boot_info; + addr = (addr + 0xf) & ~0xf; + bi = (struct xboot_info *)addr; + +#if !defined(__xpv) + switch (multiboot_version) { + case 1: + dboot_multiboot1_xboot_consinfo(); + break; + case 2: + dboot_multiboot2_xboot_consinfo(); + break; + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + /* + * Lookup environment module for the console. Complete module list + * will be built after console setup. + */ + dboot_find_env(); +#endif +} + +/* + * Set up basic data from the boot loader. + * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support + * 32-bit dboot code setup used to set up and start 64-bit kernel. + * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and + * start 64-bit illumos kernel. + */ +static void +dboot_loader_init(void) +{ +#if !defined(__xpv) + mb_info = NULL; + mb2_info = NULL; + + switch (mb_magic) { + case MB_BOOTLOADER_MAGIC: + multiboot_version = 1; + mb_info = (multiboot_info_t *)(uintptr_t)mb_addr; +#if defined(_BOOT_TARGET_amd64) + load_addr = mb_header.load_addr; +#endif + break; + + case MULTIBOOT2_BOOTLOADER_MAGIC: + multiboot_version = 2; + mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr; + mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info); +#if defined(_BOOT_TARGET_amd64) + load_addr = mb2_load_addr; +#endif + break; + + default: + dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic); + break; + } +#endif /* !defined(__xpv) */ +} + +/* Extract the kernel command line from [multi]boot information. */ +static char * +dboot_loader_cmdline(void) +{ + char *line = NULL; + +#if defined(__xpv) + line = (char *)xen_info->cmd_line; +#else /* __xpv */ + + switch (multiboot_version) { + case 1: + if (mb_info->flags & MB_INFO_CMDLINE) + line = (char *)mb_info->cmdline; + break; + + case 2: + line = dboot_multiboot2_cmdline(mb2_info); + break; + + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + +#endif /* __xpv */ + + /* + * Make sure we have valid pointer so the string operations + * will not crash us. + */ + if (line == NULL) + line = ""; + + return (line); +} + +static char * +dboot_loader_name(void) +{ +#if defined(__xpv) + return (NULL); +#else /* __xpv */ + multiboot_tag_string_t *tag; + + switch (multiboot_version) { + case 1: + return ((char *)mb_info->boot_loader_name); + + case 2: + tag = dboot_multiboot2_find_tag(mb2_info, + MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME); + return (tag->mb_string); + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } + + return (NULL); +#endif /* __xpv */ +} /* * startup_kernel has a pretty simple job. It builds pagetables which reflect * 1:1 mappings for all memory in use. It then also adds mappings for @@ -1450,22 +1993,18 @@ void startup_kernel(void) { char *cmdline; - uintptr_t addr; + char *bootloader; #if defined(__xpv) physdev_set_iopl_t set_iopl; #endif /* __xpv */ + dboot_loader_init(); /* * At this point we are executing in a 32 bit real mode. */ -#if defined(__xpv) - cmdline = (char *)xen_info->cmd_line; -#else /* __xpv */ - cmdline = (char *)mb_info->cmdline; -#endif /* __xpv */ - prom_debug = (strstr(cmdline, "prom_debug") != NULL); - map_debug = (strstr(cmdline, "map_debug") != NULL); + bootloader = dboot_loader_name(); + cmdline = dboot_loader_cmdline(); #if defined(__xpv) /* @@ -1478,23 +2017,40 @@ startup_kernel(void) } #endif /* __xpv */ - bcons_init(cmdline); - DBG_MSG("\n\nSolaris prekernel set: "); + dboot_init_xboot_consinfo(); + bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; + bcons_init(bi); + + prom_debug = (find_boot_prop("prom_debug") != NULL); + map_debug = (find_boot_prop("map_debug") != NULL); + +#if !defined(__xpv) + dboot_multiboot_get_fwtables(); +#endif + DBG_MSG("\n\nillumos prekernel set: "); DBG_MSG(cmdline); DBG_MSG("\n"); + if (bootloader != NULL && prom_debug) { + dboot_printf("Kernel loaded by: %s\n", bootloader); +#if !defined(__xpv) + dboot_printf("Using multiboot %d boot protocol.\n", + multiboot_version); +#endif + } + if (strstr(cmdline, "multiboot") != NULL) { dboot_panic(NO_MULTIBOOT); } - /* - * boot info must be 16 byte aligned for 64 bit kernel ABI - */ - addr = (uintptr_t)boot_info; - addr = (addr + 0xf) & ~0xf; - bi = (struct xboot_info *)addr; DBG((uintptr_t)bi); - bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline; +#if !defined(__xpv) + DBG((uintptr_t)mb_info); + DBG((uintptr_t)mb2_info); + if (mb2_info != NULL) + DBG(mb2_info->mbi_total_size); + DBG(bi->bi_acpi_rsdp); +#endif /* * Need correct target_kernel_text value @@ -1709,7 +2265,8 @@ startup_kernel(void) ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); if (ktext_phys == 0) dboot_panic("failed to allocate aligned kernel memory"); - if (dboot_elfload64(mb_header.load_addr) != 0) + DBG(load_addr); + if (dboot_elfload64(load_addr) != 0) dboot_panic("failed to parse kernel ELF image, rebooting"); #endif @@ -1757,7 +2314,20 @@ startup_kernel(void) DBG(bi->bi_next_paddr); bi->bi_next_vaddr = (uintptr_t)next_avail_addr; DBG(bi->bi_next_vaddr); - bi->bi_mb_info = (uintptr_t)mb_info; + bi->bi_mb_version = multiboot_version; + + switch (multiboot_version) { + case 1: + bi->bi_mb_info = (uintptr_t)mb_info; + break; + case 2: + bi->bi_mb_info = (uintptr_t)mb2_info; + break; + default: + dboot_panic("Unknown multiboot version: %d\n", + multiboot_version); + break; + } bi->bi_top_page_table = (uintptr_t)top_page_table; #endif /* __xpv */ diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c index 84b41cfdad..fa07371303 100644 --- a/usr/src/uts/i86pc/os/ddi_impl.c +++ b/usr/src/uts/i86pc/os/ddi_impl.c @@ -1903,6 +1903,9 @@ get_boot_properties(void) copy_boot_str(bop_staging_area, property_val, 50); (void) ndi_prop_update_string(DDI_DEV_T_NONE, devi, property_name, property_val); + } else if (strcmp(name, "acpi-root-tab") == 0) { + (void) ndi_prop_update_int64(DDI_DEV_T_NONE, devi, + property_name, *((int64_t *)bop_staging_area)); } else if (strcmp(name, "stdout") == 0) { (void) ndi_prop_update_int(DDI_DEV_T_NONE, devi, property_name, *((int *)bop_staging_area)); diff --git a/usr/src/uts/i86pc/os/fakebop.c b/usr/src/uts/i86pc/os/fakebop.c index 2a1c65d4b6..8616ef9f40 100644 --- a/usr/src/uts/i86pc/os/fakebop.c +++ b/usr/src/uts/i86pc/os/fakebop.c @@ -40,6 +40,8 @@ #include <sys/bootsvcs.h> #include <sys/bootinfo.h> #include <sys/multiboot.h> +#include <sys/multiboot2.h> +#include <sys/multiboot2_impl.h> #include <sys/bootvfs.h> #include <sys/bootprops.h> #include <sys/varargs.h> @@ -132,7 +134,7 @@ shared_info_t *HYPERVISOR_shared_info; static ulong_t total_bop_alloc_scratch = 0; static ulong_t total_bop_alloc_kernel = 0; -static void build_firmware_properties(void); +static void build_firmware_properties(struct xboot_info *); static int early_allocation = 1; @@ -791,7 +793,7 @@ done: bcons_init2(inputdev, outputdev, consoledev); } - if (strstr((char *)xbootp->bi_cmdline, "prom_debug") || kbm_debug) + if (find_boot_prop("prom_debug") || kbm_debug) boot_prop_display(line); } @@ -1130,7 +1132,8 @@ build_panic_cmdline(const char *cmd, int cmdlen) #ifndef __xpv /* - * Construct boot command line for Fast Reboot + * Construct boot command line for Fast Reboot. The saved_cmdline + * is also reported by "eeprom bootcmd". */ static void build_fastboot_cmdline(struct xboot_info *xbp) @@ -1199,6 +1202,125 @@ save_boot_info(struct xboot_info *xbi) } #endif /* __xpv */ +/* + * Import boot environment module variables as properties, applying + * blacklist filter for variables we know we will not use. + * + * Since the environment can be relatively large, containing many variables + * used only for boot loader purposes, we will use a blacklist based filter. + * To keep the blacklist from growing too large, we use prefix based filtering. + * This is possible because in many cases, the loader variable names are + * using a structured layout. + * + * We will not overwrite already set properties. + */ +static struct bop_blacklist { + const char *bl_name; + int bl_name_len; +} bop_prop_blacklist[] = { + { "ISADIR", sizeof ("ISADIR") }, + { "acpi", sizeof ("acpi") }, + { "autoboot_delay", sizeof ("autoboot_delay") }, + { "autoboot_delay", sizeof ("autoboot_delay") }, + { "beansi_", sizeof ("beansi_") }, + { "beastie", sizeof ("beastie") }, + { "bemenu", sizeof ("bemenu") }, + { "boot.", sizeof ("boot.") }, + { "bootenv", sizeof ("bootenv") }, + { "currdev", sizeof ("currdev") }, + { "dhcp.", sizeof ("dhcp.") }, + { "interpret", sizeof ("interpret") }, + { "kernel", sizeof ("kernel") }, + { "loaddev", sizeof ("loaddev") }, + { "loader_", sizeof ("loader_") }, + { "module_path", sizeof ("module_path") }, + { "nfs.", sizeof ("nfs.") }, + { "pcibios", sizeof ("pcibios") }, + { "prompt", sizeof ("prompt") }, + { "smbios", sizeof ("smbios") }, + { "tem", sizeof ("tem") }, + { "twiddle_divisor", sizeof ("twiddle_divisor") }, + { "zfs_be", sizeof ("zfs_be") }, +}; + +/* + * Match the name against prefixes in above blacklist. If the match was + * found, this name is blacklisted. + */ +static boolean_t +name_is_blacklisted(const char *name) +{ + int i, n; + + n = sizeof (bop_prop_blacklist) / sizeof (bop_prop_blacklist[0]); + for (i = 0; i < n; i++) { + if (strncmp(bop_prop_blacklist[i].bl_name, name, + bop_prop_blacklist[i].bl_name_len - 1) == 0) { + return (B_TRUE); + } + } + return (B_FALSE); +} + +static void +process_boot_environment(struct boot_modules *benv) +{ + char *env, *ptr, *name, *value; + uint32_t size, name_len, value_len; + + if (benv == NULL || benv->bm_type != BMT_ENV) + return; + ptr = env = benv->bm_addr; + size = benv->bm_size; + do { + name = ptr; + /* find '=' */ + while (*ptr != '=') { + ptr++; + if (ptr > env + size) /* Something is very wrong. */ + return; + } + name_len = ptr - name; + if (sizeof (buffer) <= name_len) + continue; + + (void) strncpy(buffer, name, sizeof (buffer)); + buffer[name_len] = '\0'; + name = buffer; + + value_len = 0; + value = ++ptr; + while ((uintptr_t)ptr - (uintptr_t)env < size) { + if (*ptr == '\0') { + ptr++; + value_len = (uintptr_t)ptr - (uintptr_t)env; + break; + } + ptr++; + } + + /* Did we reach the end of the module? */ + if (value_len == 0) + return; + + if (*value == '\0') + continue; + + /* Is this property already set? */ + if (do_bsys_getproplen(NULL, name) >= 0) + continue; + + if (name_is_blacklisted(name) == B_TRUE) + continue; + + /* Create new property. */ + bsetprops(name, value); + + /* Avoid reading past the module end. */ + if (size <= (uintptr_t)ptr - (uintptr_t)env) + return; + } while (*ptr != '\0'); +} /* * 1st pass at building the table of boot properties. This includes: @@ -1218,7 +1340,7 @@ build_boot_properties(struct xboot_info *xbp) int name_len; char *value; int value_len; - struct boot_modules *bm, *rdbm; + struct boot_modules *bm, *rdbm, *benv = NULL; char *propbuf; int quoted = 0; int boot_arg_len; @@ -1228,9 +1350,6 @@ build_boot_properties(struct xboot_info *xbp) static int stdout_val = 0; uchar_t boot_device; char str[3]; - multiboot_info_t *mbi; - int netboot; - struct sol_netinfo *sip; #endif /* @@ -1250,6 +1369,13 @@ build_boot_properties(struct xboot_info *xbp) if (bm[i].bm_type == BMT_HASH || bm[i].bm_name == NULL) continue; + if (bm[i].bm_type == BMT_ENV) { + if (benv == NULL) + benv = &bm[i]; + else + continue; + } + (void) snprintf(modid, sizeof (modid), "module-name-%u", midx); bsetprops(modid, (char *)bm[i].bm_name); @@ -1277,6 +1403,19 @@ build_boot_properties(struct xboot_info *xbp) fastreboot_disable(FBNS_BOOTMOD); } +#ifndef __xpv + /* + * Disable fast reboot if we're using the Multiboot 2 boot protocol, + * since we don't currently support MB2 info and module relocation. + * Note that fast reboot will have already been disabled if multiple + * modules are present, since the current implementation assumes that + * we only have a single module, the boot_archive. + */ + if (xbp->bi_mb_version != 1) { + fastreboot_disable(FBNS_MULTIBOOT2); + } +#endif + DBG_MSG("Parsing command line for boot properties\n"); value = xbp->bi_cmdline; @@ -1470,48 +1609,83 @@ build_boot_properties(struct xboot_info *xbp) bsetprops("boot-args", boot_args); bsetprops("bootargs", boot_args); -#ifndef __xpv - /* - * set the BIOS boot device from GRUB - */ - netboot = 0; - mbi = xbp->bi_mb_info; + process_boot_environment(benv); +#ifndef __xpv /* * Build boot command line for Fast Reboot */ build_fastboot_cmdline(xbp); - /* - * Save various boot information for Fast Reboot - */ - save_boot_info(xbp); - - if (mbi != NULL && mbi->flags & MB_INFO_BOOTDEV) { - boot_device = mbi->boot_device >> 24; - if (boot_device == 0x20) - netboot++; - str[0] = (boot_device >> 4) + '0'; - str[1] = (boot_device & 0xf) + '0'; - str[2] = 0; - bsetprops("bios-boot-device", str); - } else { - netboot = 1; - } + if (xbp->bi_mb_version == 1) { + multiboot_info_t *mbi = xbp->bi_mb_info; + int netboot; + struct sol_netinfo *sip; - /* - * In the netboot case, drives_info is overloaded with the dhcp ack. - * This is not multiboot compliant and requires special pxegrub! - */ - if (netboot && mbi->drives_length != 0) { - sip = (struct sol_netinfo *)(uintptr_t)mbi->drives_addr; - if (sip->sn_infotype == SN_TYPE_BOOTP) + /* + * set the BIOS boot device from GRUB + */ + netboot = 0; + + /* + * Save various boot information for Fast Reboot + */ + save_boot_info(xbp); + + if (mbi != NULL && mbi->flags & MB_INFO_BOOTDEV) { + boot_device = mbi->boot_device >> 24; + if (boot_device == 0x20) + netboot++; + str[0] = (boot_device >> 4) + '0'; + str[1] = (boot_device & 0xf) + '0'; + str[2] = 0; + bsetprops("bios-boot-device", str); + } else { + netboot = 1; + } + + /* + * In the netboot case, drives_info is overloaded with the + * dhcp ack. This is not multiboot compliant and requires + * special pxegrub! + */ + if (netboot && mbi->drives_length != 0) { + sip = (struct sol_netinfo *)(uintptr_t)mbi->drives_addr; + if (sip->sn_infotype == SN_TYPE_BOOTP) + bsetprop("bootp-response", + sizeof ("bootp-response"), + (void *)(uintptr_t)mbi->drives_addr, + mbi->drives_length); + else if (sip->sn_infotype == SN_TYPE_RARP) + setup_rarp_props(sip); + } + } else { + multiboot2_info_header_t *mbi = xbp->bi_mb_info; + multiboot_tag_bootdev_t *bootdev = NULL; + multiboot_tag_network_t *netdev = NULL; + + if (mbi != NULL) { + bootdev = dboot_multiboot2_find_tag(mbi, + MULTIBOOT_TAG_TYPE_BOOTDEV); + netdev = dboot_multiboot2_find_tag(mbi, + MULTIBOOT_TAG_TYPE_NETWORK); + } + if (bootdev != NULL) { + DBG(bootdev->mb_biosdev); + boot_device = bootdev->mb_biosdev; + str[0] = (boot_device >> 4) + '0'; + str[1] = (boot_device & 0xf) + '0'; + str[2] = 0; + bsetprops("bios-boot-device", str); + } + if (netdev != NULL) { bsetprop("bootp-response", sizeof ("bootp-response"), - (void *)(uintptr_t)mbi->drives_addr, - mbi->drives_length); - else if (sip->sn_infotype == SN_TYPE_RARP) - setup_rarp_props(sip); + (void *)(uintptr_t)netdev->mb_dhcpack, + netdev->mb_size - + sizeof (multiboot_tag_network_t)); + } } + bsetprop("stdout", strlen("stdout"), &stdout_val, sizeof (stdout_val)); #endif /* __xpv */ @@ -1530,7 +1704,7 @@ build_boot_properties(struct xboot_info *xbp) /* * Build firmware-provided system properties */ - build_firmware_properties(); + build_firmware_properties(xbp); /* * XXPV @@ -1812,13 +1986,13 @@ _start(struct xboot_info *xbp) } #endif - bcons_init((void *)xbp->bi_cmdline); + bcons_init(xbp); have_console = 1; /* * enable debugging */ - if (strstr((char *)xbp->bi_cmdline, "kbm_debug")) + if (find_boot_prop("kbm_debug") != NULL) kbm_debug = 1; DBG_MSG("\n\n*** Entered Solaris in _start() cmdline is: "); @@ -1897,7 +2071,7 @@ _start(struct xboot_info *xbp) DBG_MSG("Initializing boot properties:\n"); build_boot_properties(xbp); - if (strstr((char *)xbp->bi_cmdline, "prom_debug") || kbm_debug) { + if (find_boot_prop("prom_debug") || kbm_debug) { char *value; value = do_bsys_alloc(NULL, NULL, MMU_PAGESIZE, MMU_PAGESIZE); @@ -2019,9 +2193,26 @@ static ACPI_TABLE_RSDP * find_rsdp() { ACPI_TABLE_RSDP *rsdp; + uint64_t rsdp_val = 0; uint16_t *ebda_seg; paddr_t ebda_addr; + /* check for "acpi-root-tab" property */ + if (do_bsys_getproplen(NULL, "acpi-root-tab") == sizeof (uint64_t)) { + (void) do_bsys_getprop(NULL, "acpi-root-tab", &rsdp_val); + if (rsdp_val != 0) { + rsdp = scan_rsdp(rsdp_val, rsdp_val + sizeof (*rsdp)); + if (rsdp != NULL) { + if (kbm_debug) { + bop_printf(NULL, + "Using RSDP from bootloader: " + "0x%p\n", (void *)rsdp); + } + return (rsdp); + } + } + } + /* * Get the EBDA segment and scan the first 1K */ @@ -2536,12 +2727,18 @@ enumerate_xen_cpus() } #endif /* __xpv */ +/*ARGSUSED*/ static void -build_firmware_properties(void) +build_firmware_properties(struct xboot_info *xbp) { ACPI_TABLE_HEADER *tp = NULL; #ifndef __xpv + if (xbp->bi_acpi_rsdp) { + bsetprop64("acpi-root-tab", + (uint64_t)(uintptr_t)xbp->bi_acpi_rsdp); + } + if ((tp = find_fw_table(ACPI_SIG_MSCT)) != NULL) msct_ptr = process_msct((ACPI_TABLE_MSCT *)tp); else diff --git a/usr/src/uts/i86pc/sys/boot_console.h b/usr/src/uts/i86pc/sys/boot_console.h index b2fcf98f97..187733615c 100644 --- a/usr/src/uts/i86pc/sys/boot_console.h +++ b/usr/src/uts/i86pc/sys/boot_console.h @@ -36,6 +36,8 @@ extern "C" { #endif +#include <sys/bootinfo.h> + #define CONS_INVALID -1 #define CONS_SCREEN_TEXT 0 #define CONS_TTY 1 @@ -53,9 +55,12 @@ extern void kb_init(void); extern int kb_getchar(void); extern int kb_ischar(void); +/* Read property from command line or environment. */ +extern const char *find_boot_prop(const char *); + extern int boot_console_type(int *); -extern void bcons_init(char *); +extern void bcons_init(struct xboot_info *); extern void bcons_putchar(int); extern int bcons_getchar(void); extern int bcons_ischar(void); diff --git a/usr/src/uts/i86pc/sys/fastboot_msg.h b/usr/src/uts/i86pc/sys/fastboot_msg.h index 9a1c9bd878..5643d65b29 100644 --- a/usr/src/uts/i86pc/sys/fastboot_msg.h +++ b/usr/src/uts/i86pc/sys/fastboot_msg.h @@ -42,17 +42,20 @@ #define fastboot_nosup_msg_end(id) #endif /* fastboot_nosup_msg_end */ +/* BEGIN CSTYLED */ fastboot_nosup_msg(FBNS_DEFAULT, "") fastboot_nosup_msg(FBNS_SUSPEND, " after suspend/resume") fastboot_nosup_msg(FBNS_FMAHWERR, " due to FMA recovery from hardware error") fastboot_nosup_msg(FBNS_HOTPLUG, " after DR operations") fastboot_nosup_msg(FBNS_BOOTMOD, " due to presence of boot-time modules") +fastboot_nosup_msg(FBNS_MULTIBOOT2, " due to multiboot2 boot protocol") /* * Should ALWAYS be the last one. * No fastboot_nosup_msg() after that line. */ fastboot_nosup_msg_end(FBNS_END) +/* END CSTYLED */ #undef fastboot_nosup_msg #undef fastboot_nosup_msg_end diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index 1da8a3813c..8fdda3652d 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -119,6 +119,7 @@ BOOT_DRIVER_OBJS = \ boot_mmu.o \ boot_vga.o \ boot_xconsole.o \ + dboot_multiboot2.o \ $(FONT_OBJS) CORE_OBJS += $(BOOT_DRIVER_OBJS) diff --git a/usr/src/uts/intel/io/acpica/osl.c b/usr/src/uts/intel/io/acpica/osl.c index 5bc1b855fd..5c32604088 100644 --- a/usr/src/uts/intel/io/acpica/osl.c +++ b/usr/src/uts/intel/io/acpica/osl.c @@ -229,7 +229,7 @@ AcpiOsGetRootPointer() * The boot code process the table and put the physical address * in the acpi-root-tab property. */ - Address = ddi_prop_get_int(DDI_DEV_T_ANY, ddi_root_node(), + Address = ddi_prop_get_int64(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS, "acpi-root-tab", NULL); if ((Address == NULL) && ACPI_FAILURE(AcpiFindRootPointer(&Address))) @@ -241,7 +241,7 @@ AcpiOsGetRootPointer() /*ARGSUSED*/ ACPI_STATUS AcpiOsPredefinedOverride(const ACPI_PREDEFINED_NAMES *InitVal, - ACPI_STRING *NewVal) + ACPI_STRING *NewVal) { *NewVal = 0; @@ -260,7 +260,7 @@ acpica_strncpy(char *dest, const char *src, int len) ACPI_STATUS AcpiOsTableOverride(ACPI_TABLE_HEADER *ExistingTable, - ACPI_TABLE_HEADER **NewTable) + ACPI_TABLE_HEADER **NewTable) { char signature[5]; char oemid[7]; @@ -418,7 +418,7 @@ acpi_sema_v(acpi_sema_t *sp, unsigned count) ACPI_STATUS AcpiOsCreateSemaphore(UINT32 MaxUnits, UINT32 InitialUnits, -ACPI_HANDLE *OutHandle) + ACPI_HANDLE *OutHandle) { acpi_sema_t *sp; @@ -622,7 +622,7 @@ AcpiOsUnmapMemory(void *LogicalAddress, ACPI_SIZE Size) /*ARGSUSED*/ ACPI_STATUS AcpiOsGetPhysicalAddress(void *LogicalAddress, - ACPI_PHYSICAL_ADDRESS *PhysicalAddress) + ACPI_PHYSICAL_ADDRESS *PhysicalAddress) { /* UNIMPLEMENTED: not invoked by ACPI CA code */ @@ -653,8 +653,8 @@ static int acpi_intr_hooked = 0; ACPI_STATUS AcpiOsInstallInterruptHandler(UINT32 InterruptNumber, - ACPI_OSD_HANDLER ServiceRoutine, - void *Context) + ACPI_OSD_HANDLER ServiceRoutine, + void *Context) { _NOTE(ARGUNUSED(InterruptNumber)) @@ -687,7 +687,7 @@ AcpiOsInstallInterruptHandler(UINT32 InterruptNumber, ACPI_STATUS AcpiOsRemoveInterruptHandler(UINT32 InterruptNumber, - ACPI_OSD_HANDLER ServiceRoutine) + ACPI_OSD_HANDLER ServiceRoutine) { _NOTE(ARGUNUSED(ServiceRoutine)) @@ -931,7 +931,7 @@ osl_rw_memory(ACPI_PHYSICAL_ADDRESS Address, UINT64 *Value, ACPI_STATUS AcpiOsReadMemory(ACPI_PHYSICAL_ADDRESS Address, - UINT64 *Value, UINT32 Width) + UINT64 *Value, UINT32 Width) { osl_rw_memory(Address, Value, Width, 0); return (AE_OK); @@ -939,7 +939,7 @@ AcpiOsReadMemory(ACPI_PHYSICAL_ADDRESS Address, ACPI_STATUS AcpiOsWriteMemory(ACPI_PHYSICAL_ADDRESS Address, - UINT64 Value, UINT32 Width) + UINT64 Value, UINT32 Width) { osl_rw_memory(Address, &Value, Width, 1); return (AE_OK); @@ -948,7 +948,7 @@ AcpiOsWriteMemory(ACPI_PHYSICAL_ADDRESS Address, ACPI_STATUS AcpiOsReadPciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg, - UINT64 *Value, UINT32 Width) + UINT64 *Value, UINT32 Width) { switch (Width) { @@ -980,7 +980,7 @@ int acpica_write_pci_config_ok = 1; ACPI_STATUS AcpiOsWritePciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg, - UINT64 Value, UINT32 Width) + UINT64 Value, UINT32 Width) { if (!acpica_write_pci_config_ok) { @@ -1034,7 +1034,7 @@ AcpiOsWritePciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg, */ void AcpiOsDerivePciId(ACPI_HANDLE rhandle, ACPI_HANDLE chandle, - ACPI_PCI_ID **PciId) + ACPI_PCI_ID **PciId) { ACPI_HANDLE handle; dev_info_t *dip; diff --git a/usr/src/uts/intel/sys/bootinfo.h b/usr/src/uts/intel/sys/bootinfo.h index 3adce64fc4..fa60e6ac41 100644 --- a/usr/src/uts/intel/sys/bootinfo.h +++ b/usr/src/uts/intel/sys/bootinfo.h @@ -61,7 +61,8 @@ typedef void *native_ptr_t; typedef enum boot_module_type { BMT_ROOTFS, BMT_FILE, - BMT_HASH + BMT_HASH, + BMT_ENV } boot_module_type_t; struct boot_memlist { @@ -107,7 +108,9 @@ struct xboot_info { native_ptr_t bi_xen_start_info; native_ptr_t bi_shared_info; /* VA for shared_info */ #else - native_ptr_t bi_mb_info; + native_ptr_t bi_mb_info; /* multiboot 1 or 2 info */ + int bi_mb_version; /* multiboot version */ + native_ptr_t bi_acpi_rsdp; #endif }; #pragma pack() |