summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2017-04-24 12:05:06 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2017-04-24 12:05:06 +0000
commitfbe38b28735cd7e7715a61f2d1066dbbf1327b5c (patch)
tree82b5600384e40993ef24fee34a90e234f4520c7f
parent6f1d54d1657b9c469ae08d5ee733e114fe36c690 (diff)
parentff7af0d3beb1bddf8bb93afc2e9042dc3828be3d (diff)
downloadillumos-joyent-fbe38b28735cd7e7715a61f2d1066dbbf1327b5c.tar.gz
[illumos-gate merge]
commit ff7af0d3beb1bddf8bb93afc2e9042dc3828be3d 8007 want sys/stddef.h for offsetof and container_of macros commit 63982b82e639bf9f496423925738dd3f86bda7aa 7976 libstand/dosfs: cache FAT32 in 128 Kb blocks to save loader memory commit f289ce6eb03db0584699ec4fed88ef795a33dd79 7839 uts: implement boot environment support commit 76608ff7a54afda798e7fdc98681fb6d37322109 7838 loader: pass loader environment to kernel as module commit 14ee0d29c415966483c8c602b05bf27669c29497 7462 loader should support multiboot2 protocol commit 1738dd6ec94e36a9828d13a6e52ac7fb68cb52ed 7461 illumos should support multiboot2 protocol commit 660946868929e02041af7b5b1c3e14f547c53f11 8021 ARC buf data scatter-ization commit df950592be5771afa05177cbbef90ff275f2526f 8088 Add support for LSI Intruder and Cutlass cards. Conflicts: usr/src/uts/common/fs/zfs/arc.c
-rw-r--r--usr/src/boot/Makefile.version2
-rw-r--r--usr/src/boot/lib/libstand/dosfs.c210
-rw-r--r--usr/src/boot/lib/libstand/dosfs.h3
-rw-r--r--usr/src/boot/sys/boot/common/bootstrap.h12
-rw-r--r--usr/src/boot/sys/boot/common/load_elf.c14
-rw-r--r--usr/src/boot/sys/boot/common/module.c120
-rw-r--r--usr/src/boot/sys/boot/common/multiboot2.c894
-rw-r--r--usr/src/boot/sys/boot/efi/loader/bootinfo.c3
-rw-r--r--usr/src/boot/sys/boot/i386/libi386/biosacpi.c2
-rw-r--r--usr/src/boot/sys/boot/i386/libi386/multiboot.c319
-rw-r--r--usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S9
-rw-r--r--usr/src/boot/sys/boot/i386/loader/Makefile11
-rw-r--r--usr/src/boot/sys/boot/i386/loader/conf.c2
-rw-r--r--usr/src/boot/sys/boot/i386/loader/main.c15
-rw-r--r--usr/src/cmd/mdb/common/modules/zfs/zfs.c1
-rw-r--r--usr/src/cmd/zdb/zdb.c47
-rw-r--r--usr/src/cmd/zdb/zdb_il.c48
-rw-r--r--usr/src/cmd/ztest/ztest.c18
-rw-r--r--usr/src/common/zfs/zfs_fletcher.c106
-rw-r--r--usr/src/common/zfs/zfs_fletcher.h16
-rw-r--r--usr/src/head/iso/stddef_iso.h10
-rw-r--r--usr/src/head/stddef.h3
-rw-r--r--usr/src/lib/libzfs/common/libzfs_sendrecv.c15
-rw-r--r--usr/src/lib/libzpool/common/llib-lzpool2
-rw-r--r--usr/src/pkg/manifests/driver-storage-mr_sas.mf13
-rw-r--r--usr/src/pkg/manifests/system-header.mf1
-rw-r--r--usr/src/pkg/manifests/system-test-zfstest.mf9
-rw-r--r--usr/src/test/zfs-tests/cmd/memory_balloon/Makefile22
-rw-r--r--usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c103
-rw-r--r--usr/src/test/zfs-tests/runfiles/perf-regression.run8
-rw-r--r--usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh57
-rw-r--r--usr/src/test/zfs-tests/tests/perf/perf.shlib12
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh5
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh5
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh5
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh7
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh (renamed from usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh)7
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh (renamed from usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh)7
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh82
-rw-r--r--usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh5
-rw-r--r--usr/src/test/zfs-tests/tests/perf/scripts/profile.d37
-rw-r--r--usr/src/tools/mbh_patch/Makefile1
-rw-r--r--usr/src/tools/mbh_patch/mbh_patch.c121
-rw-r--r--usr/src/uts/common/Makefile.files1
-rw-r--r--usr/src/uts/common/fs/zfs/abd.c940
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c379
-rw-r--r--usr/src/uts/common/fs/zfs/blkptr.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c8
-rw-r--r--usr/src/uts/common/fs/zfs/ddt.c12
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c12
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c14
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c12
-rw-r--r--usr/src/uts/common/fs/zfs/edonr_zfs.c24
-rw-r--r--usr/src/uts/common/fs/zfs/lz4.c3
-rw-r--r--usr/src/uts/common/fs/zfs/sha256.c26
-rw-r--r--usr/src/uts/common/fs/zfs/skein_zfs.c28
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/abd.h150
-rw-r--r--usr/src/uts/common/fs/zfs/sys/ddt.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h29
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_checksum.h34
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_compress.h25
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c11
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_cache.c38
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c17
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c17
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c79
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c15
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c20
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c603
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c6
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c244
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c108
-rw-r--r--usr/src/uts/common/fs/zfs/zio_compress.c80
-rw-r--r--usr/src/uts/common/io/mr_sas/ld_pd_map.c12
-rw-r--r--usr/src/uts/common/io/mr_sas/mr_sas.c18
-rw-r--r--usr/src/uts/common/io/mr_sas/mr_sas.h20
-rw-r--r--usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c32
-rw-r--r--usr/src/uts/common/sys/Makefile1
-rw-r--r--usr/src/uts/common/sys/multiboot2.h418
-rw-r--r--usr/src/uts/common/sys/multiboot2_impl.h53
-rw-r--r--usr/src/uts/common/sys/stddef.h48
-rw-r--r--usr/src/uts/common/sys/sysmacros.h13
-rw-r--r--usr/src/uts/i86pc/Makefile.files1
-rw-r--r--usr/src/uts/i86pc/Makefile.rules7
-rw-r--r--usr/src/uts/i86pc/boot/boot_console.c130
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_grub.s102
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_multiboot2.c341
-rw-r--r--usr/src/uts/i86pc/dboot/dboot_startkern.c808
-rw-r--r--usr/src/uts/i86pc/os/ddi_impl.c3
-rw-r--r--usr/src/uts/i86pc/os/fakebop.c287
-rw-r--r--usr/src/uts/i86pc/sys/boot_console.h7
-rw-r--r--usr/src/uts/i86pc/sys/fastboot_msg.h3
-rw-r--r--usr/src/uts/i86xpv/Makefile.files1
-rw-r--r--usr/src/uts/intel/io/acpica/osl.c26
-rw-r--r--usr/src/uts/intel/sys/bootinfo.h7
98 files changed, 6173 insertions, 1518 deletions
diff --git a/usr/src/boot/Makefile.version b/usr/src/boot/Makefile.version
index 6e85cbfb08..018e7e7816 100644
--- a/usr/src/boot/Makefile.version
+++ b/usr/src/boot/Makefile.version
@@ -33,4 +33,4 @@ LOADER_VERSION = 1.1
# Use date like formatting here, YYYY.MM.DD.XX, without leading zeroes.
# The version is processed from left to right, the version number can only
# be increased.
-BOOT_VERSION = $(LOADER_VERSION)-2017.4.1.1
+BOOT_VERSION = $(LOADER_VERSION)-2017.4.22.1
diff --git a/usr/src/boot/lib/libstand/dosfs.c b/usr/src/boot/lib/libstand/dosfs.c
index 6cf50b8ba2..617041566e 100644
--- a/usr/src/boot/lib/libstand/dosfs.c
+++ b/usr/src/boot/lib/libstand/dosfs.c
@@ -26,7 +26,6 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
/*
* Readonly filesystem for Microsoft FAT12/FAT16/FAT32 filesystems,
@@ -65,6 +64,7 @@ struct fs_ops dosfs_fsops = {
#define DEPSEC 16 /* directory entries per sector */
#define DSHIFT 4 /* DEPSEC shift */
#define LOCLUS 2 /* lowest cluster number */
+#define FATBLKSZ 0x20000 /* size of block in the FAT cache buffer */
/* DOS "BIOS Parameter Block" */
typedef struct {
@@ -132,18 +132,6 @@ static DOS_DE dot[2] = {
((u_int)cv2((de)->dex.h_clus) << 16) | \
cv2((de)->clus))
-/*
- * fat cache metadata
- */
-struct fatcache {
- int unit; /* disk unit number */
- int size; /* buffer (and fat) size in sectors */
- u_char *buf;
-};
-
-static struct fatcache fat;
-
-static int dosunmount(DOS_FS *);
static int parsebs(DOS_FS *, DOS_BS *);
static int namede(DOS_FS *, const char *, DOS_DE **);
static int lookup(DOS_FS *, u_int, const char *, DOS_DE **);
@@ -153,36 +141,38 @@ static off_t fsize(DOS_FS *, DOS_DE *);
static int fatcnt(DOS_FS *, u_int);
static int fatget(DOS_FS *, u_int *);
static int fatend(u_int, u_int);
-static int ioread(DOS_FS *, u_int, void *, u_int);
-static int ioget(struct open_file *, daddr_t, void *, u_int);
+static int ioread(DOS_FS *, u_int, void *, size_t);
+static int ioget(struct open_file *, daddr_t, void *, size_t);
-static void
-dos_read_fat(DOS_FS *fs, struct open_file *fd)
+static int
+dos_read_fatblk(DOS_FS *fs, struct open_file *fd, u_int blknum)
{
- struct devdesc *dd = fd->f_devdata;
-
- if (fat.buf != NULL) { /* can we reuse old buffer? */
- if (fat.size != fs->spf) {
- free(fat.buf); /* no, free old buffer */
- fat.buf = NULL;
+ int err;
+ size_t io_size;
+ daddr_t offset_in_fat, max_offset_in_fat;
+
+ offset_in_fat = ((daddr_t)blknum) * FATBLKSZ;
+ max_offset_in_fat = secbyt(fs->spf);
+ io_size = FATBLKSZ;
+ if (offset_in_fat > max_offset_in_fat)
+ offset_in_fat = max_offset_in_fat;
+ if (offset_in_fat + io_size > max_offset_in_fat)
+ io_size = ((size_t)(max_offset_in_fat - offset_in_fat));
+
+ if (io_size != 0) {
+ err = ioget(fd, fs->lsnfat + bytsec(offset_in_fat),
+ fs->fatbuf, io_size);
+ if (err != 0) {
+ fs->fatbuf_blknum = ((u_int)(-1));
+ return (err);
}
}
- if (fat.buf == NULL)
- fat.buf = malloc(secbyt(fs->spf));
+ if (io_size < FATBLKSZ)
+ memset(fs->fatbuf + io_size, 0, FATBLKSZ - io_size);
- if (fat.buf != NULL) {
- if (ioget(fd, fs->lsnfat, fat.buf, secbyt(fs->spf)) == 0) {
- fat.size = fs->spf;
- fat.unit = dd->d_unit;
- return;
- }
- }
- if (fat.buf != NULL) /* got IO error */
- free(fat.buf);
- fat.buf = NULL;
- fat.unit = -1; /* impossible unit */
- fat.size = 0;
+ fs->fatbuf_blknum = blknum;
+ return (0);
}
/*
@@ -192,24 +182,27 @@ static int
dos_mount(DOS_FS *fs, struct open_file *fd)
{
int err;
- struct devdesc *dd = fd->f_devdata;
u_char *buf;
bzero(fs, sizeof(DOS_FS));
fs->fd = fd;
- if ((err = !(buf = malloc(secbyt(1))) ? errno : 0) ||
- (err = ioget(fs->fd, 0, buf, secbyt(1))) ||
+ if ((buf = malloc(secbyt(1))) == NULL)
+ return (errno);
+ if ((err = ioget(fs->fd, 0, buf, secbyt(1))) ||
(err = parsebs(fs, (DOS_BS *)buf))) {
- if (buf != NULL)
- free(buf);
- (void)dosunmount(fs);
+ free(buf);
return (err);
}
free(buf);
- if (fat.buf == NULL || fat.unit != dd->d_unit)
- dos_read_fat(fs, fd);
+ if ((fs->fatbuf = malloc(FATBLKSZ)) == NULL)
+ return (errno);
+ err = dos_read_fatblk(fs, fd, 0);
+ if (err != 0) {
+ free(fs->fatbuf);
+ return (err);
+ }
fs->root = dot[0];
fs->root.name[0] = ' ';
@@ -228,21 +221,9 @@ dos_mount(DOS_FS *fs, struct open_file *fd)
static int
dos_unmount(DOS_FS *fs)
{
- int err;
-
if (fs->links)
return (EBUSY);
- if ((err = dosunmount(fs)))
- return (err);
- return (0);
-}
-
-/*
- * Common code shared by dos_mount() and dos_unmount()
- */
-static int
-dosunmount(DOS_FS *fs)
-{
+ free(fs->fatbuf);
free(fs);
return (0);
}
@@ -257,16 +238,20 @@ dos_open(const char *path, struct open_file *fd)
DOS_FILE *f;
DOS_FS *fs;
u_int size, clus;
- int err = 0;
+ int err;
/* Allocate mount structure, associate with open */
- fs = malloc(sizeof(DOS_FS));
-
- if ((err = dos_mount(fs, fd)))
- goto out;
+ if ((fs = malloc(sizeof(DOS_FS))) == NULL)
+ return (errno);
+ if ((err = dos_mount(fs, fd))) {
+ free(fs);
+ return (err);
+ }
- if ((err = namede(fs, path, &de)))
- goto out;
+ if ((err = namede(fs, path, &de))) {
+ dos_unmount(fs);
+ return (err);
+ }
clus = stclus(fs->fatsz, de);
size = cv4(de->size);
@@ -274,18 +259,20 @@ dos_open(const char *path, struct open_file *fd)
if ((!(de->attr & FA_DIR) && (!clus != !size)) ||
((de->attr & FA_DIR) && size) ||
(clus && !okclus(fs, clus))) {
- err = EINVAL;
- goto out;
+ dos_unmount(fs);
+ return (EINVAL);
+ }
+ if ((f = malloc(sizeof(DOS_FILE))) == NULL) {
+ err = errno;
+ dos_unmount(fs);
+ return (err);
}
- f = malloc(sizeof(DOS_FILE));
bzero(f, sizeof(DOS_FILE));
f->fs = fs;
fs->links++;
f->de = *de;
fd->f_fsdata = (void *)f;
-
- out:
- return (err);
+ return (0);
}
/*
@@ -761,34 +748,57 @@ fatcnt(DOS_FS *fs, u_int c)
}
/*
- * Get next cluster in cluster chain. Use in core fat cache unless another
- * device replaced it.
+ * Get next cluster in cluster chain. Use in core fat cache unless
+ * the number of current 128K block in FAT has changed.
*/
static int
fatget(DOS_FS *fs, u_int *c)
{
- u_char buf[4];
- u_int x, offset, n, nbyte;
- struct devdesc *dd = fs->fd->f_devdata;
- int err = 0;
+ u_int val_in, val_out, offset, blknum, nbyte;
+ const u_char *p_entry;
+ int err;
- if (fat.unit != dd->d_unit) {
- /* fat cache was changed to another device, don't use it */
- err = ioread(fs, secbyt(fs->lsnfat) + fatoff(fs->fatsz, *c), buf,
- fs->fatsz != 32 ? 2 : 4);
- if (err)
- return (err);
- } else {
- offset = fatoff(fs->fatsz, *c);
- nbyte = fs->fatsz != 32 ? 2 : 4;
+ /* check input value to prevent overflow in fatoff() */
+ val_in = *c;
+ if (val_in & 0xf0000000)
+ return (EINVAL);
- if (offset + nbyte > secbyt(fat.size))
- return (EINVAL);
- memcpy(buf, fat.buf + offset, nbyte);
+ /* ensure that current 128K FAT block is cached */
+ offset = fatoff(fs->fatsz, val_in);
+ nbyte = fs->fatsz != 32 ? 2 : 4;
+ if (offset + nbyte > secbyt(fs->spf))
+ return (EINVAL);
+ blknum = offset / FATBLKSZ;
+ offset %= FATBLKSZ;
+ if (offset + nbyte > FATBLKSZ)
+ return (EINVAL);
+ if (blknum != fs->fatbuf_blknum) {
+ err = dos_read_fatblk(fs, fs->fd, blknum);
+ if (err != 0)
+ return (err);
}
-
- x = fs->fatsz != 32 ? cv2(buf) : cv4(buf);
- *c = fs->fatsz == 12 ? *c & 1 ? x >> 4 : x & 0xfff : x;
+ p_entry = fs->fatbuf + offset;
+
+ /* extract cluster number from FAT entry */
+ switch (fs->fatsz) {
+ case 32:
+ val_out = cv4(p_entry);
+ val_out &= 0x0fffffff;
+ break;
+ case 16:
+ val_out = cv2(p_entry);
+ break;
+ case 12:
+ val_out = cv2(p_entry);
+ if (val_in & 1)
+ val_out >>= 4;
+ else
+ val_out &= 0xfff;
+ break;
+ default:
+ return (EINVAL);
+ }
+ *c = val_out;
return (0);
}
@@ -805,7 +815,7 @@ fatend(u_int sz, u_int c)
* Offset-based I/O primitive
*/
static int
-ioread(DOS_FS *fs, u_int offset, void *buf, u_int nbyte)
+ioread(DOS_FS *fs, u_int offset, void *buf, size_t nbyte)
{
char *s;
u_int off, n;
@@ -843,8 +853,16 @@ ioread(DOS_FS *fs, u_int offset, void *buf, u_int nbyte)
* Sector-based I/O primitive
*/
static int
-ioget(struct open_file *fd, daddr_t lsec, void *buf, u_int size)
+ioget(struct open_file *fd, daddr_t lsec, void *buf, size_t size)
{
- return ((fd->f_dev->dv_strategy)(fd->f_devdata, F_READ, lsec,
- size, buf, NULL));
+ size_t rsize;
+ int rv;
+
+ /* Make sure we get full read or error. */
+ rsize = 0;
+ rv = (fd->f_dev->dv_strategy)(fd->f_devdata, F_READ, lsec,
+ size, buf, &rsize);
+ if ((rv == 0) && (size != rsize))
+ rv = EIO;
+ return (rv);
}
diff --git a/usr/src/boot/lib/libstand/dosfs.h b/usr/src/boot/lib/libstand/dosfs.h
index f2370ee502..0915c70930 100644
--- a/usr/src/boot/lib/libstand/dosfs.h
+++ b/usr/src/boot/lib/libstand/dosfs.h
@@ -24,7 +24,6 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
- * $FreeBSD$
*/
#ifndef DOSIO_H
@@ -96,6 +95,8 @@ typedef union {
typedef struct {
struct open_file *fd; /* file descriptor */
+ u_char *fatbuf; /* FAT cache buffer */
+ u_int fatbuf_blknum; /* number of 128K block in FAT cache buffer */
u_int links; /* active links to structure */
u_int spc; /* sectors per cluster */
u_int bsize; /* cluster size in bytes */
diff --git a/usr/src/boot/sys/boot/common/bootstrap.h b/usr/src/boot/sys/boot/common/bootstrap.h
index d228875f7f..010dda130e 100644
--- a/usr/src/boot/sys/boot/common/bootstrap.h
+++ b/usr/src/boot/sys/boot/common/bootstrap.h
@@ -22,8 +22,6 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * $FreeBSD$
*/
#ifndef _BOOTSTRAP_H_
@@ -234,6 +232,8 @@ void file_discard(struct preloaded_file *fp);
void file_addmetadata(struct preloaded_file *fp, int type, size_t size, void *p);
int file_addmodule(struct preloaded_file *fp, char *modname, int version,
struct kernel_module **newmp);
+void build_environment_module(void);
+vm_offset_t bi_copyenv(vm_offset_t);
/* MI module loaders */
#ifdef __elfN
@@ -306,7 +306,13 @@ struct arch_switch
*/
uint64_t (*arch_loadaddr)(u_int type, void *data, uint64_t addr);
#define LOAD_ELF 1 /* data points to the ELF header. */
-#define LOAD_RAW 2 /* data points to the file name. */
+#define LOAD_RAW 2 /* data points to the module file name. */
+#define LOAD_KERN 3 /* data points to the kernel file name. */
+#define LOAD_MEM 4 /* data points to int for buffer size. */
+ /*
+ * Interface to release the load address.
+ */
+ void (*arch_free_loadaddr)(uint64_t addr, uint64_t pages);
/*
* Interface to inform MD code about a loaded (ELF) segment. This
diff --git a/usr/src/boot/sys/boot/common/load_elf.c b/usr/src/boot/sys/boot/common/load_elf.c
index 287bfac56a..b7fc4bea09 100644
--- a/usr/src/boot/sys/boot/common/load_elf.c
+++ b/usr/src/boot/sys/boot/common/load_elf.c
@@ -251,11 +251,15 @@ __elfN(loadfile_raw)(char *filename, u_int64_t dest,
if (ef.kernel == 1 && multiboot == 0)
setenv("kernelname", filename, 1);
fp->f_name = strdup(filename);
- if (multiboot == 0)
- fp->f_type = strdup(ef.kernel ?
- __elfN(kerneltype) : __elfN(moduletype));
- else
- fp->f_type = strdup("elf multiboot kernel");
+ if (multiboot == 0) {
+ fp->f_type = strdup(ef.kernel ?
+ __elfN(kerneltype) : __elfN(moduletype));
+ } else {
+ if (multiboot == 1)
+ fp->f_type = strdup("elf multiboot kernel");
+ else
+ fp->f_type = strdup("elf multiboot2 kernel");
+ }
#ifdef ELF_VERBOSE
if (ef.kernel)
diff --git a/usr/src/boot/sys/boot/common/module.c b/usr/src/boot/sys/boot/common/module.c
index b091cf23b6..50afdbef7f 100644
--- a/usr/src/boot/sys/boot/common/module.c
+++ b/usr/src/boot/sys/boot/common/module.c
@@ -25,7 +25,6 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
/*
* file/module function dispatcher, support, etc.
@@ -403,6 +402,88 @@ file_load_dependencies(struct preloaded_file *base_file)
}
/*
+ * Calculate the size of the environment module.
+ * The environment is list of name=value C strings, ending with a '\0' byte.
+ */
+static size_t
+env_get_size(void)
+{
+ size_t size = 0;
+ struct env_var *ep;
+
+ /* Traverse the environment. */
+ for (ep = environ; ep != NULL; ep = ep->ev_next) {
+ size += strlen(ep->ev_name);
+ size++; /* "=" */
+ if (ep->ev_value != NULL)
+ size += strlen(ep->ev_value);
+ size++; /* nul byte */
+ }
+ size++; /* nul byte */
+ return (size);
+}
+
+/*
+ * Create virtual module for environment variables.
+ * This module should be created as late as possible before executing
+ * the OS kernel, or we may miss some environment variable updates.
+ */
+void
+build_environment_module(void)
+{
+ struct preloaded_file *fp;
+ size_t size;
+ char *name = "environment";
+ vm_offset_t laddr;
+
+ /* We can't load first */
+ if ((file_findfile(NULL, NULL)) == NULL) {
+ printf("Can not load environment module: %s\n",
+ "the kernel is not loaded");
+ return;
+ }
+
+ size = env_get_size();
+
+ fp = file_alloc();
+ if (fp != NULL) {
+ fp->f_name = strdup(name);
+ fp->f_type = strdup(name);
+ }
+
+ if (fp == NULL || fp->f_name == NULL || fp->f_type == NULL) {
+ printf("Can not load environment module: %s\n",
+ "out of memory");
+ if (fp != NULL)
+ file_discard(fp);
+ return;
+ }
+
+
+ if (archsw.arch_loadaddr != NULL)
+ loadaddr = archsw.arch_loadaddr(LOAD_MEM, &size, loadaddr);
+
+ if (loadaddr == 0) {
+ printf("Can not load environment module: %s\n",
+ "out of memory");
+ file_discard(fp);
+ return;
+ }
+
+ laddr = bi_copyenv(loadaddr);
+
+ /* Looks OK so far; populate control structure */
+ fp->f_loader = -1;
+ fp->f_addr = loadaddr;
+ fp->f_size = laddr - loadaddr;
+
+ /* recognise space consumption */
+ loadaddr = laddr;
+
+ file_insert_tail(fp);
+}
+
+/*
* We've been asked to load (fname) as (type), so just suck it in,
* no arguments or anything.
*/
@@ -413,6 +494,7 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert)
char *name;
int fd, got;
vm_offset_t laddr;
+ struct stat st;
/* We can't load first */
if ((file_findfile(NULL, NULL)) == NULL) {
@@ -434,12 +516,25 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert)
free(name);
return(NULL);
}
+ if (fstat(fd, &st) < 0) {
+ close(fd);
+ snprintf(command_errbuf, sizeof (command_errbuf),
+ "stat error '%s': %s", name, strerror(errno));
+ free(name);
+ return(NULL);
+ }
if (archsw.arch_loadaddr != NULL)
loadaddr = archsw.arch_loadaddr(LOAD_RAW, name, loadaddr);
+ if (loadaddr == 0) {
+ close(fd);
+ snprintf(command_errbuf, sizeof (command_errbuf),
+ "no memory to load %s", name);
+ free(name);
+ return(NULL);
+ }
- laddr = roundup(loadaddr, PAGE_SIZE);
- loadaddr = laddr;
+ laddr = loadaddr;
for (;;) {
/* read in 4k chunks; size is not really important */
got = archsw.arch_readin(fd, laddr, 4096);
@@ -450,6 +545,9 @@ file_loadraw(const char *fname, char *type, int argc, char **argv, int insert)
"error reading '%s': %s", name, strerror(errno));
free(name);
close(fd);
+ if (archsw.arch_free_loadaddr != NULL)
+ archsw.arch_free_loadaddr(loadaddr,
+ (uint64_t)(roundup2(st.st_size, PAGE_SIZE) >> 12));
return(NULL);
}
laddr += got;
@@ -893,6 +991,11 @@ file_discard(struct preloaded_file *fp)
struct kernel_module *mp, *mp1;
if (fp == NULL)
return;
+
+ if (archsw.arch_free_loadaddr != NULL && fp->f_addr)
+ archsw.arch_free_loadaddr(fp->f_addr,
+ (uint64_t)(roundup2(fp->f_size, PAGE_SIZE) >> 12));
+
md = fp->f_metadata;
while (md) {
md1 = md;
@@ -906,13 +1009,10 @@ file_discard(struct preloaded_file *fp)
mp1 = mp;
mp = mp->m_next;
free(mp1);
- }
- if (fp->f_name != NULL)
- free(fp->f_name);
- if (fp->f_type != NULL)
- free(fp->f_type);
- if (fp->f_args != NULL)
- free(fp->f_args);
+ }
+ free(fp->f_name);
+ free(fp->f_type);
+ free(fp->f_args);
free(fp);
}
diff --git a/usr/src/boot/sys/boot/common/multiboot2.c b/usr/src/boot/sys/boot/common/multiboot2.c
new file mode 100644
index 0000000000..adf1b7e2b4
--- /dev/null
+++ b/usr/src/boot/sys/boot/common/multiboot2.c
@@ -0,0 +1,894 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Toomas Soome <tsoome@me.com>
+ */
+
+/*
+ * This module adds support for loading and booting illumos multiboot2
+ * kernel. This code is only built to support the illumos kernel, it does
+ * not support xen.
+ */
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/linker.h>
+#include <sys/module.h>
+#include <sys/stdint.h>
+#include <sys/multiboot2.h>
+#include <stand.h>
+#include <stdbool.h>
+#include "libzfs.h"
+
+#include "bootstrap.h"
+
+#include <machine/metadata.h>
+#include <machine/pc/bios.h>
+
+#include "../i386/libi386/libi386.h"
+#include "../i386/btx/lib/btxv86.h"
+#include "pxe.h"
+
+extern BOOTPLAYER bootplayer; /* dhcp info */
+extern void multiboot_tramp();
+
+#include "platform/acfreebsd.h"
+#include "acconfig.h"
+#define ACPI_SYSTEM_XFACE
+#include "actypes.h"
+#include "actbl.h"
+
+extern ACPI_TABLE_RSDP *rsdp;
+
+/* MB data heap pointer. */
+static vm_offset_t last_addr;
+extern char bootprog_info[];
+
+extern int elf32_loadfile_raw(char *filename, u_int64_t dest,
+ struct preloaded_file **result, int multiboot);
+static int multiboot2_loadfile(char *, u_int64_t, struct preloaded_file **);
+static int multiboot2_exec(struct preloaded_file *);
+
+struct file_format multiboot2 = { multiboot2_loadfile, multiboot2_exec };
+static bool keep_bs = false;
+static bool have_framebuffer = false;
+static vm_offset_t load_addr;
+static vm_offset_t entry_addr;
+
+/*
+ * Validate tags in info request. This function is provided just to
+ * recognize the current tag list and only serves as a limited
+ * safe guard against possibly corrupt information.
+ */
+static bool
+is_info_request_valid(multiboot_header_tag_information_request_t *rtag)
+{
+ int i;
+
+ /*
+ * If the tag is optional and we do not support it, we do not
+ * have to do anything special, so we skip optional tags.
+ */
+ if (rtag->mbh_flags & MULTIBOOT_HEADER_TAG_OPTIONAL)
+ return (true);
+
+ for (i = 0; i < (rtag->mbh_size - sizeof (*rtag)) /
+ sizeof (rtag->mbh_requests[0]); i++)
+ switch (rtag->mbh_requests[i]) {
+ case MULTIBOOT_TAG_TYPE_END:
+ case MULTIBOOT_TAG_TYPE_CMDLINE:
+ case MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME:
+ case MULTIBOOT_TAG_TYPE_MODULE:
+ case MULTIBOOT_TAG_TYPE_BASIC_MEMINFO:
+ case MULTIBOOT_TAG_TYPE_BOOTDEV:
+ case MULTIBOOT_TAG_TYPE_MMAP:
+ case MULTIBOOT_TAG_TYPE_FRAMEBUFFER:
+ case MULTIBOOT_TAG_TYPE_VBE:
+ case MULTIBOOT_TAG_TYPE_ELF_SECTIONS:
+ case MULTIBOOT_TAG_TYPE_APM:
+ case MULTIBOOT_TAG_TYPE_EFI32:
+ case MULTIBOOT_TAG_TYPE_EFI64:
+ case MULTIBOOT_TAG_TYPE_ACPI_OLD:
+ case MULTIBOOT_TAG_TYPE_ACPI_NEW:
+ case MULTIBOOT_TAG_TYPE_NETWORK:
+ case MULTIBOOT_TAG_TYPE_EFI_MMAP:
+ case MULTIBOOT_TAG_TYPE_EFI_BS:
+ case MULTIBOOT_TAG_TYPE_EFI32_IH:
+ case MULTIBOOT_TAG_TYPE_EFI64_IH:
+ case MULTIBOOT_TAG_TYPE_LOAD_BASE_ADDR:
+ break;
+ default:
+ printf("unsupported information tag: 0x%x\n",
+ rtag->mbh_requests[i]);
+ return (false);
+ }
+ return (true);
+}
+
+static int
+multiboot2_loadfile(char *filename, u_int64_t dest,
+ struct preloaded_file **result)
+{
+ int fd, error;
+ uint32_t i;
+ struct stat st;
+ caddr_t header_search;
+ multiboot2_header_t *header;
+ multiboot_header_tag_t *tag;
+ multiboot_header_tag_address_t *addr_tag = NULL;
+ multiboot_header_tag_entry_address_t *entry_tag = NULL;
+ struct preloaded_file *fp;
+
+ /* This allows to check other file formats from file_formats array. */
+ error = EFTYPE;
+ if (filename == NULL)
+ return (error);
+
+ /* is kernel already loaded? */
+ fp = file_findfile(NULL, NULL);
+ if (fp != NULL)
+ return (error);
+
+ if ((fd = open(filename, O_RDONLY)) == -1)
+ return (errno);
+
+ /*
+ * Read MULTIBOOT_SEARCH size in order to search for the
+ * multiboot magic header.
+ */
+ header_search = malloc(MULTIBOOT_SEARCH);
+ if (header_search == NULL) {
+ close(fd);
+ return (ENOMEM);
+ }
+
+ if (read(fd, header_search, MULTIBOOT_SEARCH) != MULTIBOOT_SEARCH)
+ goto out;
+
+ header = NULL;
+ for (i = 0; i <= (MULTIBOOT_SEARCH - sizeof (multiboot2_header_t));
+ i += MULTIBOOT_HEADER_ALIGN) {
+ header = (multiboot2_header_t *)(header_search + i);
+
+ /* Do we have match on magic? */
+ if (header->mb2_magic != MULTIBOOT2_HEADER_MAGIC) {
+ header = NULL;
+ continue;
+ }
+ /*
+ * Validate checksum, the sum of magic + architecture +
+ * header_length + checksum must equal 0.
+ */
+ if (header->mb2_magic + header->mb2_architecture +
+ header->mb2_header_length + header->mb2_checksum != 0) {
+ header = NULL;
+ continue;
+ }
+ /*
+ * Finally, the entire header must fit within MULTIBOOT_SEARCH.
+ */
+ if (i + header->mb2_header_length > MULTIBOOT_SEARCH) {
+ header = NULL;
+ continue;
+ }
+ break;
+ }
+
+ if (header == NULL)
+ goto out;
+
+ for (tag = header->mb2_tags; tag->mbh_type != MULTIBOOT_TAG_TYPE_END;
+ tag = (multiboot_header_tag_t *)((uintptr_t)tag +
+ roundup2(tag->mbh_size, MULTIBOOT_TAG_ALIGN))) {
+ switch (tag->mbh_type) {
+ case MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST:
+ if (is_info_request_valid((void*)tag) == false)
+ goto out;
+ break;
+ case MULTIBOOT_HEADER_TAG_ADDRESS:
+ addr_tag = (multiboot_header_tag_address_t *)tag;
+ break;
+ case MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS:
+ entry_tag =
+ (multiboot_header_tag_entry_address_t *)tag;
+ break;
+ case MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS:
+ break;
+ case MULTIBOOT_HEADER_TAG_FRAMEBUFFER:
+ have_framebuffer = true;
+ break;
+ case MULTIBOOT_HEADER_TAG_MODULE_ALIGN:
+ /* we always align modules */
+ break;
+ case MULTIBOOT_HEADER_TAG_EFI_BS:
+ keep_bs = true;
+ break;
+ default:
+ if (!(tag->mbh_flags & MULTIBOOT_HEADER_TAG_OPTIONAL)) {
+ printf("unsupported tag: 0x%x\n",
+ tag->mbh_type);
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * We must have addr_tag and entry_tag to load a 64-bit kernel.
+ * If these tags are missing, we either have a 32-bit kernel, or
+ * this is not our kernel at all.
+ */
+ if (addr_tag != NULL && entry_tag != NULL) {
+ fp = file_alloc();
+ if (fp == NULL) {
+ error = ENOMEM;
+ goto out;
+ }
+ if (lseek(fd, 0, SEEK_SET) == -1) {
+ printf("lseek failed\n");
+ error = EIO;
+ file_discard(fp);
+ goto out;
+ }
+ if (fstat(fd, &st) < 0) {
+ printf("fstat failed\n");
+ error = EIO;
+ file_discard(fp);
+ goto out;
+ }
+
+ load_addr = addr_tag->mbh_load_addr;
+ entry_addr = entry_tag->mbh_entry_addr;
+ fp->f_addr = archsw.arch_loadaddr(LOAD_KERN, filename,
+ addr_tag->mbh_load_addr);
+ if (fp->f_addr == 0) {
+ error = ENOMEM;
+ file_discard(fp);
+ goto out;
+ }
+ fp->f_size = archsw.arch_readin(fd, fp->f_addr, st.st_size);
+
+ if (fp->f_size != st.st_size) {
+ printf("error reading: %s", strerror(errno));
+ file_discard(fp);
+ error = EIO;
+ goto out;
+ }
+
+ fp->f_name = strdup(filename);
+ fp->f_type = strdup("aout multiboot2 kernel");
+ if (fp->f_name == NULL || fp->f_type == NULL) {
+ error = ENOMEM;
+ file_discard(fp);
+ goto out;
+ }
+
+ fp->f_metadata = NULL;
+ error = 0;
+ } else {
+ /* elf32_loadfile_raw will fill the attributes in fp. */
+ error = elf32_loadfile_raw(filename, dest, &fp, 2);
+ if (error != 0) {
+ printf("elf32_loadfile_raw failed: %d unable to "
+ "load multiboot2 kernel\n", error);
+ goto out;
+ }
+ entry_addr = fp->f_addr;
+ /*
+ * We want the load_addr to have some legal value,
+ * so we set it same as the entry_addr.
+ * The distinction is important with UEFI, but not
+ * with BIOS version, because BIOS version does not use
+ * staging area.
+ */
+ load_addr = fp->f_addr;
+ }
+
+ setenv("kernelname", fp->f_name, 1);
+ bios_addsmapdata(fp);
+ *result = fp;
+out:
+ free(header_search);
+ close(fd);
+ return (error);
+}
+
+/*
+ * Since for now we have no way to pass the environment to the kernel other than
+ * through arguments, we need to take care of console setup.
+ *
+ * If the console is in mirror mode, set the kernel console from $os_console.
+ * If it's unset, use first item from $console.
+ * If $console is "ttyX", also pass $ttyX-mode, since it may have been set by
+ * the user.
+ *
+ * In case of memory allocation errors, just return the original command line
+ * so we have a chance of booting.
+ *
+ * On success, cl will be freed and a new, allocated command line string is
+ * returned.
+ */
+static char *
+update_cmdline(char *cl)
+{
+ char *os_console = getenv("os_console");
+ char *ttymode = NULL;
+ char mode[10];
+ char *tmp;
+ int len;
+
+ if (os_console == NULL) {
+ tmp = strdup(getenv("console"));
+ os_console = strsep(&tmp, ", ");
+ } else {
+ os_console = strdup(os_console);
+ }
+
+ if (os_console == NULL)
+ return (cl);
+
+ if (strncmp(os_console, "tty", 3) == 0) {
+ snprintf(mode, sizeof (mode), "%s-mode", os_console);
+ ttymode = getenv(mode); /* We will never get NULL. */
+ }
+
+ if (strstr(cl, "-B") != NULL) {
+ len = strlen(cl) + 1;
+ /*
+ * If console is not present, add it.
+ * If console is ttyX, add ttymode.
+ */
+ tmp = strstr(cl, "console");
+ if (tmp == NULL) {
+ len += 12; /* " -B console=" */
+ len += strlen(os_console);
+ if (ttymode != NULL) {
+ len += 13; /* ",ttyX-mode=\"\"" */
+ len += strlen(ttymode);
+ }
+ tmp = malloc(len);
+ if (tmp == NULL) {
+ free(os_console);
+ return (cl);
+ }
+ if (ttymode != NULL) {
+ snprintf(tmp, len,
+ "%s -B console=%s,%s-mode=\"%s\"",
+ cl, os_console, os_console, ttymode);
+ } else {
+ snprintf(tmp, len, "%s -B console=%s",
+ cl, os_console);
+ }
+ } else {
+ /* console is set, do we need tty mode? */
+ tmp += 8;
+ if (strstr(tmp, "tty") == tmp) {
+ strncpy(mode, tmp, 4);
+ mode[4] = '\0';
+ strncat(mode, "-mode", 5);
+ ttymode = getenv(mode);
+ } else { /* nope */
+ free(os_console);
+ return (cl);
+ }
+ len = strlen(cl) + 1;
+ len += 13; /* ",ttyX-mode=\"\"" */
+ len += strlen(ttymode);
+ tmp = malloc(len);
+ if (tmp == NULL) {
+ free(os_console);
+ return (cl);
+ }
+ snprintf(tmp, len, "%s,%s=\"%s\"", cl, mode, ttymode);
+ }
+ } else {
+ /*
+ * no -B, so we need to add " -B console=%s[,ttyX-mode=\"%s\"]"
+ */
+ len = strlen(cl) + 1;
+ len += 12; /* " -B console=" */
+ len += strlen(os_console);
+ if (ttymode != NULL) {
+ len += 13; /* ",ttyX-mode=\"\"" */
+ len += strlen(ttymode);
+ }
+ tmp = malloc(len);
+ if (tmp == NULL) {
+ free(os_console);
+ return (cl);
+ }
+ if (ttymode != NULL) {
+ snprintf(tmp, len, "%s -B console=%s,%s-mode=\"%s\"",
+ cl, os_console, os_console, ttymode);
+ } else {
+ snprintf(tmp, len, "%s -B console=%s", cl, os_console);
+ }
+ }
+ free(os_console);
+ free(cl);
+ return (tmp);
+}
+
+/*
+ * Search the command line for named property.
+ *
+ * Return codes:
+ * 0 The name is found, we return the data in value and len.
+ * ENOENT The name is not found.
+ * EINVAL The provided command line is badly formed.
+ */
+static int
+find_property_value(const char *cmd, const char *name, const char **value,
+ size_t *len)
+{
+ const char *namep, *valuep;
+ size_t name_len, value_len;
+ int quoted;
+
+ *value = NULL;
+ *len = 0;
+
+ if (cmd == NULL)
+ return (ENOENT);
+
+ while (*cmd != '\0') {
+ if (cmd[0] != '-' || cmd[1] != 'B') {
+ cmd++;
+ continue;
+ }
+ cmd += 2; /* Skip -B */
+ while (cmd[0] == ' ' || cmd[0] == '\t')
+ cmd++; /* Skip whitespaces. */
+ while (*cmd != '\0' && cmd[0] != ' ' && cmd[0] != '\t') {
+ namep = cmd;
+ valuep = strchr(cmd, '=');
+ if (valuep == NULL)
+ break;
+ name_len = valuep - namep;
+ valuep++;
+ value_len = 0;
+ quoted = 0;
+ for (; ; ++value_len) {
+ if (valuep[value_len] == '\0')
+ break;
+
+ /* Is this value quoted? */
+ if (value_len == 0 &&
+ (valuep[0] == '\'' || valuep[0] == '"')) {
+ quoted = valuep[0];
+ ++value_len;
+ }
+
+ /*
+ * In the quote accept any character,
+ * but look for ending quote.
+ */
+ if (quoted != 0) {
+ if (valuep[value_len] == quoted)
+ quoted = 0;
+ continue;
+ }
+
+ /* A comma or white space ends the value. */
+ if (valuep[value_len] == ',' ||
+ valuep[value_len] == ' ' ||
+ valuep[value_len] == '\t')
+ break;
+ }
+ if (quoted != 0) {
+ printf("Missing closing '%c' in \"%s\"\n",
+ quoted, valuep);
+ return (EINVAL);
+ }
+ if (value_len != 0) {
+ if (strncmp(namep, name, name_len) == 0) {
+ *value = valuep;
+ *len = value_len;
+ return (0);
+ }
+ }
+ cmd = valuep + value_len;
+ while (*cmd == ',')
+ cmd++;
+ }
+ }
+ return (ENOENT);
+}
+
+/*
+ * Build the kernel command line. Shared function between MB1 and MB2.
+ */
+int
+mb_kernel_cmdline(struct preloaded_file *fp, struct devdesc *rootdev,
+ char **line)
+{
+ const char *fs = getenv("fstype");
+ char *cmdline = NULL;
+ size_t len;
+ bool zfs_root = false;
+ int rv = 0;
+
+ if (rootdev->d_type == DEVT_ZFS)
+ zfs_root = true;
+
+ /* If we have fstype set in env, reset zfs_root if needed. */
+ if (fs != NULL && strcmp(fs, "zfs") != 0)
+ zfs_root = false;
+
+ /*
+ * If we have fstype set on the command line,
+ * reset zfs_root if needed.
+ */
+ rv = find_property_value(fp->f_args, "fstype", &fs, &len);
+ switch (rv) {
+ case EINVAL: /* invalid command line */
+ default:
+ return (rv);
+ case ENOENT: /* fall through */
+ case 0:
+ break;
+ }
+
+ if (fs != NULL && strncmp(fs, "zfs", len) != 0)
+ zfs_root = false;
+
+ len = strlen(fp->f_name) + 1;
+
+ if (fp->f_args != NULL)
+ len += strlen(fp->f_args) + 1;
+
+ if (zfs_root == true)
+ len += 3 + strlen(zfs_bootfs(rootdev)) + 1;
+
+ cmdline = malloc(len);
+ if (cmdline == NULL)
+ return (ENOMEM);
+
+ if (zfs_root == true) {
+ if (fp->f_args != NULL) {
+ snprintf(cmdline, len, "%s %s -B %s", fp->f_name,
+ fp->f_args, zfs_bootfs(rootdev));
+ } else {
+ snprintf(cmdline, len, "%s -B %s", fp->f_name,
+ zfs_bootfs(rootdev));
+ }
+ } else if (fp->f_args != NULL)
+ snprintf(cmdline, len, "%s %s", fp->f_name, fp->f_args);
+ else
+ snprintf(cmdline, len, "%s", fp->f_name);
+
+ *line = update_cmdline(cmdline);
+ return (0);
+}
+
+/*
+ * Returns allocated virtual address from MB info area.
+ */
+static vm_offset_t
+mb_malloc(size_t n)
+{
+ vm_offset_t ptr = last_addr;
+ last_addr = roundup(last_addr + n, MULTIBOOT_TAG_ALIGN);
+ return (ptr);
+}
+
+/*
+ * Calculate size for module tag list.
+ */
+static size_t
+module_size(struct preloaded_file *fp)
+{
+ size_t len, size;
+ struct preloaded_file *mfp;
+
+ size = 0;
+ for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) {
+ len = strlen(mfp->f_name) + 1;
+ len += strlen(mfp->f_type) + 5 + 1; /* 5 is for "type=" */
+ if (mfp->f_args != NULL)
+ len += strlen(mfp->f_args) + 1;
+ size += sizeof (multiboot_tag_module_t) + len;
+ size = roundup(size, MULTIBOOT_TAG_ALIGN);
+ }
+ return (size);
+}
+
+/*
+ * Calculate size for bios smap tag.
+ */
+static size_t
+biossmap_size(struct preloaded_file *fp)
+{
+ int num;
+ struct file_metadata *md;
+
+ md = file_findmetadata(fp, MODINFOMD_SMAP);
+ if (md == NULL)
+ return (0);
+
+ num = md->md_size / sizeof(struct bios_smap); /* number of entries */
+ return (sizeof (multiboot_tag_mmap_t) +
+ num * sizeof (multiboot_mmap_entry_t));
+}
+
+static size_t
+mbi_size(struct preloaded_file *fp, char *cmdline)
+{
+ size_t size;
+
+ size = sizeof (uint32_t) * 2; /* first 2 fields from MBI header */
+ size += sizeof (multiboot_tag_string_t) + strlen(cmdline) + 1;
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ size += sizeof (multiboot_tag_string_t) + strlen(bootprog_info) + 1;
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ size += sizeof (multiboot_tag_basic_meminfo_t);
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ size += module_size(fp);
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ size += biossmap_size(fp);
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+
+ if (strstr(getenv("loaddev"), "pxe") != NULL) {
+ size += sizeof(multiboot_tag_network_t) + sizeof (BOOTPLAYER);
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ }
+
+ if (rsdp != NULL) {
+ if (rsdp->Revision == 0) {
+ size += sizeof (multiboot_tag_old_acpi_t) +
+ sizeof(ACPI_RSDP_COMMON);
+ } else {
+ size += sizeof (multiboot_tag_new_acpi_t) +
+ rsdp->Length;
+ }
+ size = roundup2(size, MULTIBOOT_TAG_ALIGN);
+ }
+ size += sizeof(multiboot_tag_t);
+
+ return (size);
+}
+
+static int
+multiboot2_exec(struct preloaded_file *fp)
+{
+ struct preloaded_file *mfp;
+ multiboot2_info_header_t *mbi;
+ char *cmdline = NULL;
+ struct devdesc *rootdev;
+ struct file_metadata *md;
+ int i, error, num;
+ int rootfs = 0;
+ size_t size;
+ struct bios_smap *smap;
+ vm_offset_t tmp;
+ i386_getdev((void **)(&rootdev), NULL, NULL);
+
+ error = EINVAL;
+ if (rootdev == NULL) {
+ printf("can't determine root device\n");
+ goto error;
+ }
+
+ /*
+ * Set the image command line.
+ */
+ if (fp->f_args == NULL) {
+ cmdline = getenv("boot-args");
+ if (cmdline != NULL) {
+ fp->f_args = strdup(cmdline);
+ if (fp->f_args == NULL) {
+ error = ENOMEM;
+ goto error;
+ }
+ }
+ }
+
+ error = mb_kernel_cmdline(fp, rootdev, &cmdline);
+ if (error != 0)
+ goto error;
+
+ /* mb_kernel_cmdline() updates the environment. */
+ build_environment_module();
+
+ size = mbi_size(fp, cmdline); /* Get the size for MBI. */
+
+ /* Set up the base for mb_malloc. */
+ for (mfp = fp; mfp->f_next != NULL; mfp = mfp->f_next);
+
+ /* Start info block from the new page. */
+ last_addr = roundup(mfp->f_addr + mfp->f_size, MULTIBOOT_MOD_ALIGN);
+
+ /* Do we have space for multiboot info? */
+ if (last_addr + size >= memtop_copyin) {
+ error = ENOMEM;
+ goto error;
+ }
+
+ mbi = (multiboot2_info_header_t *)PTOV(last_addr);
+ last_addr = (vm_offset_t)mbi->mbi_tags;
+
+ {
+ multiboot_tag_string_t *tag;
+ i = sizeof (multiboot_tag_string_t) + strlen(cmdline) + 1;
+ tag = (multiboot_tag_string_t *) mb_malloc(i);
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_CMDLINE;
+ tag->mb_size = i;
+ memcpy(tag->mb_string, cmdline, strlen(cmdline) + 1);
+ free(cmdline);
+ cmdline = NULL;
+ }
+
+ {
+ multiboot_tag_string_t *tag;
+ i = sizeof (multiboot_tag_string_t) + strlen(bootprog_info) + 1;
+ tag = (multiboot_tag_string_t *) mb_malloc(i);
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME;
+ tag->mb_size = i;
+ memcpy(tag->mb_string, bootprog_info,
+ strlen(bootprog_info) + 1);
+ }
+
+ {
+ multiboot_tag_basic_meminfo_t *tag;
+ tag = (multiboot_tag_basic_meminfo_t *)
+ mb_malloc(sizeof (*tag));
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_BASIC_MEMINFO;
+ tag->mb_size = sizeof (*tag);
+ tag->mb_mem_lower = bios_basemem / 1024;
+ tag->mb_mem_upper = bios_extmem / 1024;
+ }
+
+ num = 0;
+ for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) {
+ num++;
+ if (mfp->f_type != NULL && strcmp(mfp->f_type, "rootfs") == 0)
+ rootfs++;
+ }
+
+ if (num == 0 || rootfs == 0) {
+ /* We need at least one module - rootfs. */
+ printf("No rootfs module provided, aborting\n");
+ error = EINVAL;
+ goto error;
+ }
+
+ /*
+ * Set the stage for physical memory layout:
+ * - We have kernel at load_addr.
+ * - Modules are aligned to page boundary.
+ * - MBI is aligned to page boundary.
+ * - Set the tmp to point to physical address of the first module.
+ */
+ tmp = roundup2(load_addr + fp->f_size, MULTIBOOT_MOD_ALIGN);
+
+ for (mfp = fp->f_next; mfp != NULL; mfp = mfp->f_next) {
+ multiboot_tag_module_t *tag;
+
+ num = strlen(mfp->f_name) + 1;
+ num += strlen(mfp->f_type) + 5 + 1;
+ if (mfp->f_args != NULL) {
+ num += strlen(mfp->f_args) + 1;
+ }
+ cmdline = malloc(num);
+ if (cmdline == NULL) {
+ error = ENOMEM;
+ goto error;
+ }
+
+ if (mfp->f_args != NULL)
+ snprintf(cmdline, num, "%s type=%s %s",
+ mfp->f_name, mfp->f_type, mfp->f_args);
+ else
+ snprintf(cmdline, num, "%s type=%s",
+ mfp->f_name, mfp->f_type);
+
+ tag = (multiboot_tag_module_t *)mb_malloc(sizeof (*tag) + num);
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_MODULE;
+ tag->mb_size = sizeof (*tag) + num;
+ tag->mb_mod_start = tmp;
+ tag->mb_mod_end = tmp + mfp->f_size;
+ tmp = roundup2(tag->mb_mod_end, MULTIBOOT_MOD_ALIGN);
+ memcpy(tag->mb_cmdline, cmdline, num);
+ free(cmdline);
+ cmdline = NULL;
+ }
+
+ md = file_findmetadata(fp, MODINFOMD_SMAP);
+ if (md == NULL) {
+ printf("no memory smap\n");
+ error = EINVAL;
+ goto error;
+ }
+
+ smap = (struct bios_smap *)md->md_data;
+ num = md->md_size / sizeof(struct bios_smap); /* number of entries */
+
+ {
+ multiboot_tag_mmap_t *tag;
+ multiboot_mmap_entry_t *mmap_entry;
+
+ tag = (multiboot_tag_mmap_t *)
+ mb_malloc(sizeof (*tag) +
+ num * sizeof (multiboot_mmap_entry_t));
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_MMAP;
+ tag->mb_size = sizeof (*tag) +
+ num * sizeof (multiboot_mmap_entry_t);
+ tag->mb_entry_size = sizeof (multiboot_mmap_entry_t);
+ tag->mb_entry_version = 0;
+ mmap_entry = (multiboot_mmap_entry_t *)tag->mb_entries;
+
+ for (i = 0; i < num; i++) {
+ mmap_entry[i].mmap_addr = smap[i].base;
+ mmap_entry[i].mmap_len = smap[i].length;
+ mmap_entry[i].mmap_type = smap[i].type;
+ mmap_entry[i].mmap_reserved = 0;
+ }
+ }
+
+ if (strstr(getenv("loaddev"), "pxe") != NULL) {
+ multiboot_tag_network_t *tag;
+ tag = (multiboot_tag_network_t *)
+ mb_malloc(sizeof(*tag) + sizeof (BOOTPLAYER));
+
+ tag->mb_type = MULTIBOOT_TAG_TYPE_NETWORK;
+ tag->mb_size = sizeof(*tag) + sizeof (BOOTPLAYER);
+ memcpy(tag->mb_dhcpack, &bootplayer, sizeof (BOOTPLAYER));
+ }
+
+ if (rsdp != NULL) {
+ multiboot_tag_new_acpi_t *ntag;
+ multiboot_tag_old_acpi_t *otag;
+ int size;
+
+ if (rsdp->Revision == 0) {
+ size = sizeof (*otag) + rsdp->Length;
+ otag = (multiboot_tag_old_acpi_t *)mb_malloc(size);
+ otag->mb_type = MULTIBOOT_TAG_TYPE_ACPI_OLD;
+ otag->mb_size = size;
+ memcpy(otag->mb_rsdp, rsdp, sizeof (ACPI_RSDP_COMMON));
+ } else {
+ size = sizeof (*ntag) + rsdp->Length;
+ ntag = (multiboot_tag_new_acpi_t *)mb_malloc(size);
+ ntag->mb_type = MULTIBOOT_TAG_TYPE_ACPI_NEW;
+ ntag->mb_size = size;
+ memcpy(ntag->mb_rsdp, rsdp, rsdp->Length);
+ }
+ }
+
+ /*
+ * MB tag list end marker.
+ */
+ {
+ multiboot_tag_t *tag = (multiboot_tag_t *)
+ mb_malloc(sizeof(*tag));
+ tag->mb_type = MULTIBOOT_TAG_TYPE_END;
+ tag->mb_size = sizeof(*tag);
+ }
+
+ mbi->mbi_total_size = last_addr - (vm_offset_t)mbi;
+ mbi->mbi_reserved = 0;
+
+ dev_cleanup();
+ __exec((void *)VTOP(multiboot_tramp), MULTIBOOT2_BOOTLOADER_MAGIC,
+ (void *)entry_addr, (void *)VTOP(mbi));
+ panic("exec returned");
+
+error:
+ if (cmdline != NULL)
+ free(cmdline);
+ return (error);
+}
diff --git a/usr/src/boot/sys/boot/efi/loader/bootinfo.c b/usr/src/boot/sys/boot/efi/loader/bootinfo.c
index 1f45ea3493..6c90871c06 100644
--- a/usr/src/boot/sys/boot/efi/loader/bootinfo.c
+++ b/usr/src/boot/sys/boot/efi/loader/bootinfo.c
@@ -27,7 +27,6 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
#include <stand.h>
#include <string.h>
@@ -112,7 +111,7 @@ bi_getboothowto(char *kargs)
* Each variable is formatted as <name>=<value>, with a single nul
* separating each variable, and a double nul terminating the environment.
*/
-static vm_offset_t
+vm_offset_t
bi_copyenv(vm_offset_t start)
{
struct env_var *ep;
diff --git a/usr/src/boot/sys/boot/i386/libi386/biosacpi.c b/usr/src/boot/sys/boot/i386/libi386/biosacpi.c
index 18f8050783..a82862dd3f 100644
--- a/usr/src/boot/sys/boot/i386/libi386/biosacpi.c
+++ b/usr/src/boot/sys/boot/i386/libi386/biosacpi.c
@@ -43,6 +43,7 @@
* environment.
*/
+ACPI_TABLE_RSDP *rsdp;
static ACPI_TABLE_RSDP *biosacpi_find_rsdp(void);
static ACPI_TABLE_RSDP *biosacpi_search_rsdp(char *base, int length);
@@ -51,7 +52,6 @@ static ACPI_TABLE_RSDP *biosacpi_search_rsdp(char *base, int length);
void
biosacpi_detect(void)
{
- ACPI_TABLE_RSDP *rsdp;
char buf[24];
int revision;
diff --git a/usr/src/boot/sys/boot/i386/libi386/multiboot.c b/usr/src/boot/sys/boot/i386/libi386/multiboot.c
index 899f75bca6..32e4fe3b07 100644
--- a/usr/src/boot/sys/boot/i386/libi386/multiboot.c
+++ b/usr/src/boot/sys/boot/i386/libi386/multiboot.c
@@ -41,7 +41,6 @@
#include <sys/linker.h>
#include <sys/module.h>
#include <sys/stdint.h>
-#include <stdbool.h>
#define _MACHINE_ELF_WANT_32BIT
#include <machine/elf.h>
#include <machine/metadata.h>
@@ -58,7 +57,6 @@
#define MULTIBOOT_SUPPORTED_FLAGS \
(MULTIBOOT_AOUT_KLUDGE|MULTIBOOT_PAGE_ALIGN|MULTIBOOT_MEMORY_INFO)
-#define NUM_MODULES 2
#define METADATA_FIXED_SIZE (PAGE_SIZE*4)
#define METADATA_MODULE_SIZE PAGE_SIZE
@@ -67,12 +65,17 @@
/* MB data heap pointer */
static vm_offset_t last_addr;
+extern char bootprog_info[];
extern int elf32_loadfile_raw(char *filename, u_int64_t dest,
struct preloaded_file **result, int multiboot);
extern int elf64_load_modmetadata(struct preloaded_file *fp, u_int64_t dest);
extern int elf64_obj_loadfile(char *filename, u_int64_t dest,
struct preloaded_file **result);
+extern int mb_kernel_cmdline(struct preloaded_file *, struct devdesc *,
+ char **);
+
+extern void multiboot_tramp();
static int multiboot_loadfile(char *, u_int64_t, struct preloaded_file **);
static int multiboot_exec(struct preloaded_file *);
@@ -84,10 +87,6 @@ struct file_format multiboot = { multiboot_loadfile, multiboot_exec };
struct file_format multiboot_obj =
{ multiboot_obj_loadfile, multiboot_obj_exec };
-extern void multiboot_tramp();
-
-static const char mbl_name[] = "illumos Loader";
-
static int
num_modules(struct preloaded_file *kfp)
{
@@ -124,7 +123,6 @@ multiboot_loadfile(char *filename, u_int64_t dest,
ssize_t search_size;
int fd;
struct multiboot_header *header;
- char *cmdline;
struct preloaded_file *fp;
if (filename == NULL)
@@ -219,12 +217,9 @@ multiboot_loadfile(char *filename, u_int64_t dest,
goto out;
}
fp->f_metadata = NULL;
-
- *result = fp;
error = 0;
} else {
-
- error = elf32_loadfile_raw(filename, dest, result, 1);
+ error = elf32_loadfile_raw(filename, dest, &fp, 1);
if (error != 0) {
printf("elf32_loadfile_raw failed: %d unable to "
"load multiboot kernel\n", error);
@@ -232,8 +227,9 @@ multiboot_loadfile(char *filename, u_int64_t dest,
}
}
- setenv("kernelname", (*result)->f_name, 1);
- bios_addsmapdata(*result);
+ setenv("kernelname", fp->f_name, 1);
+ bios_addsmapdata(fp);
+ *result = fp;
out:
free(header_search);
close(fd);
@@ -253,267 +249,6 @@ mb_malloc(size_t n)
return (ptr);
}
-/*
- * Since for now we have no way to pass the environment to the kernel other than
- * through arguments, we need to take care of console setup.
- *
- * If the console is in mirror mode, set the kernel console from $os_console.
- * If it's unset, use first item from $console.
- * If $console is "ttyX", also pass $ttyX-mode, since it may have been set by
- * the user.
- *
- * In case of memory allocation errors, just return original command line,
- * so we have chance of booting.
- *
- * On success, cl will be freed and a new, allocated command line string is
- * returned.
- */
-static char *
-update_cmdline(char *cl)
-{
- char *os_console = getenv("os_console");
- char *ttymode = NULL;
- char mode[10];
- char *tmp;
- int len;
-
- if (os_console == NULL) {
- tmp = strdup(getenv("console"));
- os_console = strsep(&tmp, ", ");
- } else
- os_console = strdup(os_console);
-
- if (os_console == NULL)
- return (cl);
-
- if (strncmp(os_console, "tty", 3) == 0) {
- snprintf(mode, sizeof (mode), "%s-mode", os_console);
- ttymode = getenv(mode); /* never NULL */
- }
-
- if (strstr(cl, "-B") != NULL) {
- len = strlen(cl) + 1;
- /*
- * if console is not present, add it
- * if console is ttyX, add ttymode
- */
- tmp = strstr(cl, "console");
- if (tmp == NULL) {
- len += 12; /* " -B console=" */
- len += strlen(os_console);
- if (ttymode != NULL) {
- len += 13; /* ",ttyX-mode=\"\"" */
- len += strlen(ttymode);
- }
- tmp = malloc(len);
- if (tmp == NULL) {
- free(os_console);
- return (cl);
- }
- if (ttymode != NULL)
- sprintf(tmp,
- "%s -B console=%s,%s-mode=\"%s\"",
- cl, os_console, os_console, ttymode);
- else
- sprintf(tmp, "%s -B console=%s",
- cl, os_console);
- } else {
- /* console is set, do we need tty mode? */
- tmp += 8;
- if (strstr(tmp, "tty") == tmp) {
- strncpy(mode, tmp, 4);
- mode[4] = '\0';
- strcat(mode, "-mode");
- ttymode = getenv(mode); /* never NULL */
- } else { /* nope */
- free(os_console);
- return (cl);
- }
- len = strlen(cl) + 1;
- len += 13; /* ",ttyX-mode=\"\"" */
- len += strlen(ttymode);
- tmp = malloc(len);
- if (tmp == NULL) {
- free(os_console);
- return (cl);
- }
- sprintf(tmp, "%s,%s=\"%s\"", cl, mode, ttymode);
- }
- } else {
- /*
- * no -B, so we need to add " -B console=%s[,ttyX-mode=\"%s\"]"
- */
- len = strlen(cl) + 1;
- len += 12; /* " -B console=" */
- len += strlen(os_console);
- if (ttymode != NULL) {
- len += 13; /* ",ttyX-mode=\"\"" */
- len += strlen(ttymode);
- }
- tmp = malloc(len);
- if (tmp == NULL) {
- free(os_console);
- return (cl);
- }
- if (ttymode != NULL)
- sprintf(tmp, "%s -B console=%s,%s-mode=\"%s\"", cl,
- os_console, os_console, ttymode);
- else
- sprintf(tmp, "%s -B console=%s", cl, os_console);
- }
- free(os_console);
- free(cl);
- return (tmp);
-}
-
-/*
- * Search the command line for named property.
- *
- * Return codes:
- * 0 The name is found, we return the data in value and len.
- * ENOENT The name is not found.
- * EINVAL The provided command line is badly formed.
- */
-static int
-find_property_value(const char *cmd, const char *name, const char **value,
- size_t *len)
-{
- const char *namep, *valuep;
- size_t name_len, value_len;
- int quoted;
-
- *value = NULL;
- *len = 0;
-
- if (cmd == NULL)
- return (ENOENT);
-
- while (*cmd != '\0') {
- if (cmd[0] != '-' || cmd[1] != 'B') {
- cmd++;
- continue;
- }
- cmd += 2; /* Skip -B */
- while (cmd[0] == ' ' || cmd[0] == '\t')
- cmd++; /* Skip whitespaces. */
- while (*cmd != '\0' && cmd[0] != ' ' && cmd[0] != '\t') {
- namep = cmd;
- valuep = strchr(cmd, '=');
- if (valuep == NULL)
- break;
- name_len = valuep - namep;
- valuep++;
- value_len = 0;
- quoted = 0;
- for (; ; ++value_len) {
- if (valuep[value_len] == '\0')
- break;
-
- /* Is this value quoted? */
- if (value_len == 0 &&
- (valuep[0] == '\'' || valuep[0] == '"')) {
- quoted = valuep[0];
- ++value_len;
- }
-
- /*
- * In the quote accept any character,
- * but look for ending quote.
- */
- if (quoted != 0) {
- if (valuep[value_len] == quoted)
- quoted = 0;
- continue;
- }
-
- /* A comma or white space ends the value. */
- if (valuep[value_len] == ',' ||
- valuep[value_len] == ' ' ||
- valuep[value_len] == '\t')
- break;
- }
- if (quoted != 0) {
- printf("Missing closing '%c' in \"%s\"\n",
- quoted, valuep);
- return (EINVAL);
- }
-
- if (value_len != 0) {
- if (strncmp(namep, name, name_len) == 0) {
- *value = valuep;
- *len = value_len;
- return (0);
- }
- }
- cmd = valuep + value_len;
- while (*cmd == ',')
- cmd++;
- }
- }
- return (ENOENT);
-}
-
-static int
-kernel_cmdline(struct preloaded_file *fp, struct i386_devdesc *rootdev,
- char **line)
-{
- const char *fs = getenv("fstype");
- char *cmdline = NULL;
- size_t len;
- bool zfs_root = false;
- int rv = 0;
-
- if (rootdev->d_type == DEVT_ZFS)
- zfs_root = true;
-
- /* If we have fstype set in env, reset zfs_root if needed. */
- if (fs != NULL && strcmp(fs, "zfs") != 0)
- zfs_root = false;
-
- /*
- * If we have fstype set on the command line,
- * reset zfs_root if needed.
- */
- rv = find_property_value(fp->f_args, "fstype", &fs, &len);
- switch (rv) {
- case EINVAL: /* invalid command line */
- return (rv);
- case ENOENT: /* fall through */
- case 0:
- break;
- }
-
- if (fs != NULL && strncmp(fs, "zfs", len) != 0)
- zfs_root = false;
-
- len = strlen(fp->f_name) + 1;
-
- if (fp->f_args != NULL)
- len += strlen(fp->f_args) + 1;
-
- if (zfs_root == true)
- len += 3 + strlen(zfs_bootfs(rootdev)) + 1;
-
- cmdline = malloc(len);
- if (cmdline == NULL)
- return (ENOMEM);
-
- if (zfs_root == true) {
- if (fp->f_args != NULL)
- snprintf(cmdline, len, "%s %s -B %s", fp->f_name,
- fp->f_args, zfs_bootfs(rootdev));
- else
- snprintf(cmdline, len, "%s -B %s", fp->f_name,
- zfs_bootfs(rootdev));
- } else if (fp->f_args != NULL)
- snprintf(cmdline, len, "%s %s", fp->f_name, fp->f_args);
- else
- snprintf(cmdline, len, "%s", fp->f_name);
-
- *line = update_cmdline(cmdline);
- return (0);
-}
-
static int
multiboot_exec(struct preloaded_file *fp)
{
@@ -521,13 +256,12 @@ multiboot_exec(struct preloaded_file *fp)
vm_offset_t module_start, metadata_size;
vm_offset_t modulep, kernend, entry;
struct file_metadata *md;
- Elf_Ehdr *ehdr;
struct multiboot_info *mb_info = NULL;
struct multiboot_mod_list *mb_mod = NULL;
multiboot_memory_map_t *mmap;
struct bios_smap *smap;
- struct i386_devdesc *rootdev;
- extern BOOTPLAYER bootplayer; /* dhcp info */
+ struct devdesc *rootdev;
+ extern BOOTPLAYER bootplayer; /* dhcp info */
char *cmdline = NULL;
size_t len;
int error, num, i;
@@ -535,10 +269,10 @@ multiboot_exec(struct preloaded_file *fp)
int xen = 0; /* flag for xen */
int kernel = 0; /* flag for kernel */
- /* set up base for mb_malloc */
+ /* Set up base for mb_malloc. */
for (mfp = fp; mfp->f_next != NULL; mfp = mfp->f_next);
- /* start info block from new page */
+ /* Start info block from new page. */
last_addr = roundup(mfp->f_addr + mfp->f_size, MULTIBOOT_MOD_ALIGN);
/* Allocate the multiboot struct and fill the basic details. */
@@ -548,9 +282,10 @@ multiboot_exec(struct preloaded_file *fp)
mb_info->flags = MULTIBOOT_INFO_MEMORY|MULTIBOOT_INFO_BOOT_LOADER_NAME;
mb_info->mem_lower = bios_basemem / 1024;
mb_info->mem_upper = bios_extmem / 1024;
- mb_info->boot_loader_name = mb_malloc(strlen(mbl_name) + 1);
+ mb_info->boot_loader_name = mb_malloc(strlen(bootprog_info) + 1);
- i386_copyin(mbl_name, mb_info->boot_loader_name, strlen(mbl_name)+1);
+ i386_copyin(bootprog_info, mb_info->boot_loader_name,
+ strlen(bootprog_info) + 1);
i386_getdev((void **)(&rootdev), NULL, NULL);
if (rootdev == NULL) {
@@ -560,12 +295,12 @@ multiboot_exec(struct preloaded_file *fp)
}
/*
- * boot image command line. if args were not provided, we need to set
+ * Boot image command line. If args were not provided, we need to set
* args here, and that depends on image type...
- * fortunately we only have following options:
- * 64 or 32 bit unix or xen. so we just check if f_name has unix.
+ * Fortunately we only have following options:
+ * 64 or 32 bit unix or xen. So we just check if f_name has unix.
*/
- /* do we boot xen? */
+ /* Do we boot xen? */
if (strstr(fp->f_name, "unix") == NULL)
xen = 1;
@@ -581,7 +316,7 @@ multiboot_exec(struct preloaded_file *fp)
}
if (num == 0 || rootfs == 0) {
- /* need at least one module - rootfs */
+ /* We need at least one module - rootfs. */
printf("No rootfs module provided, aborting\n");
error = EINVAL;
goto error;
@@ -603,7 +338,7 @@ multiboot_exec(struct preloaded_file *fp)
if (strcmp(mfp->f_type, "kernel") == 0) {
cmdline = NULL;
- error = kernel_cmdline(mfp, rootdev, &cmdline);
+ error = mb_kernel_cmdline(mfp, rootdev, &cmdline);
if (error != 0)
goto error;
} else {
@@ -667,7 +402,7 @@ multiboot_exec(struct preloaded_file *fp)
}
/*
* Set the image command line. Need to do this as last thing,
- * as Illumos kernel dboot_startkern will check cmdline
+ * as illumos kernel dboot_startkern will check cmdline
* address as last check to find first free address.
*/
if (fp->f_args == NULL) {
@@ -685,7 +420,7 @@ multiboot_exec(struct preloaded_file *fp)
}
/*
- * if image is xen, we just use f_name + f_args for commandline
+ * If the image is xen, we just use f_name + f_args for commandline
* for unix, we need to add zfs-bootfs.
*/
if (xen) {
@@ -708,7 +443,7 @@ multiboot_exec(struct preloaded_file *fp)
}
} else {
cmdline = NULL;
- if ((error = kernel_cmdline(fp, rootdev, &cmdline)) != 0)
+ if ((error = mb_kernel_cmdline(fp, rootdev, &cmdline)) != 0)
goto error;
}
@@ -719,8 +454,8 @@ multiboot_exec(struct preloaded_file *fp)
cmdline = NULL;
dev_cleanup();
- __exec((void *)VTOP(multiboot_tramp), (void *)entry,
- (void *)VTOP(mb_info));
+ __exec((void *)VTOP(multiboot_tramp), MULTIBOOT_BOOTLOADER_MAGIC,
+ (void *)entry, (void *)VTOP(mb_info));
panic("exec returned");
diff --git a/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S b/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S
index 0bd604365f..452a86bbb8 100644
--- a/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S
+++ b/usr/src/boot/sys/boot/i386/libi386/multiboot_tramp.S
@@ -26,9 +26,6 @@
* $FreeBSD$
*/
-#define ASM_FILE
-#include "multiboot.h"
-
/*
* The multiboot specification requires the executable to be launched
* with %cs set to a flat read/execute segment with offset 0 and limit
@@ -43,9 +40,9 @@ multiboot_tramp:
/* Be sure that interrupts are disabled. */
cli
- movl $MULTIBOOT_BOOTLOADER_MAGIC, %eax
+ movl 4(%esp), %eax /* bootloader magic */
/* Get the entry point and address of the multiboot_info parameter. */
- movl 8(%esp), %ebx
- movl 4(%esp), %ecx
+ movl 12(%esp), %ebx /* multiboot_info */
+ movl 8(%esp), %ecx /* entry */
call *%ecx
diff --git a/usr/src/boot/sys/boot/i386/loader/Makefile b/usr/src/boot/sys/boot/i386/loader/Makefile
index f2dac3bfc9..1275085c18 100644
--- a/usr/src/boot/sys/boot/i386/loader/Makefile
+++ b/usr/src/boot/sys/boot/i386/loader/Makefile
@@ -17,8 +17,9 @@
include $(SRC)/Makefile.master
include $(SRC)/boot/Makefile.version
-CFLAGS= -O2
-CPPFLAGS= -DSTAND -nostdinc -I../../../../include -I../../..
+CFLAGS= -O2
+CPPFLAGS= -DSTAND -nostdinc -I../../../../include -I../../..
+CPPFLAGS += -I$(SRC)/uts/intel/sys/acpi
LOADER= zfsloader
NEWVERSWHAT= "ZFS enabled bootstrap loader" x86
MAN=
@@ -63,7 +64,7 @@ LIBFICL= ../../ficl/i386/libficl.a
# Always add MI sources
SRCS += boot.c commands.c console.c devopen.c interp.c
SRCS += interp_backslash.c interp_parse.c ls.c misc.c
-SRCS += module.c panic.c linenoise.c
+SRCS += module.c panic.c linenoise.c multiboot2.c
SRCS += load_elf32.c load_elf32_obj.c reloc_elf32.c
SRCS += load_elf64.c load_elf64_obj.c reloc_elf64.c
@@ -107,6 +108,9 @@ CPPFLAGS += -I../btx/lib
include ../Makefile.inc
+# For multiboot2.h, must be last, to avoid conflicts
+CPPFLAGS += -I$(SRC)/uts/common
+
vers.c: ../../common/newvers.sh $(SRC)/boot/Makefile.version
$(SH) ../../common/newvers.sh ${LOADER_VERSION} ${NEWVERSWHAT}
@@ -135,7 +139,6 @@ DPADD= ${LIBFICL} ${LIBZFSBOOT} ${LIBI386} ${LIBSTAND}
LDADD= ${LIBFICL} ${LIBZFSBOOT} ${LIBI386} ${LIBSTAND}
CLEANFILES += machine x86
-CFLAGS += -DLOADER_PREFER_AMD64
machine:
$(RM) machine
diff --git a/usr/src/boot/sys/boot/i386/loader/conf.c b/usr/src/boot/sys/boot/i386/loader/conf.c
index b47c9219e8..d99c3a4b49 100644
--- a/usr/src/boot/sys/boot/i386/loader/conf.c
+++ b/usr/src/boot/sys/boot/i386/loader/conf.c
@@ -101,10 +101,12 @@ extern struct file_format amd64_elf;
extern struct file_format amd64_elf_obj;
extern struct file_format multiboot;
extern struct file_format multiboot_obj;
+extern struct file_format multiboot2;
extern struct file_format linux;
extern struct file_format linux_initrd;
struct file_format *file_formats[] = {
+ &multiboot2,
&multiboot,
&multiboot_obj,
&amd64_elf,
diff --git a/usr/src/boot/sys/boot/i386/loader/main.c b/usr/src/boot/sys/boot/i386/loader/main.c
index be092c552f..9f9d69f0c2 100644
--- a/usr/src/boot/sys/boot/i386/loader/main.c
+++ b/usr/src/boot/sys/boot/i386/loader/main.c
@@ -38,7 +38,9 @@
#include <machine/cpufunc.h>
#include <machine/psl.h>
#include <sys/disk.h>
+#include <sys/param.h>
#include <sys/reboot.h>
+#include <sys/multiboot2.h>
#include "bootstrap.h"
#include "common/bootargs.h"
@@ -81,6 +83,18 @@ extern char end[];
static void *heap_top;
static void *heap_bottom;
+static uint64_t
+i386_loadaddr(u_int type, void *data, uint64_t addr)
+{
+ /*
+ * Our modules are page aligned.
+ */
+ if (type == LOAD_RAW || type == LOAD_MEM)
+ return (roundup2(addr, MULTIBOOT_MOD_ALIGN));
+
+ return (addr);
+}
+
int
main(void)
{
@@ -162,6 +176,7 @@ main(void)
archsw.arch_readin = i386_readin;
archsw.arch_isainb = isa_inb;
archsw.arch_isaoutb = isa_outb;
+ archsw.arch_loadaddr = i386_loadaddr;
#ifdef LOADER_ZFS_SUPPORT
archsw.arch_zfs_probe = i386_zfs_probe;
#endif
diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
index 0658d7c639..10a2f5a4f7 100644
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c
@@ -3553,7 +3553,6 @@ typedef struct mdb_arc_buf_hdr_t {
struct {
uint32_t b_bufcnt;
uintptr_t b_state;
- uintptr_t b_pdata;
} b_l1hdr;
} mdb_arc_buf_hdr_t;
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index 9ddf5e1021..0137e6f448 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -60,6 +60,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#include <zfs_comutil.h>
#undef verify
#include <libzfs.h>
@@ -2537,7 +2538,7 @@ zdb_blkptr_done(zio_t *zio)
zdb_cb_t *zcb = zio->io_private;
zbookmark_phys_t *zb = &zio->io_bookmark;
- zio_data_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -2603,7 +2604,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (!BP_IS_EMBEDDED(bp) &&
(dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
+ abd_t *abd = abd_alloc(size, B_FALSE);
int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
/* If it's an intent log block, failure is expected. */
@@ -2616,7 +2617,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
- zio_nowait(zio_read(NULL, spa, bp, data, size,
+ zio_nowait(zio_read(NULL, spa, bp, abd, size,
zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
}
@@ -3397,6 +3398,13 @@ name:
return (NULL);
}
+/* ARGSUSED */
+static int
+random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused)
+{
+ return (random_get_pseudo_bytes(buf, len));
+}
+
/*
* Read a block from a pool and print it out. The syntax of the
* block descriptor is:
@@ -3428,7 +3436,8 @@ zdb_read_block(char *thing, spa_t *spa)
uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
zio_t *zio;
vdev_t *vd;
- void *pbuf, *lbuf, *buf;
+ abd_t *pabd;
+ void *lbuf, *buf;
char *s, *p, *dup, *vdev, *flagstr;
int i, error;
@@ -3499,7 +3508,7 @@ zdb_read_block(char *thing, spa_t *spa)
psize = size;
lsize = size;
- pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+ pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE);
lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
BP_ZERO(bp);
@@ -3527,15 +3536,15 @@ zdb_read_block(char *thing, spa_t *spa)
/*
* Treat this as a normal block read.
*/
- zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+ zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
} else {
/*
* Treat this as a vdev child I/O.
*/
- zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
- ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
+ psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
@@ -3558,21 +3567,21 @@ zdb_read_block(char *thing, spa_t *spa)
void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- bcopy(pbuf, pbuf2, psize);
+ abd_copy_to_buf(pbuf2, pabd, psize);
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize,
+ random_get_pseudo_bytes_cb, NULL));
- VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
- SPA_MAXBLOCKSIZE - psize) == 0);
+ VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+ SPA_MAXBLOCKSIZE - psize));
for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
lsize -= SPA_MINBLOCKSIZE) {
for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
- if (zio_decompress_data(c, pbuf, lbuf,
- psize, lsize) == 0 &&
- zio_decompress_data(c, pbuf2, lbuf2,
- psize, lsize) == 0 &&
+ if (zio_decompress_data(c, pabd,
+ lbuf, psize, lsize) == 0 &&
+ zio_decompress_data_buf(c, pbuf2,
+ lbuf2, psize, lsize) == 0 &&
bcmp(lbuf, lbuf2, lsize) == 0)
break;
}
@@ -3591,7 +3600,7 @@ zdb_read_block(char *thing, spa_t *spa)
buf = lbuf;
size = lsize;
} else {
- buf = pbuf;
+ buf = abd_to_buf(pabd);
size = psize;
}
@@ -3609,7 +3618,7 @@ zdb_read_block(char *thing, spa_t *spa)
zdb_dump_block(thing, buf, size, flags);
out:
- umem_free(pbuf, SPA_MAXBLOCKSIZE);
+ abd_free(pabd);
umem_free(lbuf, SPA_MAXBLOCKSIZE);
free(dup);
}
diff --git a/usr/src/cmd/zdb/zdb_il.c b/usr/src/cmd/zdb/zdb_il.c
index 583e422286..bc02b1b670 100644
--- a/usr/src/cmd/zdb/zdb_il.c
+++ b/usr/src/cmd/zdb/zdb_il.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
/*
@@ -41,6 +41,7 @@
#include <sys/resource.h>
#include <sys/zil.h>
#include <sys/zil_impl.h>
+#include <sys/abd.h>
extern uint8_t dump_opt[256];
@@ -117,13 +118,27 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
}
/* ARGSUSED */
+static int
+zil_prt_rec_write_cb(void *data, size_t len, void *unused)
+{
+ char *cdata = data;
+ for (int i = 0; i < len; i++) {
+ if (isprint(*cdata))
+ (void) printf("%c ", *cdata);
+ else
+ (void) printf("%2X", *cdata);
+ cdata++;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
- char *data, *dlimit;
+ abd_t *data;
blkptr_t *bp = &lr->lr_blkptr;
zbookmark_phys_t zb;
- char buf[SPA_MAXBLOCKSIZE];
int verbose = MAX(dump_opt['d'], dump_opt['i']);
int error;
@@ -144,7 +159,6 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
if (BP_IS_HOLE(bp)) {
(void) printf("\t\t\tLSIZE 0x%llx\n",
(u_longlong_t)BP_GET_LSIZE(bp));
- bzero(buf, sizeof (buf));
(void) printf("%s<hole>\n", prefix);
return;
}
@@ -157,28 +171,26 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
lr->lr_foid, ZB_ZIL_LEVEL,
lr->lr_offset / BP_GET_LSIZE(bp));
+ data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE);
error = zio_wait(zio_read(NULL, zilog->zl_spa,
- bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+ bp, data, BP_GET_LSIZE(bp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
if (error)
- return;
- data = buf;
+ goto out;
} else {
- data = (char *)(lr + 1);
+ /* data is stored after the end of the lr_write record */
+ data = abd_alloc(lr->lr_length, B_FALSE);
+ abd_copy_from_buf(data, lr + 1, lr->lr_length);
}
- dlimit = data + MIN(lr->lr_length,
- (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
-
(void) printf("%s", prefix);
- while (data < dlimit) {
- if (isprint(*data))
- (void) printf("%c ", *data);
- else
- (void) printf("%2X", *data);
- data++;
- }
+ (void) abd_iterate_func(data,
+ 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)),
+ zil_prt_rec_write_cb, NULL);
(void) printf("\n");
+
+out:
+ abd_free(data);
}
/* ARGSUSED */
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 75a3d5245f..16f79b52ef 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -111,6 +111,7 @@
#include <sys/refcount.h>
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
+#include <sys/abd.h>
#include <stdio.h>
#include <stdio_ext.h>
#include <stdlib.h>
@@ -188,6 +189,7 @@ extern uint64_t metaslab_df_alloc_threshold;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
+extern boolean_t zfs_abd_scatter_enabled;
static ztest_shared_opts_t *ztest_shared_opts;
static ztest_shared_opts_t ztest_opts;
@@ -5051,7 +5053,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
enum zio_checksum checksum = spa_dedup_checksum(spa);
dmu_buf_t *db;
dmu_tx_t *tx;
- void *buf;
+ abd_t *abd;
blkptr_t blk;
int copies = 2 * ZIO_DEDUPDITTO_MIN;
@@ -5131,14 +5133,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
* Damage the block. Dedup-ditto will save us when we read it later.
*/
psize = BP_GET_PSIZE(&blk);
- buf = zio_buf_alloc(psize);
- ztest_pattern_set(buf, psize, ~pattern);
+ abd = abd_alloc_linear(psize, B_TRUE);
+ ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
- buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+ abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
- zio_buf_free(buf, psize);
+ abd_free(abd);
(void) rw_unlock(&ztest_name_lock);
}
@@ -5421,6 +5423,12 @@ ztest_resume_thread(void *arg)
*/
if (ztest_random(10) == 0)
zfs_compressed_arc_enabled = ztest_random(2);
+
+ /*
+ * Periodically change the zfs_abd_scatter_enabled setting.
+ */
+ if (ztest_random(10) == 0)
+ zfs_abd_scatter_enabled = ztest_random(2);
}
return (NULL);
}
diff --git a/usr/src/common/zfs/zfs_fletcher.c b/usr/src/common/zfs/zfs_fletcher.c
index a58fa14b7c..c889169b42 100644
--- a/usr/src/common/zfs/zfs_fletcher.c
+++ b/usr/src/common/zfs/zfs_fletcher.c
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
/*
@@ -133,17 +134,29 @@
#include <sys/byteorder.h>
#include <sys/zio.h>
#include <sys/spa.h>
+#include <zfs_fletcher.h>
-/*ARGSUSED*/
void
-fletcher_2_native(const void *buf, uint64_t size,
- const void *ctx_template, zio_cksum_t *zcp)
+fletcher_init(zio_cksum_t *zcp)
{
+ ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
a0 += ip[0];
a1 += ip[1];
b0 += a0;
@@ -151,18 +164,33 @@ fletcher_2_native(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
}
/*ARGSUSED*/
void
-fletcher_2_byteswap(const void *buf, uint64_t size,
+fletcher_2_native(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
const uint64_t *ip = buf;
const uint64_t *ipend = ip + (size / sizeof (uint64_t));
uint64_t a0, b0, a1, b1;
- for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+ a0 = zcp->zc_word[0];
+ a1 = zcp->zc_word[1];
+ b0 = zcp->zc_word[2];
+ b1 = zcp->zc_word[3];
+
+ for (; ip < ipend; ip += 2) {
a0 += BSWAP_64(ip[0]);
a1 += BSWAP_64(ip[1]);
b0 += a0;
@@ -170,50 +198,23 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+ return (0);
}
/*ARGSUSED*/
void
-fletcher_4_native(const void *buf, uint64_t size,
+fletcher_2_byteswap(const void *buf, size_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
-
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += ip[0];
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+ fletcher_init(zcp);
+ (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
}
-/*ARGSUSED*/
-void
-fletcher_4_byteswap(const void *buf, uint64_t size,
- const void *ctx_template, zio_cksum_t *zcp)
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
{
- const uint32_t *ip = buf;
- const uint32_t *ipend = ip + (size / sizeof (uint32_t));
- uint64_t a, b, c, d;
+ zio_cksum_t *zcp = data;
- for (a = b = c = d = 0; ip < ipend; ip++) {
- a += BSWAP_32(ip[0]);
- b += a;
- c += b;
- d += c;
- }
-
- ZIO_SET_CHECKSUM(zcp, a, b, c, d);
-}
-
-void
-fletcher_4_incremental_native(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
-{
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
@@ -231,12 +232,23 @@ fletcher_4_incremental_native(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+ return (0);
}
+/*ARGSUSED*/
void
-fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
- zio_cksum_t *zcp)
+fletcher_4_native(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
+ fletcher_init(zcp);
+ (void) fletcher_4_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
+{
+ zio_cksum_t *zcp = data;
+
const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
@@ -254,4 +266,14 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
}
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_byteswap(const void *buf, size_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) fletcher_4_incremental_byteswap((void *) buf, size, zcp);
}
diff --git a/usr/src/common/zfs/zfs_fletcher.h b/usr/src/common/zfs/zfs_fletcher.h
index a920cc816d..33c6c728cf 100644
--- a/usr/src/common/zfs/zfs_fletcher.h
+++ b/usr/src/common/zfs/zfs_fletcher.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _ZFS_FLETCHER_H
@@ -40,12 +41,15 @@ extern "C" {
* fletcher checksum functions
*/
-void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
-void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_init(zio_cksum_t *);
+void fletcher_2_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_2_incremental_native(void *, size_t, void *);
+int fletcher_2_incremental_byteswap(void *, size_t, void *);
+void fletcher_4_native(const void *, size_t, const void *, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, size_t, const void *, zio_cksum_t *);
+int fletcher_4_incremental_native(void *, size_t, void *);
+int fletcher_4_incremental_byteswap(void *, size_t, void *);
#ifdef __cplusplus
}
diff --git a/usr/src/head/iso/stddef_iso.h b/usr/src/head/iso/stddef_iso.h
index b94960793c..37e10aec4b 100644
--- a/usr/src/head/iso/stddef_iso.h
+++ b/usr/src/head/iso/stddef_iso.h
@@ -82,16 +82,6 @@ typedef unsigned int size_t; /* (historical version) */
}
#endif /* end of namespace std */
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
-#define offsetof(s, m) __builtin_offsetof(s, m)
-#else
-#if __cplusplus >= 199711L
-#define offsetof(s, m) (std::size_t)(&(((s *)0)->m))
-#else
-#define offsetof(s, m) (size_t)(&(((s *)0)->m))
-#endif
-#endif /* GNUC, etc. */
-
#if !defined(_MAX_ALIGN_T)
#if !defined(_STRICT_SYMBOLS) || defined(_STDC_C11)
#define _MAX_ALIGN_T
diff --git a/usr/src/head/stddef.h b/usr/src/head/stddef.h
index 1e3d016048..6f04b7f7c9 100644
--- a/usr/src/head/stddef.h
+++ b/usr/src/head/stddef.h
@@ -31,10 +31,9 @@
#ifndef _STDDEF_H
#define _STDDEF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/isa_defs.h>
#include <iso/stddef_iso.h>
+#include <sys/stddef.h>
/*
* Allow global visibility for symbols defined in
diff --git a/usr/src/lib/libzfs/common/libzfs_sendrecv.c b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
index 2641d53e00..4e89dc053d 100644
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c
@@ -192,19 +192,19 @@ dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
- fletcher_4_incremental_native(drr,
+ (void) fletcher_4_incremental_native(drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
if (drr->drr_type != DRR_BEGIN) {
ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
drr_checksum.drr_checksum));
drr->drr_u.drr_checksum.drr_checksum = *zc;
}
- fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
- sizeof (zio_cksum_t), zc);
+ (void) fletcher_4_incremental_native(
+ &drr->drr_u.drr_checksum.drr_checksum, sizeof (zio_cksum_t), zc);
if (write(outfd, drr, sizeof (*drr)) == -1)
return (errno);
if (payload_len != 0) {
- fletcher_4_incremental_native(payload, payload_len, zc);
+ (void) fletcher_4_incremental_native(payload, payload_len, zc);
if (write(outfd, payload, payload_len) == -1)
return (errno);
}
@@ -2093,9 +2093,9 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
if (zc) {
if (byteswap)
- fletcher_4_incremental_byteswap(buf, ilen, zc);
+ (void) fletcher_4_incremental_byteswap(buf, ilen, zc);
else
- fletcher_4_incremental_native(buf, ilen, zc);
+ (void) fletcher_4_incremental_native(buf, ilen, zc);
}
return (0);
}
@@ -3649,7 +3649,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
* recv_read() above; do it again correctly.
*/
bzero(&zcksum, sizeof (zio_cksum_t));
- fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
+ (void) fletcher_4_incremental_byteswap(&drr,
+ sizeof (drr), &zcksum);
flags->byteswap = B_TRUE;
drr.drr_type = BSWAP_32(drr.drr_type);
diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool
index d0421bea94..3636b4e76e 100644
--- a/usr/src/lib/libzpool/common/llib-lzpool
+++ b/usr/src/lib/libzpool/common/llib-lzpool
@@ -61,6 +61,7 @@
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
#include <sys/blkptr.h>
+#include <sys/abd.h>
extern uint64_t metaslab_gang_bang;
extern uint64_t metaslab_df_alloc_threshold;
@@ -68,3 +69,4 @@ extern boolean_t zfeature_checks_disable;
extern uint64_t zfs_deadman_synctime_ms;
extern int metaslab_preload_limit;
extern boolean_t zfs_compressed_arc_enabled;
+extern boolean_t zfs_abd_scatter_enabled;
diff --git a/usr/src/pkg/manifests/driver-storage-mr_sas.mf b/usr/src/pkg/manifests/driver-storage-mr_sas.mf
index 32a138a184..d1b39a659e 100644
--- a/usr/src/pkg/manifests/driver-storage-mr_sas.mf
+++ b/usr/src/pkg/manifests/driver-storage-mr_sas.mf
@@ -23,6 +23,7 @@
# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright 2013 Nexenta Systems, Inc. All rights reserved.
# Copyright 2015 Garrett D'Amore <garrett@damore.org>
+# Copyright 2017 Citrus IT Limited. All rights reserved.
#
#
@@ -45,21 +46,29 @@ dir path=usr/share/man/man7d
$(sparc_ONLY)driver name=mr_sas class=scsi-self-identifying \
alias=pci1000,78 \
alias=pci1000,79 \
+ alias=pciex1000,52 \
+ alias=pciex1000,53 \
alias=pciex1000,5b \
alias=pciex1000,5d \
alias=pciex1000,5f \
alias=pciex1000,71 \
alias=pciex1000,73 \
alias=pciex1000,78 \
- alias=pciex1000,79
+ alias=pciex1000,79 \
+ alias=pciex1000,ce \
+ alias=pciex1000,cf
$(i386_ONLY)driver name=mr_sas class=scsi-self-identifying \
+ alias=pciex1000,52 \
+ alias=pciex1000,53 \
alias=pciex1000,5b \
alias=pciex1000,5d \
alias=pciex1000,5f \
alias=pciex1000,71 \
alias=pciex1000,73 \
alias=pciex1000,78 \
- alias=pciex1000,79
+ alias=pciex1000,79 \
+ alias=pciex1000,ce \
+ alias=pciex1000,cf
file path=kernel/drv/$(ARCH64)/mr_sas group=sys
$(i386_ONLY)file path=kernel/drv/mr_sas group=sys
file path=kernel/drv/mr_sas.conf group=sys
diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf
index 7fec376963..4e135e7751 100644
--- a/usr/src/pkg/manifests/system-header.mf
+++ b/usr/src/pkg/manifests/system-header.mf
@@ -1433,6 +1433,7 @@ file path=usr/include/sys/stat_impl.h
file path=usr/include/sys/statfs.h
file path=usr/include/sys/statvfs.h
file path=usr/include/sys/stdbool.h
+file path=usr/include/sys/stddef.h
file path=usr/include/sys/stdint.h
file path=usr/include/sys/stermio.h
file path=usr/include/sys/stream.h
diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf
index 114e9108df..1010ad94ed 100644
--- a/usr/src/pkg/manifests/system-test-zfstest.mf
+++ b/usr/src/pkg/manifests/system-test-zfstest.mf
@@ -158,6 +158,7 @@ file path=opt/zfs-tests/bin/file_trunc mode=0555
file path=opt/zfs-tests/bin/file_write mode=0555
file path=opt/zfs-tests/bin/getholes mode=0555
file path=opt/zfs-tests/bin/largest_file mode=0555
+file path=opt/zfs-tests/bin/memory_balloon mode=0555
file path=opt/zfs-tests/bin/mkbusy mode=0555
file path=opt/zfs-tests/bin/mkfiles mode=0555
file path=opt/zfs-tests/bin/mkholes mode=0555
@@ -2302,14 +2303,18 @@ file path=opt/zfs-tests/tests/perf/regression/random_reads mode=0555
file path=opt/zfs-tests/tests/perf/regression/random_readwrite mode=0555
file path=opt/zfs-tests/tests/perf/regression/random_writes mode=0555
file path=opt/zfs-tests/tests/perf/regression/sequential_reads mode=0555
-file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached \
+file path=opt/zfs-tests/tests/perf/regression/sequential_reads_arc_cached \
mode=0555
-file path=opt/zfs-tests/tests/perf/regression/sequential_reads_cached_clone \
+file \
+ path=opt/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone \
+ mode=0555
+file path=opt/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached \
mode=0555
file path=opt/zfs-tests/tests/perf/regression/sequential_writes mode=0555
file path=opt/zfs-tests/tests/perf/regression/setup mode=0555
file path=opt/zfs-tests/tests/perf/scripts/io.d mode=0444
file path=opt/zfs-tests/tests/perf/scripts/prefetch_io.d mode=0444
+file path=opt/zfs-tests/tests/perf/scripts/profile.d mode=0444
license cr_Sun license=cr_Sun
license lic_CDDL license=lic_CDDL
depend fmri=system/file-system/zfs/tests type=require
diff --git a/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile b/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile
new file mode 100644
index 0000000000..df1d370356
--- /dev/null
+++ b/usr/src/test/zfs-tests/cmd/memory_balloon/Makefile
@@ -0,0 +1,22 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+PROG = memory_balloon
+
+include $(SRC)/cmd/Makefile.cmd
+
+LINTFLAGS += -erroff=E_FUNC_SET_NOT_USED
+
+include ../Makefile.subdirs
diff --git a/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c b/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c
new file mode 100644
index 0000000000..958f6e6609
--- /dev/null
+++ b/usr/src/test/zfs-tests/cmd/memory_balloon/memory_balloon.c
@@ -0,0 +1,103 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Steal memory from the kernel, forcing the ARC to decrease in size, and hold
+ * it until the process receives a signal.
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+static void
+usage(char *progname)
+{
+ (void) fprintf(stderr, "Usage: %s -f <bytes>\n", progname);
+ exit(1);
+}
+
+static void
+fail(char *err, int rval)
+{
+ perror(err);
+ exit(rval);
+}
+
+static void
+daemonize(void)
+{
+ pid_t pid;
+
+ if ((pid = fork()) < 0) {
+ fail("fork", 1);
+ } else if (pid != 0) {
+ (void) fprintf(stdout, "%ld\n", pid);
+ exit(0);
+ }
+
+ (void) setsid();
+ (void) close(0);
+ (void) close(1);
+ (void) close(2);
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c;
+ boolean_t fflag = B_FALSE;
+ char *prog = argv[0];
+ long long size;
+ char *stroll_leftovers;
+ int shm_id;
+ void *shm_attached;
+
+ while ((c = getopt(argc, argv, "f")) != -1) {
+ switch (c) {
+ /* Run in the foreground */
+ case 'f':
+ fflag = B_TRUE;
+ break;
+ default:
+ usage(prog);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(prog);
+ size = strtoll(argv[0], &stroll_leftovers, 10);
+ if (size <= 0)
+ fail("invalid size in bytes", 1);
+
+ if ((shm_id = shmget(IPC_PRIVATE, size, IPC_CREAT|IPC_EXCL)) == -1)
+ fail("shmget", 1);
+ if ((shm_attached = shmat(shm_id, NULL, SHM_SHARE_MMU)) == (void *)-1)
+ fail("shmat", 1);
+
+ if (fflag == B_FALSE)
+ daemonize();
+ (void) pause();
+
+ /* NOTREACHED */
+ return (0);
+}
diff --git a/usr/src/test/zfs-tests/runfiles/perf-regression.run b/usr/src/test/zfs-tests/runfiles/perf-regression.run
index 0095931ad5..dbb30f0327 100644
--- a/usr/src/test/zfs-tests/runfiles/perf-regression.run
+++ b/usr/src/test/zfs-tests/runfiles/perf-regression.run
@@ -10,7 +10,7 @@
#
#
-# Copyright (c) 2015 by Delphix. All rights reserved.
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
#
[DEFAULT]
@@ -24,7 +24,7 @@ post = cleanup
outputdir = /var/tmp/test_results
[/opt/zfs-tests/tests/perf/regression]
-tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_cached',
- 'sequential_reads_cached_clone', 'random_reads', 'random_writes',
- 'random_readwrite']
+tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_arc_cached',
+ 'sequential_reads_arc_cached_clone', 'sequential_reads_dbuf_cached',
+ 'random_reads', 'random_writes', 'random_readwrite']
post =
diff --git a/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh b/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh
index a4f90be49b..7f6faf690e 100644
--- a/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/mdb/mdb_001_pos.ksh
@@ -49,26 +49,51 @@ function cleanup
verify_runnable "global"
log_onexit cleanup
-OUTFILE='/var/tmp/mdb-outfile'
-set -A dcmds "::walk spa" \
- "::walk spa | ::spa " \
- "::walk spa | ::spa -c" \
- "::walk spa | ::spa -v" \
- "::walk spa | ::spa_config" \
- "::walk spa | ::spa_space" \
- "::walk spa | ::spa_space -b" \
- "::walk spa | ::spa_vdevs" \
- "::walk spa | ::walk metaslab" \
- "::walk spa | ::print struct spa spa_root_vdev | ::vdev" \
- "::walk spa | ::print struct spa spa_root_vdev | ::vdev -re" \
+tmpfile=$(mktemp)
+log_must zpool scrub $TESTPOOL
+
+typeset spa=$(mdb -ke "::spa" | awk "/$TESTPOOL/ {print \$1}")
+typeset off_ub=$(mdb -ke "::offsetof spa_t spa_uberblock | =J")
+typeset off_rbp=$(mdb -ke "::offsetof uberblock_t ub_rootbp | =J")
+typeset bp=$(mdb -ke "$spa + $off_ub + $off_rbp =J")
+
+# dcmds and walkers skipped due to being DEBUG only or difficult to run:
+# ::zfs_params
+# ::refcount
+
+set -A dcmds "::abuf_find 1 2" \
+ "::arc" \
+ "::arc -b" \
+ "::arc_compression_stats" \
+ "$bp ::blkptr" \
+ "$bp ::dva" \
+ "::walk spa" \
+ "::spa" \
+ "$spa ::spa " \
+ "$spa ::spa -c" \
+ "$spa ::spa -h" \
+ "$spa ::spa -v" \
+ "$spa ::spa -Mmh" \
+ "$spa ::spa_config" \
+ "$spa ::spa_space" \
+ "$spa ::spa_space -b" \
+ "$spa ::spa_vdevs" \
+ "$spa ::print spa_t spa_root_vdev | ::vdev" \
+ "$spa ::print spa_t spa_root_vdev | ::vdev -re" \
+ "$spa ::print -a spa_t spa_dsl_pool->dp_dirty_datasets | ::walk txg_list" \
+ "$spa ::print -a spa_t spa_uberblock.ub_rootbp | ::blkptr" \
+ "$spa ::walk metaslab" \
+ "$spa ::walk metaslab | ::head -1 | ::metaslab_weight" \
+ "$spa ::walk metaslab | ::head -1 | ::metaslab_trace" \
+ "$spa ::walk zio_root | ::zio -c" \
+ "$spa ::walk zio_root | ::zio -r" \
+ "$spa ::walk zms_freelist"
+ "$spa ::zfs_blkstats -v" \
"::dbufs" \
"::dbufs -n mos -o mdn -l 0 -b 0" \
"::dbufs | ::dbuf" \
"::dbuf_stats" \
- "::abuf_find 1 2" \
- "::walk spa | ::print -a struct spa spa_uberblock.ub_rootbp | ::blkptr" \
- "::walk spa | ::print -a struct spa spa_dsl_pool->dp_dirty_datasets | ::walk txg_list" \
- "::walk spa | ::walk zms_freelist"
+ "dbuf_cache ::walk multilist"
#
# The commands above were supplied by the ZFS development team. The idea is to
# do as much checking as possible without the need to hardcode addresses.
diff --git a/usr/src/test/zfs-tests/tests/perf/perf.shlib b/usr/src/test/zfs-tests/tests/perf/perf.shlib
index 38e30f255d..ff980c0e6e 100644
--- a/usr/src/test/zfs-tests/tests/perf/perf.shlib
+++ b/usr/src/test/zfs-tests/tests/perf/perf.shlib
@@ -182,6 +182,18 @@ function get_max_arc_size
echo $max_arc_size
}
+function get_max_dbuf_cache_size
+{
+ typeset -l max_dbuf_cache_size=$(dtrace -qn 'BEGIN {
+ printf("%u\n", `dbuf_cache_max_bytes);
+ exit(0);
+ }')
+
+ [[ $? -eq 0 ]] || log_fail "get_max_dbuf_cache_size failed"
+
+ echo $max_dbuf_cache_size
+}
+
# Create a file with some information about how this system is configured.
function get_system_config
{
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh
index abf05ca719..655366e00c 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/random_reads.ksh
@@ -69,8 +69,9 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Random reads with $PERF_RUNTYPE settings"
do_fio_run random_reads.fio false true
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh
index 2422f9c658..f41a2b526e 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/random_readwrite.ksh
@@ -69,8 +69,9 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Random reads and writes with $PERF_RUNTYPE settings"
do_fio_run random_readwrite.fio false true
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh
index c48ae76140..9e201a827c 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/random_writes.ksh
@@ -61,8 +61,9 @@ fi
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Random writes with $PERF_RUNTYPE settings"
do_fio_run random_writes.fio true false
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh
index 60083c8673..580f2d94e4 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads.ksh
@@ -69,9 +69,10 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat"
- "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch"
+ "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Sequential reads with $PERF_RUNTYPE settings"
do_fio_run sequential_reads.fio false true
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh
index b4365c0871..97bb8bdc31 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh
@@ -68,9 +68,10 @@ log_must fio $FIO_SCRIPTS/mkfiles.fio
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat"
- "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch"
+ "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Sequential cached reads with $PERF_RUNTYPE settings"
do_fio_run sequential_reads.fio false false
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh
index c656eb4643..cfc748c843 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_cached_clone.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh
@@ -84,9 +84,10 @@ export TESTFS=$PERFPOOL/$TESTCLONE
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "$PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch" "vmstat 1" "vmstat"
- "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch"
+ "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Sequential cached reads from $TESTFS with $PERF_RUNTYPE settings"
do_fio_run sequential_reads.fio false false
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh
new file mode 100644
index 0000000000..f7ea4b75c6
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh
@@ -0,0 +1,82 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2016 by Delphix. All rights reserved.
+#
+
+#
+# Description:
+# Trigger fio runs using the sequential_reads job file. The number of runs and
+# data collected is determined by the PERF_* variables. See do_fio_run for
+# details about these variables.
+#
+# The files to read from are created prior to the first fio run, and used
+# for all fio runs. The ARC is not cleared to ensure that all data is cached.
+#
+# This is basically a copy of the sequential_reads_cached test case, but with
+# a smaller dateset so that we can fit everything into the decompressed, linear
+# space in the dbuf cache.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/perf/perf.shlib
+
+function cleanup
+{
+ log_must zfs destroy $TESTFS
+}
+
+log_assert "Measure IO stats during sequential read load"
+log_onexit cleanup
+
+export TESTFS=$PERFPOOL/testfs
+recreate_perfpool
+log_must zfs create $PERF_FS_OPTS $TESTFS
+
+# Ensure the working set can be cached in the dbuf cache.
+export TOTAL_SIZE=$(($(get_max_dbuf_cache_size) * 3 / 4))
+
+# Variables for use by fio.
+if [[ -n $PERF_REGRESSION_WEEKLY ]]; then
+ export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_WEEKLY}
+ export PERF_RUNTYPE=${PERF_RUNTYPE:-'weekly'}
+ export PERF_NTHREADS=${PERF_NTHREADS:-'16 64'}
+ export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'}
+ export PERF_IOSIZES=${PERF_IOSIZES:-'64k'}
+elif [[ -n $PERF_REGRESSION_NIGHTLY ]]; then
+ export PERF_RUNTIME=${PERF_RUNTIME:-$PERF_RUNTIME_NIGHTLY}
+ export PERF_RUNTYPE=${PERF_RUNTYPE:-'nightly'}
+ export PERF_NTHREADS=${PERF_NTHREADS:-'64'}
+ export PERF_SYNC_TYPES=${PERF_SYNC_TYPES:-'1'}
+ export PERF_IOSIZES=${PERF_IOSIZES:-'64k'}
+fi
+
+# Layout the files to be used by the read tests. Create as many files as the
+# largest number of threads. An fio run with fewer threads will use a subset
+# of the available files.
+export NUMJOBS=$(get_max $PERF_NTHREADS)
+export FILE_SIZE=$((TOTAL_SIZE / NUMJOBS))
+log_must fio $FIO_SCRIPTS/mkfiles.fio
+
+# Set up the scripts and output files that will log performance data.
+lun_list=$(pool_to_lun_list $PERFPOOL)
+log_note "Collecting backend IO stats with lun list $lun_list"
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "dtrace -Cs $PERF_SCRIPTS/prefetch_io.d $PERFPOOL 1" "prefetch"
+ "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
+
+log_note "Sequential cached reads with $PERF_RUNTYPE settings"
+do_fio_run sequential_reads.fio false false
+log_pass "Measure IO stats during sequential cached read load"
diff --git a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh
index e2f2cca0d2..493a3d18b7 100644
--- a/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh
+++ b/usr/src/test/zfs-tests/tests/perf/regression/sequential_writes.ksh
@@ -61,8 +61,9 @@ fi
# Set up the scripts and output files that will log performance data.
lun_list=$(pool_to_lun_list $PERFPOOL)
log_note "Collecting backend IO stats with lun list $lun_list"
-export collect_scripts=("$PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1" "io"
- "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat")
+export collect_scripts=("dtrace -s $PERF_SCRIPTS/io.d $PERFPOOL $lun_list 1"
+ "io" "vmstat 1" "vmstat" "mpstat 1" "mpstat" "iostat -xcnz 1" "iostat"
+ "dtrace -s $PERF_SCRIPTS/profile.d" "profile" "kstat zfs:0 1" "kstat")
log_note "Sequential writes with $PERF_RUNTYPE settings"
do_fio_run sequential_writes.fio true false
diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/profile.d b/usr/src/test/zfs-tests/tests/perf/scripts/profile.d
new file mode 100644
index 0000000000..e7fbd1fca5
--- /dev/null
+++ b/usr/src/test/zfs-tests/tests/perf/scripts/profile.d
@@ -0,0 +1,37 @@
+#!/usr/sbin/dtrace -s
+
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#pragma D option stackframes=100
+
+/*
+ * @stacks: The number of times a stack has been recorded
+ */
+
+profile-997
+/ arg0 /
+{
+ @stacks[stack()] = count();
+}
+
+ERROR
+{
+ trace(arg1);
+ trace(arg2);
+ trace(arg3);
+ trace(arg4);
+ trace(arg5);
+}
diff --git a/usr/src/tools/mbh_patch/Makefile b/usr/src/tools/mbh_patch/Makefile
index 68d2559864..95e8442340 100644
--- a/usr/src/tools/mbh_patch/Makefile
+++ b/usr/src/tools/mbh_patch/Makefile
@@ -23,7 +23,6 @@
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
include ../Makefile.tools
diff --git a/usr/src/tools/mbh_patch/mbh_patch.c b/usr/src/tools/mbh_patch/mbh_patch.c
index 8a5fa4cd31..204009ab4d 100644
--- a/usr/src/tools/mbh_patch/mbh_patch.c
+++ b/usr/src/tools/mbh_patch/mbh_patch.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
@@ -37,7 +35,9 @@
#include <sys/elf_notes.h>
#include <sys/mman.h>
#include <sys/stat.h>
+#include <sys/sysmacros.h>
#include "sys/multiboot.h"
+#include "sys/multiboot2.h"
static char *pname;
static char *fname;
@@ -46,7 +46,57 @@ static char *image; /* pointer to the ELF file in memory */
#define ELFSEEK(offset) ((void *)(image + offset))
/*
- * patch the load address / entry address
+ * Find MB2 header tags for entry and patch it.
+ * The first tag is right after header.
+ */
+static int
+patch64_mb2(multiboot2_header_t *mbh2, int file_offset,
+ Elf64_Addr ptload_start, Elf32_Off ptload_offset)
+{
+ multiboot_header_tag_t *tagp = mbh2->mb2_tags;
+ multiboot_header_tag_address_t *mbaddr = NULL;
+ multiboot_header_tag_entry_address_t *mbentry = NULL;
+
+ /*
+ * Loop until we get end TAG or we have both tags.
+ */
+ while (tagp->mbh_type != MULTIBOOT_HEADER_TAG_END &&
+ (mbaddr == NULL || mbentry == NULL)) {
+ switch (tagp->mbh_type) {
+ case MULTIBOOT_HEADER_TAG_ADDRESS:
+ mbaddr = (multiboot_header_tag_address_t *)tagp;
+ break;
+ case MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS:
+ mbentry = (multiboot_header_tag_entry_address_t *)tagp;
+ break;
+ }
+ tagp = (multiboot_header_tag_t *)
+ ((uintptr_t)tagp +
+ P2ROUNDUP(tagp->mbh_size, MULTIBOOT_TAG_ALIGN));
+ }
+
+ if (mbaddr == NULL || mbentry == NULL) {
+ (void) fprintf(stderr, "Missing multiboot2 %s tag\n",
+ (mbaddr == NULL)? "address" : "entry");
+ return (1);
+ }
+
+ /* Patch it. */
+ mbaddr->mbh_load_addr = ptload_start - ptload_offset;
+ mbaddr->mbh_header_addr = mbaddr->mbh_load_addr + file_offset;
+ mbentry->mbh_entry_addr = ptload_start;
+
+#ifdef VERBOSE
+ (void) printf(" ELF64 MB2 header patched\n");
+ (void) printf("\tload_addr now: 0x%x\n", mbaddr->mbh_load_addr);
+ (void) printf("\theader_addr now: 0x%x\n", mbaddr->mbh_header_addr);
+ (void) printf("\tentry_addr now: 0x%x\n", mbentry->mbh_entry_addr);
+#endif
+ return (0);
+}
+
+/*
+ * Patch the load address / entry address for MB1 and MB2 if present.
* Find the physical load address of the 1st PT_LOAD segment.
* Find the amount that e_entry exceeds that amount.
* Now go back and subtract the excess from the p_paddr of the LOAD segment.
@@ -56,8 +106,9 @@ patch64(Elf64_Ehdr *eh)
{
Elf64_Phdr *phdr;
caddr_t phdrs = NULL;
- int ndx, mem;
+ int ndx, mem, mem2;
multiboot_header_t *mbh;
+ multiboot2_header_t *mbh2;
/*
* Verify some ELF basics - this must be an executable with program
@@ -84,7 +135,7 @@ patch64(Elf64_Ehdr *eh)
}
/*
- * Look for multiboot header. It must be 32-bit aligned and
+ * Look for multiboot1 header. It must be 32-bit aligned and
* completely contained in the 1st 8K of the file.
*/
for (mem = 0; mem < 8192 - sizeof (multiboot_header_t); mem += 4) {
@@ -100,6 +151,30 @@ patch64(Elf64_Ehdr *eh)
}
/*
+ * Look for multiboot2 header. It must be 64-bit aligned and
+ * completely contained in the 1st 32K of the file.
+ * We do not require it to be present.
+ */
+ ndx = 0;
+ for (mem2 = 0;
+ mem2 <= MULTIBOOT_SEARCH - sizeof (multiboot2_header_t);
+ mem2 += MULTIBOOT_HEADER_ALIGN) {
+ mbh2 = ELFSEEK(mem2);
+ ndx = mbh2->mb2_header_length;
+ if (mbh2->mb2_magic == MULTIBOOT2_HEADER_MAGIC)
+ break;
+ ndx = 0;
+ }
+
+ if (ndx == 0 || mem2 + ndx > MULTIBOOT_SEARCH) {
+#ifdef VERBOSE
+ (void) fprintf(stderr, "%s: %s: Didn't find multiboot2 "
+ "header\n", pname, fname);
+#endif
+ mbh2 = NULL;
+ }
+
+ /*
* Find the 1:1 mapped PT_LOAD section
*/
for (ndx = 0; ndx < eh->e_phnum; ndx++) {
@@ -135,6 +210,16 @@ patch64(Elf64_Ehdr *eh)
return (1);
}
+ if (mbh2 != NULL && ((mem2 < phdr->p_offset) ||
+ (mem2 >= (phdr->p_offset + phdr->p_filesz)))) {
+#ifdef VERBOSE
+ (void) fprintf(stderr, "%s: %s: multiboot2 header not"
+ " in 1st PT_LOAD\n", pname, fname);
+#endif
+ mem2 = 0;
+ mbh2 = NULL;
+ }
+
/*
* Patch the multiboot header fields to get entire file loaded.
* Grub uses the MB header for 64 bit loading.
@@ -148,6 +233,9 @@ patch64(Elf64_Ehdr *eh)
(void) printf("\tentry_addr now: 0x%x\n", mbh->entry_addr);
(void) printf("\theader_addr now: 0x%x\n", mbh->header_addr);
#endif
+ if (mbh2 != NULL)
+ return (patch64_mb2(mbh2, mem2, phdr->p_paddr,
+ phdr->p_offset));
return (0);
}
@@ -162,9 +250,10 @@ main(int argc, char **argv)
int fd;
uchar_t *ident;
void *hdr = NULL;
+ struct stat sb;
/*
- * we expect one argument -- the elf file
+ * We expect one argument -- the elf file.
*/
if (argc != 2) {
(void) fprintf(stderr, "usage: %s <unix-elf-file>\n", argv[0]);
@@ -184,11 +273,25 @@ main(int argc, char **argv)
return (1);
}
+ if (fstat(fd, &sb) != 0) {
+ (void) fprintf(stderr, "%s: fstat failed: %s\n",
+ pname, strerror(errno));
+ return (1);
+ }
+
+ /* Make sure we have at least MULTIBOOT_SEARCH bytes. */
+ if (sb.st_size < MULTIBOOT_SEARCH) {
+ (void) fprintf(stderr, "%s: %s is too small for a kernel\n",
+ pname, fname);
+ return (1);
+ }
+
/*
- * mmap just the 1st 8K -- since that's where the GRUB
- * multiboot header must be located.
+ * mmap the 1st 32K -- MB1 header is within first 8k and MB2 header
+ * is within 32k.
*/
- image = mmap(NULL, 8192, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ image = mmap(NULL, MULTIBOOT_SEARCH, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
if (image == MAP_FAILED) {
(void) fprintf(stderr, "%s: mmap() of %s failed: %s\n",
pname, fname, strerror(errno));
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 4aaa968965..450c903674 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -1378,6 +1378,7 @@ SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \
BOOTFS_OBJS += bootfs_construct.o bootfs_vfsops.o bootfs_vnops.o
ZFS_COMMON_OBJS += \
+ abd.o \
arc.o \
blkptr.o \
bplist.o \
diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c
new file mode 100644
index 0000000000..932ba800ed
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/abd.c
@@ -0,0 +1,940 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+};
+
+#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
+#define ABDSTAT_INCR(stat, val) \
+ atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
+#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
+
+/*
+ * It is possible to make all future ABDs be linear by setting this to B_FALSE.
+ * Otherwise, ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear().
+ */
+boolean_t zfs_abd_scatter_enabled = B_TRUE;
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#ifdef _KERNEL
+extern vmem_t *zio_alloc_arena;
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+static void *
+abd_alloc_chunk()
+{
+ void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+ ASSERT3P(c, !=, NULL);
+ return (c);
+}
+
+static void
+abd_free_chunk(void *c)
+{
+ kmem_cache_free(abd_chunk_cache, c);
+}
+
+void
+abd_init(void)
+{
+ vmem_t *data_alloc_arena = NULL;
+
+#ifdef _KERNEL
+ data_alloc_arena = zio_alloc_arena;
+#endif
+
+ /*
+ * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
+ * so that no allocator metadata is stored with the buffers.
+ */
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+ }
+}
+
+void
+abd_fini(void)
+{
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ kmem_cache_destroy(abd_chunk_cache);
+ abd_chunk_cache = NULL;
+}
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+ ASSERT(!abd_is_linear(abd));
+ return (abd_chunkcnt_for_bytes(
+ abd->abd_u.abd_scatter.abd_offset + abd->abd_size));
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META));
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+ } else {
+ ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <,
+ zfs_abd_chunk_size);
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ ASSERT3P(
+ abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL);
+ }
+ }
+}
+
+static inline abd_t *
+abd_alloc_struct(size_t chunkcnt)
+{
+ size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ abd_t *abd = kmem_alloc(size, KM_PUSHPAGE);
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, size);
+
+ return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+ size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd);
+ int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]);
+ kmem_free(abd, size);
+ ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ if (!zfs_abd_scatter_enabled)
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ size_t n = abd_chunkcnt_for_bytes(size);
+ abd_t *abd = abd_alloc_struct(n);
+
+ abd->abd_flags = ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_scatter.abd_offset = 0;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ for (int i = 0; i < n; i++) {
+ void *c = abd_alloc_chunk();
+ ASSERT3P(c, !=, NULL);
+ abd->abd_u.abd_scatter.abd_chunks[i] = c;
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ n * zfs_abd_chunk_size - size);
+
+ return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ size_t n = abd_scatter_chunkcnt(abd);
+ for (int i = 0; i < n; i++) {
+ abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]);
+ }
+
+ refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ abd->abd_size - n * zfs_abd_chunk_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ if (is_metadata) {
+ abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+ } else {
+ abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ } else {
+ zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ }
+
+ refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT3P(abd->abd_parent, ==, NULL);
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd))
+ abd_free_linear(abd);
+ else
+ abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc_linear(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+ abd_t *abd;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ abd = abd_alloc_struct(0);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+
+ abd->abd_u.abd_linear.abd_buf =
+ (char *)sabd->abd_u.abd_linear.abd_buf + off;
+ } else {
+ size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+ size_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+ (new_offset / zfs_abd_chunk_size);
+
+ abd = abd_alloc_struct(chunkcnt);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = 0;
+
+ abd->abd_u.abd_scatter.abd_offset =
+ new_offset % zfs_abd_chunk_size;
+ abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size;
+
+ /* Copy the scatterlist starting at the correct offset */
+ (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks,
+ &sabd->abd_u.abd_scatter.abd_chunks[new_offset /
+ zfs_abd_chunk_size],
+ chunkcnt * sizeof (void *));
+ }
+
+ abd->abd_size = sabd->abd_size - off;
+ abd->abd_parent = sabd;
+ refcount_create(&abd->abd_children);
+ (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+ return (abd);
+}
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct(0);
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_linear.abd_buf = buf;
+
+ return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+ if (abd->abd_parent != NULL) {
+ (void) refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+
+ refcount_destroy(&abd->abd_children);
+ abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+ (void) refcount_add_many(&abd->abd_children, n, buf);
+
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+ (void) refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+struct abd_iter {
+ abd_t *iter_abd; /* ABD being iterated through */
+ size_t iter_pos; /* position (relative to abd_offset) */
+ void *iter_mapaddr; /* addr corresponding to iter_pos */
+ size_t iter_mapsize; /* length of data valid at mapaddr */
+};
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+ ASSERT(!abd_is_linear(aiter->iter_abd));
+ return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset +
+ aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_pos = 0;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* Panic if someone has changed zfs_abd_chunk_size */
+ IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+ aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ offset = aiter->iter_pos;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+ } else {
+ size_t index = abd_iter_scatter_chunk_index(aiter);
+ offset = abd_iter_scatter_chunk_offset(aiter);
+ aiter->iter_mapsize = zfs_abd_chunk_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
+ }
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter aiter;
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_iter_init(&aiter, abd);
+ abd_iter_advance(&aiter, off);
+
+ while (size > 0) {
+ abd_iter_map(&aiter);
+
+ size_t len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ abd_iter_init(&daiter, dabd);
+ abd_iter_init(&saiter, sabd);
+ abd_iter_advance(&daiter, doff);
+ abd_iter_advance(&saiter, soff);
+
+ while (size > 0) {
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ size_t dlen = MIN(daiter.iter_mapsize, size);
+ size_t slen = MIN(saiter.iter_mapsize, size);
+ size_t len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&daiter, len);
+ abd_iter_advance(&saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the first size bytes of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
+{
+ return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
+}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 73c568c0b6..b10ea0bc69 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -128,14 +128,14 @@
* the arc_buf_hdr_t that will point to the data block in memory. A block can
* only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
* caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
- * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
*
* The L1ARC's data pointer may or may not be uncompressed. The ARC has the
- * ability to store the physical data (b_pdata) associated with the DVA of the
- * arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk physical block,
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
* it will match its on-disk compression characteristics. This behavior can be
* disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
- * compressed ARC functionality is disabled, the b_pdata will point to an
+ * compressed ARC functionality is disabled, the b_pabd will point to an
* uncompressed version of the on-disk data.
*
* Data in the L1ARC is not accessed by consumers of the ARC directly. Each
@@ -174,7 +174,7 @@
* | l1arc_buf_hdr_t
* | | arc_buf_t
* | b_buf +------------>+-----------+ arc_buf_t
- * | b_pdata +-+ |b_next +---->+-----------+
+ * | b_pabd +-+ |b_next +---->+-----------+
* +-----------+ | |-----------| |b_next +-->NULL
* | |b_comp = T | +-----------+
* | |b_data +-+ |b_comp = F |
@@ -191,8 +191,8 @@
* When a consumer reads a block, the ARC must first look to see if the
* arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
* arc_buf_t and either copies uncompressed data into a new data buffer from an
- * existing uncompressed arc_buf_t, decompresses the hdr's b_pdata buffer into a
- * new data buffer, or shares the hdr's b_pdata buffer, depending on whether the
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
* hdr is compressed and the desired compression characteristics of the
* arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
* arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
@@ -216,7 +216,7 @@
* | | arc_buf_t (shared)
* | b_buf +------------>+---------+ arc_buf_t
* | | |b_next +---->+---------+
- * | b_pdata +-+ |---------| |b_next +-->NULL
+ * | b_pabd +-+ |---------| |b_next +-->NULL
* +-----------+ | | | +---------+
* | |b_data +-+ | |
* | +---------+ | |b_data +-+
@@ -230,19 +230,19 @@
* | +------+ |
* +---------------------------------+
*
- * Writing to the ARC requires that the ARC first discard the hdr's b_pdata
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
* since the physical block is about to be rewritten. The new data contents
* will be contained in the arc_buf_t. As the I/O pipeline performs the write,
* it may compress the data before writing it to disk. The ARC will be called
* with the transformed data and will bcopy the transformed on-disk block into
- * a newly allocated b_pdata. Writes are always done into buffers which have
+ * a newly allocated b_pabd. Writes are always done into buffers which have
* either been loaned (and hence are new and don't have other readers) or
* buffers which have been released (and hence have their own hdr, if there
* were originally other readers of the buf's original hdr). This ensures that
* the ARC only needs to update a single buf and its hdr after a write occurs.
*
- * When the L2ARC is in use, it will also take advantage of the b_pdata. The
- * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
* that when compressed ARC is enabled that the L2ARC blocks are identical
* to the on-disk block in the main data pool. This provides a significant
* advantage since the ARC can leverage the bp's checksum when reading from the
@@ -264,7 +264,9 @@
#include <sys/vdev_impl.h>
#include <sys/dsl_pool.h>
#include <sys/zfs_zone.h>
+#include <sys/zio_checksum.h>
#include <sys/multilist.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <vm/anon.h>
@@ -300,7 +302,7 @@ int zfs_arc_evict_batch_limit = 10;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
-/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
int zfs_arc_overflow_shift = 8;
/* shift of arc_c for calculating both min and max arc_p */
@@ -463,13 +465,13 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
/*
- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
* Note that the compressed bytes may match the uncompressed bytes
* if the block is either not compressed or compressed arc is disabled.
*/
kstat_named_t arcstat_compressed_size;
/*
- * Uncompressed size of the data stored in b_pdata. If compressed
+ * Uncompressed size of the data stored in b_pabd. If compressed
* arc is disabled then this value will be identical to the stat
* above.
*/
@@ -883,7 +885,7 @@ typedef struct l1arc_buf_hdr {
refcount_t b_refcnt;
arc_callback_t *b_acb;
- void *b_pdata;
+ abd_t *b_pabd;
} l1arc_buf_hdr_t;
typedef struct l2arc_dev l2arc_dev_t;
@@ -1083,7 +1085,7 @@ typedef struct l2arc_write_callback {
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
- void *l2df_data;
+ abd_t *l2df_abd;
size_t l2df_size;
arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
@@ -1093,10 +1095,14 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
-static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
-static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_pabd(arc_buf_hdr_t *);
+static void arc_hdr_alloc_pabd(arc_buf_hdr_t *);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
@@ -1436,7 +1442,9 @@ static inline boolean_t
arc_buf_is_shared(arc_buf_t *buf)
{
boolean_t shared = (buf->b_data != NULL &&
- buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+ buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+ abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+ buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
@@ -1540,7 +1548,8 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
uint64_t csize;
void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
- csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
+
ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
if (csize < HDR_GET_PSIZE(hdr)) {
/*
@@ -1575,7 +1584,7 @@ arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
* logical I/O size and not just a gang fragment.
*/
valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
- BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
zio->io_offset, NULL) == 0);
zio_pop_transforms(zio);
return (valid_cksum);
@@ -1873,7 +1882,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
- bcopy(hdr->b_l1hdr.b_pdata, buf->b_data,
+ abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
} else {
@@ -1925,7 +1934,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed)
return (0);
} else {
int error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
- hdr->b_l1hdr.b_pdata, buf->b_data,
+ hdr->b_l1hdr.b_pabd, buf->b_data,
HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
/*
@@ -1962,7 +1971,7 @@ arc_decompress(arc_buf_t *buf)
}
/*
- * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
*/
static uint64_t
arc_hdr_size(arc_buf_hdr_t *hdr)
@@ -1994,14 +2003,14 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_add_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@@ -2029,14 +2038,14 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
(void) refcount_remove_many(&state->arcs_esize[type],
HDR_GET_LSIZE(hdr), hdr);
return;
}
ASSERT(!GHOST_STATE(state));
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_remove_many(&state->arcs_esize[type],
arc_hdr_size(hdr), hdr);
}
@@ -2133,7 +2142,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
old_state = hdr->b_l1hdr.b_state;
refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
- update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL);
} else {
old_state = arc_l2c_only;
refcnt = 0;
@@ -2203,7 +2212,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
*/
(void) refcount_add_many(&new_state->arcs_size,
HDR_GET_LSIZE(hdr), hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
} else {
uint32_t buffers = 0;
@@ -2232,7 +2241,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
ASSERT3U(bufcnt, ==, buffers);
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
(void) refcount_add_many(&new_state->arcs_size,
arc_hdr_size(hdr), hdr);
} else {
@@ -2245,7 +2254,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
/*
* When moving a header off of a ghost state,
@@ -2285,7 +2294,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
buf);
}
ASSERT3U(bufcnt, ==, buffers);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
(void) refcount_remove_many(
&old_state->arcs_size, arc_hdr_size(hdr), hdr);
}
@@ -2367,7 +2376,7 @@ arc_space_return(uint64_t space, arc_space_type_t type)
/*
* Given a hdr and a buf, returns whether that buf can share its b_data buffer
- * with the hdr's b_pdata.
+ * with the hdr's b_pabd.
*/
static boolean_t
arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
@@ -2444,20 +2453,23 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed,
/*
* If the hdr's data can be shared then we share the data buffer and
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
- * sharing it's b_pdata with the arc_buf_t. Otherwise, we allocate a new
+ * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
* buffer to store the buf's data.
*
- * There is one additional restriction here because we're sharing
- * hdr -> buf instead of the usual buf -> hdr: the hdr can't be actively
- * involved in an L2ARC write, because if this buf is used by an
- * arc_write() then the hdr's data buffer will be released when the
+ * There are two additional restrictions here because we're sharing
+ * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+ * actively involved in an L2ARC write, because if this buf is used by
+ * an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
+ * Second, the hdr's ABD must be linear so that the buf's user doesn't
+ * need to be ABD-aware.
*/
- boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr);
+ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
+ abd_is_linear(hdr->b_l1hdr.b_pabd);
/* Set up b_data and sharing */
if (can_share) {
- buf->b_data = hdr->b_l1hdr.b_pdata;
+ buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
@@ -2553,11 +2565,11 @@ arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
}
static void
-l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
{
l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
- df->l2df_data = data;
+ df->l2df_abd = abd;
df->l2df_size = size;
df->l2df_type = type;
mutex_enter(&l2arc_free_on_write_mtx);
@@ -2588,7 +2600,7 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
arc_space_return(size, ARC_SPACE_DATA);
}
- l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
+ l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
}
/*
@@ -2602,7 +2614,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_can_share(hdr, buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@@ -2611,7 +2623,9 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
* the refcount whenever an arc_buf_t is shared.
*/
refcount_transfer_ownership(&state->arcs_size, buf, hdr);
- hdr->b_l1hdr.b_pdata = buf->b_data;
+ hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+ abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+ HDR_ISTYPE_METADATA(hdr));
arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
buf->b_flags |= ARC_BUF_FLAG_SHARED;
@@ -2631,7 +2645,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_state_t *state = hdr->b_l1hdr.b_state;
ASSERT(arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
/*
@@ -2640,7 +2654,9 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
*/
refcount_transfer_ownership(&state->arcs_size, hdr, buf);
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
- hdr->b_l1hdr.b_pdata = NULL;
+ abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+ abd_put(hdr->b_l1hdr.b_pabd);
+ hdr->b_l1hdr.b_pabd = NULL;
buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
/*
@@ -2735,7 +2751,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
/*
* If the current arc_buf_t is sharing its data buffer with the
- * hdr, then reassign the hdr's b_pdata to share it with the new
+ * hdr, then reassign the hdr's b_pabd to share it with the new
* buffer at the end of the list. The shared buffer is always
* the last one on the hdr's buffer list.
*
@@ -2750,8 +2766,8 @@ arc_buf_destroy_impl(arc_buf_t *buf)
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
- arc_hdr_free_pdata(hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+ arc_hdr_free_pabd(hdr);
/*
* We must setup a new shared block between the
@@ -2789,26 +2805,26 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
static void
-arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr)
{
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_SHARED_DATA(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
}
static void
-arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+arc_hdr_free_pabd(arc_buf_hdr_t *hdr)
{
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
/*
* If the hdr is currently being written to the l2arc then
@@ -2820,10 +2836,10 @@ arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
arc_hdr_free_on_write(hdr);
ARCSTAT_BUMP(arcstat_l2_free_on_write);
} else {
- arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+ arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
arc_hdr_size(hdr), hdr);
}
- hdr->b_l1hdr.b_pdata = NULL;
+ hdr->b_l1hdr.b_pabd = NULL;
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
@@ -2860,7 +2876,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
- arc_hdr_alloc_pdata(hdr);
+ arc_hdr_alloc_pabd(hdr);
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@@ -2901,7 +2917,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
nhdr->b_l1hdr.b_state = arc_l2c_only;
/* Verify previous threads set to NULL before freeing */
- ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
@@ -2919,11 +2935,11 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
/*
* A buffer must not be moved into the arc_l2c_only
* state if it's not finished being written out to the
- * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+ * l2arc device. Otherwise, the b_l1hdr.b_pabd field
* might try to be accessed, even though it was removed.
*/
VERIFY(!HDR_L2_WRITING(hdr));
- VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
#ifdef ZFS_DEBUG
if (hdr->b_l1hdr.b_thawed != NULL) {
@@ -3012,6 +3028,18 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_thaw(buf);
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ if (!arc_buf_is_shared(buf)) {
+ /*
+ * To ensure that the hdr has the correct data in it if we call
+ * arc_decompress() on this buf before it's been written to
+ * disk, it's easiest if we just set up sharing between the
+ * buf and the hdr.
+ */
+ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
+ arc_hdr_free_pabd(hdr);
+ arc_share_buf(hdr, buf);
+ }
+
return (buf);
}
@@ -3087,8 +3115,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
}
#endif
- if (hdr->b_l1hdr.b_pdata != NULL) {
- arc_hdr_free_pdata(hdr);
+ if (hdr->b_l1hdr.b_pabd != NULL) {
+ arc_hdr_free_pabd(hdr);
}
}
@@ -3156,7 +3184,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/*
* l2arc_write_buffers() relies on a header's L1 portion
- * (i.e. its b_pdata field) during its write phase.
+ * (i.e. its b_pabd field) during it's write phase.
* Thus, we cannot push a header onto the arc_l2c_only
* state (removing it's L1 piece) until the header is
* done being written to the l2arc.
@@ -3171,7 +3199,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
if (HDR_HAS_L2HDR(hdr)) {
/*
* This buffer is cached on the 2nd Level ARC;
@@ -3237,9 +3265,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If this hdr is being evicted and has a compressed
* buffer then we discard it here before we change states.
* This ensures that the accounting is updated correctly
- * in arc_free_data_buf().
+ * in arc_free_data_impl().
*/
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
@@ -3337,7 +3365,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* thread. If we used cv_broadcast, we could
* wake up "too many" threads causing arc_size
* to significantly overflow arc_c; since
- * arc_get_data_buf() doesn't check for overflow
+ * arc_get_data_impl() doesn't check for overflow
* when it's woken up (it doesn't because it's
* possible for the ARC to be overflowing while
* full of un-evictable buffers, and the
@@ -4000,6 +4028,7 @@ arc_kmem_reap_now(void)
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
extern kmem_cache_t *range_seg_cache;
+ extern kmem_cache_t *abd_chunk_cache;
#ifdef _KERNEL
if (arc_meta_used >= arc_meta_limit) {
@@ -4027,6 +4056,7 @@ arc_kmem_reap_now(void)
kmem_cache_reap_now(zio_data_buf_cache[i]);
}
}
+ kmem_cache_reap_now(abd_chunk_cache);
kmem_cache_reap_now(buf_cache);
kmem_cache_reap_now(hdr_full_cache);
kmem_cache_reap_now(hdr_l2only_cache);
@@ -4042,13 +4072,13 @@ arc_kmem_reap_now(void)
}
/*
- * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * Threads can block in arc_get_data_impl() waiting for this thread to evict
* enough data and signal them to proceed. When this happens, the threads in
- * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * arc_get_data_impl() are sleeping while holding the hash lock for their
* particular arc header. Thus, we must be careful to never sleep on a
* hash lock in this thread. This is to prevent the following deadlock:
*
- * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
* waiting for the reclaim thread to signal it.
*
* - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
@@ -4088,7 +4118,7 @@ arc_reclaim_thread(void)
/*
* We call arc_adjust() before (possibly) calling
* arc_kmem_reap_now(), so that we can wake up
- * arc_get_data_buf() sooner.
+ * arc_get_data_impl() sooner.
*/
evicted = arc_adjust();
@@ -4245,18 +4275,45 @@ arc_is_overflowing(void)
return (arc_size >= arc_c + overflow);
}
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ return (abd_alloc(size, B_TRUE));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (abd_alloc(size, B_FALSE));
+ }
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_get_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ return (zio_buf_alloc(size));
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ return (zio_data_buf_alloc(size));
+ }
+}
+
/*
* Allocate a block and return it to the caller. If we are hitting the
* hard limit for the cache size, we must sleep, waiting for the eviction
* thread to catch up. If we're past the target size but below the hard
* limit, we'll only signal the reclaim thread and continue on.
*/
-static void *
-arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
- void *datap = NULL;
- arc_state_t *state = hdr->b_l1hdr.b_state;
- arc_buf_contents_t type = arc_buf_type(hdr);
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
arc_adapt(size, state);
@@ -4298,11 +4355,8 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
- datap = zio_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_META);
} else {
- ASSERT(type == ARC_BUFC_DATA);
- datap = zio_data_buf_alloc(size);
arc_space_consume(size, ARC_SPACE_DATA);
}
@@ -4338,14 +4392,34 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
- return (datap);
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+ arc_free_data_impl(hdr, size, tag);
+ abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ arc_free_data_impl(hdr, size, tag);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(buf, size);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(buf, size);
+ }
}
/*
* Free the arc data buffer.
*/
static void
-arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
@@ -4362,11 +4436,9 @@ arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(data, size);
arc_space_return(size, ARC_SPACE_META);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(data, size);
arc_space_return(size, ARC_SPACE_DATA);
}
}
@@ -4639,7 +4711,7 @@ arc_read_done(zio_t *zio)
if (callback_cnt == 0) {
ASSERT(HDR_PREFETCH(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
@@ -4735,7 +4807,7 @@ top:
hdr = buf_hash_find(guid, bp, &hash_lock);
}
- if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) {
arc_buf_t *buf = NULL;
*arc_flags |= ARC_FLAG_CACHED;
@@ -4878,7 +4950,7 @@ top:
hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
hdr_full_cache);
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@@ -4896,9 +4968,9 @@ top:
* avoid hitting an assert in remove_reference().
*/
arc_access(hdr, hash_lock);
- arc_hdr_alloc_pdata(hdr);
+ arc_hdr_alloc_pabd(hdr);
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
size = arc_hdr_size(hdr);
/*
@@ -5001,7 +5073,7 @@ top:
ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
ZIO_COMPRESS_EMPTY);
rzio = zio_read_phys(pio, vd, addr,
- size, hdr->b_l1hdr.b_pdata,
+ size, hdr->b_l1hdr.b_pabd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
@@ -5040,7 +5112,7 @@ top:
}
}
- rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+ rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
/*
@@ -5232,16 +5304,17 @@ arc_release(arc_buf_t *buf, void *tag)
arc_unshare_buf(hdr, buf);
/*
- * Now we need to recreate the hdr's b_pdata. Since we
+ * Now we need to recreate the hdr's b_pabd. Since we
* have lastbuf handy, we try to share with it, but if
- * we can't then we allocate a new b_pdata and copy the
+ * we can't then we allocate a new b_pabd and copy the
* data from buf into it.
*/
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
- arc_hdr_alloc_pdata(hdr);
- bcopy(buf->b_data, hdr->b_l1hdr.b_pdata, psize);
+ arc_hdr_alloc_pabd(hdr);
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+ buf->b_data, psize);
}
VERIFY3P(lastbuf->b_data, !=, NULL);
} else if (HDR_SHARED_DATA(hdr)) {
@@ -5257,7 +5330,7 @@ arc_release(arc_buf_t *buf, void *tag)
HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3P(state, !=, arc_l2c_only);
(void) refcount_remove_many(&state->arcs_size,
@@ -5276,7 +5349,7 @@ arc_release(arc_buf_t *buf, void *tag)
mutex_exit(hash_lock);
/*
- * Allocate a new hdr. The new hdr will contain a b_pdata
+ * Allocate a new hdr. The new hdr will contain a b_pabd
* buffer which will be freed in arc_write().
*/
nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
@@ -5354,15 +5427,15 @@ arc_write_ready(zio_t *zio)
if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
}
}
}
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_SHARED_DATA(hdr));
ASSERT(!arc_buf_is_shared(buf));
@@ -5384,33 +5457,47 @@ arc_write_ready(zio_t *zio)
HDR_SET_PSIZE(hdr, psize);
arc_hdr_set_compress(hdr, compress);
+
/*
- * If the hdr is compressed, then copy the compressed
- * zio contents into arc_buf_hdr_t. Otherwise, copy the original
- * data buf into the hdr. Ideally, we would like to always copy the
- * io_data into b_pdata but the user may have disabled compressed
- * arc thus the on-disk block may or may not match what we maintain
- * in the hdr's b_pdata field.
+ * Fill the hdr with data. If the hdr is compressed, the data we want
+ * is available from the zio, otherwise we can take it from the buf.
+ *
+ * We might be able to share the buf's data with the hdr here. However,
+ * doing so would cause the ARC to be full of linear ABDs if we write a
+ * lot of shareable data. As a compromise, we check whether scattered
+ * ABDs are allowed, and assume that if they are then the user wants
+ * the ARC to be primarily filled with them regardless of the data being
+ * written. Therefore, if they're allowed then we allocate one and copy
+ * the data into it; otherwise, we share the data directly if we can.
*/
- if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
- !ARC_BUF_COMPRESSED(buf)) {
- ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, ZIO_COMPRESS_OFF);
- ASSERT3U(psize, >, 0);
- arc_hdr_alloc_pdata(hdr);
- bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+ if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+ arc_hdr_alloc_pabd(hdr);
+
+ /*
+ * Ideally, we would always copy the io_abd into b_pabd, but the
+ * user may have disabled compressed ARC, thus we must check the
+ * hdr's compression setting rather than the io_bp's.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=,
+ ZIO_COMPRESS_OFF);
+ ASSERT3U(psize, >, 0);
+
+ abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+ } else {
+ ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+
+ abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+ arc_buf_size(buf));
+ }
} else {
- ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+ ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
- /*
- * This hdr is not compressed so we're able to share
- * the arc_buf_t data buffer with the hdr.
- */
arc_share_buf(hdr, buf);
- ASSERT0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
- HDR_GET_LSIZE(hdr)));
}
+
arc_hdr_verify(hdr, zio->io_bp);
}
@@ -5515,6 +5602,7 @@ arc_write_done(zio_t *zio)
ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
+ abd_put(zio->io_abd);
kmem_free(callback, sizeof (arc_write_callback_t));
}
@@ -5551,10 +5639,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
callback->awcb_buf = buf;
/*
- * The hdr's b_pdata is now stale, free it now. A new data block
+ * The hdr's b_pabd is now stale, free it now. A new data block
* will be allocated when the zio pipeline calls arc_write_ready().
*/
- if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL) {
/*
* If the buf is currently sharing the data block with
* the hdr then we need to break that relationship here.
@@ -5564,15 +5652,16 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
if (arc_buf_is_shared(buf)) {
arc_unshare_buf(hdr, buf);
} else {
- arc_hdr_free_pdata(hdr);
+ arc_hdr_free_pabd(hdr);
}
VERIFY3P(buf->b_data, !=, NULL);
arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
}
ASSERT(!arc_buf_is_shared(buf));
- ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- zio = zio_write(pio, spa, txg, bp, buf->b_data,
+ zio = zio_write(pio, spa, txg, bp,
+ abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
HDR_GET_LSIZE(hdr), arc_buf_size(buf), zp, arc_write_ready,
(children_ready != NULL) ? arc_write_children_ready : NULL,
arc_write_physdone, arc_write_done, callback,
@@ -6328,13 +6417,8 @@ l2arc_do_free_on_write()
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
- ASSERT3P(df->l2df_data, !=, NULL);
- if (df->l2df_type == ARC_BUFC_METADATA) {
- zio_buf_free(df->l2df_data, df->l2df_size);
- } else {
- ASSERT(df->l2df_type == ARC_BUFC_DATA);
- zio_data_buf_free(df->l2df_data, df->l2df_size);
- }
+ ASSERT3P(df->l2df_abd, !=, NULL);
+ abd_free(df->l2df_abd);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
@@ -6484,12 +6568,12 @@ l2arc_read_done(zio_t *zio)
mutex_enter(hash_lock);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT3P(zio->io_data, !=, NULL);
+ ASSERT3P(zio->io_abd, !=, NULL);
/*
* Check this survived the L2ARC journey.
*/
- ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+ ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd);
zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
@@ -6523,7 +6607,7 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
- hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+ hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
&cb->l2rcb_zb));
}
@@ -6811,7 +6895,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
ASSERT3U(arc_hdr_size(hdr), >, 0);
uint64_t size = arc_hdr_size(hdr);
@@ -6826,20 +6910,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* lifetime of the ZIO and be cleaned up afterwards, we
* add it to the l2arc_free_on_write queue.
*/
- void *to_write;
+ abd_t *to_write;
if (!HDR_SHARED_DATA(hdr)) {
- to_write = hdr->b_l1hdr.b_pdata;
+ to_write = hdr->b_l1hdr.b_pabd;
} else {
- arc_buf_contents_t type = arc_buf_type(hdr);
- if (type == ARC_BUFC_METADATA) {
- to_write = zio_buf_alloc(size);
- } else {
- ASSERT3U(type, ==, ARC_BUFC_DATA);
- to_write = zio_data_buf_alloc(size);
- }
-
- bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
- l2arc_free_data_on_write(to_write, size, type);
+ to_write = abd_alloc_for_io(size,
+ HDR_ISTYPE_METADATA(hdr));
+ abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+ l2arc_free_abd_on_write(to_write, size,
+ arc_buf_type(hdr));
}
wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, size, to_write,
diff --git a/usr/src/uts/common/fs/zfs/blkptr.c b/usr/src/uts/common/fs/zfs/blkptr.c
index 7e61dc96ff..ff93ff4456 100644
--- a/usr/src/uts/common/fs/zfs/blkptr.c
+++ b/usr/src/uts/common/fs/zfs/blkptr.c
@@ -14,7 +14,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 8d42481ea0..812ff3badd 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -46,6 +46,7 @@
#include <sys/blkptr.h>
#include <sys/range_tree.h>
#include <sys/callb.h>
+#include <sys/abd.h>
uint_t zfs_dbuf_evict_key;
@@ -3463,8 +3464,10 @@ dbuf_write_override_done(zio_t *zio)
arc_release(dr->dt.dl.dr_data, db);
}
mutex_exit(&db->db_mtx);
-
dbuf_write_done(zio, NULL, db);
+
+ if (zio->io_abd != NULL)
+ abd_put(zio->io_abd);
}
/* Issue I/O to commit a dirty buffer to disk. */
@@ -3557,7 +3560,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* The BP for this block has been provided by open context
* (by dmu_sync() or dmu_buf_write_embedded()).
*/
- void *contents = (data != NULL) ? data->b_data : NULL;
+ abd_t *contents = (data != NULL) ?
+ abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c
index 9955f89e77..ba3e02cfb5 100644
--- a/usr/src/uts/common/fs/zfs/ddt.c
+++ b/usr/src/uts/common/fs/zfs/ddt.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -36,6 +36,7 @@
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -651,9 +652,8 @@ ddt_free(ddt_entry_t *dde)
for (int p = 0; p < DDT_PHYS_TYPES; p++)
ASSERT(dde->dde_lead_zio[p] == NULL);
- if (dde->dde_repair_data != NULL)
- zio_buf_free(dde->dde_repair_data,
- DDK_GET_PSIZE(&dde->dde_key));
+ if (dde->dde_repair_abd != NULL)
+ abd_free(dde->dde_repair_abd);
cv_destroy(&dde->dde_cv);
kmem_free(dde, sizeof (*dde));
@@ -917,7 +917,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
ddt_enter(ddt);
- if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+ if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
@@ -954,7 +954,7 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
- rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+ rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index f7ef30548d..7769003c43 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -46,6 +46,7 @@
#include <sys/zio_compress.h>
#include <sys/sa.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/vmsystm.h>
#include <sys/zfs_znode.h>
@@ -1632,6 +1633,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+ abd_put(zio->io_abd);
kmem_free(dsa, sizeof (*dsa));
}
@@ -1657,10 +1659,10 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
dsa->dsa_tx = tx;
zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
- zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size,
- zp, dmu_sync_late_arrival_ready, NULL,
- NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
- ZIO_FLAG_CANFAIL, zb));
+ abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+ zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+ dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+ dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
return (0);
}
@@ -2192,6 +2194,7 @@ byteswap_uint8_array(void *vbuf, size_t size)
void
dmu_init(void)
{
+ abd_init();
zfs_dbgmsg_init();
sa_cache_init();
xuio_stat_init();
@@ -2215,4 +2218,5 @@ dmu_fini(void)
xuio_stat_fini();
sa_cache_fini();
zfs_dbgmsg_fini();
+ abd_fini();
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index e40b9f88b6..c9a79b94e8 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -132,7 +132,7 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
{
ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
- fletcher_4_incremental_native(dsp->dsa_drr,
+ (void) fletcher_4_incremental_native(dsp->dsa_drr,
offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
&dsp->dsa_zc);
if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
@@ -145,13 +145,13 @@ dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
if (dsp->dsa_drr->drr_type == DRR_END) {
dsp->dsa_sent_end = B_TRUE;
}
- fletcher_4_incremental_native(&dsp->dsa_drr->
+ (void) fletcher_4_incremental_native(&dsp->dsa_drr->
drr_u.drr_checksum.drr_checksum,
sizeof (zio_cksum_t), &dsp->dsa_zc);
if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
return (SET_ERROR(EINTR));
if (payload_len != 0) {
- fletcher_4_incremental_native(payload, payload_len,
+ (void) fletcher_4_incremental_native(payload, payload_len,
&dsp->dsa_zc);
if (dump_bytes(dsp, payload, payload_len) != 0)
return (SET_ERROR(EINTR));
@@ -1742,11 +1742,11 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
- fletcher_4_incremental_byteswap(drr_begin,
+ (void) fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
byteswap_record(drr_begin);
} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
- fletcher_4_incremental_native(drr_begin,
+ (void) fletcher_4_incremental_native(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
} else {
return (SET_ERROR(EINVAL));
@@ -2419,9 +2419,9 @@ static void
receive_cksum(struct receive_arg *ra, int len, void *buf)
{
if (ra->byteswap) {
- fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+ (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
} else {
- fletcher_4_incremental_native(buf, len, &ra->cksum);
+ (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
}
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index c672128744..1963f15385 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -20,8 +20,8 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2016 Gary Mills
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
#include <sys/dsl_scan.h>
@@ -47,6 +47,7 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
@@ -1756,7 +1757,7 @@ dsl_scan_scrub_done(zio_t *zio)
{
spa_t *spa = zio->io_spa;
- zio_data_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -1839,7 +1840,6 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (needs_io && !zfs_no_scrub_io) {
vdev_t *rvd = spa->spa_root_vdev;
uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
- void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= maxinflight)
@@ -1854,9 +1854,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
delay(scan_delay);
- zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
- zio_flags, zb));
+ zio_nowait(zio_read(NULL, spa, bp,
+ abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done,
+ NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb));
}
/* do not relocate this block */
diff --git a/usr/src/uts/common/fs/zfs/edonr_zfs.c b/usr/src/uts/common/fs/zfs/edonr_zfs.c
index 93f1221fd5..9a3430d946 100644
--- a/usr/src/uts/common/fs/zfs/edonr_zfs.c
+++ b/usr/src/uts/common/fs/zfs/edonr_zfs.c
@@ -22,19 +22,31 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/edonr.h>
+#include <sys/abd.h>
#define EDONR_MODE 512
#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+ EdonRState *ctx = arg;
+ EdonRUpdate(ctx, buf, size * 8);
+ return (0);
+}
+
/*
* Native zio_checksum interface for the Edon-R hash function.
*/
/*ARGSUSED*/
void
-zio_checksum_edonr_native(const void *buf, uint64_t size,
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
uint8_t digest[EDONR_MODE / 8];
@@ -42,7 +54,7 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
- EdonRUpdate(&ctx, buf, size * 8);
+ (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
EdonRFinal(&ctx, digest);
bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
}
@@ -51,12 +63,12 @@ zio_checksum_edonr_native(const void *buf, uint64_t size,
* Byteswapped zio_checksum interface for the Edon-R hash function.
*/
void
-zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+ abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
@@ -64,7 +76,7 @@ zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
}
void *
-zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
{
EdonRState *ctx;
uint8_t salt_block[EDONR_BLOCK_SIZE];
@@ -93,7 +105,7 @@ zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
}
void
-zio_checksum_edonr_tmpl_free(void *ctx_template)
+abd_checksum_edonr_tmpl_free(void *ctx_template)
{
EdonRState *ctx = ctx_template;
diff --git a/usr/src/uts/common/fs/zfs/lz4.c b/usr/src/uts/common/fs/zfs/lz4.c
index 3aa1b74ef3..82a08939dc 100644
--- a/usr/src/uts/common/fs/zfs/lz4.c
+++ b/usr/src/uts/common/fs/zfs/lz4.c
@@ -31,6 +31,9 @@
* - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
* - LZ4 source repository : http://code.google.com/p/lz4/
*/
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
#include <sys/zfs_context.h>
diff --git a/usr/src/uts/common/fs/zfs/sha256.c b/usr/src/uts/common/fs/zfs/sha256.c
index 81a7f6b1c2..23a97aa3de 100644
--- a/usr/src/uts/common/fs/zfs/sha256.c
+++ b/usr/src/uts/common/fs/zfs/sha256.c
@@ -24,29 +24,39 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/sha2.h>
+#include <sys/abd.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+ SHA2_CTX *ctx = arg;
+ SHA2Update(ctx, buf, size);
+ return (0);
+}
/*ARGSUSED*/
void
-zio_checksum_SHA256(const void *buf, uint64_t size,
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA2_CTX ctx;
zio_cksum_t tmp;
SHA2Init(SHA256, &ctx);
- SHA2Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(&tmp, &ctx);
/*
* A prior implementation of this function had a
* private SHA256 implementation always wrote things out in
* Big Endian and there wasn't a byteswap variant of it.
- * To preseve on disk compatibility we need to force that
- * behaviour.
+ * To preserve on disk compatibility we need to force that
+ * behavior.
*/
zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
@@ -56,24 +66,24 @@ zio_checksum_SHA256(const void *buf, uint64_t size,
/*ARGSUSED*/
void
-zio_checksum_SHA512_native(const void *buf, uint64_t size,
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
SHA2_CTX ctx;
SHA2Init(SHA512_256, &ctx);
- SHA2Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
SHA2Final(zcp, &ctx);
}
/*ARGSUSED*/
void
-zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+ abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/usr/src/uts/common/fs/zfs/skein_zfs.c b/usr/src/uts/common/fs/zfs/skein_zfs.c
index 6592340396..340da7adfb 100644
--- a/usr/src/uts/common/fs/zfs/skein_zfs.c
+++ b/usr/src/uts/common/fs/zfs/skein_zfs.c
@@ -20,42 +20,52 @@
*/
/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/zio.h>
#include <sys/skein.h>
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+ Skein_512_Ctxt_t *ctx = arg;
+ (void) Skein_512_Update(ctx, buf, size);
+ return (0);
+}
/*
* Computes a native 256-bit skein MAC checksum. Please note that this
* function requires the presence of a ctx_template that should be allocated
- * using zio_checksum_skein_tmpl_init.
+ * using abd_checksum_skein_tmpl_init.
*/
/*ARGSUSED*/
void
-zio_checksum_skein_native(const void *buf, uint64_t size,
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
Skein_512_Ctxt_t ctx;
ASSERT(ctx_template != NULL);
bcopy(ctx_template, &ctx, sizeof (ctx));
- (void) Skein_512_Update(&ctx, buf, size);
+ (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
bzero(&ctx, sizeof (ctx));
}
/*
- * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
* the native checksum function and byteswaps the resulting checksum (since
* skein is internally endian-insensitive).
*/
void
-zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+ abd_checksum_skein_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
@@ -67,7 +77,7 @@ zio_checksum_skein_byteswap(const void *buf, uint64_t size,
* computations and returns a pointer to it.
*/
void *
-zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
{
Skein_512_Ctxt_t *ctx;
@@ -79,10 +89,10 @@ zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
/*
* Frees a skein context template previously allocated using
- * zio_checksum_skein_tmpl_init.
+ * abd_checksum_skein_tmpl_init.
*/
void
-zio_checksum_skein_tmpl_free(void *ctx_template)
+abd_checksum_skein_tmpl_free(void *ctx_template)
{
Skein_512_Ctxt_t *ctx = ctx_template;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 5764d47c33..be5b66fd3b 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -70,6 +70,7 @@
#include <sys/dsl_scan.h>
#include <sys/zfeature.h>
#include <sys/dsl_destroy.h>
+#include <sys/abd.h>
#ifdef _KERNEL
#include <sys/bootprops.h>
@@ -1876,6 +1877,7 @@ spa_load_verify_done(zio_t *zio)
int error = zio->io_error;
spa_t *spa = zio->io_spa;
+ abd_free(zio->io_abd);
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG)
@@ -1883,7 +1885,6 @@ spa_load_verify_done(zio_t *zio)
else
atomic_inc_64(&sle->sle_data_count);
}
- zio_data_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
@@ -1913,12 +1914,11 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (!spa_load_verify_metadata)
return (0);
- if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+ if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
return (0);
zio_t *rio = arg;
size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
@@ -1926,7 +1926,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
- zio_nowait(zio_read(rio, spa, bp, data, size,
+ zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h
new file mode 100644
index 0000000000..308f021b76
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/abd.h
@@ -0,0 +1,150 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_H
+#define _ABD_H
+
+#include <sys/isa_defs.h>
+#include <sys/int_types.h>
+#include <sys/debug.h>
+#include <sys/refcount.h>
+#ifdef _KERNEL
+#include <sys/uio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum abd_flags {
+ ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */
+ ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
+ ABD_FLAG_META = 1 << 2 /* does this represent FS metadata? */
+} abd_flags_t;
+
+typedef struct abd {
+ abd_flags_t abd_flags;
+ uint_t abd_size; /* excludes scattered abd_offset */
+ struct abd *abd_parent;
+ refcount_t abd_children;
+ union {
+ struct abd_scatter {
+ uint_t abd_offset;
+ uint_t abd_chunk_size;
+ void *abd_chunks[];
+ } abd_scatter;
+ struct abd_linear {
+ void *abd_buf;
+ } abd_linear;
+ } abd_u;
+} abd_t;
+
+typedef int abd_iter_func_t(void *, size_t, void *);
+typedef int abd_iter_func2_t(void *, void *, size_t, void *);
+
+extern boolean_t zfs_abd_scatter_enabled;
+
+inline boolean_t
+abd_is_linear(abd_t *abd)
+{
+ return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0);
+}
+
+/*
+ * Allocations and deallocations
+ */
+
+abd_t *abd_alloc(size_t, boolean_t);
+abd_t *abd_alloc_linear(size_t, boolean_t);
+abd_t *abd_alloc_for_io(size_t, boolean_t);
+abd_t *abd_alloc_sametype(abd_t *, size_t);
+void abd_free(abd_t *);
+abd_t *abd_get_offset(abd_t *, size_t);
+abd_t *abd_get_from_buf(void *, size_t);
+void abd_put(abd_t *);
+
+/*
+ * Conversion to and from a normal buffer
+ */
+
+void *abd_to_buf(abd_t *);
+void *abd_borrow_buf(abd_t *, size_t);
+void *abd_borrow_buf_copy(abd_t *, size_t);
+void abd_return_buf(abd_t *, void *, size_t);
+void abd_return_buf_copy(abd_t *, void *, size_t);
+void abd_take_ownership_of_buf(abd_t *, boolean_t);
+void abd_release_ownership_of_buf(abd_t *);
+
+/*
+ * ABD operations
+ */
+
+int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
+int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
+ abd_iter_func2_t *, void *);
+void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
+void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
+int abd_cmp(abd_t *, abd_t *, size_t);
+int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
+void abd_zero_off(abd_t *, size_t, size_t);
+
+/*
+ * Wrappers for calls with offsets of 0
+ */
+
+inline void
+abd_copy(abd_t *dabd, abd_t *sabd, size_t size)
+{
+ abd_copy_off(dabd, sabd, 0, 0, size);
+}
+
+inline void
+abd_copy_from_buf(abd_t *abd, void *buf, size_t size)
+{
+ abd_copy_from_buf_off(abd, buf, 0, size);
+}
+
+inline void
+abd_copy_to_buf(void* buf, abd_t *abd, size_t size)
+{
+ abd_copy_to_buf_off(buf, abd, 0, size);
+}
+
+inline int
+abd_cmp_buf(abd_t *abd, void *buf, size_t size)
+{
+ return (abd_cmp_buf_off(abd, buf, 0, size));
+}
+
+inline void
+abd_zero(abd_t *abd, size_t size)
+{
+ abd_zero_off(abd, 0, size);
+}
+
+/*
+ * Module lifecycle
+ */
+
+void abd_init(void);
+void abd_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ABD_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h
index 771610677e..15d2a9a7ad 100644
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_DDT_H
@@ -35,6 +36,8 @@
extern "C" {
#endif
+struct abd;
+
/*
* On-disk DDT formats, in the desired search order (newest version first).
*/
@@ -108,7 +111,7 @@ struct ddt_entry {
ddt_key_t dde_key;
ddt_phys_t dde_phys[DDT_PHYS_TYPES];
zio_t *dde_lead_zio[DDT_PHYS_TYPES];
- void *dde_repair_data;
+ struct abd *dde_repair_abd;
enum ddt_type dde_type;
enum ddt_class dde_class;
uint8_t dde_loading;
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index d0bb431866..0caefcd153 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -419,15 +419,17 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+#define BP_IS_METADATA(bp) \
+ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
+
#define BP_GET_ASIZE(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
DVA_GET_ASIZE(&(bp)->blk_dva[2]))
-#define BP_GET_UCSIZE(bp) \
- ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
- BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
+#define BP_GET_UCSIZE(bp) \
+ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
@@ -597,8 +599,7 @@ _NOTE(CONSTCOND) } while (0)
}
#define BP_GET_BUFC_TYPE(bp) \
- (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \
- ARC_BUFC_METADATA : ARC_BUFC_DATA)
+ (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
typedef enum spa_import_type {
SPA_IMPORT_EXISTING,
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 8df5b3b785..931c42f2be 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -52,6 +52,7 @@ extern "C" {
typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
+struct abd;
extern int zfs_vdev_queue_depth_pct;
extern uint32_t zfs_vdev_async_write_max_active;
@@ -86,7 +87,7 @@ typedef struct vdev_ops {
* Virtual device properties
*/
struct vdev_cache_entry {
- char *ve_data;
+ struct abd *ve_abd;
uint64_t ve_offset;
uint64_t ve_lastused;
avl_node_t ve_offset_node;
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 5a6bd3c329..b50df27774 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -306,6 +306,7 @@ typedef void zio_cksum_free_f(void *cbdata, size_t size);
struct zio_bad_cksum; /* defined in zio_checksum.h */
struct dnode_phys;
+struct abd;
struct zio_cksum_report {
struct zio_cksum_report *zcr_next;
@@ -338,12 +339,12 @@ typedef struct zio_gang_node {
} zio_gang_node_t;
typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
- zio_gang_node_t *gn, void *data);
+ zio_gang_node_t *gn, struct abd *data, uint64_t offset);
-typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size);
+typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
typedef struct zio_transform {
- void *zt_orig_data;
+ struct abd *zt_orig_abd;
uint64_t zt_orig_size;
uint64_t zt_bufsize;
zio_transform_func_t *zt_transform;
@@ -404,8 +405,8 @@ struct zio {
blkptr_t io_bp_orig;
/* Data represented by this I/O */
- void *io_data;
- void *io_orig_data;
+ struct abd *io_abd;
+ struct abd *io_orig_abd;
uint64_t io_size;
uint64_t io_orig_size;
/* io_lsize != io_orig_size iff this is a raw write */
@@ -465,19 +466,19 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
extern zio_t *zio_root(spa_t *spa,
zio_done_func_t *done, void *private, enum zio_flag flags);
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
- uint64_t lsize, zio_done_func_t *done, void *private,
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+ struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
+ struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
+ struct abd *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
@@ -493,12 +494,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_done_func_t *done, void *private, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
+ uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
- uint64_t size, void *data, int checksum,
+ uint64_t size, struct abd *data, int checksum,
zio_done_func_t *done, void *private, zio_priority_t priority,
enum zio_flag flags, boolean_t labels);
@@ -528,19 +529,19 @@ extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
-extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
uint64_t bufsize, zio_transform_func_t *transform);
extern void zio_pop_transforms(zio_t *zio);
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
- uint64_t offset, void *data, uint64_t size, int type,
+ uint64_t offset, struct abd *data, uint64_t size, int type,
zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, zio_priority_t priority,
+ struct abd *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private);
extern void zio_vdev_io_bypass(zio_t *zio);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
index 2f7579fd73..3eda057eae 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
* Copyright Saso Kiselkov 2013, All rights reserved.
*/
@@ -34,10 +34,12 @@
extern "C" {
#endif
+struct abd;
+
/*
* Signature for checksum functions.
*/
-typedef void zio_checksum_t(const void *data, uint64_t size,
+typedef void zio_checksum_t(struct abd *, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp);
typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
typedef void zio_checksum_tmpl_free_t(void *ctx_template);
@@ -81,28 +83,28 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
/*
* Checksum routines.
*/
-extern zio_checksum_t zio_checksum_SHA256;
-extern zio_checksum_t zio_checksum_SHA512_native;
-extern zio_checksum_t zio_checksum_SHA512_byteswap;
+extern zio_checksum_t abd_checksum_SHA256;
+extern zio_checksum_t abd_checksum_SHA512_native;
+extern zio_checksum_t abd_checksum_SHA512_byteswap;
/* Skein */
-extern zio_checksum_t zio_checksum_skein_native;
-extern zio_checksum_t zio_checksum_skein_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+extern zio_checksum_t abd_checksum_skein_native;
+extern zio_checksum_t abd_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free;
/* Edon-R */
-extern zio_checksum_t zio_checksum_edonr_native;
-extern zio_checksum_t zio_checksum_edonr_byteswap;
-extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
-extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+extern zio_checksum_t abd_checksum_edonr_native;
+extern zio_checksum_t abd_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
void *, uint64_t, uint64_t, zio_bad_cksum_t *);
-extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size);
+extern void zio_checksum_compute(zio_t *, enum zio_checksum,
+ struct abd *, uint64_t);
extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
- void *, uint64_t, uint64_t, zio_bad_cksum_t *);
+ struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
extern void zio_checksum_templates_free(spa_t *spa);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
index 0c1783b140..bcffa699b5 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -25,12 +25,14 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZIO_COMPRESS_H
#define _SYS_ZIO_COMPRESS_H
+#include <sys/abd.h>
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -61,15 +63,22 @@ typedef size_t zio_compress_func_t(void *src, void *dst,
/* Common signature for all zio decompress functions. */
typedef int zio_decompress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
+/*
+ * Common signature for all zio decompress functions using an ABD as input.
+ * This is helpful if you have both compressed ARC and scatter ABDs enabled,
+ * but is not a requirement for all compression algorithms.
+ */
+typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
+ size_t s_len, size_t d_len, int);
/*
* Information about each compression function.
*/
typedef struct zio_compress_info {
- zio_compress_func_t *ci_compress; /* compression function */
- zio_decompress_func_t *ci_decompress; /* decompression function */
- int ci_level; /* level parameter */
- char *ci_name; /* algorithm name */
+ char *ci_name;
+ int ci_level;
+ zio_compress_func_t *ci_compress;
+ zio_decompress_func_t *ci_decompress;
} zio_compress_info_t;
extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS];
@@ -97,9 +106,11 @@ extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
/*
* Compress and decompress data if necessary.
*/
-extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst,
size_t s_len);
-extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len);
+extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index a081deb7ea..d43745cd11 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -45,6 +45,7 @@
#include <sys/arc.h>
#include <sys/zil.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
/*
* Virtual device management.
@@ -961,16 +962,16 @@ vdev_probe_done(zio_t *zio)
vps->vps_readable = 1;
if (zio->io_error == 0 && spa_writeable(spa)) {
zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
- zio->io_offset, zio->io_size, zio->io_data,
+ zio->io_offset, zio->io_size, zio->io_abd,
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
} else {
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
if (zio->io_error == 0)
vps->vps_writeable = 1;
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
} else if (zio->io_type == ZIO_TYPE_NULL) {
zio_t *pio;
@@ -1086,8 +1087,8 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad2)),
- VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
+ offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
+ abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index a6d6cfa61b..9b4755321d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -31,6 +31,7 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/kstat.h>
+#include <sys/abd.h>
/*
* Virtual device read-ahead caching.
@@ -141,12 +142,12 @@ static void
vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
{
ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
- ASSERT(ve->ve_data != NULL);
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
+ ASSERT3P(ve->ve_abd, !=, NULL);
avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve);
- zio_buf_free(ve->ve_data, VCBS);
+ abd_free(ve->ve_abd);
kmem_free(ve, sizeof (vdev_cache_entry_t));
}
@@ -176,14 +177,14 @@ vdev_cache_allocate(zio_t *zio)
ve = avl_first(&vc->vc_lastused_tree);
if (ve->ve_fill_io != NULL)
return (NULL);
- ASSERT(ve->ve_hits != 0);
+ ASSERT3U(ve->ve_hits, !=, 0);
vdev_cache_evict(vc, ve);
}
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
ve->ve_lastused = ddi_get_lbolt();
- ve->ve_data = zio_buf_alloc(VCBS);
+ ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
avl_add(&vc->vc_offset_tree, ve);
avl_add(&vc->vc_lastused_tree, ve);
@@ -197,7 +198,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
ASSERT(MUTEX_HELD(&vc->vc_lock));
- ASSERT(ve->ve_fill_io == NULL);
+ ASSERT3P(ve->ve_fill_io, ==, NULL);
if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
@@ -206,7 +207,7 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
}
ve->ve_hits++;
- bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
+ abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
}
/*
@@ -220,16 +221,16 @@ vdev_cache_fill(zio_t *fio)
vdev_cache_entry_t *ve = fio->io_private;
zio_t *pio;
- ASSERT(fio->io_size == VCBS);
+ ASSERT3U(fio->io_size, ==, VCBS);
/*
* Add data to the cache.
*/
mutex_enter(&vc->vc_lock);
- ASSERT(ve->ve_fill_io == fio);
- ASSERT(ve->ve_offset == fio->io_offset);
- ASSERT(ve->ve_data == fio->io_data);
+ ASSERT3P(ve->ve_fill_io, ==, fio);
+ ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+ ASSERT3P(ve->ve_abd, ==, fio->io_abd);
ve->ve_fill_io = NULL;
@@ -260,7 +261,7 @@ vdev_cache_read(zio_t *zio)
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
zio_t *fio;
- ASSERT(zio->io_type == ZIO_TYPE_READ);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (B_FALSE);
@@ -274,7 +275,7 @@ vdev_cache_read(zio_t *zio)
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (B_FALSE);
- ASSERT(cache_phase + zio->io_size <= VCBS);
+ ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
mutex_enter(&vc->vc_lock);
@@ -311,7 +312,7 @@ vdev_cache_read(zio_t *zio)
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+ ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
@@ -339,7 +340,7 @@ vdev_cache_write(zio_t *zio)
uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
avl_index_t where;
- ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
mutex_enter(&vc->vc_lock);
@@ -356,8 +357,9 @@ vdev_cache_write(zio_t *zio)
if (ve->ve_fill_io != NULL) {
ve->ve_missed_update = 1;
} else {
- bcopy((char *)zio->io_data + start - io_start,
- ve->ve_data + start - ve->ve_offset, end - start);
+ abd_copy_off(ve->ve_abd, zio->io_abd,
+ start - ve->ve_offset, start - io_start,
+ end - start);
}
ve = AVL_NEXT(&vc->vc_offset_tree, ve);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index d6c16fce75..056d356f27 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -31,6 +31,7 @@
#include <sys/refcount.h>
#include <sys/vdev_disk.h>
#include <sys/vdev_impl.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/zio.h>
#include <sys/sunldi.h>
@@ -667,6 +668,12 @@ vdev_disk_io_intr(buf_t *bp)
if (zio->io_error == 0 && bp->b_resid != 0)
zio->io_error = SET_ERROR(EIO);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ } else {
+ abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ }
+
kmem_free(vb, sizeof (vdev_buf_t));
zio_delay_interrupt(zio);
@@ -778,7 +785,15 @@ vdev_disk_io_start(zio_t *zio)
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
bp->b_flags |= B_FAILFAST;
bp->b_bcount = zio->io_size;
- bp->b_un.b_addr = zio->io_data;
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->b_un.b_addr =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->b_un.b_addr =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+
bp->b_lblkno = lbtodb(zio->io_offset);
bp->b_bufsize = zio->io_size;
bp->b_iodone = (int (*)())vdev_disk_io_intr;
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index 633621b0dd..147e693967 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -31,6 +31,7 @@
#include <sys/zio.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
/*
* Virtual device vector for files.
@@ -157,6 +158,12 @@ vdev_file_io_intr(buf_t *bp)
if (zio->io_error == 0 && bp->b_resid != 0)
zio->io_error = SET_ERROR(ENOSPC);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ } else {
+ abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size);
+ }
+
kmem_free(vb, sizeof (vdev_buf_t));
zio_delay_interrupt(zio);
}
@@ -222,7 +229,15 @@ vdev_file_io_start(zio_t *zio)
bioinit(bp);
bp->b_flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
bp->b_bcount = zio->io_size;
- bp->b_un.b_addr = zio->io_data;
+
+ if (zio->io_type == ZIO_TYPE_READ) {
+ bp->b_un.b_addr =
+ abd_borrow_buf(zio->io_abd, zio->io_size);
+ } else {
+ bp->b_un.b_addr =
+ abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+ }
+
bp->b_lblkno = lbtodb(zio->io_offset);
bp->b_bufsize = zio->io_size;
bp->b_private = vf->vf_vnode;
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 866046315c..b76589f0f6 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -145,6 +145,7 @@
#include <sys/metaslab.h>
#include <sys/zio.h>
#include <sys/dsl_scan.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
@@ -178,7 +179,7 @@ vdev_label_number(uint64_t psize, uint64_t offset)
}
static void
-vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
@@ -192,7 +193,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
}
static void
-vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
@@ -444,6 +445,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
spa_t *spa = vd->vdev_spa;
nvlist_t *config = NULL;
vdev_phys_t *vp;
+ abd_t *vp_abd;
zio_t *zio;
uint64_t best_txg = 0;
int error = 0;
@@ -455,7 +457,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg)
if (!vdev_readable(vd))
return (NULL);
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ vp = abd_to_buf(vp_abd);
retry:
for (int l = 0; l < VDEV_LABELS; l++) {
@@ -463,7 +466,7 @@ retry:
zio = zio_root(spa, NULL, NULL, flags);
- vdev_label_read(zio, vd, l, vp,
+ vdev_label_read(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@@ -502,7 +505,7 @@ retry:
goto retry;
}
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
return (config);
}
@@ -636,8 +639,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
spa_t *spa = vd->vdev_spa;
nvlist_t *label;
vdev_phys_t *vp;
- char *pad2;
+ abd_t *vp_abd;
+ abd_t *pad2;
uberblock_t *ub;
+ abd_t *ub_abd;
zio_t *zio;
char *buf;
size_t buflen;
@@ -719,8 +724,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize its label.
*/
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
/*
* Generate a label describing the pool and our top-level vdev.
@@ -780,7 +786,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
if (error != 0) {
nvlist_free(label);
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
/* EFAULT means nvlist_pack ran out of room */
return (error == EFAULT ? ENAMETOOLONG : EINVAL);
}
@@ -788,14 +794,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* Initialize uberblock template.
*/
- ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
- bzero(ub, VDEV_UBERBLOCK_RING);
- *ub = spa->spa_uberblock;
+ ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+ abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+ ub = abd_to_buf(ub_abd);
ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
- pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
- bzero(pad2, VDEV_PAD_SIZE);
+ pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(pad2, VDEV_PAD_SIZE);
/*
* Write everything in parallel.
@@ -805,7 +812,7 @@ retry:
for (int l = 0; l < VDEV_LABELS; l++) {
- vdev_label_write(zio, vd, l, vp,
+ vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t), NULL, NULL, flags);
@@ -818,7 +825,7 @@ retry:
offsetof(vdev_label_t, vl_pad2),
VDEV_PAD_SIZE, NULL, NULL, flags);
- vdev_label_write(zio, vd, l, ub,
+ vdev_label_write(zio, vd, l, ub_abd,
offsetof(vdev_label_t, vl_uberblock),
VDEV_UBERBLOCK_RING, NULL, NULL, flags);
}
@@ -831,9 +838,9 @@ retry:
}
nvlist_free(label);
- zio_buf_free(pad2, VDEV_PAD_SIZE);
- zio_buf_free(ub, VDEV_UBERBLOCK_RING);
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(pad2);
+ abd_free(ub_abd);
+ abd_free(vp_abd);
/*
* If this vdev hasn't been previously identified as a spare, then we
@@ -897,7 +904,7 @@ vdev_uberblock_load_done(zio_t *zio)
vdev_t *vd = zio->io_vd;
spa_t *spa = zio->io_spa;
zio_t *rio = zio->io_private;
- uberblock_t *ub = zio->io_data;
+ uberblock_t *ub = abd_to_buf(zio->io_abd);
struct ubl_cbdata *cbp = rio->io_private;
ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
@@ -918,7 +925,7 @@ vdev_uberblock_load_done(zio_t *zio)
mutex_exit(&rio->io_lock);
}
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
}
static void
@@ -932,8 +939,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
for (int l = 0; l < VDEV_LABELS; l++) {
for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
vdev_label_read(zio, vd, l,
- zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
- VDEV_UBERBLOCK_OFFSET(vd, n),
+ abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+ B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_load_done, zio, flags);
}
@@ -1000,9 +1007,6 @@ vdev_uberblock_sync_done(zio_t *zio)
static void
vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
{
- uberblock_t *ubbuf;
- int n;
-
for (int c = 0; c < vd->vdev_children; c++)
vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
@@ -1012,19 +1016,20 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
if (!vdev_writeable(vd))
return;
- n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
+ int n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
- ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
- bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
- *ubbuf = *ub;
+ /* Copy the uberblock_t into the ABD */
+ abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+ abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+ abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
for (int l = 0; l < VDEV_LABELS; l++)
- vdev_label_write(zio, vd, l, ubbuf,
+ vdev_label_write(zio, vd, l, ub_abd,
VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
vdev_uberblock_sync_done, zio->io_private,
flags | ZIO_FLAG_DONT_PROPAGATE);
- zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
+ abd_free(ub_abd);
}
/* Sync the uberblocks to all vdevs in svd[] */
@@ -1100,6 +1105,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
{
nvlist_t *label;
vdev_phys_t *vp;
+ abd_t *vp_abd;
char *buf;
size_t buflen;
@@ -1117,15 +1123,16 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
*/
label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
- vp = zio_buf_alloc(sizeof (vdev_phys_t));
- bzero(vp, sizeof (vdev_phys_t));
+ vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+ abd_zero(vp_abd, sizeof (vdev_phys_t));
+ vp = abd_to_buf(vp_abd);
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
for (; l < VDEV_LABELS; l += 2) {
- vdev_label_write(zio, vd, l, vp,
+ vdev_label_write(zio, vd, l, vp_abd,
offsetof(vdev_label_t, vl_vdev_phys),
sizeof (vdev_phys_t),
vdev_label_sync_done, zio->io_private,
@@ -1133,7 +1140,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
}
}
- zio_buf_free(vp, sizeof (vdev_phys_t));
+ abd_free(vp_abd);
nvlist_free(label);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index b038ef6f67..a57bd6c73b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -31,6 +31,7 @@
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
/*
@@ -196,13 +197,12 @@ vdev_mirror_scrub_done(zio_t *zio)
while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
- bcopy(zio->io_data, pio->io_data, pio->io_size);
+ abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
mutex_exit(&pio->io_lock);
}
mutex_exit(&zio->io_lock);
}
-
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mc->mc_error = zio->io_error;
mc->mc_tried = 1;
@@ -282,7 +282,8 @@ vdev_mirror_io_start(zio_t *zio)
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
- zio_buf_alloc(zio->io_size), zio->io_size,
+ abd_alloc_sametype(zio->io_abd,
+ zio->io_size), zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
@@ -307,7 +308,7 @@ vdev_mirror_io_start(zio_t *zio)
while (children--) {
mc = &mm->mm_child[c];
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
zio->io_type, zio->io_priority, 0,
vdev_mirror_child_done, mc));
c++;
@@ -392,7 +393,7 @@ vdev_mirror_io_done(zio_t *zio)
mc = &mm->mm_child[c];
zio_vdev_io_redone(zio);
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
- mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
+ mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
ZIO_TYPE_READ, zio->io_priority, 0,
vdev_mirror_child_done, mc));
return;
@@ -433,7 +434,7 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
- zio->io_data, zio->io_size,
+ zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 5211996f6a..9665ce9957 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -37,6 +37,7 @@
#include <sys/dsl_pool.h>
#include <sys/zfs_zone.h>
#include <sys/metaslab_impl.h>
+#include <sys/abd.h>
/*
* ZFS I/O Scheduler
@@ -377,12 +378,12 @@ vdev_queue_agg_io_done(zio_t *aio)
zio_t *pio;
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
- bcopy((char *)aio->io_data + (pio->io_offset -
- aio->io_offset), pio->io_data, pio->io_size);
+ abd_copy_off(pio->io_abd, aio->io_abd,
+ 0, pio->io_offset - aio->io_offset, pio->io_size);
}
}
- zio_buf_free(aio->io_data, aio->io_size);
+ abd_free(aio->io_abd);
}
static int
@@ -617,8 +618,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
- zio_buf_alloc(size), size, first->io_type, zio->io_priority,
- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+ abd_alloc_for_io(size, B_TRUE), size, first->io_type,
+ zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
@@ -630,12 +631,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
if (dio->io_flags & ZIO_FLAG_NODATA) {
ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
- bzero((char *)aio->io_data + (dio->io_offset -
- aio->io_offset), dio->io_size);
+ abd_zero_off(aio->io_abd,
+ dio->io_offset - aio->io_offset, dio->io_size);
} else if (dio->io_type == ZIO_TYPE_WRITE) {
- bcopy(dio->io_data, (char *)aio->io_data +
- (dio->io_offset - aio->io_offset),
- dio->io_size);
+ abd_copy_off(aio->io_abd, dio->io_abd,
+ dio->io_offset - aio->io_offset, 0, dio->io_size);
}
zio_add_child(dio, aio);
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index ff06896e8d..4b77438877 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -34,6 +34,7 @@
#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
+#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
@@ -108,7 +109,7 @@ typedef struct raidz_col {
uint64_t rc_devidx; /* child device index for I/O */
uint64_t rc_offset; /* device offset */
uint64_t rc_size; /* I/O size */
- void *rc_data; /* I/O data */
+ abd_t *rc_abd; /* I/O data */
void *rc_gdata; /* used to store the "good" version */
int rc_error; /* I/O error for this device */
uint8_t rc_tried; /* Did we attempt this I/O column? */
@@ -125,7 +126,7 @@ typedef struct raidz_map {
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
uint64_t rm_skipstart; /* Column index of padding start */
- void *rm_datacopy; /* rm_asize-buffer of copied data */
+ abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
uint8_t rm_ecksuminjected; /* checksum error was injected */
@@ -265,7 +266,7 @@ vdev_raidz_map_free(raidz_map_t *rm)
size_t size;
for (c = 0; c < rm->rm_firstdatacol; c++) {
- zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
+ abd_free(rm->rm_col[c].rc_abd);
if (rm->rm_col[c].rc_gdata != NULL)
zio_buf_free(rm->rm_col[c].rc_gdata,
@@ -273,11 +274,13 @@ vdev_raidz_map_free(raidz_map_t *rm)
}
size = 0;
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ abd_put(rm->rm_col[c].rc_abd);
size += rm->rm_col[c].rc_size;
+ }
- if (rm->rm_datacopy != NULL)
- zio_buf_free(rm->rm_datacopy, size);
+ if (rm->rm_abd_copy != NULL)
+ abd_free(rm->rm_abd_copy);
kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
}
@@ -314,7 +317,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
size_t x;
const char *good = NULL;
- const char *bad = rm->rm_col[c].rc_data;
+ char *bad;
if (good_data == NULL) {
zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
@@ -328,8 +331,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* data never changes for a given logical ZIO)
*/
if (rm->rm_col[0].rc_gdata == NULL) {
- char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+ abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
char *buf;
+ int offset;
/*
* Set up the rm_col[]s to generate the parity for
@@ -337,15 +341,20 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
* replacing them with buffers to hold the result.
*/
for (x = 0; x < rm->rm_firstdatacol; x++) {
- bad_parity[x] = rm->rm_col[x].rc_data;
- rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+ bad_parity[x] = rm->rm_col[x].rc_abd;
+ rm->rm_col[x].rc_gdata =
zio_buf_alloc(rm->rm_col[x].rc_size);
+ rm->rm_col[x].rc_abd =
+ abd_get_from_buf(rm->rm_col[x].rc_gdata,
+ rm->rm_col[x].rc_size);
}
/* fill in the data columns from good_data */
buf = (char *)good_data;
for (; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
+ rm->rm_col[x].rc_size);
buf += rm->rm_col[x].rc_size;
}
@@ -355,13 +364,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
vdev_raidz_generate_parity(rm);
/* restore everything back to its original state */
- for (x = 0; x < rm->rm_firstdatacol; x++)
- rm->rm_col[x].rc_data = bad_parity[x];
+ for (x = 0; x < rm->rm_firstdatacol; x++) {
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = bad_parity[x];
+ }
- buf = rm->rm_datacopy;
+ offset = 0;
for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
- rm->rm_col[x].rc_data = buf;
- buf += rm->rm_col[x].rc_size;
+ abd_put(rm->rm_col[x].rc_abd);
+ rm->rm_col[x].rc_abd = abd_get_offset(
+ rm->rm_abd_copy, offset);
+ offset += rm->rm_col[x].rc_size;
}
}
@@ -375,8 +388,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
good += rm->rm_col[x].rc_size;
}
+ bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
/* we drop the ereport if it ends up that the data was good */
zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+ abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
}
/*
@@ -389,7 +404,7 @@ static void
vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
{
size_t c = (size_t)(uintptr_t)arg;
- caddr_t buf;
+ size_t offset;
raidz_map_t *rm = zio->io_vsd;
size_t size;
@@ -403,7 +418,7 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
rm->rm_reports++;
ASSERT3U(rm->rm_reports, >, 0);
- if (rm->rm_datacopy != NULL)
+ if (rm->rm_abd_copy != NULL)
return;
/*
@@ -419,17 +434,20 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
size += rm->rm_col[c].rc_size;
- buf = rm->rm_datacopy = zio_buf_alloc(size);
+ rm->rm_abd_copy =
+ abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
- for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
raidz_col_t *col = &rm->rm_col[c];
+ abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
- bcopy(col->rc_data, buf, col->rc_size);
- col->rc_data = buf;
+ abd_copy(tmp, col->rc_abd, col->rc_size);
+ abd_put(col->rc_abd);
+ col->rc_abd = tmp;
- buf += col->rc_size;
+ offset += col->rc_size;
}
- ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
+ ASSERT3U(offset, ==, size);
}
static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
@@ -442,7 +460,7 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
* the number of children in the target vdev.
*/
static raidz_map_t *
-vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
+vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
@@ -455,6 +473,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
/* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ uint64_t off = 0;
/*
* "Quotient": The number of data sectors for this stripe on all but
@@ -499,7 +518,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
rm->rm_missingdata = 0;
rm->rm_missingparity = 0;
rm->rm_firstdatacol = nparity;
- rm->rm_datacopy = NULL;
+ rm->rm_abd_copy = NULL;
rm->rm_reports = 0;
rm->rm_freed = 0;
rm->rm_ecksuminjected = 0;
@@ -515,7 +534,7 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
- rm->rm_col[c].rc_data = NULL;
+ rm->rm_col[c].rc_abd = NULL;
rm->rm_col[c].rc_gdata = NULL;
rm->rm_col[c].rc_error = 0;
rm->rm_col[c].rc_tried = 0;
@@ -538,13 +557,16 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
- rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
+ rm->rm_col[c].rc_abd =
+ abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
- rm->rm_col[c].rc_data = data;
+ rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
+ off = rm->rm_col[c].rc_size;
- for (c = c + 1; c < acols; c++)
- rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
- rm->rm_col[c - 1].rc_size;
+ for (c = c + 1; c < acols; c++) {
+ rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
+ off += rm->rm_col[c].rc_size;
+ }
/*
* If all data stored spans all columns, there's a danger that parity
@@ -584,29 +606,84 @@ vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
return (rm);
}
+struct pqr_struct {
+ uint64_t *p;
+ uint64_t *q;
+ uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++)
+ *pqr->p ^= *src;
+
+ return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && !pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+ struct pqr_struct *pqr = private;
+ const uint64_t *src = buf;
+ uint64_t mask;
+ int i, cnt = size / sizeof (src[0]);
+
+ ASSERT(pqr->p && pqr->q && pqr->r);
+
+ for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+ *pqr->p ^= *src;
+ VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+ *pqr->q ^= *src;
+ VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+ *pqr->r ^= *src;
+ }
+
+ return (0);
+}
+
static void
vdev_raidz_generate_parity_p(raidz_map_t *rm)
{
- uint64_t *p, *src, pcount, ccount, i;
+ uint64_t *p;
int c;
-
- pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ abd_t *src;
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccount == pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p = *src;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccount <= pcount);
- for (i = 0; i < ccount; i++, src++, p++) {
- *p ^= *src;
- }
+ struct pqr_struct pqr = { p, NULL, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_p_func, &pqr);
}
}
}
@@ -614,50 +691,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pq(raidz_map_t *rm)
{
- uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p = *src;
- *q = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++) {
- *p = 0;
- *q = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++) {
- *p ^= *src;
+ struct pqr_struct pqr = { p, q, NULL };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pq_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
}
-
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
}
}
}
@@ -666,59 +736,48 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
static void
vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
{
- uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+ uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
int c;
+ abd_t *src;
- pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+ pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_Q].rc_size);
ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
rm->rm_col[VDEV_RAIDZ_R].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+ src = rm->rm_col[c].rc_abd;
+ p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
- ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+ ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
if (c == rm->rm_firstdatacol) {
- ASSERT(ccnt == pcnt || ccnt == 0);
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p = *src;
- *q = *src;
- *r = *src;
- }
- for (; i < pcnt; i++, src++, p++, q++, r++) {
- *p = 0;
- *q = 0;
- *r = 0;
- }
+ abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
+ (void) memcpy(q, p, rm->rm_col[c].rc_size);
+ (void) memcpy(r, p, rm->rm_col[c].rc_size);
} else {
- ASSERT(ccnt <= pcnt);
-
- /*
- * Apply the algorithm described above by multiplying
- * the previous result and adding in the new value.
- */
- for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
- *p ^= *src;
-
- VDEV_RAIDZ_64MUL_2(*q, mask);
- *q ^= *src;
+ struct pqr_struct pqr = { p, q, r };
+ (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
+ vdev_raidz_pqr_func, &pqr);
+ }
- VDEV_RAIDZ_64MUL_4(*r, mask);
- *r ^= *src;
+ if (c == rm->rm_firstdatacol) {
+ for (i = ccnt; i < pcnt; i++) {
+ p[i] = 0;
+ q[i] = 0;
+ r[i] = 0;
}
-
+ } else {
/*
* Treat short columns as though they are full of 0s.
* Note that there's therefore nothing needed for P.
*/
- for (; i < pcnt; i++, q++, r++) {
- VDEV_RAIDZ_64MUL_2(*q, mask);
- VDEV_RAIDZ_64MUL_4(*r, mask);
+ for (i = ccnt; i < pcnt; i++) {
+ VDEV_RAIDZ_64MUL_2(q[i], mask);
+ VDEV_RAIDZ_64MUL_4(r[i], mask);
}
}
}
@@ -746,40 +805,153 @@ vdev_raidz_generate_parity(raidz_map_t *rm)
}
}
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ int cnt = size / sizeof (src[0]);
+
+ for (int i = 0; i < cnt; i++) {
+ dst[i] ^= src[i];
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+ void *private)
+{
+ uint64_t *dst = dbuf;
+ uint64_t *src = sbuf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, src++) {
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ *dst ^= *src;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+ uint64_t *dst = buf;
+ uint64_t mask;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++) {
+ /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+ VDEV_RAIDZ_64MUL_2(*dst, mask);
+ }
+
+ return (0);
+}
+
+struct reconst_q_struct {
+ uint64_t *q;
+ int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+ struct reconst_q_struct *rq = private;
+ uint64_t *dst = buf;
+ int cnt = size / sizeof (dst[0]);
+
+ for (int i = 0; i < cnt; i++, dst++, rq->q++) {
+ *dst ^= *rq->q;
+
+ int j;
+ uint8_t *b;
+ for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+ *b = vdev_raidz_exp2(*b, rq->exp);
+ }
+ }
+
+ return (0);
+}
+
+struct reconst_pq_struct {
+ uint8_t *p;
+ uint8_t *q;
+ uint8_t *pxy;
+ uint8_t *qxy;
+ int aexp;
+ int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+ uint8_t *yd = ybuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ *yd = *rpq->p ^ *rpq->pxy ^ *xd;
+ }
+
+ return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+ struct reconst_pq_struct *rpq = private;
+ uint8_t *xd = xbuf;
+
+ for (int i = 0; i < size;
+ i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+ /* same operation as vdev_raidz_reconst_pq_func() on xd */
+ *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+ vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+ }
+
+ return (0);
+}
+
static int
vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, i;
int x = tgts[0];
int c;
+ abd_t *dst, *src;
ASSERT(ntgts == 1);
ASSERT(x >= rm->rm_firstdatacol);
ASSERT(x < rm->rm_cols);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
- ASSERT(xcount > 0);
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
+ ASSERT(rm->rm_col[x].rc_size > 0);
- src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- dst = rm->rm_col[x].rc_data;
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst = *src;
- }
+ src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
+
+ abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
+ uint64_t size = MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
+
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == x)
continue;
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
- count = MIN(ccount, xcount);
-
- for (i = 0; i < count; i++, dst++, src++) {
- *dst ^= *src;
- }
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_p_func, NULL);
}
return (1 << VDEV_RAIDZ_P);
@@ -788,57 +960,43 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint64_t *dst, *src, xcount, ccount, count, mask, i;
- uint8_t *b;
int x = tgts[0];
- int c, j, exp;
+ int c, exp;
+ abd_t *dst, *src;
ASSERT(ntgts == 1);
- xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
- ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
+ ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
- src = rm->rm_col[c].rc_data;
- dst = rm->rm_col[x].rc_data;
-
- if (c == x)
- ccount = 0;
- else
- ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+ uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
+ rm->rm_col[c].rc_size);
- count = MIN(ccount, xcount);
+ src = rm->rm_col[c].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
if (c == rm->rm_firstdatacol) {
- for (i = 0; i < count; i++, dst++, src++) {
- *dst = *src;
- }
- for (; i < xcount; i++, dst++) {
- *dst = 0;
- }
-
+ abd_copy(dst, src, size);
+ if (rm->rm_col[x].rc_size > size)
+ abd_zero_off(dst, size,
+ rm->rm_col[x].rc_size - size);
} else {
- for (i = 0; i < count; i++, dst++, src++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- *dst ^= *src;
- }
-
- for (; i < xcount; i++, dst++) {
- VDEV_RAIDZ_64MUL_2(*dst, mask);
- }
+ ASSERT3U(size, <=, rm->rm_col[x].rc_size);
+ (void) abd_iterate_func2(dst, src, 0, 0, size,
+ vdev_raidz_reconst_q_pre_func, NULL);
+ (void) abd_iterate_func(dst,
+ size, rm->rm_col[x].rc_size - size,
+ vdev_raidz_reconst_q_pre_tail_func, NULL);
}
}
- src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- dst = rm->rm_col[x].rc_data;
+ src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
+ dst = rm->rm_col[x].rc_abd;
exp = 255 - (rm->rm_cols - 1 - x);
- for (i = 0; i < xcount; i++, dst++, src++) {
- *dst ^= *src;
- for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
- *b = vdev_raidz_exp2(*b, exp);
- }
- }
+ struct reconst_q_struct rq = { abd_to_buf(src), exp };
+ (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
+ vdev_raidz_reconst_q_post_func, &rq);
return (1 << VDEV_RAIDZ_Q);
}
@@ -846,11 +1004,12 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
static int
vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
{
- uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
- void *pdata, *qdata;
- uint64_t xsize, ysize, i;
+ uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+ abd_t *pdata, *qdata;
+ uint64_t xsize, ysize;
int x = tgts[0];
int y = tgts[1];
+ abd_t *xd, *yd;
ASSERT(ntgts == 2);
ASSERT(x < y);
@@ -866,15 +1025,15 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* parity so we make those columns appear to be full of zeros by
* setting their lengths to zero.
*/
- pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+ pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
+ qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
xsize = rm->rm_col[x].rc_size;
ysize = rm->rm_col[y].rc_size;
- rm->rm_col[VDEV_RAIDZ_P].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
- rm->rm_col[VDEV_RAIDZ_Q].rc_data =
- zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
+ abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
rm->rm_col[x].rc_size = 0;
rm->rm_col[y].rc_size = 0;
@@ -883,12 +1042,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
rm->rm_col[x].rc_size = xsize;
rm->rm_col[y].rc_size = ysize;
- p = pdata;
- q = qdata;
- pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
- qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
- xd = rm->rm_col[x].rc_data;
- yd = rm->rm_col[y].rc_data;
+ p = abd_to_buf(pdata);
+ q = abd_to_buf(qdata);
+ pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
+ xd = rm->rm_col[x].rc_abd;
+ yd = rm->rm_col[y].rc_abd;
/*
* We now have:
@@ -912,24 +1071,21 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
- for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
- *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
- vdev_raidz_exp2(*q ^ *qxy, bexp);
+ ASSERT3U(xsize, >=, ysize);
+ struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
+ (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+ vdev_raidz_reconst_pq_func, &rpq);
+ (void) abd_iterate_func(xd, ysize, xsize - ysize,
+ vdev_raidz_reconst_pq_tail_func, &rpq);
- if (i < ysize)
- *yd = *p ^ *pxy ^ *xd;
- }
-
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
- rm->rm_col[VDEV_RAIDZ_P].rc_size);
- zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
- rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+ abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
+ abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
/*
* Restore the saved parity data.
*/
- rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
- rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+ rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
+ rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
}
@@ -1244,7 +1400,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
c = used[i];
ASSERT3U(c, <, rm->rm_cols);
- src = rm->rm_col[c].rc_data;
+ src = abd_to_buf(rm->rm_col[c].rc_abd);
ccount = rm->rm_col[c].rc_size;
for (j = 0; j < nmissing; j++) {
cc = missing[j] + rm->rm_firstdatacol;
@@ -1252,7 +1408,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
ASSERT3U(cc, <, rm->rm_cols);
ASSERT3U(cc, !=, c);
- dst[j] = rm->rm_col[cc].rc_data;
+ dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
dcount[j] = rm->rm_col[cc].rc_size;
}
@@ -1300,8 +1456,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
uint8_t *used;
+ abd_t **bufs = NULL;
+
int code = 0;
+ /*
+ * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+ * temporary linear ABDs.
+ */
+ if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
+ bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ bufs[c] = col->rc_abd;
+ col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
+ abd_copy(col->rc_abd, bufs[c], col->rc_size);
+ }
+ }
n = rm->rm_cols - rm->rm_firstdatacol;
@@ -1388,6 +1561,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
kmem_free(p, psize);
+ /*
+ * copy back from temporary linear abds and free them
+ */
+ if (bufs) {
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+ raidz_col_t *col = &rm->rm_col[c];
+
+ abd_copy(bufs[c], col->rc_abd, col->rc_size);
+ abd_free(col->rc_abd);
+ col->rc_abd = bufs[c];
+ }
+ kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
+ }
+
return (code);
}
@@ -1619,7 +1806,9 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
* treat the on-disk format as if the only blocks are the complete 128
* KB size.
*/
- rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+ abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
+ SPA_OLD_MAXBLOCKSIZE);
+ rm = vdev_raidz_map_alloc(abd,
SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
vd->vdev_children, vd->vdev_nparity);
@@ -1658,13 +1847,14 @@ vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
* example of why this calculation is needed.
*/
if ((err = vdev_disk_physio(cvd,
- ((char *)rc->rc_data) + colskip, colsize,
+ ((char *)rc->rc_abd) + colskip, colsize,
VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
flags, isdump)) != 0)
break;
}
vdev_raidz_map_free(rm);
+ abd_put(abd);
#endif /* KERNEL */
return (err);
@@ -1722,7 +1912,7 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+ rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
@@ -1738,7 +1928,7 @@ vdev_raidz_io_start(zio_t *zio)
rc = &rm->rm_col[c];
cvd = vd->vdev_child[rc->rc_devidx];
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1795,7 +1985,7 @@ vdev_raidz_io_start(zio_t *zio)
if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
@@ -1811,6 +2001,7 @@ vdev_raidz_io_start(zio_t *zio)
static void
raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
{
+ void *buf;
vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -1824,9 +2015,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
+ buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
zfs_ereport_post_checksum(zio->io_spa, vd, zio,
- rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+ rc->rc_offset, rc->rc_size, buf, bad_data,
&zbc);
+ abd_return_buf(rc->rc_abd, buf, rc->rc_size);
}
}
@@ -1872,7 +2065,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
if (!rc->rc_tried || rc->rc_error != 0)
continue;
orig[c] = zio_buf_alloc(rc->rc_size);
- bcopy(rc->rc_data, orig[c], rc->rc_size);
+ abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
}
vdev_raidz_generate_parity(rm);
@@ -1881,7 +2074,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
continue;
- if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
+ if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) {
raidz_checksum_error(zio, rc, orig[c]);
rc->rc_error = SET_ERROR(ECKSUM);
ret++;
@@ -1989,7 +2182,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
ASSERT3S(c, >=, 0);
ASSERT3S(c, <, rm->rm_cols);
rc = &rm->rm_col[c];
- bcopy(rc->rc_data, orig[i], rc->rc_size);
+ abd_copy_to_buf(orig[i], rc->rc_abd,
+ rc->rc_size);
}
/*
@@ -2020,7 +2214,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
for (i = 0; i < n; i++) {
c = tgts[i];
rc = &rm->rm_col[c];
- bcopy(orig[i], rc->rc_data, rc->rc_size);
+ abd_copy_from_buf(rc->rc_abd, orig[i],
+ rc->rc_size);
}
do {
@@ -2261,7 +2456,7 @@ vdev_raidz_io_done(zio_t *zio)
continue;
zio_nowait(zio_vdev_child_io(zio, NULL,
vd->vdev_child[rc->rc_devidx],
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
} while (++c < rm->rm_cols);
@@ -2341,7 +2536,7 @@ done:
continue;
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
- rc->rc_offset, rc->rc_data, rc->rc_size,
+ rc->rc_offset, rc->rc_abd, rc->rc_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 26c8d793dc..ae72c667cb 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -40,6 +40,7 @@
#include <sys/vdev_impl.h>
#include <sys/dmu_tx.h>
#include <sys/dsl_pool.h>
+#include <sys/abd.h>
/*
* The zfs intent log (ZIL) saves transaction records of system calls
@@ -878,6 +879,7 @@ zil_lwb_write_done(zio_t *zio)
* one in zil_commit_writer(). zil_sync() will only remove
* the lwb if lwb_buf is null.
*/
+ abd_put(zio->io_abd);
zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
mutex_enter(&zilog->zl_lock);
lwb->lwb_buf = NULL;
@@ -909,8 +911,10 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
ZIO_FLAG_CANFAIL);
}
if (lwb->lwb_zio == NULL) {
+ abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+ BP_GET_LSIZE(&lwb->lwb_blk));
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
- 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
+ 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 4bef635b0a..da09434078 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -43,6 +43,7 @@
#include <sys/zfeature.h>
#include <sys/zfs_zone.h>
#include <sys/metaslab_impl.h>
+#include <sys/abd.h>
/*
* ==========================================================================
@@ -274,12 +275,18 @@ zio_data_buf_free(void *buf, size_t size)
* ==========================================================================
*/
void
-zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
- zt->zt_orig_data = zio->io_data;
+ /*
+ * Ensure that anyone expecting this zio to contain a linear ABD isn't
+ * going to get a nasty surprise when they try to access the data.
+ */
+ IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
+
+ zt->zt_orig_abd = zio->io_abd;
zt->zt_orig_size = zio->io_size;
zt->zt_bufsize = bufsize;
zt->zt_transform = transform;
@@ -287,7 +294,7 @@ zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
zt->zt_next = zio->io_transform_stack;
zio->io_transform_stack = zt;
- zio->io_data = data;
+ zio->io_abd = data;
zio->io_size = size;
}
@@ -299,12 +306,12 @@ zio_pop_transforms(zio_t *zio)
while ((zt = zio->io_transform_stack) != NULL) {
if (zt->zt_transform != NULL)
zt->zt_transform(zio,
- zt->zt_orig_data, zt->zt_orig_size);
+ zt->zt_orig_abd, zt->zt_orig_size);
if (zt->zt_bufsize != 0)
- zio_buf_free(zio->io_data, zt->zt_bufsize);
+ abd_free(zio->io_abd);
- zio->io_data = zt->zt_orig_data;
+ zio->io_abd = zt->zt_orig_abd;
zio->io_size = zt->zt_orig_size;
zio->io_transform_stack = zt->zt_next;
@@ -318,21 +325,26 @@ zio_pop_transforms(zio_t *zio)
* ==========================================================================
*/
static void
-zio_subblock(zio_t *zio, void *data, uint64_t size)
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
{
ASSERT(zio->io_size > size);
if (zio->io_type == ZIO_TYPE_READ)
- bcopy(zio->io_data, data, size);
+ abd_copy(data, zio->io_abd, size);
}
static void
-zio_decompress(zio_t *zio, void *data, uint64_t size)
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
{
- if (zio->io_error == 0 &&
- zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
- zio->io_data, data, zio->io_size, size) != 0)
- zio->io_error = SET_ERROR(EIO);
+ if (zio->io_error == 0) {
+ void *tmp = abd_borrow_buf(data, size);
+ int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+ zio->io_abd, tmp, zio->io_size, size);
+ abd_return_buf_copy(data, tmp, size);
+
+ if (ret != 0)
+ zio->io_error = SET_ERROR(EIO);
+ }
}
/*
@@ -530,7 +542,7 @@ zio_bookmark_compare(const void *x1, const void *x2)
*/
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
- void *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+ abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
void *private, zio_type_t type, zio_priority_t priority,
enum zio_flag flags, vdev_t *vd, uint64_t offset,
const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline)
@@ -589,7 +601,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_priority = priority;
zio->io_vd = vd;
zio->io_offset = offset;
- zio->io_orig_data = zio->io_data = data;
+ zio->io_orig_abd = zio->io_abd = data;
zio->io_orig_size = zio->io_size = psize;
zio->io_lsize = lsize;
zio->io_orig_flags = zio->io_flags = flags;
@@ -731,7 +743,7 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- void *data, uint64_t size, zio_done_func_t *done, void *private,
+ abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -749,7 +761,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
- void *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+ abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
zio_done_func_t *ready, zio_done_func_t *children_ready,
zio_done_func_t *physdone, zio_done_func_t *done,
void *private, zio_priority_t priority, enum zio_flag flags,
@@ -790,7 +802,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
}
zio_t *
-zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
uint64_t size, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{
@@ -943,7 +955,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -964,7 +976,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
- void *data, int checksum, zio_done_func_t *done, void *private,
+ abd_t *data, int checksum, zio_done_func_t *done, void *private,
zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -987,8 +999,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
* Therefore, we must make a local copy in case the data is
* being written to multiple places in parallel.
*/
- void *wbuf = zio_buf_alloc(size);
- bcopy(data, wbuf, size);
+ abd_t *wbuf = abd_alloc_sametype(data, size);
+ abd_copy(wbuf, data, size);
+
zio_push_transform(zio, wbuf, size, size, NULL);
}
@@ -1000,7 +1013,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, zio_priority_t priority,
+ abd_t *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
@@ -1065,7 +1078,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
}
zio_t *
-zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
int type, zio_priority_t priority, enum zio_flag flags,
zio_done_func_t *done, void *private)
{
@@ -1126,14 +1139,17 @@ zio_read_bp_init(zio_t *zio)
!(zio->io_flags & ZIO_FLAG_RAW)) {
uint64_t psize =
BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
- void *cbuf = zio_buf_alloc(psize);
-
- zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
+ zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+ psize, psize, zio_decompress);
}
if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
- decode_embedded_bp_compressed(bp, zio->io_data);
+
+ int psize = BPE_GET_PSIZE(bp);
+ void *data = abd_borrow_buf(zio->io_abd, psize);
+ decode_embedded_bp_compressed(bp, data);
+ abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
}
@@ -1273,7 +1289,7 @@ zio_write_compress(zio_t *zio)
/* If it's a compressed write that is not raw, compress the buffer. */
if (compress != ZIO_COMPRESS_OFF && psize == lsize) {
void *cbuf = zio_buf_alloc(lsize);
- psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize);
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@@ -1308,9 +1324,11 @@ zio_write_compress(zio_t *zio)
zio_buf_free(cbuf, lsize);
psize = lsize;
} else {
- bzero((char *)cbuf + psize, rounded - psize);
+ abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+ abd_take_ownership_of_buf(cdata, B_TRUE);
+ abd_zero_off(cdata, psize, rounded - psize);
psize = rounded;
- zio_push_transform(zio, cbuf,
+ zio_push_transform(zio, cdata,
psize, lsize, NULL);
}
}
@@ -1830,26 +1848,38 @@ zio_resume_wait(spa_t *spa)
* ==========================================================================
*/
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+ abd_put(zio->io_abd);
+}
+
static zio_t *
-zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
if (gn != NULL)
return (pio);
- return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
- NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+ BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+ NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
&pio->io_bookmark));
}
-zio_t *
-zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
zio_t *zio;
if (gn != NULL) {
+ abd_t *gbh_abd =
+ abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
- ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+ pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+ &pio->io_bookmark);
/*
* As we rewrite each gang header, the pipeline will compute
* a new gang block header checksum for it; but no one will
@@ -1860,8 +1890,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
* this is just good hygiene.)
*/
if (gn != pio->io_gang_leader->io_gang_tree) {
+ abd_t *buf = abd_get_offset(data, offset);
+
zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
- data, BP_GET_PSIZE(bp));
+ buf, BP_GET_PSIZE(bp));
+
+ abd_put(buf);
}
/*
* If we are here to damage data for testing purposes,
@@ -1871,7 +1905,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
} else {
zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
- data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
+ abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+ zio_gang_issue_func_done, NULL, pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
}
@@ -1879,16 +1914,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
}
/* ARGSUSED */
-zio_t *
-zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
ZIO_GANG_CHILD_FLAGS(pio)));
}
/* ARGSUSED */
-zio_t *
-zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+ uint64_t offset)
{
return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
@@ -1950,13 +1987,14 @@ static void
zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
{
zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+ abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
ASSERT(gio->io_gang_leader == gio);
ASSERT(BP_IS_GANG(bp));
- zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
- SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
- gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+ zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_gang_tree_assemble_done, gn, gio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
}
static void
@@ -1972,13 +2010,16 @@ zio_gang_tree_assemble_done(zio_t *zio)
if (zio->io_error)
return;
+ /* this ABD was created from a linear buf in zio_gang_tree_assemble */
if (BP_SHOULD_BYTESWAP(bp))
- byteswap_uint64_array(zio->io_data, zio->io_size);
+ byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
- ASSERT(zio->io_data == gn->gn_gbh);
+ ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+ abd_put(zio->io_abd);
+
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (!BP_IS_GANG(gbp))
@@ -1988,7 +2029,8 @@ zio_gang_tree_assemble_done(zio_t *zio)
}
static void
-zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+ uint64_t offset)
{
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@@ -2001,7 +2043,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
* If you're a gang header, your data is in gn->gn_gbh.
* If you're a gang member, your data is in 'data' and gn == NULL.
*/
- zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
+ zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
if (gn != NULL) {
ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
@@ -2010,13 +2052,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
if (BP_IS_HOLE(gbp))
continue;
- zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
- data = (char *)data + BP_GET_PSIZE(gbp);
+ zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+ offset);
+ offset += BP_GET_PSIZE(gbp);
}
}
if (gn == gio->io_gang_tree)
- ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
+ ASSERT3U(gio->io_size, ==, offset);
if (zio != pio)
zio_nowait(zio);
@@ -2049,7 +2092,8 @@ zio_gang_issue(zio_t *zio)
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
- zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
+ zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+ 0);
else
zio_gang_tree_free(&zio->io_gang_tree);
@@ -2088,6 +2132,12 @@ zio_write_gang_member_ready(zio_t *zio)
mutex_exit(&pio->io_lock);
}
+static void
+zio_write_gang_done(zio_t *zio)
+{
+ abd_put(zio->io_abd);
+}
+
static int
zio_write_gang_block(zio_t *pio)
{
@@ -2098,6 +2148,7 @@ zio_write_gang_block(zio_t *pio)
zio_t *zio;
zio_gang_node_t *gn, **gnpp;
zio_gbh_phys_t *gbh;
+ abd_t *gbh_abd;
uint64_t txg = pio->io_txg;
uint64_t resid = pio->io_size;
uint64_t lsize;
@@ -2158,12 +2209,14 @@ zio_write_gang_block(zio_t *pio)
gn = zio_gang_node_alloc(gnpp);
gbh = gn->gn_gbh;
bzero(gbh, SPA_GANGBLOCKSIZE);
+ gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
/*
* Create the gang header.
*/
- zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+ zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+ zio_write_gang_done, NULL, pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
/*
* Create and nowait the gang children.
@@ -2183,9 +2236,9 @@ zio_write_gang_block(zio_t *pio)
zp.zp_nopwrite = B_FALSE;
zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
- (char *)pio->io_data + (pio->io_size - resid), lsize, lsize,
- &zp, zio_write_gang_member_ready, NULL, NULL, NULL,
- &gn->gn_child[g], pio->io_priority,
+ abd_get_offset(pio->io_abd, pio->io_size - resid), lsize,
+ lsize, &zp, zio_write_gang_member_ready, NULL, NULL,
+ zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
@@ -2298,10 +2351,11 @@ zio_ddt_child_read_done(zio_t *zio)
ddp = ddt_phys_select(dde, bp);
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */
- if (zio->io_error == 0 && dde->dde_repair_data == NULL)
- dde->dde_repair_data = zio->io_data;
+
+ if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+ dde->dde_repair_abd = zio->io_abd;
else
- zio_buf_free(zio->io_data, zio->io_size);
+ abd_free(zio->io_abd);
mutex_exit(&pio->io_lock);
}
@@ -2333,16 +2387,16 @@ zio_ddt_read_start(zio_t *zio)
ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
&blk);
zio_nowait(zio_read(zio, zio->io_spa, &blk,
- zio_buf_alloc(zio->io_size), zio->io_size,
- zio_ddt_child_read_done, dde, zio->io_priority,
- ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
- &zio->io_bookmark));
+ abd_alloc_for_io(zio->io_size, B_TRUE),
+ zio->io_size, zio_ddt_child_read_done, dde,
+ zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+ ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
}
return (ZIO_PIPELINE_CONTINUE);
}
zio_nowait(zio_read(zio, zio->io_spa, bp,
- zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+ zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
return (ZIO_PIPELINE_CONTINUE);
@@ -2372,8 +2426,9 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (ZIO_PIPELINE_STOP);
}
- if (dde->dde_repair_data != NULL) {
- bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+ if (dde->dde_repair_abd != NULL) {
+ abd_copy(zio->io_abd, dde->dde_repair_abd,
+ zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0;
}
ddt_repair_done(ddt, dde);
@@ -2405,7 +2460,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (lio != NULL) {
return (lio->io_orig_size != zio->io_orig_size ||
- bcmp(zio->io_orig_data, lio->io_orig_data,
+ abd_cmp(zio->io_orig_abd, lio->io_orig_abd,
zio->io_orig_size) != 0);
}
}
@@ -2426,17 +2481,17 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
/*
* Intuitively, it would make more sense to compare
- * io_data than io_orig_data in the raw case since you
+ * io_abd than io_orig_abd in the raw case since you
* don't want to look at any transformations that have
* happened to the data. However, for raw I/Os the
- * data will actually be the same in io_data and
- * io_orig_data, so all we have to do is issue this as
+ * data will actually be the same in io_abd and
+ * io_orig_abd, so all we have to do is issue this as
* a raw ARC read.
*/
if (do_raw) {
zio_flags |= ZIO_FLAG_RAW;
ASSERT3U(zio->io_size, ==, zio->io_orig_size);
- ASSERT0(bcmp(zio->io_data, zio->io_orig_data,
+ ASSERT0(abd_cmp(zio->io_abd, zio->io_orig_abd,
zio->io_size));
ASSERT3P(zio->io_transform_stack, ==, NULL);
}
@@ -2447,7 +2502,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (error == 0) {
if (arc_buf_size(abuf) != zio->io_orig_size ||
- bcmp(abuf->b_data, zio->io_orig_data,
+ abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
arc_buf_destroy(abuf, &abuf);
@@ -2613,12 +2668,12 @@ zio_ddt_write(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}
- dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ dio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, &czp, NULL, NULL,
NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
- zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+ zio_push_transform(dio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
}
@@ -2635,13 +2690,13 @@ zio_ddt_write(zio_t *zio)
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
} else {
- cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+ cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
zio->io_orig_size, zio->io_orig_size, zp,
zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
- zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+ zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio;
}
@@ -2981,11 +3036,11 @@ zio_vdev_io_start(zio_t *zio)
P2PHASE(zio->io_size, align) != 0) {
/* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
- char *abuf = zio_buf_alloc(asize);
+ abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
ASSERT(vd == vd->vdev_top);
if (zio->io_type == ZIO_TYPE_WRITE) {
- bcopy(zio->io_data, abuf, zio->io_size);
- bzero(abuf + zio->io_size, asize - zio->io_size);
+ abd_copy(abuf, zio->io_abd, zio->io_size);
+ abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
}
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
}
@@ -3111,7 +3166,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
{
void *buf = zio_buf_alloc(zio->io_size);
- bcopy(zio->io_data, buf, zio->io_size);
+ abd_copy_to_buf(buf, zio->io_abd, zio->io_size);
zcr->zcr_cbinfo = zio->io_size;
zcr->zcr_cbdata = buf;
@@ -3255,7 +3310,7 @@ zio_checksum_generate(zio_t *zio)
}
}
- zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
+ zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
return (ZIO_PIPELINE_CONTINUE);
}
@@ -3394,7 +3449,7 @@ zio_ready(zio_t *zio)
if (BP_IS_GANG(bp)) {
zio->io_flags &= ~ZIO_FLAG_NODATA;
} else {
- ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+ ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
}
}
@@ -3549,21 +3604,28 @@ zio_done(zio_t *zio)
zio_cksum_report_t *zcr = zio->io_cksum_report;
uint64_t align = zcr->zcr_align;
uint64_t asize = P2ROUNDUP(psize, align);
- char *abuf = zio->io_data;
+ char *abuf = NULL;
+ abd_t *adata = zio->io_abd;
if (asize != psize) {
- abuf = zio_buf_alloc(asize);
- bcopy(zio->io_data, abuf, psize);
- bzero(abuf + psize, asize - psize);
+ adata = abd_alloc_linear(asize, B_TRUE);
+ abd_copy(adata, zio->io_abd, psize);
+ abd_zero_off(adata, psize, asize - psize);
}
+ if (adata != NULL)
+ abuf = abd_borrow_buf_copy(adata, asize);
+
zio->io_cksum_report = zcr->zcr_next;
zcr->zcr_next = NULL;
zcr->zcr_finish(zcr, abuf);
zfs_ereport_free_checksum(zcr);
+ if (adata != NULL)
+ abd_return_buf(adata, abuf, asize);
+
if (asize != psize)
- zio_buf_free(abuf, asize);
+ abd_free(adata);
}
}
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index 2bd9001456..e1c98b0b99 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -31,6 +31,7 @@
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
+#include <sys/abd.h>
#include <zfs_fletcher.h>
/*
@@ -93,45 +94,85 @@
/*ARGSUSED*/
static void
-zio_checksum_off(const void *buf, uint64_t size,
+abd_checksum_off(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
+/*ARGSUSED*/
+void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_2_incremental_byteswap, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ fletcher_init(zcp);
+ (void) abd_iterate_func(abd, 0, size,
+ fletcher_4_incremental_byteswap, zcp);
+}
+
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "inherit"},
{{NULL, NULL}, NULL, NULL, 0, "on"},
- {{zio_checksum_off, zio_checksum_off},
+ {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap},
+ {{abd_fletcher_2_native, abd_fletcher_2_byteswap},
NULL, NULL, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256},
+ {{abd_checksum_SHA256, abd_checksum_SHA256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
- {{fletcher_4_native, fletcher_4_byteswap},
+ {{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
- {{zio_checksum_off, zio_checksum_off},
+ {{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"},
- {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
+ {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
- {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
- zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+ {{abd_checksum_skein_native, abd_checksum_skein_byteswap},
+ abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
- {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
- zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+ {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap},
+ abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
};
@@ -251,7 +292,7 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
*/
void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size)
+ abd_t *abd, uint64_t size)
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
@@ -266,6 +307,7 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
+ void *data = abd_to_buf(abd);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@@ -283,18 +325,18 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
else
bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC;
- ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&cksum);
eck->zec_cksum = cksum;
} else {
- ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
&bp->blk_cksum);
}
}
int
zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
- void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
+ abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t actual_cksum, expected_cksum;
@@ -308,25 +350,31 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
zio_cksum_t verifier;
+ uint64_t data_size = size;
+ void *data = abd_borrow_buf_copy(abd, data_size);
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
uint64_t nused;
eck = &zilc->zc_eck;
- if (eck->zec_magic == ZEC_MAGIC)
+ if (eck->zec_magic == ZEC_MAGIC) {
nused = zilc->zc_nused;
- else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+ } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) {
nused = BSWAP_64(zilc->zc_nused);
- else
+ } else {
+ abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
+ }
- if (nused > size)
+ if (nused > data_size) {
+ abd_return_buf(abd, data, data_size);
return (SET_ERROR(ECKSUM));
+ }
size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
} else {
- eck = (zio_eck_t *)((char *)data + size) - 1;
+ eck = (zio_eck_t *)((char *)data + data_size) - 1;
}
if (checksum == ZIO_CHECKSUM_GANG_HEADER)
@@ -341,11 +389,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
if (byteswap)
byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+ size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data;
expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier;
- ci->ci_func[byteswap](data, size,
+ abd_return_buf_copy(abd, data, data_size);
+
+ ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
- eck->zec_cksum = expected_cksum;
+ abd_copy_from_buf_off(abd, &expected_cksum,
+ eck_offset, sizeof (zio_cksum_t));
if (byteswap) {
byteswap_uint64_array(&expected_cksum,
@@ -354,7 +406,7 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
} else {
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
- ci->ci_func[byteswap](data, size,
+ ci->ci_func[byteswap](abd, size,
spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
@@ -383,7 +435,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
uint64_t size = (bp == NULL ? zio->io_size :
(BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
uint64_t offset = zio->io_offset;
- void *data = zio->io_data;
+ abd_t *data = zio->io_abd;
spa_t *spa = zio->io_spa;
error = zio_checksum_error_impl(spa, bp, checksum, data, size,
diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c
index 4e2d645572..8d0a33de69 100644
--- a/usr/src/uts/common/fs/zfs/zio_compress.c
+++ b/usr/src/uts/common/fs/zfs/zio_compress.c
@@ -25,10 +25,7 @@
*/
/*
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
- */
-
-/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,24 +38,23 @@
/*
* Compression vectors.
*/
-
zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
- {NULL, NULL, 0, "inherit"},
- {NULL, NULL, 0, "on"},
- {NULL, NULL, 0, "uncompressed"},
- {lzjb_compress, lzjb_decompress, 0, "lzjb"},
- {NULL, NULL, 0, "empty"},
- {gzip_compress, gzip_decompress, 1, "gzip-1"},
- {gzip_compress, gzip_decompress, 2, "gzip-2"},
- {gzip_compress, gzip_decompress, 3, "gzip-3"},
- {gzip_compress, gzip_decompress, 4, "gzip-4"},
- {gzip_compress, gzip_decompress, 5, "gzip-5"},
- {gzip_compress, gzip_decompress, 6, "gzip-6"},
- {gzip_compress, gzip_decompress, 7, "gzip-7"},
- {gzip_compress, gzip_decompress, 8, "gzip-8"},
- {gzip_compress, gzip_decompress, 9, "gzip-9"},
- {zle_compress, zle_decompress, 64, "zle"},
- {lz4_compress, lz4_decompress, 0, "lz4"},
+ {"inherit", 0, NULL, NULL},
+ {"on", 0, NULL, NULL},
+ {"uncompressed", 0, NULL, NULL},
+ {"lzjb", 0, lzjb_compress, lzjb_decompress},
+ {"empty", 0, NULL, NULL},
+ {"gzip-1", 1, gzip_compress, gzip_decompress},
+ {"gzip-2", 2, gzip_compress, gzip_decompress},
+ {"gzip-3", 3, gzip_compress, gzip_decompress},
+ {"gzip-4", 4, gzip_compress, gzip_decompress},
+ {"gzip-5", 5, gzip_compress, gzip_decompress},
+ {"gzip-6", 6, gzip_compress, gzip_decompress},
+ {"gzip-7", 7, gzip_compress, gzip_decompress},
+ {"gzip-8", 8, gzip_compress, gzip_decompress},
+ {"gzip-9", 9, gzip_compress, gzip_decompress},
+ {"zle", 64, zle_compress, zle_decompress},
+ {"lz4", 0, lz4_compress, lz4_decompress}
};
enum zio_compress
@@ -85,10 +81,21 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
return (result);
}
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+ uint64_t *end = (uint64_t *)((char *)data + len);
+ for (uint64_t *word = (uint64_t *)data; word < end; word++)
+ if (*word != 0)
+ return (1);
+
+ return (0);
+}
+
size_t
-zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len)
{
- uint64_t *word, *word_end;
size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
@@ -99,12 +106,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
* If the data is all zeroes, we don't even need to allocate
* a block for it. We indicate this by returning zero size.
*/
- word_end = (uint64_t *)((char *)src + s_len);
- for (word = src; word < word_end; word++)
- if (*word != 0)
- break;
-
- if (word == word_end)
+ if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
return (0);
if (c == ZIO_COMPRESS_EMPTY)
@@ -112,7 +114,11 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
/* Compress at least 12.5% */
d_len = s_len - (s_len >> 3);
- c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
+
+ /* No compression algorithms can read from ABDs directly */
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ c_len = ci->ci_compress(tmp, dst, s_len, d_len, ci->ci_level);
+ abd_return_buf(src, tmp, s_len);
if (c_len > d_len)
return (s_len);
@@ -122,13 +128,23 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
}
int
-zio_decompress_data(enum zio_compress c, void *src, void *dst,
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len)
{
zio_compress_info_t *ci = &zio_compress_table[c];
-
if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
return (SET_ERROR(EINVAL));
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+ size_t s_len, size_t d_len)
+{
+ void *tmp = abd_borrow_buf_copy(src, s_len);
+ int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len);
+ abd_return_buf(src, tmp, s_len);
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/io/mr_sas/ld_pd_map.c b/usr/src/uts/common/io/mr_sas/ld_pd_map.c
index 829904afc0..0f2798a790 100644
--- a/usr/src/uts/common/io/mr_sas/ld_pd_map.c
+++ b/usr/src/uts/common/io/mr_sas/ld_pd_map.c
@@ -23,6 +23,7 @@
*/
/*
* Copyright 2015 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2017 Citrus IT Limited. All rights reserved.
*/
#include <sys/scsi/scsi.h>
@@ -212,7 +213,6 @@ MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow,
U32 rowMod;
U32 armQ;
U32 arm;
- U16 devid = instance->device_id;
ASSERT(raid->rowDataSize != 0);
@@ -254,11 +254,8 @@ MR_GetPhyParams(struct mrsas_instance *instance, U32 ld, U64 stripRow,
*pDevHandle = MR_PdDevHandleGet(pd, map);
} else {
*pDevHandle = MR_PD_INVALID; /* set dev handle as invalid. */
- if ((raid->level >= 5) &&
- ((devid != PCI_DEVICE_ID_LSI_INVADER) ||
- ((devid == PCI_DEVICE_ID_LSI_INVADER ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) &&
- raid->regTypeReqOnRead != REGION_TYPE_UNUSED))) {
+ if (raid->level >= 5 && (!instance->gen3 ||
+ raid->regTypeReqOnRead != REGION_TYPE_UNUSED)) {
pRAID_Context->regLockFlags = REGION_TYPE_EXCLUSIVE;
} else if (raid->level == 1) {
/* Get Alternate Pd. */
@@ -403,8 +400,7 @@ MR_BuildRaidContext(struct mrsas_instance *instance,
pRAID_Context->timeoutValue = map->raidMap.fpPdIoTimeoutSec;
- if ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) ||
- (instance->device_id == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
pRAID_Context->regLockFlags = (isRead) ?
raid->regTypeReqOnRead : raid->regTypeReqOnWrite;
} else {
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.c b/usr/src/uts/common/io/mr_sas/mr_sas.c
index 1bb6cec1d2..4e3fe4dcce 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.c
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.c
@@ -45,7 +45,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Bayard G. Bell. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2015 Citrus IT Limited. All rights reserved.
+ * Copyright 2015, 2017 Citrus IT Limited. All rights reserved.
* Copyright 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -565,9 +565,16 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
/* initialize function pointers */
switch (device_id) {
- case PCI_DEVICE_ID_LSI_TBOLT:
case PCI_DEVICE_ID_LSI_INVADER:
case PCI_DEVICE_ID_LSI_FURY:
+ case PCI_DEVICE_ID_LSI_INTRUDER:
+ case PCI_DEVICE_ID_LSI_INTRUDER_24:
+ case PCI_DEVICE_ID_LSI_CUTLASS_52:
+ case PCI_DEVICE_ID_LSI_CUTLASS_53:
+ dev_err(dip, CE_CONT, "?Gen3 device detected\n");
+ instance->gen3 = 1;
+ /* FALLTHROUGH */
+ case PCI_DEVICE_ID_LSI_TBOLT:
dev_err(dip, CE_CONT, "?TBOLT device detected\n");
instance->func_ptr =
@@ -584,6 +591,7 @@ mrsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
* certain other features are available to a Skinny
* HBA.
*/
+ dev_err(dip, CE_CONT, "?Skinny device detected\n");
instance->skinny = 1;
/* FALLTHRU */
@@ -1596,7 +1604,7 @@ mrsas_quiesce(dev_info_t *dip)
/*ARGSUSED*/
static int
mrsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip,
- scsi_hba_tran_t *tran, struct scsi_device *sd)
+ scsi_hba_tran_t *tran, struct scsi_device *sd)
{
struct mrsas_instance *instance;
uint16_t tgt = sd->sd_address.a_target;
@@ -1772,8 +1780,8 @@ mrsas_name_node(dev_info_t *dip, char *name, int len)
*/
static struct scsi_pkt *
mrsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt,
- struct buf *bp, int cmdlen, int statuslen, int tgtlen,
- int flags, int (*callback)(), caddr_t arg)
+ struct buf *bp, int cmdlen, int statuslen, int tgtlen,
+ int flags, int (*callback)(), caddr_t arg)
{
struct scsa_cmd *acmd;
struct mrsas_instance *instance;
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.h b/usr/src/uts/common/io/mr_sas/mr_sas.h
index 8f27cbdf21..fe4c3659af 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas.h
+++ b/usr/src/uts/common/io/mr_sas/mr_sas.h
@@ -45,6 +45,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 Garrett D'Amore <garrett@damore.org>
+ * Copyright 2017 Citrus IT Limited. All rights reserved.
*/
#ifndef _MR_SAS_H_
@@ -61,8 +62,8 @@ extern "C" {
/*
* MegaRAID SAS2.0 Driver meta data
*/
-#define MRSAS_VERSION "6.503.00.00ILLUMOS"
-#define MRSAS_RELDATE "July 30, 2012"
+#define MRSAS_VERSION "6.503.00.00ILLUMOS-20170421"
+#define MRSAS_RELDATE "April 21, 2017"
#define MRSAS_TRUE 1
#define MRSAS_FALSE 0
@@ -90,13 +91,23 @@ extern "C" {
/*
* MegaRAID SAS2.0 supported controllers
*/
-#define PCI_DEVICE_ID_LSI_2108VDE 0x0078
-#define PCI_DEVICE_ID_LSI_2108V 0x0079
+
+/* Skinny */
#define PCI_DEVICE_ID_LSI_SKINNY 0x0071
#define PCI_DEVICE_ID_LSI_SKINNY_NEW 0x0073
+/* Liberator series (Gen2) */
+#define PCI_DEVICE_ID_LSI_2108VDE 0x0078
+#define PCI_DEVICE_ID_LSI_2108V 0x0079
+/* Thunderbolt series */
#define PCI_DEVICE_ID_LSI_TBOLT 0x005b
+/* Invader series (Gen3) */
#define PCI_DEVICE_ID_LSI_INVADER 0x005d
#define PCI_DEVICE_ID_LSI_FURY 0x005f
+#define PCI_DEVICE_ID_LSI_INTRUDER 0x00ce
+#define PCI_DEVICE_ID_LSI_INTRUDER_24 0x00cf
+#define PCI_DEVICE_ID_LSI_CUTLASS_52 0x0052
+#define PCI_DEVICE_ID_LSI_CUTLASS_53 0x0053
+/* Ventura series not yet supported */
/*
* Register Index for 2108 Controllers.
@@ -602,6 +613,7 @@ typedef struct mrsas_instance {
uint8_t skinny;
uint8_t tbolt;
+ uint8_t gen3;
uint16_t reply_read_index;
uint16_t reply_size; /* Single Reply struct size */
uint16_t raid_io_msg_size; /* Single message size */
diff --git a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c
index 929ae8056e..9ff12ffb07 100644
--- a/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c
+++ b/usr/src/uts/common/io/mr_sas/mr_sas_tbolt.c
@@ -17,7 +17,7 @@
/*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2015 Citrus IT Limited. All rights reserved.
+ * Copyright 2015, 2017 Citrus IT Limited. All rights reserved.
* Copyright 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -1252,7 +1252,6 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
Mpi25IeeeSgeChain64_t *scsi_raid_io_sgl_ieee = NULL;
ddi_acc_handle_t acc_handle =
instance->mpi2_frame_pool_dma_obj.acc_handle;
- uint16_t devid = instance->device_id;
con_log(CL_ANN1, (CE_NOTE,
"chkpnt: Building Chained SGL :%d", __LINE__));
@@ -1296,8 +1295,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
scsi_raid_io_sgl_ieee =
(Mpi25IeeeSgeChain64_t *)&scsi_raid_io->SGL.IeeeChain;
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
Mpi25IeeeSgeChain64_t *sgl_ptr_end = scsi_raid_io_sgl_ieee;
sgl_ptr_end += instance->max_sge_in_main_msg - 1;
@@ -1313,8 +1311,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0);
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
if (i == (numElements - 1)) {
ddi_put8(acc_handle,
&scsi_raid_io_sgl_ieee->Flags,
@@ -1342,8 +1339,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
con_log(CL_ANN1, (CE_NOTE, "[Chain Element index]:%x", i));
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
uint16_t ioFlags =
ddi_get16(acc_handle, &scsi_raid_io->IoFlags);
@@ -1366,8 +1362,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
ddi_put8(acc_handle, &ieeeChainElement->NextChainOffset, 0);
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
ddi_put8(acc_handle, &ieeeChainElement->Flags,
IEEE_SGE_FLAGS_CHAIN_ELEMENT);
} else {
@@ -1402,8 +1397,7 @@ mr_sas_tbolt_build_sgl(struct mrsas_instance *instance,
ddi_put8(acc_handle, &scsi_raid_io_sgl_ieee->Flags, 0);
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
if (i == (numElements - 1)) {
ddi_put8(acc_handle,
&scsi_raid_io_sgl_ieee->Flags,
@@ -1443,7 +1437,6 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
uint32_t lba_count = 0;
uint32_t start_lba_hi = 0;
uint32_t start_lba_lo = 0;
- uint16_t devid = instance->device_id;
ddi_acc_handle_t acc_handle =
instance->mpi2_frame_pool_dma_obj.acc_handle;
struct mrsas_cmd *cmd = NULL;
@@ -1678,8 +1671,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
(MPI2_REQ_DESCRIPT_FLAGS_HIGH_PRIORITY <<
MPI2_REQ_DESCRIPT_FLAGS_TYPE_SHIFT);
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
uint8_t regLockFlags = ddi_get8(acc_handle,
&scsi_raid_io->RaidContext.regLockFlags);
uint16_t IoFlags = ddi_get16(acc_handle,
@@ -1743,8 +1735,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
&scsi_raid_io->RaidContext.timeoutValue,
local_map_ptr->raidMap.fpPdIoTimeoutSec);
- if ((devid == PCI_DEVICE_ID_LSI_INVADER) ||
- (devid == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
uint8_t regLockFlags = ddi_get8(acc_handle,
&scsi_raid_io->RaidContext.regLockFlags);
@@ -1849,9 +1840,7 @@ mrsas_tbolt_build_cmd(struct mrsas_instance *instance, struct scsi_address *ap,
ddi_put8(acc_handle,
&scsi_raid_io->LUN[1], acmd->lun);
- if (instance->fast_path_io &&
- ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) ||
- (instance->device_id == PCI_DEVICE_ID_LSI_FURY))) {
+ if (instance->fast_path_io && instance->gen3) {
uint16_t IoFlags = ddi_get16(acc_handle,
&scsi_raid_io->IoFlags);
IoFlags |= MPI25_SAS_DEVICE0_FLAGS_ENABLED_FAST_PATH;
@@ -2269,8 +2258,7 @@ mr_sas_tbolt_build_mfi_cmd(struct mrsas_instance *instance,
/* get raid message frame pointer */
scsi_raid_io = (Mpi2RaidSCSIIORequest_t *)cmd->scsi_io_request;
- if ((instance->device_id == PCI_DEVICE_ID_LSI_INVADER) ||
- (instance->device_id == PCI_DEVICE_ID_LSI_FURY)) {
+ if (instance->gen3) {
Mpi25IeeeSgeChain64_t *sgl_ptr_end = (Mpi25IeeeSgeChain64_t *)
&scsi_raid_io->SGL.IeeeChain;
sgl_ptr_end += instance->max_sge_in_main_msg - 1;
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 80d344523e..c44301765b 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -545,6 +545,7 @@ CHKHDRS= \
statfs.h \
statvfs.h \
stdbool.h \
+ stddef.h \
stdint.h \
stermio.h \
stmf.h \
diff --git a/usr/src/uts/common/sys/multiboot2.h b/usr/src/uts/common/sys/multiboot2.h
new file mode 100644
index 0000000000..556b0217a3
--- /dev/null
+++ b/usr/src/uts/common/sys/multiboot2.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (C) 1999,2003,2007,2008,2009,2010 Free Software Foundation, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ANY
+ * DEVELOPER OR DISTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+ * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ */
+
+/*
+ * This header contains definitions for Multiboot 2 boot protocol, based on
+ * the reference implementation by grub 2.
+ *
+ * At the time this was written (Jan 2017), the Multiboot 2 documentation is in
+ * process of being rewritten and the information in the specification is not
+ * entirely correct. Instead, you must rely on grub 2 source code.
+ *
+ * This header provides essential support for the Multiboot 2 specification
+ * for illumos and makes it possible to pass the needed structures from the
+ * boot loader to the kernel.
+ */
+
+#ifndef _SYS_MULTIBOOT2_H
+#define _SYS_MULTIBOOT2_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* How many bytes from the start of the file we search for the header. */
+#define MULTIBOOT_SEARCH 32768
+#define MULTIBOOT_HEADER_ALIGN 8
+
+/* The magic field should contain this. */
+#define MULTIBOOT2_HEADER_MAGIC 0xe85250d6
+
+/* This should be in %eax. */
+#define MULTIBOOT2_BOOTLOADER_MAGIC 0x36d76289
+
+/* Alignment of multiboot modules. */
+#if defined(__i386) || defined(__amd64)
+#define MULTIBOOT_MOD_ALIGN 0x00001000
+#else
+#error No architecture defined
+#endif
+
+/* Alignment of the multiboot info structure. */
+#define MULTIBOOT_INFO_ALIGN 0x00000008
+
+/* Flags set in the 'flags' member of the multiboot header. */
+
+#define MULTIBOOT_TAG_ALIGN 8
+#define MULTIBOOT_TAG_TYPE_END 0
+#define MULTIBOOT_TAG_TYPE_CMDLINE 1
+#define MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME 2
+#define MULTIBOOT_TAG_TYPE_MODULE 3
+#define MULTIBOOT_TAG_TYPE_BASIC_MEMINFO 4
+#define MULTIBOOT_TAG_TYPE_BOOTDEV 5
+#define MULTIBOOT_TAG_TYPE_MMAP 6
+#define MULTIBOOT_TAG_TYPE_VBE 7
+#define MULTIBOOT_TAG_TYPE_FRAMEBUFFER 8
+#define MULTIBOOT_TAG_TYPE_ELF_SECTIONS 9
+#define MULTIBOOT_TAG_TYPE_APM 10
+#define MULTIBOOT_TAG_TYPE_EFI32 11
+#define MULTIBOOT_TAG_TYPE_EFI64 12
+#define MULTIBOOT_TAG_TYPE_SMBIOS 13
+#define MULTIBOOT_TAG_TYPE_ACPI_OLD 14
+#define MULTIBOOT_TAG_TYPE_ACPI_NEW 15
+#define MULTIBOOT_TAG_TYPE_NETWORK 16
+#define MULTIBOOT_TAG_TYPE_EFI_MMAP 17
+#define MULTIBOOT_TAG_TYPE_EFI_BS 18
+#define MULTIBOOT_TAG_TYPE_EFI32_IH 19
+#define MULTIBOOT_TAG_TYPE_EFI64_IH 20
+#define MULTIBOOT_TAG_TYPE_LOAD_BASE_ADDR 21
+
+#define MULTIBOOT_HEADER_TAG_END 0
+#define MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST 1
+#define MULTIBOOT_HEADER_TAG_ADDRESS 2
+#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS 3
+#define MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS 4
+#define MULTIBOOT_HEADER_TAG_FRAMEBUFFER 5
+#define MULTIBOOT_HEADER_TAG_MODULE_ALIGN 6
+#define MULTIBOOT_HEADER_TAG_EFI_BS 7
+#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS_EFI32 8
+#define MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS_EFI64 9
+#define MULTIBOOT_HEADER_TAG_RELOCATABLE 10
+
+#define MULTIBOOT_ARCHITECTURE_I386 0
+#define MULTIBOOT_ARCHITECTURE_MIPS32 4
+#define MULTIBOOT_HEADER_TAG_OPTIONAL 1
+
+/* Hints for relocatable kernel load preference */
+#define MULTIBOOT_LOAD_PREFERENCE_NONE 0
+#define MULTIBOOT_LOAD_PREFERENCE_LOW 1
+#define MULTIBOOT_LOAD_PREFERENCE_HIGH 2
+
+/* Values for console_flags field in tag multiboot_header_tag_console_flags. */
+#define MULTIBOOT_CONSOLE_FLAGS_CONSOLE_REQUIRED 1
+#define MULTIBOOT_CONSOLE_FLAGS_EGA_TEXT_SUPPORTED 2
+
+#ifndef _ASM
+
+#include <sys/stdint.h>
+
+#pragma pack(1)
+
+typedef struct multiboot_header_tag {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+} multiboot_header_tag_t;
+
+typedef struct multiboot2_header {
+ /* Must be MULTIBOOT2_MAGIC - see above. */
+ uint32_t mb2_magic;
+
+ /* ISA */
+ uint32_t mb2_architecture;
+
+ /* Total header length. */
+ uint32_t mb2_header_length;
+
+ /* The above fields plus this one must equal 0 mod 2^32. */
+ uint32_t mb2_checksum;
+ multiboot_header_tag_t mb2_tags[];
+} multiboot2_header_t;
+
+typedef struct multiboot_header_tag_information_request {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_requests[];
+} multiboot_header_tag_information_request_t;
+
+typedef struct multiboot_header_tag_address {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_header_addr;
+ uint32_t mbh_load_addr;
+ uint32_t mbh_load_end_addr;
+ uint32_t mbh_bss_end_addr;
+} multiboot_header_tag_address_t;
+
+typedef struct multiboot_header_tag_entry_address {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_entry_addr;
+} multiboot_header_tag_entry_address_t;
+
+typedef struct multiboot_header_tag_console_flags {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_console_flags;
+} multiboot_header_tag_console_flags_t;
+
+typedef struct multiboot_header_tag_framebuffer {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_width;
+ uint32_t mbh_height;
+ uint32_t mbh_depth;
+} multiboot_header_tag_framebuffer_t;
+
+typedef struct multiboot_header_tag_module_align {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+} multiboot_header_tag_module_align_t;
+
+typedef struct multiboot_header_tag_relocatable {
+ uint16_t mbh_type;
+ uint16_t mbh_flags;
+ uint32_t mbh_size;
+ uint32_t mbh_min_addr;
+ uint32_t mbh_max_addr;
+ uint32_t mbh_align;
+ uint32_t mbh_preference;
+} multiboot_header_tag_relocatable_t;
+
+typedef struct multiboot_color {
+ uint8_t mb_red;
+ uint8_t mb_green;
+ uint8_t mb_blue;
+} multiboot_color_t;
+
+typedef struct multiboot_mmap_entry {
+ uint64_t mmap_addr;
+ uint64_t mmap_len;
+#define MULTIBOOT_MEMORY_AVAILABLE 1
+#define MULTIBOOT_MEMORY_RESERVED 2
+#define MULTIBOOT_MEMORY_ACPI_RECLAIMABLE 3
+#define MULTIBOOT_MEMORY_NVS 4
+#define MULTIBOOT_MEMORY_BADRAM 5
+ uint32_t mmap_type;
+ uint32_t mmap_reserved;
+} multiboot_mmap_entry_t;
+
+typedef struct multiboot_tag {
+ uint32_t mb_type;
+ uint32_t mb_size;
+} multiboot_tag_t;
+
+typedef struct multiboot2_info_header {
+ uint32_t mbi_total_size;
+ uint32_t mbi_reserved;
+ multiboot_tag_t mbi_tags[];
+} multiboot2_info_header_t;
+
+typedef struct multiboot_tag_string {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ char mb_string[];
+} multiboot_tag_string_t;
+
+typedef struct multiboot_tag_module {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_mod_start;
+ uint32_t mb_mod_end;
+ char mb_cmdline[];
+} multiboot_tag_module_t;
+
+typedef struct multiboot_tag_basic_meminfo {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_mem_lower;
+ uint32_t mb_mem_upper;
+} multiboot_tag_basic_meminfo_t;
+
+typedef struct multiboot_tag_bootdev {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_biosdev;
+ uint32_t mb_slice;
+ uint32_t mb_part;
+} multiboot_tag_bootdev_t;
+
+typedef struct multiboot_tag_mmap {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_entry_size;
+ uint32_t mb_entry_version;
+ uint8_t mb_entries[];
+} multiboot_tag_mmap_t;
+
+struct multiboot_vbe_info_block {
+ uint8_t vbe_external_specification[512];
+};
+
+struct multiboot_vbe_mode_info_block {
+ uint8_t vbe_external_specification[256];
+};
+
+typedef struct multiboot_tag_vbe {
+ uint32_t mb_type;
+ uint32_t mb_size;
+
+ uint16_t vbe_mode;
+ uint16_t vbe_interface_seg;
+ uint16_t vbe_interface_off;
+ uint16_t vbe_interface_len;
+
+ struct multiboot_vbe_info_block vbe_control_info;
+ struct multiboot_vbe_mode_info_block vbe_mode_info;
+} multiboot_tag_vbe_t;
+
+struct multiboot_tag_framebuffer_common {
+ uint32_t mb_type;
+ uint32_t mb_size;
+
+ uint64_t framebuffer_addr;
+ uint32_t framebuffer_pitch;
+ uint32_t framebuffer_width;
+ uint32_t framebuffer_height;
+ uint8_t framebuffer_bpp;
+#define MULTIBOOT_FRAMEBUFFER_TYPE_INDEXED 0
+#define MULTIBOOT_FRAMEBUFFER_TYPE_RGB 1
+#define MULTIBOOT_FRAMEBUFFER_TYPE_EGA_TEXT 2
+ uint8_t framebuffer_type;
+ uint16_t mb_reserved;
+};
+
+typedef struct multiboot_tag_framebuffer {
+ struct multiboot_tag_framebuffer_common framebuffer_common;
+
+ union {
+ struct {
+ uint16_t framebuffer_palette_num_colors;
+ multiboot_color_t framebuffer_palette[];
+ } fb1;
+ struct {
+ uint8_t framebuffer_red_field_position;
+ uint8_t framebuffer_red_mask_size;
+ uint8_t framebuffer_green_field_position;
+ uint8_t framebuffer_green_mask_size;
+ uint8_t framebuffer_blue_field_position;
+ uint8_t framebuffer_blue_mask_size;
+ } fb2;
+ } u;
+} multiboot_tag_framebuffer_t;
+
+typedef struct multiboot_tag_elf_sections {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_num;
+ uint32_t mb_entsize;
+ uint32_t mb_shndx;
+ char mb_sections[];
+} multiboot_tag_elf_sections_t;
+
+typedef struct multiboot_tag_apm {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint16_t mb_version;
+ uint16_t mb_cseg;
+ uint32_t mb_offset;
+ uint16_t mb_cseg_16;
+ uint16_t mb_dseg;
+ uint16_t mb_flags;
+ uint16_t mb_cseg_len;
+ uint16_t mb_cseg_16_len;
+ uint16_t mb_dseg_len;
+} multiboot_tag_apm_t;
+
+typedef struct multiboot_tag_efi32 {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_pointer;
+} multiboot_tag_efi32_t;
+
+typedef struct multiboot_tag_efi64 {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint64_t mb_pointer;
+} multiboot_tag_efi64_t;
+
+typedef struct multiboot_tag_smbios {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint8_t mb_major;
+ uint8_t mb_minor;
+ uint8_t mb_reserved[6];
+ uint8_t mb_tables[];
+} multiboot_tag_smbios_t;
+
+typedef struct multiboot_tag_old_acpi {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint8_t mb_rsdp[];
+} multiboot_tag_old_acpi_t;
+
+typedef struct multiboot_tag_new_acpi {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint8_t mb_rsdp[];
+} multiboot_tag_new_acpi_t;
+
+typedef struct multiboot_tag_network {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint8_t mb_dhcpack[];
+} multiboot_tag_network_t;
+
+typedef struct multiboot_tag_efi_mmap {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_descr_size;
+ uint32_t mb_descr_vers;
+ uint8_t mb_efi_mmap[];
+} multiboot_tag_efi_mmap_t;
+
+typedef struct multiboot_tag_efi32_ih {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_pointer;
+} multiboot_tag_efi32_ih_t;
+
+typedef struct multiboot_tag_efi64_ih {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint64_t mb_pointer;
+} multiboot_tag_efi64_ih_t;
+
+typedef struct multiboot_tag_load_base_addr {
+ uint32_t mb_type;
+ uint32_t mb_size;
+ uint32_t mb_load_base_addr;
+} multiboot_tag_load_base_addr_t;
+
+#pragma pack()
+
+#endif /* !_ASM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_SYS_MULTIBOOT2_H */
diff --git a/usr/src/uts/common/sys/multiboot2_impl.h b/usr/src/uts/common/sys/multiboot2_impl.h
new file mode 100644
index 0000000000..d90ed0e8ee
--- /dev/null
+++ b/usr/src/uts/common/sys/multiboot2_impl.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ */
+
+#ifndef _SYS_MULTIBOOT2_IMPL_H
+#define _SYS_MULTIBOOT2_IMPL_H
+
+/*
+ * Multiboot 2 protocol implementation for dboot.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/multiboot2.h>
+
+extern void *dboot_multiboot2_find_tag(multiboot2_info_header_t *, uint32_t);
+extern char *dboot_multiboot2_cmdline(multiboot2_info_header_t *);
+extern int dboot_multiboot2_modcount(multiboot2_info_header_t *);
+extern uint32_t dboot_multiboot2_modstart(multiboot2_info_header_t *, int);
+extern uint32_t dboot_multiboot2_modend(multiboot2_info_header_t *, int);
+extern char *dboot_multiboot2_modcmdline(multiboot2_info_header_t *, int);
+extern multiboot_tag_mmap_t *
+ dboot_multiboot2_get_mmap_tagp(multiboot2_info_header_t *);
+extern boolean_t dboot_multiboot2_basicmeminfo(multiboot2_info_header_t *,
+ uint32_t *, uint32_t *);
+extern uint64_t dboot_multiboot2_mmap_get_length(multiboot2_info_header_t *,
+ multiboot_tag_mmap_t *, int);
+extern uint64_t dboot_multiboot2_mmap_get_base(multiboot2_info_header_t *,
+ multiboot_tag_mmap_t *, int);
+extern uint32_t dboot_multiboot2_mmap_get_type(multiboot2_info_header_t *,
+ multiboot_tag_mmap_t *, int);
+extern int dboot_multiboot2_mmap_nentries(multiboot2_info_header_t *,
+ multiboot_tag_mmap_t *);
+extern paddr_t dboot_multiboot2_highest_addr(multiboot2_info_header_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MULTIBOOT2_IMPL_H */
diff --git a/usr/src/uts/common/sys/stddef.h b/usr/src/uts/common/sys/stddef.h
new file mode 100644
index 0000000000..9dc9736241
--- /dev/null
+++ b/usr/src/uts/common/sys/stddef.h
@@ -0,0 +1,48 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2017 Toomas Soome <tsoome@me.com>
+ */
+
+#ifndef _SYS_STDDEF_H
+#define _SYS_STDDEF_H
+
+/*
+ * Commonly used macros and definitions.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(offsetof)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+#define offsetof(s, m) __builtin_offsetof(s, m)
+#else
+#if __cplusplus >= 199711L
+#define offsetof(s, m) (std::size_t)(&(((s *)NULL)->m))
+#else
+#define offsetof(s, m) ((size_t)(&(((s *)NULL)->m)))
+#endif
+#endif
+#endif /* !offsetof */
+
+#if !defined(container_of)
+#define container_of(m, s, name) \
+ (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name))
+#endif /* !container_of */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_STDDEF_H */
diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h
index 2e895a8daf..03be89f461 100644
--- a/usr/src/uts/common/sys/sysmacros.h
+++ b/usr/src/uts/common/sys/sysmacros.h
@@ -33,6 +33,7 @@
#define _SYS_SYSMACROS_H
#include <sys/param.h>
+#include <sys/stddef.h>
#ifdef __cplusplus
extern "C" {
@@ -369,18 +370,8 @@ extern unsigned char bcd_to_byte[256];
/* avoid any possibility of clashing with <stddef.h> version */
#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_KMEMUSER)
-#if !defined(offsetof)
-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
-#define offsetof(s, m) __builtin_offsetof(s, m)
-#else
-#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
-#endif
-#endif /* !offsetof */
-
-#define container_of(m, s, name) \
- (void *)((uintptr_t)(m) - (uintptr_t)offsetof(s, name))
-
#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0]))
+
#endif /* _KERNEL, !_KMEMUSER */
#ifdef __cplusplus
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 7e782f3f61..f3f10e1b6c 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -139,6 +139,7 @@ BOOT_DRIVER_OBJS = \
boot_keyboard_table.o \
boot_vga.o \
boot_mmu.o \
+ dboot_multiboot2.o \
$(FONT_OBJS)
CORE_OBJS += $(BOOT_DRIVER_OBJS)
diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules
index 2d55410f33..a3bf823c69 100644
--- a/usr/src/uts/i86pc/Makefile.rules
+++ b/usr/src/uts/i86pc/Makefile.rules
@@ -211,6 +211,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/xen/os/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
#
# dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr
#
@@ -422,6 +426,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/os/cpupm/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/boot/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/dboot/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/i86pc/vm/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/i86pc/boot/boot_console.c b/usr/src/uts/i86pc/boot/boot_console.c
index cadc735588..6b0873d656 100644
--- a/usr/src/uts/i86pc/boot/boot_console.c
+++ b/usr/src/uts/i86pc/boot/boot_console.c
@@ -61,6 +61,22 @@ static int cons_color = CONS_COLOR;
static int console = CONS_SCREEN_TEXT;
static int tty_num = 0;
static int tty_addr[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8};
+static char *boot_line;
+static struct boot_env {
+ char *be_env; /* ends with double ascii nul */
+ size_t be_size; /* size of the environment, including nul */
+} boot_env;
+
+static int serial_ischar(void);
+static int serial_getchar(void);
+static void serial_putchar(int);
+static void serial_adjust_prop(void);
+
+#if !defined(_BOOT)
+/* Set if the console or mode are expressed in the boot line */
+static int console_set, console_mode_set;
+#endif
+
#if defined(__xpv)
static int console_hypervisor_redirect = B_FALSE;
static int console_hypervisor_device = CONS_INVALID;
@@ -76,18 +92,6 @@ console_hypervisor_dev_type(int *tnum)
}
#endif /* __xpv */
-static int serial_ischar(void);
-static int serial_getchar(void);
-static void serial_putchar(int);
-static void serial_adjust_prop(void);
-
-static char *boot_line = NULL;
-
-#if !defined(_BOOT)
-/* Set if the console or mode are expressed in the boot line */
-static int console_set, console_mode_set;
-#endif
-
/* Clear the screen and initialize VIDEO, XPOS and YPOS. */
void
clear_screen(void)
@@ -328,6 +332,67 @@ out:
return (ret);
}
+/*
+ * Find prop from boot env module. The data in module is list of C strings
+ * name=value, the list is terminated by double nul.
+ */
+static const char *
+find_boot_env_prop(const char *name)
+{
+ char *ptr;
+ size_t len;
+ uintptr_t size;
+
+ if (boot_env.be_env == NULL)
+ return (NULL);
+
+ ptr = boot_env.be_env;
+ len = strlen(name);
+
+ /*
+ * Make sure we have at least len + 2 bytes in the environment.
+ * We are looking for name=value\0 constructs, and the environment
+ * itself is terminated by '\0'.
+ */
+ if (boot_env.be_size < len + 2)
+ return (NULL);
+
+ do {
+ if ((strncmp(ptr, name, len) == 0) && (ptr[len] == '=')) {
+ ptr += len + 1;
+ return (ptr);
+ }
+ /* find the first '\0' */
+ while (*ptr != '\0') {
+ ptr++;
+ size = (uintptr_t)ptr - (uintptr_t)boot_env.be_env;
+ if (size > boot_env.be_size)
+ return (NULL);
+ }
+ ptr++;
+
+ /* If the remainder is shorter than name + 2, get out. */
+ size = (uintptr_t)ptr - (uintptr_t)boot_env.be_env;
+ if (boot_env.be_size - size < len + 2)
+ return (NULL);
+ } while (*ptr != '\0');
+ return (NULL);
+}
+
+/*
+ * Get prop value from either command line or boot environment.
+ * We always check kernel command line first, as this will keep the
+ * functionality and will allow user to override the values in environment.
+ */
+const char *
+find_boot_prop(const char *name)
+{
+ const char *value = find_boot_line_prop(name);
+
+ if (value == NULL)
+ value = find_boot_env_prop(name);
+ return (value);
+}
#define MATCHES(p, pat) \
(strncmp(p, pat, strlen(pat)) == 0 ? (p += strlen(pat), 1) : 0)
@@ -341,14 +406,14 @@ out:
/*
* find a tty mode property either from cmdline or from boot properties
*/
-static char *
+static const char *
get_mode_value(char *name)
{
/*
* when specified on boot line it looks like "name" "="....
*/
if (boot_line != NULL) {
- return (find_boot_line_prop(name));
+ return (find_boot_prop(name));
}
#if defined(_BOOT)
@@ -377,8 +442,8 @@ static void
serial_adjust_prop(void)
{
char propname[20];
- char *propval;
- char *p;
+ const char *propval;
+ const char *p;
ulong_t baud;
uchar_t lcr = 0;
uchar_t mcr = DTR | RTS;
@@ -522,27 +587,47 @@ console_value_t console_devices[] = {
{ NULL, CONS_INVALID }
};
+static void
+bcons_init_env(struct xboot_info *xbi)
+{
+ uint32_t i;
+ struct boot_modules *modules;
+
+ modules = (struct boot_modules *)(uintptr_t)xbi->bi_modules;
+ for (i = 0; i < xbi->bi_module_cnt; i++) {
+ if (modules[i].bm_type == BMT_ENV)
+ break;
+ }
+ if (i == xbi->bi_module_cnt)
+ return;
+
+ boot_env.be_env = (char *)(uintptr_t)modules[i].bm_addr;
+ boot_env.be_size = modules[i].bm_size;
+}
+
void
-bcons_init(char *bootstr)
+bcons_init(struct xboot_info *xbi)
{
console_value_t *consolep;
size_t len, cons_len;
- char *cons_str;
+ const char *cons_str;
#if !defined(_BOOT)
static char console_text[] = "text";
extern int post_fastreboot;
#endif
- boot_line = bootstr;
+ /* Set up data to fetch properties from commad line and boot env. */
+ boot_line = (char *)(uintptr_t)xbi->bi_cmdline;
+ bcons_init_env(xbi);
console = CONS_INVALID;
#if defined(__xpv)
- bcons_init_xen(bootstr);
+ bcons_init_xen(boot_line);
#endif /* __xpv */
- cons_str = find_boot_line_prop("console");
+ cons_str = find_boot_prop("console");
if (cons_str == NULL)
- cons_str = find_boot_line_prop("output-device");
+ cons_str = find_boot_prop("output-device");
#if !defined(_BOOT)
if (post_fastreboot && strcmp(cons_str, "graphics") == 0)
@@ -657,7 +742,6 @@ bcons_init(char *bootstr)
kb_init();
break;
}
- boot_line = NULL;
}
#if !defined(_BOOT)
diff --git a/usr/src/uts/i86pc/dboot/dboot_grub.s b/usr/src/uts/i86pc/dboot/dboot_grub.s
index 92cacc4983..7409c12998 100644
--- a/usr/src/uts/i86pc/dboot/dboot_grub.s
+++ b/usr/src/uts/i86pc/dboot/dboot_grub.s
@@ -1,4 +1,3 @@
-
/*
* CDDL HEADER START
*
@@ -32,6 +31,7 @@ int silence_lint_warnings = 0;
#else /* __lint */
#include <sys/multiboot.h>
+#include <sys/multiboot2.h>
#include <sys/asm_linkage.h>
#include <sys/segments.h>
#include <sys/controlregs.h>
@@ -76,6 +76,103 @@ mb_header:
.long 0 /* height 0 == don't care */
.long 0 /* depth 0 == don't care */
+#if defined(_BOOT_TARGET_i386)
+ /*
+ * The MB2 header must be 8 byte aligned relative to the beginning of
+ * the in-memory ELF object. The 32-bit kernel ELF file has sections
+ * which are 4-byte aligned, and as .align family directives only do
+ * control the alignment inside the section, we need to construct the
+ * image manually, by inserting the padding where needed. The alignment
+ * setup here depends on the first PT_LOAD section of the ELF file, if
+ * this section offset will change, this code must be reviewed.
+ * Similarily, if we add extra tag types into the information request
+ * or add tags into the tag list.
+ */
+ .long 0 /* padding */
+#else
+ .balign MULTIBOOT_HEADER_ALIGN
+#endif
+mb2_header:
+ .long MULTIBOOT2_HEADER_MAGIC
+ .long MULTIBOOT_ARCHITECTURE_I386
+ .long mb2_header_end - mb2_header
+ .long -(MULTIBOOT2_HEADER_MAGIC + MULTIBOOT_ARCHITECTURE_I386 + (mb2_header_end - mb2_header))
+
+ /*
+ * Multiboot 2 tags follow. Note, the first tag immediately follows
+ * the header. Subsequent tags must be aligned by MULTIBOOT_TAG_ALIGN.
+ *
+ * MB information request tag.
+ */
+information_request_tag_start:
+ .word MULTIBOOT_HEADER_TAG_INFORMATION_REQUEST
+ .word 0
+ .long information_request_tag_end - information_request_tag_start
+ .long MULTIBOOT_TAG_TYPE_CMDLINE
+ .long MULTIBOOT_TAG_TYPE_MODULE
+ .long MULTIBOOT_TAG_TYPE_BOOTDEV
+ .long MULTIBOOT_TAG_TYPE_MMAP
+ .long MULTIBOOT_TAG_TYPE_BASIC_MEMINFO
+information_request_tag_end:
+ .long 0 /* padding */
+
+#if defined (_BOOT_TARGET_amd64)
+ /*
+ * The following values are patched by mbh_patch for the 64-bit kernel,
+ * so we only provide this tag for the 64-bit kernel.
+ */
+ .balign MULTIBOOT_TAG_ALIGN
+address_tag_start:
+ .word MULTIBOOT_HEADER_TAG_ADDRESS
+ .word 0
+ .long address_tag_end - address_tag_start
+ .long mb2_header
+ .globl mb2_load_addr
+mb2_load_addr:
+ .long 0 /* load addr */
+ .long 0 /* load_end_addr */
+ .long 0 /* bss_end_addr */
+address_tag_end:
+ /*
+ * entry address tag
+ */
+ .balign MULTIBOOT_TAG_ALIGN
+entry_address_tag_start:
+ .word MULTIBOOT_HEADER_TAG_ENTRY_ADDRESS
+ .word 0
+ .long entry_address_tag_end - entry_address_tag_start
+ .long 0 /* entry addr */
+entry_address_tag_end:
+
+ .balign MULTIBOOT_TAG_ALIGN /* Alignment for the next tag */
+#endif
+ /*
+ * MB console flags tag
+ */
+console_tag_start:
+ .word MULTIBOOT_HEADER_TAG_CONSOLE_FLAGS
+ .word 0
+ .long console_tag_end - console_tag_start
+ .long MULTIBOOT_CONSOLE_FLAGS_EGA_TEXT_SUPPORTED
+console_tag_end:
+ .long 0 /* padding */
+
+ /*
+ * Tell the bootloader to load the modules page aligned to
+ * the specified alignment.
+ */
+ .word MULTIBOOT_HEADER_TAG_MODULE_ALIGN
+ .word 0
+ .long 8
+
+ /*
+ * Termination tag.
+ */
+ .word MULTIBOOT_HEADER_TAG_END
+ .word 0
+ .long 8
+mb2_header_end:
+
/*
* At entry we are in protected mode, 32 bit execution, paging and
* interrupts are disabled.
@@ -85,7 +182,8 @@ mb_header:
* segment registers all have segments with base 0, limit == 0xffffffff
*/
code_start:
- movl %ebx, mb_info
+ movl %eax, mb_magic
+ movl %ebx, mb_addr
movl $stack_space, %esp /* load my stack pointer */
addl $STACK_SIZE, %esp
diff --git a/usr/src/uts/i86pc/dboot/dboot_multiboot2.c b/usr/src/uts/i86pc/dboot/dboot_multiboot2.c
new file mode 100644
index 0000000000..ccf81cf773
--- /dev/null
+++ b/usr/src/uts/i86pc/dboot/dboot_multiboot2.c
@@ -0,0 +1,341 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ */
+
+/*
+ * dboot module utility functions for multiboot 2 tags processing.
+ */
+
+#include <sys/inttypes.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/multiboot2.h>
+#include <sys/multiboot2_impl.h>
+
+struct dboot_multiboot2_iterate_ctx;
+
+typedef boolean_t (*dboot_multiboot2_iterate_cb_t)
+ (int, multiboot_tag_t *, struct dboot_multiboot2_iterate_ctx *);
+
+struct dboot_multiboot2_iterate_ctx {
+ dboot_multiboot2_iterate_cb_t dboot_iter_callback;
+ int dboot_iter_index; /* item from set */
+ uint32_t dboot_iter_tag; /* tag to search */
+ multiboot_tag_t *dboot_iter_tagp; /* search result */
+};
+
+/*
+ * Multiboot2 tag list elements are aligned to MULTIBOOT_TAG_ALIGN.
+ * To get the next item from the list, we first add the tag's size
+ * to the start of the current tag. Next, we round up that address to the
+ * nearest MULTIBOOT_TAG_ALIGN address.
+ */
+
+static multiboot_tag_t *
+dboot_multiboot2_first_tag(multiboot2_info_header_t *mbi)
+{
+ return (&mbi->mbi_tags[0]);
+}
+
+static multiboot_tag_t *
+dboot_multiboot2_next_tag(multiboot_tag_t *tag)
+{
+ if (tag == NULL || tag->mb_type == MULTIBOOT_TAG_TYPE_END)
+ return (NULL);
+
+ return ((multiboot_tag_t *)P2ROUNDUP((uintptr_t)tag +
+ tag->mb_size, MULTIBOOT_TAG_ALIGN));
+}
+
+/*
+ * Walk the tag list until we hit the first instance of a given tag or
+ * the end of the list.
+ * MB2_NEXT_TAG() will return NULL on end of list.
+ */
+static void *
+dboot_multiboot2_find_tag_impl(multiboot_tag_t *tagp, uint32_t tag)
+{
+ while (tagp != NULL && tagp->mb_type != tag) {
+ tagp = dboot_multiboot2_next_tag(tagp);
+ }
+ return (tagp);
+}
+
+/*
+ * Walk the entire list to find the first instance of the given tag.
+ */
+void *
+dboot_multiboot2_find_tag(multiboot2_info_header_t *mbi, uint32_t tag)
+{
+ multiboot_tag_t *tagp = dboot_multiboot2_first_tag(mbi);
+
+ return (dboot_multiboot2_find_tag_impl(tagp, tag));
+}
+
+/*
+ * dboot_multiboot2_iterate()
+ *
+ * While most tags in tag list are unique, the modules are specified
+ * one module per tag and therefore we need an mechanism to process
+ * tags in set.
+ *
+ * Arguments:
+ * mbi: multiboot info header
+ * data: callback context.
+ *
+ * Return value:
+ * Processed item count.
+ * Callback returning B_TRUE will terminate the iteration.
+ */
+static int
+dboot_multiboot2_iterate(multiboot2_info_header_t *mbi,
+ struct dboot_multiboot2_iterate_ctx *ctx)
+{
+ dboot_multiboot2_iterate_cb_t callback = ctx->dboot_iter_callback;
+ multiboot_tag_t *tagp;
+ uint32_t tag = ctx->dboot_iter_tag;
+ int index = 0;
+
+ tagp = dboot_multiboot2_find_tag(mbi, tag);
+ while (tagp != NULL) {
+ if (callback != NULL) {
+ if (callback(index, tagp, ctx) == B_TRUE) {
+ return (index + 1);
+ }
+ }
+ tagp = dboot_multiboot2_next_tag(tagp);
+ tagp = dboot_multiboot2_find_tag_impl(tagp, tag);
+ index++;
+ }
+ return (index);
+}
+
+char *
+dboot_multiboot2_cmdline(multiboot2_info_header_t *mbi)
+{
+ multiboot_tag_string_t *tag;
+
+ tag = dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_CMDLINE);
+
+ if (tag != NULL)
+ return (&tag->mb_string[0]);
+ else
+ return (NULL);
+}
+
+/*
+ * Simple callback to index item in set.
+ * Terminates iteration if the indexed item is found.
+ */
+static boolean_t
+dboot_multiboot2_iterate_callback(int index, multiboot_tag_t *tagp,
+ struct dboot_multiboot2_iterate_ctx *ctx)
+{
+ if (index == ctx->dboot_iter_index) {
+ ctx->dboot_iter_tagp = tagp;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+int
+dboot_multiboot2_modcount(multiboot2_info_header_t *mbi)
+{
+ struct dboot_multiboot2_iterate_ctx ctx = {
+ .dboot_iter_callback = NULL,
+ .dboot_iter_index = 0,
+ .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE,
+ .dboot_iter_tagp = NULL
+ };
+
+ return (dboot_multiboot2_iterate(mbi, &ctx));
+}
+
+uint32_t
+dboot_multiboot2_modstart(multiboot2_info_header_t *mbi, int index)
+{
+ multiboot_tag_module_t *tagp;
+ struct dboot_multiboot2_iterate_ctx ctx = {
+ .dboot_iter_callback = dboot_multiboot2_iterate_callback,
+ .dboot_iter_index = index,
+ .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE,
+ .dboot_iter_tagp = NULL
+ };
+
+ if (dboot_multiboot2_iterate(mbi, &ctx) != 0) {
+ tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp;
+
+ if (tagp != NULL)
+ return (tagp->mb_mod_start);
+ }
+ return (0);
+}
+
+uint32_t
+dboot_multiboot2_modend(multiboot2_info_header_t *mbi, int index)
+{
+ multiboot_tag_module_t *tagp;
+ struct dboot_multiboot2_iterate_ctx ctx = {
+ .dboot_iter_callback = dboot_multiboot2_iterate_callback,
+ .dboot_iter_index = index,
+ .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE,
+ .dboot_iter_tagp = NULL
+ };
+
+ if (dboot_multiboot2_iterate(mbi, &ctx) != 0) {
+ tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp;
+
+ if (tagp != NULL)
+ return (tagp->mb_mod_end);
+ }
+ return (0);
+}
+
+char *
+dboot_multiboot2_modcmdline(multiboot2_info_header_t *mbi, int index)
+{
+ multiboot_tag_module_t *tagp;
+ struct dboot_multiboot2_iterate_ctx ctx = {
+ .dboot_iter_callback = dboot_multiboot2_iterate_callback,
+ .dboot_iter_index = index,
+ .dboot_iter_tag = MULTIBOOT_TAG_TYPE_MODULE,
+ .dboot_iter_tagp = NULL
+ };
+
+ if (dboot_multiboot2_iterate(mbi, &ctx) != 0) {
+ tagp = (multiboot_tag_module_t *)ctx.dboot_iter_tagp;
+
+ if (tagp != NULL)
+ return (&tagp->mb_cmdline[0]);
+ }
+ return (NULL);
+}
+
+multiboot_tag_mmap_t *
+dboot_multiboot2_get_mmap_tagp(multiboot2_info_header_t *mbi)
+{
+ return (dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_MMAP));
+}
+
+boolean_t
+dboot_multiboot2_basicmeminfo(multiboot2_info_header_t *mbi,
+ uint32_t *lower, uint32_t *upper)
+{
+ multiboot_tag_basic_meminfo_t *mip;
+
+ mip = dboot_multiboot2_find_tag(mbi, MULTIBOOT_TAG_TYPE_BASIC_MEMINFO);
+ if (mip != NULL) {
+ *lower = mip->mb_mem_lower;
+ *upper = mip->mb_mem_upper;
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Return the type of mmap entry referenced by index.
+ */
+uint32_t
+dboot_multiboot2_mmap_get_type(multiboot2_info_header_t *mbi,
+ multiboot_tag_mmap_t *mb2_mmap_tagp, int index)
+{
+ multiboot_mmap_entry_t *mapentp;
+
+ if (mb2_mmap_tagp == NULL)
+ mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi);
+
+ if (mb2_mmap_tagp == NULL)
+ return (0);
+
+ if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index)
+ return (0);
+
+ mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries +
+ index * mb2_mmap_tagp->mb_entry_size);
+ return (mapentp->mmap_type);
+}
+
+/*
+ * Return the length of mmap entry referenced by index.
+ */
+uint64_t
+dboot_multiboot2_mmap_get_length(multiboot2_info_header_t *mbi,
+ multiboot_tag_mmap_t *mb2_mmap_tagp, int index)
+{
+ multiboot_mmap_entry_t *mapentp;
+
+ if (mb2_mmap_tagp == NULL)
+ mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi);
+
+ if (mb2_mmap_tagp == NULL)
+ return (0);
+
+ if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index)
+ return (0);
+
+ mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries +
+ index * mb2_mmap_tagp->mb_entry_size);
+ return (mapentp->mmap_len);
+}
+
+/*
+ * Return the address from mmap entry referenced by index.
+ */
+uint64_t
+dboot_multiboot2_mmap_get_base(multiboot2_info_header_t *mbi,
+ multiboot_tag_mmap_t *mb2_mmap_tagp, int index)
+{
+ multiboot_mmap_entry_t *mapentp;
+
+ if (mb2_mmap_tagp == NULL)
+ mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi);
+
+ if (mb2_mmap_tagp == NULL)
+ return (0);
+
+ if (dboot_multiboot2_mmap_nentries(mbi, mb2_mmap_tagp) < index)
+ return (0);
+
+ mapentp = (multiboot_mmap_entry_t *)(mb2_mmap_tagp->mb_entries +
+ index * mb2_mmap_tagp->mb_entry_size);
+ return (mapentp->mmap_addr);
+}
+
+/*
+ * Count and return the number of mmap entries provided by the tag.
+ */
+int
+dboot_multiboot2_mmap_nentries(multiboot2_info_header_t *mbi,
+ multiboot_tag_mmap_t *mb2_mmap_tagp)
+{
+ if (mb2_mmap_tagp == NULL)
+ mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mbi);
+
+ if (mb2_mmap_tagp != NULL) {
+ return ((mb2_mmap_tagp->mb_size -
+ offsetof(multiboot_tag_mmap_t, mb_entries)) /
+ mb2_mmap_tagp->mb_entry_size);
+ }
+ return (0);
+}
+
+/*
+ * Return the highest address used by info header.
+ */
+paddr_t
+dboot_multiboot2_highest_addr(multiboot2_info_header_t *mbi)
+{
+ return ((paddr_t)(uintptr_t)mbi + mbi->mbi_total_size);
+}
diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c
index 6abb7c6349..344665bf1f 100644
--- a/usr/src/uts/i86pc/dboot/dboot_startkern.c
+++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c
@@ -33,6 +33,9 @@
#include <sys/systm.h>
#include <sys/mach_mmu.h>
#include <sys/multiboot.h>
+#include <sys/multiboot2.h>
+#include <sys/multiboot2_impl.h>
+#include <sys/sysmacros.h>
#include <sys/sha1.h>
#include <util/string.h>
#include <util/strtolctype.h>
@@ -46,6 +49,7 @@ pfn_t *mfn_to_pfn_mapping;
#else /* !__xpv */
extern multiboot_header_t mb_header;
+extern uint32_t mb2_load_addr;
extern int have_cpuid(void);
#endif /* !__xpv */
@@ -132,7 +136,15 @@ start_info_t *xen_info;
/*
* If on the metal, then we have a multiboot loader.
*/
+uint32_t mb_magic; /* magic from boot loader */
+uint32_t mb_addr; /* multiboot info package from loader */
+int multiboot_version;
multiboot_info_t *mb_info;
+multiboot2_info_header_t *mb2_info;
+multiboot_tag_mmap_t *mb2_mmap_tagp;
+int num_entries; /* mmap entry count */
+boolean_t num_entries_set; /* is mmap entry count set */
+uintptr_t load_addr;
#endif /* __xpv */
@@ -181,6 +193,30 @@ uint_t rsvdmemlists_used = 0;
struct boot_modules modules[MAX_BOOT_MODULES];
uint_t modules_used = 0;
+#ifdef __xpv
+/*
+ * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
+ * definition in Xen source.
+ */
+typedef struct {
+ uint32_t base_addr_low;
+ uint32_t base_addr_high;
+ uint32_t length_low;
+ uint32_t length_high;
+ uint32_t type;
+} mmap_t;
+
+/*
+ * There is 512KB of scratch area after the boot stack page.
+ * We'll use that for everything except the kernel nucleus pages which are too
+ * big to fit there and are allocated last anyway.
+ */
+#define MAXMAPS 100
+static mmap_t map_buffer[MAXMAPS];
+#else
+typedef mb_memory_map_t mmap_t;
+#endif
+
/*
* Debugging macros
*/
@@ -616,29 +652,182 @@ exclude_from_pci(uint64_t start, uint64_t end)
}
/*
- * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
- * definition in Xen source.
+ * During memory allocation, find the highest address not used yet.
*/
-#ifdef __xpv
-typedef struct {
- uint32_t base_addr_low;
- uint32_t base_addr_high;
- uint32_t length_low;
- uint32_t length_high;
- uint32_t type;
-} mmap_t;
+static void
+check_higher(paddr_t a)
+{
+ if (a < next_avail_addr)
+ return;
+ next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
+ DBG(next_avail_addr);
+}
+
+static int
+dboot_loader_mmap_entries(void)
+{
+#if !defined(__xpv)
+ if (num_entries_set == B_TRUE)
+ return (num_entries);
+
+ switch (multiboot_version) {
+ case 1:
+ DBG(mb_info->flags);
+ if (mb_info->flags & 0x40) {
+ mb_memory_map_t *mmap;
+
+ DBG(mb_info->mmap_addr);
+ DBG(mb_info->mmap_length);
+ check_higher(mb_info->mmap_addr + mb_info->mmap_length);
+
+ for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
+ (uint32_t)mmap < mb_info->mmap_addr +
+ mb_info->mmap_length;
+ mmap = (mb_memory_map_t *)((uint32_t)mmap +
+ mmap->size + sizeof (mmap->size)))
+ ++num_entries;
+
+ num_entries_set = B_TRUE;
+ }
+ break;
+ case 2:
+ num_entries_set = B_TRUE;
+ num_entries = dboot_multiboot2_mmap_nentries(mb2_info,
+ mb2_mmap_tagp);
+ break;
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (num_entries);
#else
-typedef mb_memory_map_t mmap_t;
+ return (MAXMAPS);
+#endif
+}
+
+static uint32_t
+dboot_loader_mmap_get_type(int index)
+{
+#if !defined(__xpv)
+ mb_memory_map_t *mp, *mpend;
+ int i;
+
+ switch (multiboot_version) {
+ case 1:
+ mp = (mb_memory_map_t *)mb_info->mmap_addr;
+ mpend = (mb_memory_map_t *)
+ (mb_info->mmap_addr + mb_info->mmap_length);
+
+ for (i = 0; mp < mpend && i != index; i++)
+ mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
+ sizeof (mp->size));
+ if (mp >= mpend) {
+ dboot_panic("dboot_loader_mmap_get_type(): index "
+ "out of bounds: %d\n", index);
+ }
+ return (mp->type);
+
+ case 2:
+ return (dboot_multiboot2_mmap_get_type(mb2_info,
+ mb2_mmap_tagp, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+#else
+ return (map_buffer[index].type);
+#endif
+}
+
+static uint64_t
+dboot_loader_mmap_get_base(int index)
+{
+#if !defined(__xpv)
+ mb_memory_map_t *mp, *mpend;
+ int i;
+
+ switch (multiboot_version) {
+ case 1:
+ mp = (mb_memory_map_t *)mb_info->mmap_addr;
+ mpend = (mb_memory_map_t *)
+ (mb_info->mmap_addr + mb_info->mmap_length);
+
+ for (i = 0; mp < mpend && i != index; i++)
+ mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
+ sizeof (mp->size));
+ if (mp >= mpend) {
+ dboot_panic("dboot_loader_mmap_get_base(): index "
+ "out of bounds: %d\n", index);
+ }
+ return (((uint64_t)mp->base_addr_high << 32) +
+ (uint64_t)mp->base_addr_low);
+
+ case 2:
+ return (dboot_multiboot2_mmap_get_base(mb2_info,
+ mb2_mmap_tagp, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+#else
+ return (((uint64_t)map_buffer[index].base_addr_high << 32) +
+ (uint64_t)map_buffer[index].base_addr_low);
+#endif
+}
+
+static uint64_t
+dboot_loader_mmap_get_length(int index)
+{
+#if !defined(__xpv)
+ mb_memory_map_t *mp, *mpend;
+ int i;
+
+ switch (multiboot_version) {
+ case 1:
+ mp = (mb_memory_map_t *)mb_info->mmap_addr;
+ mpend = (mb_memory_map_t *)
+ (mb_info->mmap_addr + mb_info->mmap_length);
+
+ for (i = 0; mp < mpend && i != index; i++)
+ mp = (mb_memory_map_t *)((uint32_t)mp + mp->size +
+ sizeof (mp->size));
+ if (mp >= mpend) {
+ dboot_panic("dboot_loader_mmap_get_length(): index "
+ "out of bounds: %d\n", index);
+ }
+ return (((uint64_t)mp->length_high << 32) +
+ (uint64_t)mp->length_low);
+
+ case 2:
+ return (dboot_multiboot2_mmap_get_length(mb2_info,
+ mb2_mmap_tagp, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+#else
+ return (((uint64_t)map_buffer[index].length_high << 32) +
+ (uint64_t)map_buffer[index].length_low);
#endif
+}
static void
-build_pcimemlists(mmap_t *mem, int num)
+build_pcimemlists(void)
{
- mmap_t *mmap;
uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
uint64_t start;
uint64_t end;
- int i;
+ int i, num;
/*
* initialize
@@ -647,18 +836,18 @@ build_pcimemlists(mmap_t *mem, int num)
pcimemlists[0].size = pci_hi_limit - pci_lo_limit;
pcimemlists_used = 1;
+ num = dboot_loader_mmap_entries();
/*
* Fill in PCI memlists.
*/
- for (mmap = mem, i = 0; i < num; ++i, ++mmap) {
- start = ((uint64_t)mmap->base_addr_high << 32) +
- mmap->base_addr_low;
- end = start + ((uint64_t)mmap->length_high << 32) +
- mmap->length_low;
+ for (i = 0; i < num; ++i) {
+ start = dboot_loader_mmap_get_base(i);
+ end = start + dboot_loader_mmap_get_length(i);
if (prom_debug)
dboot_printf("\ttype: %d %" PRIx64 "..%"
- PRIx64 "\n", mmap->type, start, end);
+ PRIx64 "\n", dboot_loader_mmap_get_type(i),
+ start, end);
/*
* page align start and end
@@ -697,13 +886,7 @@ build_pcimemlists(mmap_t *mem, int num)
#if defined(__xpv)
/*
* Initialize memory allocator stuff from hypervisor-supplied start info.
- *
- * There is 512KB of scratch area after the boot stack page.
- * We'll use that for everything except the kernel nucleus pages which are too
- * big to fit there and are allocated last anyway.
*/
-#define MAXMAPS 100
-static mmap_t map_buffer[MAXMAPS];
static void
init_mem_alloc(void)
{
@@ -783,12 +966,159 @@ init_mem_alloc(void)
set_xen_guest_handle(map.buffer, map_buffer);
if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &map) != 0)
dboot_panic("getting XENMEM_machine_memory_map failed");
- build_pcimemlists(map_buffer, map.nr_entries);
+ build_pcimemlists();
}
}
#else /* !__xpv */
+static void
+dboot_multiboot1_xboot_consinfo(void)
+{
+}
+
+static void
+dboot_multiboot2_xboot_consinfo(void)
+{
+}
+
+static int
+dboot_multiboot_modcount(void)
+{
+ switch (multiboot_version) {
+ case 1:
+ return (mb_info->mods_count);
+
+ case 2:
+ return (dboot_multiboot2_modcount(mb2_info));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+}
+
+static uint32_t
+dboot_multiboot_modstart(int index)
+{
+ switch (multiboot_version) {
+ case 1:
+ return (((mb_module_t *)mb_info->mods_addr)[index].mod_start);
+
+ case 2:
+ return (dboot_multiboot2_modstart(mb2_info, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+}
+
+static uint32_t
+dboot_multiboot_modend(int index)
+{
+ switch (multiboot_version) {
+ case 1:
+ return (((mb_module_t *)mb_info->mods_addr)[index].mod_end);
+
+ case 2:
+ return (dboot_multiboot2_modend(mb2_info, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+}
+
+static char *
+dboot_multiboot_modcmdline(int index)
+{
+ switch (multiboot_version) {
+ case 1:
+ return ((char *)((mb_module_t *)
+ mb_info->mods_addr)[index].mod_name);
+
+ case 2:
+ return (dboot_multiboot2_modcmdline(mb2_info, index));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (0);
+}
+
+/*
+ * Find the environment module for console setup.
+ * Since we need the console to print early boot messages, the console is set up
+ * before anything else and therefore we need to pick up the environment module
+ * early too.
+ *
+ * Note, we just will search for and if found, will pass the env
+ * module to console setup, the proper module list processing will happen later.
+ */
+static void
+dboot_find_env(void)
+{
+ int i, modcount;
+ uint32_t mod_start, mod_end;
+ char *cmdline;
+
+ modcount = dboot_multiboot_modcount();
+
+ for (i = 0; i < modcount; ++i) {
+ cmdline = dboot_multiboot_modcmdline(i);
+ if (cmdline == NULL)
+ continue;
+
+ if (strstr(cmdline, "type=environment") == NULL)
+ continue;
+
+ mod_start = dboot_multiboot_modstart(i);
+ mod_end = dboot_multiboot_modend(i);
+ modules[0].bm_addr = mod_start;
+ modules[0].bm_size = mod_end - mod_start;
+ modules[0].bm_name = NULL;
+ modules[0].bm_hash = NULL;
+ modules[0].bm_type = BMT_ENV;
+ bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
+ bi->bi_module_cnt = 1;
+ return;
+ }
+}
+
+static boolean_t
+dboot_multiboot_basicmeminfo(uint32_t *lower, uint32_t *upper)
+{
+ boolean_t rv = B_FALSE;
+
+ switch (multiboot_version) {
+ case 1:
+ if (mb_info->flags & 0x01) {
+ *lower = mb_info->mem_lower;
+ *upper = mb_info->mem_upper;
+ rv = B_TRUE;
+ }
+ break;
+
+ case 2:
+ return (dboot_multiboot2_basicmeminfo(mb2_info, lower, upper));
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ return (rv);
+}
+
static uint8_t
dboot_a2h(char v)
{
@@ -860,6 +1190,8 @@ type_to_str(boot_module_type_t type)
return ("file");
case BMT_HASH:
return ("hash");
+ case BMT_ENV:
+ return ("environment");
default:
return ("unknown");
}
@@ -908,21 +1240,23 @@ check_images(void)
* hashes which are checked prior to transferring control to the kernel.
*/
static void
-process_module(mb_module_t *mod)
+process_module(int midx)
{
- int midx = modules_used++;
+ uint32_t mod_start = dboot_multiboot_modstart(midx);
+ uint32_t mod_end = dboot_multiboot_modend(midx);
+ char *cmdline = dboot_multiboot_modcmdline(midx);
char *p, *q;
+ check_higher(mod_end);
if (prom_debug) {
dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
- midx, (char *)(mod->mod_name),
- (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
+ midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end);
}
- if (mod->mod_start > mod->mod_end) {
+ if (mod_start > mod_end) {
dboot_panic("module #%d: module start address 0x%lx greater "
"than end address 0x%lx", midx,
- (ulong_t)mod->mod_start, (ulong_t)mod->mod_end);
+ (ulong_t)mod_start, (ulong_t)mod_end);
}
/*
@@ -943,18 +1277,18 @@ process_module(mb_module_t *mod)
* correct number of bytes in each module, achieving exactly this.
*/
- modules[midx].bm_addr = mod->mod_start;
- modules[midx].bm_size = mod->mod_end - mod->mod_start;
- modules[midx].bm_name = mod->mod_name;
+ modules[midx].bm_addr = mod_start;
+ modules[midx].bm_size = mod_end - mod_start;
+ modules[midx].bm_name = (native_ptr_t)(uintptr_t)cmdline;
modules[midx].bm_hash = NULL;
modules[midx].bm_type = BMT_FILE;
- if (mod->mod_name == NULL) {
+ if (cmdline == NULL) {
modules[midx].bm_name = (native_ptr_t)(uintptr_t)noname;
return;
}
- p = (char *)(uintptr_t)mod->mod_name;
+ p = cmdline;
modules[midx].bm_name =
(native_ptr_t)(uintptr_t)strsep(&p, " \t\f\n\r");
@@ -976,6 +1310,8 @@ process_module(mb_module_t *mod)
modules[midx].bm_type = BMT_ROOTFS;
} else if (strcmp(q, "hash") == 0) {
modules[midx].bm_type = BMT_HASH;
+ } else if (strcmp(q, "environment") == 0) {
+ modules[midx].bm_type = BMT_ENV;
} else if (strcmp(q, "file") != 0) {
dboot_printf("\tmodule #%d: unknown module "
"type '%s'; defaulting to 'file'",
@@ -1065,89 +1401,69 @@ assign_module_hashes(void)
}
/*
- * During memory allocation, find the highest address not used yet.
- */
-static void
-check_higher(paddr_t a)
-{
- if (a < next_avail_addr)
- return;
- next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE);
- DBG(next_avail_addr);
-}
-
-/*
* Walk through the module information finding the last used address.
* The first available address will become the top level page table.
- *
- * We then build the phys_install memlist from the multiboot information.
*/
static void
-init_mem_alloc(void)
+dboot_process_modules(void)
{
- mb_memory_map_t *mmap;
- mb_module_t *mod;
- uint64_t start;
- uint64_t end;
- uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
+ int i, modcount;
extern char _end[];
- int i;
-
- DBG_MSG("Entered init_mem_alloc()\n");
- DBG((uintptr_t)mb_info);
- if (mb_info->mods_count > MAX_BOOT_MODULES) {
+ DBG_MSG("\nFinding Modules\n");
+ modcount = dboot_multiboot_modcount();
+ if (modcount > MAX_BOOT_MODULES) {
dboot_panic("Too many modules (%d) -- the maximum is %d.",
- mb_info->mods_count, MAX_BOOT_MODULES);
+ modcount, MAX_BOOT_MODULES);
}
/*
* search the modules to find the last used address
* we'll build the module list while we're walking through here
*/
- DBG_MSG("\nFinding Modules\n");
check_higher((paddr_t)(uintptr_t)&_end);
- for (mod = (mb_module_t *)(mb_info->mods_addr), i = 0;
- i < mb_info->mods_count;
- ++mod, ++i) {
- process_module(mod);
- check_higher(mod->mod_end);
+ for (i = 0; i < modcount; ++i) {
+ process_module(i);
+ modules_used++;
}
bi->bi_modules = (native_ptr_t)(uintptr_t)modules;
DBG(bi->bi_modules);
- bi->bi_module_cnt = mb_info->mods_count;
+ bi->bi_module_cnt = modcount;
DBG(bi->bi_module_cnt);
fixup_modules();
assign_module_hashes();
check_images();
+}
+
+/*
+ * We then build the phys_install memlist from the multiboot information.
+ */
+static void
+dboot_process_mmap(void)
+{
+ uint64_t start;
+ uint64_t end;
+ uint64_t page_offset = MMU_PAGEOFFSET; /* needs to be 64 bits */
+ uint32_t lower, upper;
+ int i, mmap_entries;
/*
* Walk through the memory map from multiboot and build our memlist
* structures. Note these will have native format pointers.
*/
DBG_MSG("\nFinding Memory Map\n");
- DBG(mb_info->flags);
+ num_entries = 0;
+ num_entries_set = B_FALSE;
max_mem = 0;
- if (mb_info->flags & 0x40) {
- int cnt = 0;
-
- DBG(mb_info->mmap_addr);
- DBG(mb_info->mmap_length);
- check_higher(mb_info->mmap_addr + mb_info->mmap_length);
-
- for (mmap = (mb_memory_map_t *)mb_info->mmap_addr;
- (uint32_t)mmap < mb_info->mmap_addr + mb_info->mmap_length;
- mmap = (mb_memory_map_t *)((uint32_t)mmap + mmap->size
- + sizeof (mmap->size))) {
- ++cnt;
- start = ((uint64_t)mmap->base_addr_high << 32) +
- mmap->base_addr_low;
- end = start + ((uint64_t)mmap->length_high << 32) +
- mmap->length_low;
+ if ((mmap_entries = dboot_loader_mmap_entries()) > 0) {
+ for (i = 0; i < mmap_entries; i++) {
+ uint32_t type = dboot_loader_mmap_get_type(i);
+ start = dboot_loader_mmap_get_base(i);
+ end = start + dboot_loader_mmap_get_length(i);
if (prom_debug)
dboot_printf("\ttype: %d %" PRIx64 "..%"
- PRIx64 "\n", mmap->type, start, end);
+ PRIx64 "\n", type, start, end);
/*
* page align start and end
@@ -1160,7 +1476,7 @@ init_mem_alloc(void)
/*
* only type 1 is usable RAM
*/
- switch (mmap->type) {
+ switch (type) {
case 1:
if (end > max_mem)
max_mem = end;
@@ -1214,22 +1530,21 @@ init_mem_alloc(void)
continue;
}
}
- build_pcimemlists((mb_memory_map_t *)mb_info->mmap_addr, cnt);
- } else if (mb_info->flags & 0x01) {
- DBG(mb_info->mem_lower);
+ build_pcimemlists();
+ } else if (dboot_multiboot_basicmeminfo(&lower, &upper)) {
+ DBG(lower);
memlists[memlists_used].addr = 0;
- memlists[memlists_used].size = mb_info->mem_lower * 1024;
+ memlists[memlists_used].size = lower * 1024;
++memlists_used;
- DBG(mb_info->mem_upper);
+ DBG(upper);
memlists[memlists_used].addr = 1024 * 1024;
- memlists[memlists_used].size = mb_info->mem_upper * 1024;
+ memlists[memlists_used].size = upper * 1024;
++memlists_used;
/*
* Old platform - assume I/O space at the end of memory.
*/
- pcimemlists[0].addr =
- (mb_info->mem_upper * 1024) + (1024 * 1024);
+ pcimemlists[0].addr = (upper * 1024) + (1024 * 1024);
pcimemlists[0].size = pci_hi_limit - pcimemlists[0].addr;
pcimemlists[0].next = 0;
pcimemlists[0].prev = 0;
@@ -1239,8 +1554,6 @@ init_mem_alloc(void)
dboot_panic("No memory info from boot loader!!!");
}
- check_higher(bi->bi_cmdline);
-
/*
* finish processing the physinstall list
*/
@@ -1251,6 +1564,102 @@ init_mem_alloc(void)
*/
build_rsvdmemlists();
}
+
+/*
+ * The highest address is used as the starting point for dboot's simple
+ * memory allocator.
+ *
+ * Finding the highest address in case of Multiboot 1 protocol is
+ * quite painful in the sense that some information provided by
+ * the multiboot info structure points to BIOS data, and some to RAM.
+ *
+ * The module list was processed and checked already by dboot_process_modules(),
+ * so we will check the command line string and the memory map.
+ *
+ * This list of to be checked items is based on our current knowledge of
+ * allocations made by grub1 and will need to be reviewed if there
+ * are updates about the information provided by Multiboot 1.
+ *
+ * In the case of the Multiboot 2, our life is much simpler, as the MB2
+ * information tag list is one contiguous chunk of memory.
+ */
+static paddr_t
+dboot_multiboot1_highest_addr(void)
+{
+ paddr_t addr = NULL;
+ char *cmdl = (char *)mb_info->cmdline;
+
+ if (mb_info->flags & MB_INFO_CMDLINE)
+ addr = ((paddr_t)((uintptr_t)cmdl + strlen(cmdl) + 1));
+
+ if (mb_info->flags & MB_INFO_MEM_MAP)
+ addr = MAX(addr,
+ ((paddr_t)(mb_info->mmap_addr + mb_info->mmap_length)));
+ return (addr);
+}
+
+static void
+dboot_multiboot_highest_addr(void)
+{
+ paddr_t addr;
+
+ switch (multiboot_version) {
+ case 1:
+ addr = dboot_multiboot1_highest_addr();
+ if (addr != NULL)
+ check_higher(addr);
+ break;
+ case 2:
+ addr = dboot_multiboot2_highest_addr(mb2_info);
+ if (addr != NULL)
+ check_higher(addr);
+ break;
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+}
+
+/*
+ * Walk the boot loader provided information and find the highest free address.
+ */
+static void
+init_mem_alloc(void)
+{
+ DBG_MSG("Entered init_mem_alloc()\n");
+ dboot_process_modules();
+ dboot_process_mmap();
+ dboot_multiboot_highest_addr();
+}
+
+static void
+dboot_multiboot_get_fwtables(void)
+{
+ multiboot_tag_new_acpi_t *nacpitagp;
+ multiboot_tag_old_acpi_t *oacpitagp;
+
+ /* no fw tables from multiboot 1 */
+ if (multiboot_version != 2)
+ return;
+
+ nacpitagp = (multiboot_tag_new_acpi_t *)
+ dboot_multiboot2_find_tag(mb2_info,
+ MULTIBOOT_TAG_TYPE_ACPI_NEW);
+ oacpitagp = (multiboot_tag_old_acpi_t *)
+ dboot_multiboot2_find_tag(mb2_info,
+ MULTIBOOT_TAG_TYPE_ACPI_OLD);
+
+ if (nacpitagp != NULL) {
+ bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
+ &nacpitagp->mb_rsdp[0];
+ } else if (oacpitagp != NULL) {
+ bi->bi_acpi_rsdp = (native_ptr_t)(uintptr_t)
+ &oacpitagp->mb_rsdp[0];
+ } else {
+ bi->bi_acpi_rsdp = NULL;
+ }
+}
#endif /* !__xpv */
/*
@@ -1438,6 +1847,140 @@ kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
module$ /platform/i86pc/$ISADIR/boot_archive\n\
See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
+static void
+dboot_init_xboot_consinfo(void)
+{
+ uintptr_t addr;
+ /*
+ * boot info must be 16 byte aligned for 64 bit kernel ABI
+ */
+ addr = (uintptr_t)boot_info;
+ addr = (addr + 0xf) & ~0xf;
+ bi = (struct xboot_info *)addr;
+
+#if !defined(__xpv)
+ switch (multiboot_version) {
+ case 1:
+ dboot_multiboot1_xboot_consinfo();
+ break;
+ case 2:
+ dboot_multiboot2_xboot_consinfo();
+ break;
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+ /*
+ * Lookup environment module for the console. Complete module list
+ * will be built after console setup.
+ */
+ dboot_find_env();
+#endif
+}
+
+/*
+ * Set up basic data from the boot loader.
+ * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
+ * 32-bit dboot code setup used to set up and start 64-bit kernel.
+ * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
+ * start 64-bit illumos kernel.
+ */
+static void
+dboot_loader_init(void)
+{
+#if !defined(__xpv)
+ mb_info = NULL;
+ mb2_info = NULL;
+
+ switch (mb_magic) {
+ case MB_BOOTLOADER_MAGIC:
+ multiboot_version = 1;
+ mb_info = (multiboot_info_t *)(uintptr_t)mb_addr;
+#if defined(_BOOT_TARGET_amd64)
+ load_addr = mb_header.load_addr;
+#endif
+ break;
+
+ case MULTIBOOT2_BOOTLOADER_MAGIC:
+ multiboot_version = 2;
+ mb2_info = (multiboot2_info_header_t *)(uintptr_t)mb_addr;
+ mb2_mmap_tagp = dboot_multiboot2_get_mmap_tagp(mb2_info);
+#if defined(_BOOT_TARGET_amd64)
+ load_addr = mb2_load_addr;
+#endif
+ break;
+
+ default:
+ dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic);
+ break;
+ }
+#endif /* !defined(__xpv) */
+}
+
+/* Extract the kernel command line from [multi]boot information. */
+static char *
+dboot_loader_cmdline(void)
+{
+ char *line = NULL;
+
+#if defined(__xpv)
+ line = (char *)xen_info->cmd_line;
+#else /* __xpv */
+
+ switch (multiboot_version) {
+ case 1:
+ if (mb_info->flags & MB_INFO_CMDLINE)
+ line = (char *)mb_info->cmdline;
+ break;
+
+ case 2:
+ line = dboot_multiboot2_cmdline(mb2_info);
+ break;
+
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+
+#endif /* __xpv */
+
+ /*
+ * Make sure we have valid pointer so the string operations
+ * will not crash us.
+ */
+ if (line == NULL)
+ line = "";
+
+ return (line);
+}
+
+static char *
+dboot_loader_name(void)
+{
+#if defined(__xpv)
+ return (NULL);
+#else /* __xpv */
+ multiboot_tag_string_t *tag;
+
+ switch (multiboot_version) {
+ case 1:
+ return ((char *)mb_info->boot_loader_name);
+
+ case 2:
+ tag = dboot_multiboot2_find_tag(mb2_info,
+ MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME);
+ return (tag->mb_string);
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
+
+ return (NULL);
+#endif /* __xpv */
+}
/*
* startup_kernel has a pretty simple job. It builds pagetables which reflect
* 1:1 mappings for all memory in use. It then also adds mappings for
@@ -1450,22 +1993,18 @@ void
startup_kernel(void)
{
char *cmdline;
- uintptr_t addr;
+ char *bootloader;
#if defined(__xpv)
physdev_set_iopl_t set_iopl;
#endif /* __xpv */
+ dboot_loader_init();
/*
* At this point we are executing in a 32 bit real mode.
*/
-#if defined(__xpv)
- cmdline = (char *)xen_info->cmd_line;
-#else /* __xpv */
- cmdline = (char *)mb_info->cmdline;
-#endif /* __xpv */
- prom_debug = (strstr(cmdline, "prom_debug") != NULL);
- map_debug = (strstr(cmdline, "map_debug") != NULL);
+ bootloader = dboot_loader_name();
+ cmdline = dboot_loader_cmdline();
#if defined(__xpv)
/*
@@ -1478,23 +2017,40 @@ startup_kernel(void)
}
#endif /* __xpv */
- bcons_init(cmdline);
- DBG_MSG("\n\nSolaris prekernel set: ");
+ dboot_init_xboot_consinfo();
+ bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
+ bcons_init(bi);
+
+ prom_debug = (find_boot_prop("prom_debug") != NULL);
+ map_debug = (find_boot_prop("map_debug") != NULL);
+
+#if !defined(__xpv)
+ dboot_multiboot_get_fwtables();
+#endif
+ DBG_MSG("\n\nillumos prekernel set: ");
DBG_MSG(cmdline);
DBG_MSG("\n");
+ if (bootloader != NULL && prom_debug) {
+ dboot_printf("Kernel loaded by: %s\n", bootloader);
+#if !defined(__xpv)
+ dboot_printf("Using multiboot %d boot protocol.\n",
+ multiboot_version);
+#endif
+ }
+
if (strstr(cmdline, "multiboot") != NULL) {
dboot_panic(NO_MULTIBOOT);
}
- /*
- * boot info must be 16 byte aligned for 64 bit kernel ABI
- */
- addr = (uintptr_t)boot_info;
- addr = (addr + 0xf) & ~0xf;
- bi = (struct xboot_info *)addr;
DBG((uintptr_t)bi);
- bi->bi_cmdline = (native_ptr_t)(uintptr_t)cmdline;
+#if !defined(__xpv)
+ DBG((uintptr_t)mb_info);
+ DBG((uintptr_t)mb2_info);
+ if (mb2_info != NULL)
+ DBG(mb2_info->mbi_total_size);
+ DBG(bi->bi_acpi_rsdp);
+#endif
/*
* Need correct target_kernel_text value
@@ -1709,7 +2265,8 @@ startup_kernel(void)
ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG);
if (ktext_phys == 0)
dboot_panic("failed to allocate aligned kernel memory");
- if (dboot_elfload64(mb_header.load_addr) != 0)
+ DBG(load_addr);
+ if (dboot_elfload64(load_addr) != 0)
dboot_panic("failed to parse kernel ELF image, rebooting");
#endif
@@ -1757,7 +2314,20 @@ startup_kernel(void)
DBG(bi->bi_next_paddr);
bi->bi_next_vaddr = (uintptr_t)next_avail_addr;
DBG(bi->bi_next_vaddr);
- bi->bi_mb_info = (uintptr_t)mb_info;
+ bi->bi_mb_version = multiboot_version;
+
+ switch (multiboot_version) {
+ case 1:
+ bi->bi_mb_info = (uintptr_t)mb_info;
+ break;
+ case 2:
+ bi->bi_mb_info = (uintptr_t)mb2_info;
+ break;
+ default:
+ dboot_panic("Unknown multiboot version: %d\n",
+ multiboot_version);
+ break;
+ }
bi->bi_top_page_table = (uintptr_t)top_page_table;
#endif /* __xpv */
diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c
index 84b41cfdad..fa07371303 100644
--- a/usr/src/uts/i86pc/os/ddi_impl.c
+++ b/usr/src/uts/i86pc/os/ddi_impl.c
@@ -1903,6 +1903,9 @@ get_boot_properties(void)
copy_boot_str(bop_staging_area, property_val, 50);
(void) ndi_prop_update_string(DDI_DEV_T_NONE, devi,
property_name, property_val);
+ } else if (strcmp(name, "acpi-root-tab") == 0) {
+ (void) ndi_prop_update_int64(DDI_DEV_T_NONE, devi,
+ property_name, *((int64_t *)bop_staging_area));
} else if (strcmp(name, "stdout") == 0) {
(void) ndi_prop_update_int(DDI_DEV_T_NONE, devi,
property_name, *((int *)bop_staging_area));
diff --git a/usr/src/uts/i86pc/os/fakebop.c b/usr/src/uts/i86pc/os/fakebop.c
index 2a1c65d4b6..8616ef9f40 100644
--- a/usr/src/uts/i86pc/os/fakebop.c
+++ b/usr/src/uts/i86pc/os/fakebop.c
@@ -40,6 +40,8 @@
#include <sys/bootsvcs.h>
#include <sys/bootinfo.h>
#include <sys/multiboot.h>
+#include <sys/multiboot2.h>
+#include <sys/multiboot2_impl.h>
#include <sys/bootvfs.h>
#include <sys/bootprops.h>
#include <sys/varargs.h>
@@ -132,7 +134,7 @@ shared_info_t *HYPERVISOR_shared_info;
static ulong_t total_bop_alloc_scratch = 0;
static ulong_t total_bop_alloc_kernel = 0;
-static void build_firmware_properties(void);
+static void build_firmware_properties(struct xboot_info *);
static int early_allocation = 1;
@@ -791,7 +793,7 @@ done:
bcons_init2(inputdev, outputdev, consoledev);
}
- if (strstr((char *)xbootp->bi_cmdline, "prom_debug") || kbm_debug)
+ if (find_boot_prop("prom_debug") || kbm_debug)
boot_prop_display(line);
}
@@ -1130,7 +1132,8 @@ build_panic_cmdline(const char *cmd, int cmdlen)
#ifndef __xpv
/*
- * Construct boot command line for Fast Reboot
+ * Construct boot command line for Fast Reboot. The saved_cmdline
+ * is also reported by "eeprom bootcmd".
*/
static void
build_fastboot_cmdline(struct xboot_info *xbp)
@@ -1199,6 +1202,125 @@ save_boot_info(struct xboot_info *xbi)
}
#endif /* __xpv */
+/*
+ * Import boot environment module variables as properties, applying
+ * blacklist filter for variables we know we will not use.
+ *
+ * Since the environment can be relatively large, containing many variables
+ * used only for boot loader purposes, we will use a blacklist based filter.
+ * To keep the blacklist from growing too large, we use prefix based filtering.
+ * This is possible because in many cases, the loader variable names are
+ * using a structured layout.
+ *
+ * We will not overwrite already set properties.
+ */
+static struct bop_blacklist {
+ const char *bl_name;
+ int bl_name_len;
+} bop_prop_blacklist[] = {
+ { "ISADIR", sizeof ("ISADIR") },
+ { "acpi", sizeof ("acpi") },
+ { "autoboot_delay", sizeof ("autoboot_delay") },
+ { "autoboot_delay", sizeof ("autoboot_delay") },
+ { "beansi_", sizeof ("beansi_") },
+ { "beastie", sizeof ("beastie") },
+ { "bemenu", sizeof ("bemenu") },
+ { "boot.", sizeof ("boot.") },
+ { "bootenv", sizeof ("bootenv") },
+ { "currdev", sizeof ("currdev") },
+ { "dhcp.", sizeof ("dhcp.") },
+ { "interpret", sizeof ("interpret") },
+ { "kernel", sizeof ("kernel") },
+ { "loaddev", sizeof ("loaddev") },
+ { "loader_", sizeof ("loader_") },
+ { "module_path", sizeof ("module_path") },
+ { "nfs.", sizeof ("nfs.") },
+ { "pcibios", sizeof ("pcibios") },
+ { "prompt", sizeof ("prompt") },
+ { "smbios", sizeof ("smbios") },
+ { "tem", sizeof ("tem") },
+ { "twiddle_divisor", sizeof ("twiddle_divisor") },
+ { "zfs_be", sizeof ("zfs_be") },
+};
+
+/*
+ * Match the name against prefixes in above blacklist. If the match was
+ * found, this name is blacklisted.
+ */
+static boolean_t
+name_is_blacklisted(const char *name)
+{
+ int i, n;
+
+ n = sizeof (bop_prop_blacklist) / sizeof (bop_prop_blacklist[0]);
+ for (i = 0; i < n; i++) {
+ if (strncmp(bop_prop_blacklist[i].bl_name, name,
+ bop_prop_blacklist[i].bl_name_len - 1) == 0) {
+ return (B_TRUE);
+ }
+ }
+ return (B_FALSE);
+}
+
+static void
+process_boot_environment(struct boot_modules *benv)
+{
+ char *env, *ptr, *name, *value;
+ uint32_t size, name_len, value_len;
+
+ if (benv == NULL || benv->bm_type != BMT_ENV)
+ return;
+ ptr = env = benv->bm_addr;
+ size = benv->bm_size;
+ do {
+ name = ptr;
+ /* find '=' */
+ while (*ptr != '=') {
+ ptr++;
+ if (ptr > env + size) /* Something is very wrong. */
+ return;
+ }
+ name_len = ptr - name;
+ if (sizeof (buffer) <= name_len)
+ continue;
+
+ (void) strncpy(buffer, name, sizeof (buffer));
+ buffer[name_len] = '\0';
+ name = buffer;
+
+ value_len = 0;
+ value = ++ptr;
+ while ((uintptr_t)ptr - (uintptr_t)env < size) {
+ if (*ptr == '\0') {
+ ptr++;
+ value_len = (uintptr_t)ptr - (uintptr_t)env;
+ break;
+ }
+ ptr++;
+ }
+
+ /* Did we reach the end of the module? */
+ if (value_len == 0)
+ return;
+
+ if (*value == '\0')
+ continue;
+
+ /* Is this property already set? */
+ if (do_bsys_getproplen(NULL, name) >= 0)
+ continue;
+
+ if (name_is_blacklisted(name) == B_TRUE)
+ continue;
+
+ /* Create new property. */
+ bsetprops(name, value);
+
+ /* Avoid reading past the module end. */
+ if (size <= (uintptr_t)ptr - (uintptr_t)env)
+ return;
+ } while (*ptr != '\0');
+}
/*
* 1st pass at building the table of boot properties. This includes:
@@ -1218,7 +1340,7 @@ build_boot_properties(struct xboot_info *xbp)
int name_len;
char *value;
int value_len;
- struct boot_modules *bm, *rdbm;
+ struct boot_modules *bm, *rdbm, *benv = NULL;
char *propbuf;
int quoted = 0;
int boot_arg_len;
@@ -1228,9 +1350,6 @@ build_boot_properties(struct xboot_info *xbp)
static int stdout_val = 0;
uchar_t boot_device;
char str[3];
- multiboot_info_t *mbi;
- int netboot;
- struct sol_netinfo *sip;
#endif
/*
@@ -1250,6 +1369,13 @@ build_boot_properties(struct xboot_info *xbp)
if (bm[i].bm_type == BMT_HASH || bm[i].bm_name == NULL)
continue;
+ if (bm[i].bm_type == BMT_ENV) {
+ if (benv == NULL)
+ benv = &bm[i];
+ else
+ continue;
+ }
+
(void) snprintf(modid, sizeof (modid),
"module-name-%u", midx);
bsetprops(modid, (char *)bm[i].bm_name);
@@ -1277,6 +1403,19 @@ build_boot_properties(struct xboot_info *xbp)
fastreboot_disable(FBNS_BOOTMOD);
}
+#ifndef __xpv
+ /*
+ * Disable fast reboot if we're using the Multiboot 2 boot protocol,
+ * since we don't currently support MB2 info and module relocation.
+ * Note that fast reboot will have already been disabled if multiple
+ * modules are present, since the current implementation assumes that
+ * we only have a single module, the boot_archive.
+ */
+ if (xbp->bi_mb_version != 1) {
+ fastreboot_disable(FBNS_MULTIBOOT2);
+ }
+#endif
+
DBG_MSG("Parsing command line for boot properties\n");
value = xbp->bi_cmdline;
@@ -1470,48 +1609,83 @@ build_boot_properties(struct xboot_info *xbp)
bsetprops("boot-args", boot_args);
bsetprops("bootargs", boot_args);
-#ifndef __xpv
- /*
- * set the BIOS boot device from GRUB
- */
- netboot = 0;
- mbi = xbp->bi_mb_info;
+ process_boot_environment(benv);
+#ifndef __xpv
/*
* Build boot command line for Fast Reboot
*/
build_fastboot_cmdline(xbp);
- /*
- * Save various boot information for Fast Reboot
- */
- save_boot_info(xbp);
-
- if (mbi != NULL && mbi->flags & MB_INFO_BOOTDEV) {
- boot_device = mbi->boot_device >> 24;
- if (boot_device == 0x20)
- netboot++;
- str[0] = (boot_device >> 4) + '0';
- str[1] = (boot_device & 0xf) + '0';
- str[2] = 0;
- bsetprops("bios-boot-device", str);
- } else {
- netboot = 1;
- }
+ if (xbp->bi_mb_version == 1) {
+ multiboot_info_t *mbi = xbp->bi_mb_info;
+ int netboot;
+ struct sol_netinfo *sip;
- /*
- * In the netboot case, drives_info is overloaded with the dhcp ack.
- * This is not multiboot compliant and requires special pxegrub!
- */
- if (netboot && mbi->drives_length != 0) {
- sip = (struct sol_netinfo *)(uintptr_t)mbi->drives_addr;
- if (sip->sn_infotype == SN_TYPE_BOOTP)
+ /*
+ * set the BIOS boot device from GRUB
+ */
+ netboot = 0;
+
+ /*
+ * Save various boot information for Fast Reboot
+ */
+ save_boot_info(xbp);
+
+ if (mbi != NULL && mbi->flags & MB_INFO_BOOTDEV) {
+ boot_device = mbi->boot_device >> 24;
+ if (boot_device == 0x20)
+ netboot++;
+ str[0] = (boot_device >> 4) + '0';
+ str[1] = (boot_device & 0xf) + '0';
+ str[2] = 0;
+ bsetprops("bios-boot-device", str);
+ } else {
+ netboot = 1;
+ }
+
+ /*
+ * In the netboot case, drives_info is overloaded with the
+ * dhcp ack. This is not multiboot compliant and requires
+ * special pxegrub!
+ */
+ if (netboot && mbi->drives_length != 0) {
+ sip = (struct sol_netinfo *)(uintptr_t)mbi->drives_addr;
+ if (sip->sn_infotype == SN_TYPE_BOOTP)
+ bsetprop("bootp-response",
+ sizeof ("bootp-response"),
+ (void *)(uintptr_t)mbi->drives_addr,
+ mbi->drives_length);
+ else if (sip->sn_infotype == SN_TYPE_RARP)
+ setup_rarp_props(sip);
+ }
+ } else {
+ multiboot2_info_header_t *mbi = xbp->bi_mb_info;
+ multiboot_tag_bootdev_t *bootdev = NULL;
+ multiboot_tag_network_t *netdev = NULL;
+
+ if (mbi != NULL) {
+ bootdev = dboot_multiboot2_find_tag(mbi,
+ MULTIBOOT_TAG_TYPE_BOOTDEV);
+ netdev = dboot_multiboot2_find_tag(mbi,
+ MULTIBOOT_TAG_TYPE_NETWORK);
+ }
+ if (bootdev != NULL) {
+ DBG(bootdev->mb_biosdev);
+ boot_device = bootdev->mb_biosdev;
+ str[0] = (boot_device >> 4) + '0';
+ str[1] = (boot_device & 0xf) + '0';
+ str[2] = 0;
+ bsetprops("bios-boot-device", str);
+ }
+ if (netdev != NULL) {
bsetprop("bootp-response", sizeof ("bootp-response"),
- (void *)(uintptr_t)mbi->drives_addr,
- mbi->drives_length);
- else if (sip->sn_infotype == SN_TYPE_RARP)
- setup_rarp_props(sip);
+ (void *)(uintptr_t)netdev->mb_dhcpack,
+ netdev->mb_size -
+ sizeof (multiboot_tag_network_t));
+ }
}
+
bsetprop("stdout", strlen("stdout"),
&stdout_val, sizeof (stdout_val));
#endif /* __xpv */
@@ -1530,7 +1704,7 @@ build_boot_properties(struct xboot_info *xbp)
/*
* Build firmware-provided system properties
*/
- build_firmware_properties();
+ build_firmware_properties(xbp);
/*
* XXPV
@@ -1812,13 +1986,13 @@ _start(struct xboot_info *xbp)
}
#endif
- bcons_init((void *)xbp->bi_cmdline);
+ bcons_init(xbp);
have_console = 1;
/*
* enable debugging
*/
- if (strstr((char *)xbp->bi_cmdline, "kbm_debug"))
+ if (find_boot_prop("kbm_debug") != NULL)
kbm_debug = 1;
DBG_MSG("\n\n*** Entered Solaris in _start() cmdline is: ");
@@ -1897,7 +2071,7 @@ _start(struct xboot_info *xbp)
DBG_MSG("Initializing boot properties:\n");
build_boot_properties(xbp);
- if (strstr((char *)xbp->bi_cmdline, "prom_debug") || kbm_debug) {
+ if (find_boot_prop("prom_debug") || kbm_debug) {
char *value;
value = do_bsys_alloc(NULL, NULL, MMU_PAGESIZE, MMU_PAGESIZE);
@@ -2019,9 +2193,26 @@ static ACPI_TABLE_RSDP *
find_rsdp()
{
ACPI_TABLE_RSDP *rsdp;
+ uint64_t rsdp_val = 0;
uint16_t *ebda_seg;
paddr_t ebda_addr;
+ /* check for "acpi-root-tab" property */
+ if (do_bsys_getproplen(NULL, "acpi-root-tab") == sizeof (uint64_t)) {
+ (void) do_bsys_getprop(NULL, "acpi-root-tab", &rsdp_val);
+ if (rsdp_val != 0) {
+ rsdp = scan_rsdp(rsdp_val, rsdp_val + sizeof (*rsdp));
+ if (rsdp != NULL) {
+ if (kbm_debug) {
+ bop_printf(NULL,
+ "Using RSDP from bootloader: "
+ "0x%p\n", (void *)rsdp);
+ }
+ return (rsdp);
+ }
+ }
+ }
+
/*
* Get the EBDA segment and scan the first 1K
*/
@@ -2536,12 +2727,18 @@ enumerate_xen_cpus()
}
#endif /* __xpv */
+/*ARGSUSED*/
static void
-build_firmware_properties(void)
+build_firmware_properties(struct xboot_info *xbp)
{
ACPI_TABLE_HEADER *tp = NULL;
#ifndef __xpv
+ if (xbp->bi_acpi_rsdp) {
+ bsetprop64("acpi-root-tab",
+ (uint64_t)(uintptr_t)xbp->bi_acpi_rsdp);
+ }
+
if ((tp = find_fw_table(ACPI_SIG_MSCT)) != NULL)
msct_ptr = process_msct((ACPI_TABLE_MSCT *)tp);
else
diff --git a/usr/src/uts/i86pc/sys/boot_console.h b/usr/src/uts/i86pc/sys/boot_console.h
index b2fcf98f97..187733615c 100644
--- a/usr/src/uts/i86pc/sys/boot_console.h
+++ b/usr/src/uts/i86pc/sys/boot_console.h
@@ -36,6 +36,8 @@
extern "C" {
#endif
+#include <sys/bootinfo.h>
+
#define CONS_INVALID -1
#define CONS_SCREEN_TEXT 0
#define CONS_TTY 1
@@ -53,9 +55,12 @@ extern void kb_init(void);
extern int kb_getchar(void);
extern int kb_ischar(void);
+/* Read property from command line or environment. */
+extern const char *find_boot_prop(const char *);
+
extern int boot_console_type(int *);
-extern void bcons_init(char *);
+extern void bcons_init(struct xboot_info *);
extern void bcons_putchar(int);
extern int bcons_getchar(void);
extern int bcons_ischar(void);
diff --git a/usr/src/uts/i86pc/sys/fastboot_msg.h b/usr/src/uts/i86pc/sys/fastboot_msg.h
index 9a1c9bd878..5643d65b29 100644
--- a/usr/src/uts/i86pc/sys/fastboot_msg.h
+++ b/usr/src/uts/i86pc/sys/fastboot_msg.h
@@ -42,17 +42,20 @@
#define fastboot_nosup_msg_end(id)
#endif /* fastboot_nosup_msg_end */
+/* BEGIN CSTYLED */
fastboot_nosup_msg(FBNS_DEFAULT, "")
fastboot_nosup_msg(FBNS_SUSPEND, " after suspend/resume")
fastboot_nosup_msg(FBNS_FMAHWERR, " due to FMA recovery from hardware error")
fastboot_nosup_msg(FBNS_HOTPLUG, " after DR operations")
fastboot_nosup_msg(FBNS_BOOTMOD, " due to presence of boot-time modules")
+fastboot_nosup_msg(FBNS_MULTIBOOT2, " due to multiboot2 boot protocol")
/*
* Should ALWAYS be the last one.
* No fastboot_nosup_msg() after that line.
*/
fastboot_nosup_msg_end(FBNS_END)
+/* END CSTYLED */
#undef fastboot_nosup_msg
#undef fastboot_nosup_msg_end
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index 1da8a3813c..8fdda3652d 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -119,6 +119,7 @@ BOOT_DRIVER_OBJS = \
boot_mmu.o \
boot_vga.o \
boot_xconsole.o \
+ dboot_multiboot2.o \
$(FONT_OBJS)
CORE_OBJS += $(BOOT_DRIVER_OBJS)
diff --git a/usr/src/uts/intel/io/acpica/osl.c b/usr/src/uts/intel/io/acpica/osl.c
index 5bc1b855fd..5c32604088 100644
--- a/usr/src/uts/intel/io/acpica/osl.c
+++ b/usr/src/uts/intel/io/acpica/osl.c
@@ -229,7 +229,7 @@ AcpiOsGetRootPointer()
* The boot code process the table and put the physical address
* in the acpi-root-tab property.
*/
- Address = ddi_prop_get_int(DDI_DEV_T_ANY, ddi_root_node(),
+ Address = ddi_prop_get_int64(DDI_DEV_T_ANY, ddi_root_node(),
DDI_PROP_DONTPASS, "acpi-root-tab", NULL);
if ((Address == NULL) && ACPI_FAILURE(AcpiFindRootPointer(&Address)))
@@ -241,7 +241,7 @@ AcpiOsGetRootPointer()
/*ARGSUSED*/
ACPI_STATUS
AcpiOsPredefinedOverride(const ACPI_PREDEFINED_NAMES *InitVal,
- ACPI_STRING *NewVal)
+ ACPI_STRING *NewVal)
{
*NewVal = 0;
@@ -260,7 +260,7 @@ acpica_strncpy(char *dest, const char *src, int len)
ACPI_STATUS
AcpiOsTableOverride(ACPI_TABLE_HEADER *ExistingTable,
- ACPI_TABLE_HEADER **NewTable)
+ ACPI_TABLE_HEADER **NewTable)
{
char signature[5];
char oemid[7];
@@ -418,7 +418,7 @@ acpi_sema_v(acpi_sema_t *sp, unsigned count)
ACPI_STATUS
AcpiOsCreateSemaphore(UINT32 MaxUnits, UINT32 InitialUnits,
-ACPI_HANDLE *OutHandle)
+ ACPI_HANDLE *OutHandle)
{
acpi_sema_t *sp;
@@ -622,7 +622,7 @@ AcpiOsUnmapMemory(void *LogicalAddress, ACPI_SIZE Size)
/*ARGSUSED*/
ACPI_STATUS
AcpiOsGetPhysicalAddress(void *LogicalAddress,
- ACPI_PHYSICAL_ADDRESS *PhysicalAddress)
+ ACPI_PHYSICAL_ADDRESS *PhysicalAddress)
{
/* UNIMPLEMENTED: not invoked by ACPI CA code */
@@ -653,8 +653,8 @@ static int acpi_intr_hooked = 0;
ACPI_STATUS
AcpiOsInstallInterruptHandler(UINT32 InterruptNumber,
- ACPI_OSD_HANDLER ServiceRoutine,
- void *Context)
+ ACPI_OSD_HANDLER ServiceRoutine,
+ void *Context)
{
_NOTE(ARGUNUSED(InterruptNumber))
@@ -687,7 +687,7 @@ AcpiOsInstallInterruptHandler(UINT32 InterruptNumber,
ACPI_STATUS
AcpiOsRemoveInterruptHandler(UINT32 InterruptNumber,
- ACPI_OSD_HANDLER ServiceRoutine)
+ ACPI_OSD_HANDLER ServiceRoutine)
{
_NOTE(ARGUNUSED(ServiceRoutine))
@@ -931,7 +931,7 @@ osl_rw_memory(ACPI_PHYSICAL_ADDRESS Address, UINT64 *Value,
ACPI_STATUS
AcpiOsReadMemory(ACPI_PHYSICAL_ADDRESS Address,
- UINT64 *Value, UINT32 Width)
+ UINT64 *Value, UINT32 Width)
{
osl_rw_memory(Address, Value, Width, 0);
return (AE_OK);
@@ -939,7 +939,7 @@ AcpiOsReadMemory(ACPI_PHYSICAL_ADDRESS Address,
ACPI_STATUS
AcpiOsWriteMemory(ACPI_PHYSICAL_ADDRESS Address,
- UINT64 Value, UINT32 Width)
+ UINT64 Value, UINT32 Width)
{
osl_rw_memory(Address, &Value, Width, 1);
return (AE_OK);
@@ -948,7 +948,7 @@ AcpiOsWriteMemory(ACPI_PHYSICAL_ADDRESS Address,
ACPI_STATUS
AcpiOsReadPciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg,
- UINT64 *Value, UINT32 Width)
+ UINT64 *Value, UINT32 Width)
{
switch (Width) {
@@ -980,7 +980,7 @@ int acpica_write_pci_config_ok = 1;
ACPI_STATUS
AcpiOsWritePciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg,
- UINT64 Value, UINT32 Width)
+ UINT64 Value, UINT32 Width)
{
if (!acpica_write_pci_config_ok) {
@@ -1034,7 +1034,7 @@ AcpiOsWritePciConfiguration(ACPI_PCI_ID *PciId, UINT32 Reg,
*/
void
AcpiOsDerivePciId(ACPI_HANDLE rhandle, ACPI_HANDLE chandle,
- ACPI_PCI_ID **PciId)
+ ACPI_PCI_ID **PciId)
{
ACPI_HANDLE handle;
dev_info_t *dip;
diff --git a/usr/src/uts/intel/sys/bootinfo.h b/usr/src/uts/intel/sys/bootinfo.h
index 3adce64fc4..fa60e6ac41 100644
--- a/usr/src/uts/intel/sys/bootinfo.h
+++ b/usr/src/uts/intel/sys/bootinfo.h
@@ -61,7 +61,8 @@ typedef void *native_ptr_t;
typedef enum boot_module_type {
BMT_ROOTFS,
BMT_FILE,
- BMT_HASH
+ BMT_HASH,
+ BMT_ENV
} boot_module_type_t;
struct boot_memlist {
@@ -107,7 +108,9 @@ struct xboot_info {
native_ptr_t bi_xen_start_info;
native_ptr_t bi_shared_info; /* VA for shared_info */
#else
- native_ptr_t bi_mb_info;
+ native_ptr_t bi_mb_info; /* multiboot 1 or 2 info */
+ int bi_mb_version; /* multiboot version */
+ native_ptr_t bi_acpi_rsdp;
#endif
};
#pragma pack()