summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--usr/src/cmd/zdb/zdb.c12
-rw-r--r--usr/src/common/mpi/mpi-priv.h6
-rw-r--r--usr/src/common/mpi/mpmontg.c6
-rw-r--r--usr/src/lib/libc/port/print/doprnt.c2
-rw-r--r--usr/src/man/man1m/zpool.1m4
-rw-r--r--usr/src/test/zfs-tests/include/libtest.shlib19
-rw-r--r--usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib8
-rw-r--r--usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh17
-rw-r--r--usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh15
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c27
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_object.c14
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c61
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c7
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c19
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/txg_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c37
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c26
-rw-r--r--usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c45
-rw-r--r--usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h2
-rw-r--r--usr/src/uts/i86pc/io/pcplusmp/apic_common.c116
-rw-r--r--usr/src/uts/i86pc/io/pcplusmp/apic_timer.c25
-rw-r--r--usr/src/uts/i86pc/sys/apic.h3
-rw-r--r--usr/src/uts/i86pc/sys/apic_common.h3
25 files changed, 340 insertions, 149 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c
index bdf197ae23..7ccd124dbe 100644
--- a/usr/src/cmd/zdb/zdb.c
+++ b/usr/src/cmd/zdb/zdb.c
@@ -4909,7 +4909,7 @@ zdb_embedded_block(char *thing)
{
blkptr_t bp;
unsigned long long *words = (void *)&bp;
- char buf[SPA_MAXBLOCKSIZE];
+ char *buf;
int err;
bzero(&bp, sizeof (bp));
@@ -4920,16 +4920,22 @@ zdb_embedded_block(char *thing)
words + 8, words + 9, words + 10, words + 11,
words + 12, words + 13, words + 14, words + 15);
if (err != 16) {
- (void) printf("invalid input format\n");
+ (void) fprintf(stderr, "invalid input format\n");
exit(1);
}
ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
+ buf = malloc(SPA_MAXBLOCKSIZE);
+ if (buf == NULL) {
+ (void) fprintf(stderr, "out of memory\n");
+ exit(1);
+ }
err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
if (err != 0) {
- (void) printf("decode failed: %u\n", err);
+ (void) fprintf(stderr, "decode failed: %u\n", err);
exit(1);
}
zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
+ free(buf);
}
static boolean_t
diff --git a/usr/src/common/mpi/mpi-priv.h b/usr/src/common/mpi/mpi-priv.h
index fa6af6d661..9af654ca1d 100644
--- a/usr/src/common/mpi/mpi-priv.h
+++ b/usr/src/common/mpi/mpi-priv.h
@@ -46,14 +46,14 @@
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
+ * Copyright 2017 RackTop Systems.
+ *
* Sun elects to use this software under the MPL license.
*/
#ifndef _MPI_PRIV_H
#define _MPI_PRIV_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/* $Id: mpi-priv.h,v 1.20 2005/11/22 07:16:43 relyea%netscape.com Exp $ */
#include "mpi.h"
@@ -300,7 +300,7 @@ mp_err MPI_ASM_DECL s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo,
/* c += a * b * (MP_RADIX ** offset); */
#define s_mp_mul_d_add_offset(a, b, c, off) \
-(s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY)
+ s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off)
typedef struct {
mp_int N; /* modulus N */
diff --git a/usr/src/common/mpi/mpmontg.c b/usr/src/common/mpi/mpmontg.c
index 33aea8b0d6..150bd2d37f 100644
--- a/usr/src/common/mpi/mpmontg.c
+++ b/usr/src/common/mpi/mpmontg.c
@@ -40,11 +40,11 @@
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
+ * Copyright 2017 RackTop Systems.
+ *
* Sun elects to use this software under the MPL license.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/* $Id: mpmontg.c,v 1.20 2006/08/29 02:41:38 nelson%bolyard.com Exp $ */
/* This file implements moduluar exponentiation using Montgomery's
@@ -104,7 +104,7 @@ mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm)
for (i = 0; i < MP_USED(&mmm->N); ++i ) {
mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime;
/* T += N * m_i * (MP_RADIX ** i); */
- MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) );
+ s_mp_mul_d_add_offset(&mmm->N, m_i, T, i);
}
s_mp_clamp(T);
diff --git a/usr/src/lib/libc/port/print/doprnt.c b/usr/src/lib/libc/port/print/doprnt.c
index b6e8ceef0a..e8cfaad6de 100644
--- a/usr/src/lib/libc/port/print/doprnt.c
+++ b/usr/src/lib/libc/port/print/doprnt.c
@@ -1574,7 +1574,7 @@ _ndoprnt(const char *format, va_list in_args, FILE *iop, int prflag)
p = insert_thousands_sep(buf, p);
/* Put in a decimal point if needed */
- if (prec != 0 || (flagword & FSHARP))
+ if (prec > 0 || (flagword & FSHARP))
p = insert_decimal_point(p);
/* Digits (if any) after the decimal point */
diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m
index 728f73de9d..1e7b2b8cfc 100644
--- a/usr/src/man/man1m/zpool.1m
+++ b/usr/src/man/man1m/zpool.1m
@@ -26,7 +26,7 @@
.\" Copyright (c) 2017 Datto Inc.
.\" Copyright (c) 2017 George Melikov. All Rights Reserved.
.\"
-.Dd December 6, 2017
+.Dd April 27, 2018
.Dt ZPOOL 1M
.Os
.Sh NAME
@@ -1450,7 +1450,7 @@ See the
.Sx Properties
section for a list of valid properties.
The default list is
-.Cm name , size , allocated , free , expandsize , fragmentation , capacity ,
+.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , capacity ,
.Cm dedupratio , health , altroot .
.It Fl p
Display numbers in parsable
diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib
index dc4c4e253b..512bb069f9 100644
--- a/usr/src/test/zfs-tests/include/libtest.shlib
+++ b/usr/src/test/zfs-tests/include/libtest.shlib
@@ -1675,6 +1675,25 @@ function is_pool_removed #pool
return $?
}
+function wait_for_degraded
+{
+ typeset pool=$1
+ typeset timeout=${2:-30}
+ typeset t0=$SECONDS
+
+ while :; do
+ [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break
+ log_note "$pool is not yet degraded."
+ sleep 1
+ if ((SECONDS - t0 > $timeout)); then
+ log_note "$pool not degraded after $timeout seconds."
+ return 1
+ fi
+ done
+
+ return 0
+}
+
#
# Use create_pool()/destroy_pool() to clean up the infomation in
# in the given disk to avoid slice overlapping.
diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib
index a6d82f28d8..493ceda60d 100644
--- a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib
+++ b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib
@@ -33,12 +33,8 @@
function cleanup
{
- if datasetexists $TESTPOOL ; then
- log_must zpool destroy -f $TESTPOOL
- fi
- if datasetexists $TESTPOOL2 ; then
- log_must zpool destroy -f $TESTPOOL2
- fi
+ poolexists $TESTPOOL && destroy_pool $TESTPOOL
+ poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
}
#
diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh
index 1ebb34fdda..3607da7928 100644
--- a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh
@@ -47,12 +47,7 @@ verify_runnable "global"
function cleanup_testenv
{
cleanup
- if datasetexists $TESTPOOL2 ; then
- log_must zpool destroy -f $TESTPOOL2
- fi
- if [[ -n $lofidev ]]; then
- lofiadm -d $lofidev
- fi
+ [[ -n $lofidev ]] && $LOFIADM -d $lofidev
}
log_assert "Verify slog device can be disk, file, lofi device or any device " \
@@ -80,13 +75,3 @@ log_must verify_slog_device $TESTPOOL $lofidev 'ONLINE'
log_pass "Verify slog device can be disk, file, lofi device or any device " \
"that presents a block interface."
-
-# Add file which reside in the itself
-mntpnt=$(get_prop mountpoint $TESTPOOL)
-log_must mkfile $MINVDEVSIZE $mntpnt/vdev
-log_must zpool add $TESTPOOL $mntpnt/vdev
-
-# Add ZFS volume
-vol=$TESTPOOL/vol
-log_must zpool create -V $MINVDEVSIZE $vol
-log_must zpool add $TESTPOOL /dev/zvol/dsk/$vol
diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh
index 621ac23aa9..0190479e44 100644
--- a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh
@@ -45,11 +45,9 @@ verify_runnable "global"
log_assert "log device can survive when one of the pool device get corrupted."
-for type in "mirror" "raidz" "raidz2"
-do
- for spare in "" "spare"
- do
- log_must zpool create $TESTPOOL $type $VDEV $spare $SDEV \
+for type in "mirror" "raidz" "raidz2"; do
+ for spare in "" "spare"; do
+ log_must $ZPOOL create $TESTPOOL $type $VDEV $spare $SDEV \
log $LDEV
# Create a file to be corrupted
@@ -69,13 +67,8 @@ do
conv=notrunc count=50
log_must zpool scrub $TESTPOOL
log_must display_status $TESTPOOL
- log_must zpool status $TESTPOOL 2>&1 >/dev/null
- zpool status -v $TESTPOOL | \
- grep "state: DEGRADED" 2>&1 >/dev/null
- if (( $? != 0 )); then
- log_fail "pool $TESTPOOL status should be DEGRADED"
- fi
+ log_must wait_for_degraded $TESTPOOL
zpool status -v $TESTPOOL | grep logs | \
grep "DEGRADED" 2>&1 >/dev/null
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 6b75289537..dfb78321dc 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -443,7 +443,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
* and can induce severe lock contention when writing to several files
* whose dnodes are in the same block.
*/
-static int
+int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
@@ -1302,7 +1302,7 @@ xuio_stat_wbuf_nocopy(void)
}
#ifdef _KERNEL
-static int
+int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
@@ -1411,7 +1411,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
return (err);
}
-static int
+int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
@@ -1600,22 +1600,17 @@ dmu_return_arcbuf(arc_buf_t *buf)
* dmu_write().
*/
void
-dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
dmu_tx_t *tx)
{
- dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
- dnode_t *dn;
dmu_buf_impl_t *db;
uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
uint64_t blkid;
- DB_DNODE_ENTER(dbuf);
- dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(dbuf);
/*
* We can only assign if the offset is aligned, the arc buf is the
@@ -1632,11 +1627,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
- DB_DNODE_ENTER(dbuf);
- dn = DB_DNODE(dbuf);
os = dn->dn_objset;
object = dn->dn_object;
- DB_DNODE_EXIT(dbuf);
dbuf_rele(db, FTAG);
dmu_write(os, object, offset, blksz, buf->b_data, tx);
@@ -1645,6 +1637,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
}
}
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+ DB_DNODE_ENTER(dbuf);
+ dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
+ DB_DNODE_EXIT(dbuf);
+}
+
typedef struct {
dbuf_dirty_record_t *dsa_dr;
dmu_sync_cb_t *dsa_done;
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index 9f0f2b437c..aede315502 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -32,7 +32,8 @@
#include <sys/zfeature.h>
uint64_t
-dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
uint64_t object;
@@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
os->os_obj_next = object - 1;
}
- dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
+ dnode_allocate(dn, ot, blocksize, indirect_blockshift,
+ bonustype, bonuslen, tx);
mutex_exit(&os->os_obj_lock);
dmu_tx_add_new_object(tx, dn);
@@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
return (object);
}
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+ return (dmu_object_alloc_ibs(os, ot, blocksize, 0,
+ bonustype, bonuslen, tx));
+}
+
int
dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 43010e2964..026623f3d5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -1757,6 +1757,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
drc->drc_force = force;
drc->drc_resumable = resumable;
drc->drc_cred = CRED();
+ drc->drc_clone = (origin != NULL);
if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
@@ -1817,7 +1818,9 @@ struct receive_writer_arg {
/* A map from guid to dataset to help handle dedup'd streams. */
avl_tree_t *guid_to_ds_map;
boolean_t resumable;
- uint64_t last_object, last_offset;
+ uint64_t last_object;
+ uint64_t last_offset;
+ uint64_t max_object; /* highest object ID referenced in stream */
uint64_t bytes_read; /* bytes read when current record created */
};
@@ -2089,6 +2092,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
return (SET_ERROR(EINVAL));
object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
+ if (drro->drr_object > rwa->max_object)
+ rwa->max_object = drro->drr_object;
+
/*
* If we are losing blkptrs or changing the block size this must
* be a new file instance. We must clear out the previous file
@@ -2184,6 +2190,9 @@ receive_freeobjects(struct receive_writer_arg *rwa,
err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
+
+ if (obj > rwa->max_object)
+ rwa->max_object = obj;
}
if (next_err != ESRCH)
return (next_err);
@@ -2213,6 +2222,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
rwa->last_object = drrw->drr_object;
rwa->last_offset = drrw->drr_offset;
+ if (rwa->last_object > rwa->max_object)
+ rwa->max_object = rwa->last_object;
+
if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
@@ -2289,6 +2301,9 @@ receive_write_byref(struct receive_writer_arg *rwa,
ref_os = rwa->os;
}
+ if (drrwbr->drr_object > rwa->max_object)
+ rwa->max_object = drrwbr->drr_object;
+
err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
if (err != 0)
@@ -2331,6 +2346,9 @@ receive_write_embedded(struct receive_writer_arg *rwa,
if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
return (EINVAL);
+ if (drrwe->drr_object > rwa->max_object)
+ rwa->max_object = drrwe->drr_object;
+
tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrwe->drr_object,
@@ -2367,6 +2385,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
+ if (drrs->drr_object > rwa->max_object)
+ rwa->max_object = drrs->drr_object;
+
VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG);
@@ -2411,6 +2432,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
+ if (drrf->drr_object > rwa->max_object)
+ rwa->max_object = drrf->drr_object;
+
err = dmu_free_long_range(rwa->os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
@@ -3033,6 +3057,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
}
mutex_exit(&rwa.mutex);
+ /*
+ * If we are receiving a full stream as a clone, all object IDs which
+ * are greater than the maximum ID referenced in the stream are
+ * by definition unused and must be freed. Note that it's possible that
+ * we've resumed this send and the first record we received was the END
+ * record. In that case, max_object would be 0, but we shouldn't start
+ * freeing all objects from there; instead we should start from the
+ * resumeobj.
+ */
+ if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+ uint64_t obj;
+ if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0)
+ obj = 0;
+ if (rwa.max_object > obj)
+ obj = rwa.max_object;
+ obj++;
+ int free_err = 0;
+ int next_err = 0;
+
+ while (next_err == 0) {
+ free_err = dmu_free_long_object(rwa.os, obj);
+ if (free_err != 0 && free_err != ENOENT)
+ break;
+
+ next_err = dmu_object_next(rwa.os, &obj, FALSE, 0);
+ }
+
+ if (err == 0) {
+ if (free_err != 0 && free_err != ENOENT)
+ err = free_err;
+ else if (next_err != ESRCH)
+ err = next_err;
+ }
+ }
+
cv_destroy(&rwa.cv);
mutex_destroy(&rwa.mutex);
bqueue_destroy(&rwa.q);
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 879a820a92..549c3ca1b0 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -1087,7 +1087,12 @@ dmu_tx_wait(dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
tx->tx_needassign_txh = NULL;
} else {
- txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+ /*
+ * If we have a lot of dirty data just wait until we sync
+ * out a TXG at which point we'll hopefully have synced
+ * a portion of the changes.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
}
}
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index b42d449c77..e85a85f913 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -52,6 +52,14 @@
*/
boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+/*
+ * Override the default indirect block size of 128K, instead using 16K for
+ * spacemaps (2^14 bytes). This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
boolean_t
sm_entry_is_debug(uint64_t e)
{
@@ -674,8 +682,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
*
* [1] The feature is enabled.
* [2] The offset or run is too big for a single-word entry,
- * or the vdev_id is set (meaning not equal to
- * SM_NO_VDEVID).
+ * or the vdev_id is set (meaning not equal to
+ * SM_NO_VDEVID).
*
* Note that for purposes of testing we've added the case that
* we write two-word entries occasionally when the feature is
@@ -834,7 +842,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
*/
if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
- doi.doi_data_block_size != blocksize) {
+ doi.doi_data_block_size != blocksize ||
+ doi.doi_metadata_block_size != 1 << space_map_ibs) {
zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating "
"object[%llu]: old bonus %u, old blocksz %u",
dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
@@ -890,8 +899,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
bonuslen = SPACE_MAP_SIZE_V0;
}
- object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize,
- DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+ object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+ space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
return (object);
}
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 6a33cb7d81..52238bc735 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -354,6 +354,9 @@ typedef struct dmu_buf {
*/
uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
+uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+ int indirect_blockshift,
+ dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
@@ -514,6 +517,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
uint64_t length, boolean_t read, void *tag,
int *numbufsp, dmu_buf_t ***dbpp);
+int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp,
+ uint32_t flags);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
typedef void dmu_buf_evict_func_t(void *user_ptr);
@@ -752,14 +758,19 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
+int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
+int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
+ dmu_tx_t *tx);
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset,
+ struct arc_buf *buf, dmu_tx_t *tx);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
dmu_tx_t *tx);
int dmu_xuio_init(struct xuio *uio, int niov);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index 38b1b042e5..b8403313e9 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -63,6 +63,7 @@ typedef struct dmu_recv_cookie {
boolean_t drc_byteswap;
boolean_t drc_force;
boolean_t drc_resumable;
+ boolean_t drc_clone;
struct avl_tree *drc_guid_to_ds_map;
zio_cksum_t drc_cksum;
uint64_t drc_newsnapobj;
diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
index e583d61eac..bf3b269d70 100644
--- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_TXG_IMPL_H
@@ -92,6 +92,7 @@ typedef struct tx_state {
kmutex_t tx_sync_lock; /* protects the rest of this struct */
uint64_t tx_open_txg; /* currently open txg id */
+ uint64_t tx_quiescing_txg; /* currently quiescing txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
uint64_t tx_synced_txg; /* last synced txg id */
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 5578ba5c97..6d2ffe9921 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -445,6 +445,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
}
}
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+ ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+ return (tx->tx_quiesced_txg != 0);
+}
+
static void
txg_sync_thread(void *arg)
{
@@ -471,7 +495,7 @@ txg_sync_thread(void *arg)
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- tx->tx_quiesced_txg == 0 &&
+ !txg_has_quiesced_to_sync(dp) &&
dp->dp_dirty_total < zfs_dirty_data_sync) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
@@ -484,7 +508,7 @@ txg_sync_thread(void *arg)
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.
*/
- while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -499,6 +523,7 @@ txg_sync_thread(void *arg)
* us. This may cause the quiescing thread to now be
* able to quiesce another txg, so we must signal it.
*/
+ ASSERT(tx->tx_quiesced_txg != 0);
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
@@ -549,7 +574,7 @@ txg_quiesce_thread(void *arg)
*/
while (!tx->tx_exiting &&
(tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
- tx->tx_quiesced_txg != 0))
+ txg_has_quiesced_to_sync(dp)))
txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
if (tx->tx_exiting)
@@ -559,6 +584,8 @@ txg_quiesce_thread(void *arg)
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
txg, tx->tx_quiesce_txg_waiting,
tx->tx_sync_txg_waiting);
+ tx->tx_quiescing_txg = txg;
+
mutex_exit(&tx->tx_sync_lock);
txg_quiesce(dp, txg);
mutex_enter(&tx->tx_sync_lock);
@@ -567,6 +594,7 @@ txg_quiesce_thread(void *arg)
* Hand this txg off to the sync thread.
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
+ tx->tx_quiescing_txg = 0;
tx->tx_quiesced_txg = txg;
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
@@ -664,7 +692,8 @@ txg_kick(dsl_pool_t *dp)
ASSERT(!dsl_pool_config_held(dp));
mutex_enter(&tx->tx_sync_lock);
- if (tx->tx_syncing_txg == 0 &&
+ if (!txg_is_syncing(dp) &&
+ !txg_is_quiescing(dp) &&
tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
tx->tx_quiesced_txg <= tx->tx_synced_txg) {
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 6be167b7b6..03d711838c 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -129,7 +129,7 @@ typedef struct zvol_state {
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
znode_t zv_znode; /* for range locking */
- dmu_buf_t *zv_dbuf; /* bonus handle */
+ dnode_t *zv_dn; /* dnode hold */
} zvol_state_t;
/*
@@ -652,7 +652,7 @@ zvol_first_open(zvol_state_t *zv)
return (error);
}
- error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
+ error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn);
if (error) {
dmu_objset_disown(os, zvol_tag);
return (error);
@@ -677,8 +677,8 @@ zvol_last_close(zvol_state_t *zv)
zil_close(zv->zv_zilog);
zv->zv_zilog = NULL;
- dmu_buf_rele(zv->zv_dbuf, zvol_tag);
- zv->zv_dbuf = NULL;
+ dnode_rele(zv->zv_dn, zvol_tag);
+ zv->zv_dn = NULL;
/*
* Evict cached data
@@ -999,8 +999,6 @@ static int
zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zvol_state_t *zv = arg;
- objset_t *os = zv->zv_objset;
- uint64_t object = ZVOL_OBJ;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length; /* length of user data */
dmu_buf_t *db;
@@ -1024,7 +1022,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
if (buf != NULL) { /* immediate write */
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
RL_READER);
- error = dmu_read(os, object, offset, size, buf,
+ error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else { /* indirect write */
/*
@@ -1037,7 +1035,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
offset = P2ALIGN(offset, size);
zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
RL_READER);
- error = dmu_buf_hold(os, object, offset, zgd, &db,
+ error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@@ -1104,8 +1102,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
(wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
- ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+ if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+ off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
@@ -1536,7 +1534,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
dmu_tx_abort(tx);
break;
}
- error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
+ error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx);
if (error == 0)
zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx);
@@ -1650,7 +1648,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
int
zvol_get_volume_params(minor_t minor, uint64_t *blksize,
uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
- void **rl_hdl, void **bonus_hdl)
+ void **rl_hdl, void **dnode_hdl)
{
zvol_state_t *zv;
@@ -1661,7 +1659,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
return (SET_ERROR(ENXIO));
ASSERT(blksize && max_xfer_len && minor_hdl &&
- objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+ objset_hdl && zil_hdl && rl_hdl && dnode_hdl);
*blksize = zv->zv_volblocksize;
*max_xfer_len = (uint64_t)zvol_maxphys;
@@ -1669,7 +1667,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
*objset_hdl = zv->zv_objset;
*zil_hdl = zv->zv_zilog;
*rl_hdl = &zv->zv_znode;
- *bonus_hdl = zv->zv_dbuf;
+ *dnode_hdl = zv->zv_dn;
return (0);
}
diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
index 0e96e2ec96..bf9a369506 100644
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
@@ -59,12 +59,12 @@
* zfs internal interfaces referenced here:
*
* FUNCTIONS
- * dmu_buf_hold_array_by_bonus()
+ * dmu_buf_hold_array_by_dnode()
* dmu_buf_rele_array()
*
- * dmu_request_arc_buf()
+ * arc_loan_buf()
* dmu_assign_arcbuf()
- * dmu_return_arc()
+ * dmu_return_arcbuf()
* arc_buf_size()
*
* dmu_tx_create()
@@ -88,7 +88,7 @@
* zv_objset - dmu_tx_create
* zv_zilog - zil_commit
* zv_znode - zfs_range_lock
- * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
+ * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
* GLOBAL DATA
* zvol_maxphys
*/
@@ -114,7 +114,7 @@ sbd_zvol_get_volume_params(sbd_lu_t *sl)
&sl->sl_zvol_objset_hdl, /* dmu_tx_create */
&sl->sl_zvol_zil_hdl, /* zil_commit */
&sl->sl_zvol_rl_hdl, /* zfs_range_lock */
- &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */
+ &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */
/* dmu_request_arcbuf, */
/* dmu_assign_arcbuf */
@@ -153,10 +153,10 @@ int
sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
{
sbd_zvol_io_t *zvio = dbuf->db_lu_private;
- rl_t *rl;
- int numbufs, error;
- uint64_t len = dbuf->db_data_size;
- uint64_t offset = zvio->zvio_offset;
+ rl_t *rl;
+ int numbufs, error;
+ uint64_t len = dbuf->db_data_size;
+ uint64_t offset = zvio->zvio_offset;
dmu_buf_t **dbpp, *dbp;
/* Make sure request is reasonable */
@@ -171,8 +171,9 @@ sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
*/
rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
- error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset,
- len, TRUE, RDTAG, &numbufs, &dbpp);
+ error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl,
+ offset, len, TRUE, RDTAG, &numbufs, &dbpp,
+ DMU_READ_PREFETCH);
zfs_range_unlock(rl);
@@ -242,8 +243,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
uint64_t blksize;
arc_buf_t **abp;
stmf_sglist_ent_t *sgl;
- uint64_t len = dbuf->db_data_size;
- uint64_t offset = zvio->zvio_offset;
+ uint64_t len = dbuf->db_data_size;
+ uint64_t offset = zvio->zvio_offset;
/* Make sure request is reasonable */
if (len > sl->sl_max_xfer_len)
@@ -293,7 +294,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
if (seglen == 0)
seglen = blksize;
seglen = MIN(seglen, len);
- abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen);
+ abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl),
+ B_FALSE, (int)seglen);
ASSERT(arc_buf_size(abp[i]) == (int)seglen);
sgl->seg_addr = abp[i]->b_data;
sgl->seg_length = (uint32_t)seglen;
@@ -335,7 +337,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
sbd_zvol_io_t *zvio = dbuf->db_lu_private;
dmu_tx_t *tx;
int sync, i, error;
- rl_t *rl;
+ rl_t *rl;
arc_buf_t **abp = zvio->zvio_abp;
int flags = zvio->zvio_flags;
uint64_t toffset, offset = zvio->zvio_offset;
@@ -364,7 +366,8 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
abuf = abp[i];
size = arc_buf_size(abuf);
- dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx);
+ dmu_assign_arcbuf_dnode(sl->sl_zvol_dn_hdl, toffset, abuf,
+ tx);
toffset += size;
resid -= size;
}
@@ -391,7 +394,7 @@ int
sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
{
int error;
- rl_t *rl;
+ rl_t *rl;
uint64_t len = (uint64_t)uio->uio_resid;
uint64_t offset = (uint64_t)uio->uio_loffset;
@@ -403,7 +406,7 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
- error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len);
+ error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
zfs_range_unlock(rl);
if (error == ECKSUM)
@@ -418,8 +421,8 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
int
sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
{
- rl_t *rl;
- dmu_tx_t *tx;
+ rl_t *rl;
+ dmu_tx_t *tx;
int error, sync;
uint64_t len = (uint64_t)uio->uio_resid;
uint64_t offset = (uint64_t)uio->uio_loffset;
@@ -442,7 +445,7 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
if (error) {
dmu_tx_abort(tx);
} else {
- error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx);
+ error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx);
if (error == 0) {
zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
(ssize_t)len, sync);
diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h
index efbc7268ea..a402ad0ee3 100644
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h
@@ -228,7 +228,7 @@ typedef struct sbd_lu {
void *sl_zvol_objset_hdl;
void *sl_zvol_zil_hdl;
void *sl_zvol_rl_hdl;
- void *sl_zvol_bonus_hdl;
+ void *sl_zvol_dn_hdl;
/* Backing store */
char *sl_data_filename;
diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c
index 2ae8b5cd92..b57f978f3b 100644
--- a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c
@@ -24,7 +24,7 @@
*/
/*
* Copyright 2018 Joyent, Inc.
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
*/
/*
@@ -1072,19 +1072,20 @@ apic_cpu_remove(psm_cpu_request_t *reqp)
}
/*
- * Return the number of APIC clock ticks elapsed for 8245 to decrement
- * (APIC_TIME_COUNT + pit_ticks_adj) ticks.
+ * Return the number of ticks the APIC decrements in SF nanoseconds.
+ * The fixed-frequency PIT (aka 8254) is used for the measurement.
*/
-uint_t
-apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj)
+static uint64_t
+apic_calibrate_impl()
{
uint8_t pit_tick_lo;
- uint16_t pit_tick, target_pit_tick;
- uint32_t start_apic_tick, end_apic_tick;
+ uint16_t pit_tick, target_pit_tick, pit_ticks_adj;
+ uint32_t pit_ticks;
+ uint32_t start_apic_tick, end_apic_tick, apic_ticks;
ulong_t iflag;
- uint32_t reg;
- reg = addr + APIC_CURR_COUNT - apicadr;
+ apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
+ apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
iflag = intr_clear();
@@ -1095,7 +1096,7 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj)
pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX);
/*
- * Wait for the 8254 to decrement by 5 ticks to ensure
+ * Wait for the PIT to decrement by 5 ticks to ensure
* we didn't start in the middle of a tick.
* Compare with 0x10 for the wrap around case.
*/
@@ -1105,11 +1106,10 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj)
pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
- start_apic_tick = apic_reg_ops->apic_read(reg);
+ start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
/*
- * Wait for the 8254 to decrement by
- * (APIC_TIME_COUNT + pit_ticks_adj) ticks
+ * Wait for the PIT to decrement by APIC_TIME_COUNT ticks
*/
target_pit_tick = pit_tick - APIC_TIME_COUNT;
do {
@@ -1117,13 +1117,95 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj)
pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo;
} while (pit_tick > target_pit_tick || pit_tick_lo < 0x10);
- end_apic_tick = apic_reg_ops->apic_read(reg);
-
- *pit_ticks_adj = target_pit_tick - pit_tick;
+ end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT);
intr_restore(iflag);
- return (start_apic_tick - end_apic_tick);
+ apic_ticks = start_apic_tick - end_apic_tick;
+
+ /* The PIT might have decremented by more ticks than planned */
+ pit_ticks_adj = target_pit_tick - pit_tick;
+ /* total number of PIT ticks corresponding to apic_ticks */
+ pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
+
+ /*
+ * Determine the number of nanoseconds per APIC clock tick
+ * and then determine how many APIC ticks to interrupt at the
+ * desired frequency
+ * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
+ * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
+ * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
+ * apic_ticks_per_SFns =
+ * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
+ */
+ return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC));
+}
+
+/*
+ * It was found empirically that 5 measurements seem sufficient to give a good
+ * accuracy. Most spurious measurements are higher than the target value thus
+ * we eliminate up to 2/5 spurious measurements.
+ */
+#define APIC_CALIBRATE_MEASUREMENTS 5
+
+#define APIC_CALIBRATE_PERCENT_OFF_WARNING 10
+
+/*
+ * Return the number of ticks the APIC decrements in SF nanoseconds.
+ * Several measurements are taken to filter out outliers.
+ */
+uint64_t
+apic_calibrate()
+{
+ uint64_t measurements[APIC_CALIBRATE_MEASUREMENTS];
+ int median_idx;
+ uint64_t median;
+
+ /*
+ * When running under a virtual machine, the emulated PIT and APIC
+ * counters do not always return the right values and can roll over.
+ * Those spurious measurements are relatively rare but could
+ * significantly affect the calibration.
+ * Therefore we take several measurements and then keep the median.
+ * The median is preferred to the average here as we only want to
+ * discard outliers.
+ */
+ for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++)
+ measurements[i] = apic_calibrate_impl();
+
+ /*
+ * sort results and retrieve median.
+ */
+ for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) {
+ for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) {
+ if (measurements[j] < measurements[i]) {
+ uint64_t tmp = measurements[i];
+ measurements[i] = measurements[j];
+ measurements[j] = tmp;
+ }
+ }
+ }
+ median_idx = APIC_CALIBRATE_MEASUREMENTS / 2;
+ median = measurements[median_idx];
+
+#if (APIC_CALIBRATE_MEASUREMENTS >= 3)
+ /*
+ * Check that measurements are consistent. Post a warning
+ * if the three middle values are not close to each other.
+ */
+ uint64_t delta_warn = median *
+ APIC_CALIBRATE_PERCENT_OFF_WARNING / 100;
+ if ((median - measurements[median_idx - 1]) > delta_warn ||
+ (measurements[median_idx + 1] - median) > delta_warn) {
+ cmn_err(CE_WARN, "apic_calibrate measurements lack "
+ "precision: %llu, %llu, %llu.",
+ (u_longlong_t)measurements[median_idx - 1],
+ (u_longlong_t)median,
+ (u_longlong_t)measurements[median_idx + 1]);
+ }
+#endif
+
+ return (median);
}
/*
diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c
index 348f5034fc..bc61c114c2 100644
--- a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -90,34 +91,12 @@ static apic_timer_t apic_timer;
int
apic_timer_init(int hertz)
{
- uint_t apic_ticks = 0;
- uint_t pit_ticks;
int ret, timer_mode;
- uint16_t pit_ticks_adj;
static int firsttime = 1;
if (firsttime) {
/* first time calibrate on CPU0 only */
-
- apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init);
- apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL);
- apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj);
-
- /* total number of PIT ticks corresponding to apic_ticks */
- pit_ticks = APIC_TIME_COUNT + pit_ticks_adj;
-
- /*
- * Determine the number of nanoseconds per APIC clock tick
- * and then determine how many APIC ticks to interrupt at the
- * desired frequency
- * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s
- * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s
- * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9)
- * pic_ticks_per_SFns =
- * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9)
- */
- apic_ticks_per_SFnsecs = ((SF * apic_ticks * PIT_HZ) /
- ((uint64_t)pit_ticks * NANOSEC));
+ apic_ticks_per_SFnsecs = apic_calibrate();
/* the interval timer initial count is 32 bit max */
apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL);
diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h
index 0352a154af..f2528a632f 100644
--- a/usr/src/uts/i86pc/sys/apic.h
+++ b/usr/src/uts/i86pc/sys/apic.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Joyent, Inc.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -830,7 +831,7 @@ extern int apic_local_mode();
extern void apic_change_eoi();
extern void apic_send_EOI(uint32_t);
extern void apic_send_directed_EOI(uint32_t);
-extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *);
+extern uint64_t apic_calibrate();
extern void x2apic_send_pir_ipi(processorid_t);
extern volatile uint32_t *apicadr; /* virtual addr of local APIC */
diff --git a/usr/src/uts/i86pc/sys/apic_common.h b/usr/src/uts/i86pc/sys/apic_common.h
index 9c08d73798..dc02031ac3 100644
--- a/usr/src/uts/i86pc/sys/apic_common.h
+++ b/usr/src/uts/i86pc/sys/apic_common.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
/*
* Copyright 2018 Joyent, Inc.
@@ -183,7 +184,7 @@ extern void apic_unset_idlecpu(processorid_t cpun);
extern void apic_shutdown(int cmd, int fcn);
extern void apic_preshutdown(int cmd, int fcn);
extern processorid_t apic_get_next_processorid(processorid_t cpun);
-extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *);
+extern uint64_t apic_calibrate();
extern int apic_get_pir_ipivect(void);
extern void apic_send_pir_ipi(processorid_t);