diff options
25 files changed, 340 insertions, 149 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index bdf197ae23..7ccd124dbe 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -4909,7 +4909,7 @@ zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; - char buf[SPA_MAXBLOCKSIZE]; + char *buf; int err; bzero(&bp, sizeof (bp)); @@ -4920,16 +4920,22 @@ zdb_embedded_block(char *thing) words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { - (void) printf("invalid input format\n"); + (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { - (void) printf("decode failed: %u\n", err); + (void) fprintf(stderr, "decode failed: %u\n", err); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); } static boolean_t diff --git a/usr/src/common/mpi/mpi-priv.h b/usr/src/common/mpi/mpi-priv.h index fa6af6d661..9af654ca1d 100644 --- a/usr/src/common/mpi/mpi-priv.h +++ b/usr/src/common/mpi/mpi-priv.h @@ -46,14 +46,14 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * + * Copyright 2017 RackTop Systems. + * * Sun elects to use this software under the MPL license. */ #ifndef _MPI_PRIV_H #define _MPI_PRIV_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* $Id: mpi-priv.h,v 1.20 2005/11/22 07:16:43 relyea%netscape.com Exp $ */ #include "mpi.h" @@ -300,7 +300,7 @@ mp_err MPI_ASM_DECL s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, /* c += a * b * (MP_RADIX ** offset); */ #define s_mp_mul_d_add_offset(a, b, c, off) \ -(s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY) + s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off) typedef struct { mp_int N; /* modulus N */ diff --git a/usr/src/common/mpi/mpmontg.c b/usr/src/common/mpi/mpmontg.c index 33aea8b0d6..150bd2d37f 100644 --- a/usr/src/common/mpi/mpmontg.c +++ b/usr/src/common/mpi/mpmontg.c @@ -40,11 +40,11 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * + * Copyright 2017 RackTop Systems. + * * Sun elects to use this software under the MPL license. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* $Id: mpmontg.c,v 1.20 2006/08/29 02:41:38 nelson%bolyard.com Exp $ */ /* This file implements moduluar exponentiation using Montgomery's @@ -104,7 +104,7 @@ mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm) for (i = 0; i < MP_USED(&mmm->N); ++i ) { mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime; /* T += N * m_i * (MP_RADIX ** i); */ - MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) ); + s_mp_mul_d_add_offset(&mmm->N, m_i, T, i); } s_mp_clamp(T); diff --git a/usr/src/lib/libc/port/print/doprnt.c b/usr/src/lib/libc/port/print/doprnt.c index b6e8ceef0a..e8cfaad6de 100644 --- a/usr/src/lib/libc/port/print/doprnt.c +++ b/usr/src/lib/libc/port/print/doprnt.c @@ -1574,7 +1574,7 @@ _ndoprnt(const char *format, va_list in_args, FILE *iop, int prflag) p = insert_thousands_sep(buf, p); /* Put in a decimal point if needed */ - if (prec != 0 || (flagword & FSHARP)) + if (prec > 0 || (flagword & FSHARP)) p = insert_decimal_point(p); /* Digits (if any) after the decimal point */ diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m index 728f73de9d..1e7b2b8cfc 100644 --- a/usr/src/man/man1m/zpool.1m +++ b/usr/src/man/man1m/zpool.1m @@ -26,7 +26,7 @@ .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" -.Dd December 6, 2017 +.Dd April 27, 2018 .Dt ZPOOL 1M .Os .Sh NAME @@ -1450,7 +1450,7 @@ See the .Sx Properties section for a list of valid properties. The default list is -.Cm name , size , allocated , free , expandsize , fragmentation , capacity , +.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , capacity , .Cm dedupratio , health , altroot . .It Fl p Display numbers in parsable diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index dc4c4e253b..512bb069f9 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -1675,6 +1675,25 @@ function is_pool_removed #pool return $? } +function wait_for_degraded +{ + typeset pool=$1 + typeset timeout=${2:-30} + typeset t0=$SECONDS + + while :; do + [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break + log_note "$pool is not yet degraded." + sleep 1 + if ((SECONDS - t0 > $timeout)); then + log_note "$pool not degraded after $timeout seconds." + return 1 + fi + done + + return 0 +} + # # Use create_pool()/destroy_pool() to clean up the infomation in # in the given disk to avoid slice overlapping. diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib index a6d82f28d8..493ceda60d 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib @@ -33,12 +33,8 @@ function cleanup { - if datasetexists $TESTPOOL ; then - log_must zpool destroy -f $TESTPOOL - fi - if datasetexists $TESTPOOL2 ; then - log_must zpool destroy -f $TESTPOOL2 - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 } # diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh index 1ebb34fdda..3607da7928 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh @@ -47,12 +47,7 @@ verify_runnable "global" function cleanup_testenv { cleanup - if datasetexists $TESTPOOL2 ; then - log_must zpool destroy -f $TESTPOOL2 - fi - if [[ -n $lofidev ]]; then - lofiadm -d $lofidev - fi + [[ -n $lofidev ]] && $LOFIADM -d $lofidev } log_assert "Verify slog device can be disk, file, lofi device or any device " \ @@ -80,13 +75,3 @@ log_must verify_slog_device $TESTPOOL $lofidev 'ONLINE' log_pass "Verify slog device can be disk, file, lofi device or any device " \ "that presents a block interface." - -# Add file which reside in the itself -mntpnt=$(get_prop mountpoint $TESTPOOL) -log_must mkfile $MINVDEVSIZE $mntpnt/vdev -log_must zpool add $TESTPOOL $mntpnt/vdev - -# Add ZFS volume -vol=$TESTPOOL/vol -log_must zpool create -V $MINVDEVSIZE $vol -log_must zpool add $TESTPOOL /dev/zvol/dsk/$vol diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh index 621ac23aa9..0190479e44 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh @@ -45,11 +45,9 @@ verify_runnable "global" log_assert "log device can survive when one of the pool device get corrupted." -for type in "mirror" "raidz" "raidz2" -do - for spare in "" "spare" - do - log_must zpool create $TESTPOOL $type $VDEV $spare $SDEV \ +for type in "mirror" "raidz" "raidz2"; do + for spare in "" "spare"; do + log_must $ZPOOL create $TESTPOOL $type $VDEV $spare $SDEV \ log $LDEV # Create a file to be corrupted @@ -69,13 +67,8 @@ do conv=notrunc count=50 log_must zpool scrub $TESTPOOL log_must display_status $TESTPOOL - log_must zpool status $TESTPOOL 2>&1 >/dev/null - zpool status -v $TESTPOOL | \ - grep "state: DEGRADED" 2>&1 >/dev/null - if (( $? != 0 )); then - log_fail "pool $TESTPOOL status should be DEGRADED" - fi + log_must wait_for_degraded $TESTPOOL zpool status -v $TESTPOOL | grep logs | \ grep "DEGRADED" 2>&1 >/dev/null diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 6b75289537..dfb78321dc 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -443,7 +443,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ -static int +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { @@ -1302,7 +1302,7 @@ xuio_stat_wbuf_nocopy(void) } #ifdef _KERNEL -static int +int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; @@ -1411,7 +1411,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) return (err); } -static int +int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; @@ -1600,22 +1600,17 @@ dmu_return_arcbuf(arc_buf_t *buf) * dmu_write(). */ void -dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, +dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the @@ -1632,11 +1627,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; - DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); @@ -1645,6 +1637,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } } +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + + DB_DNODE_ENTER(dbuf); + dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); +} + typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index 9f0f2b437c..aede315502 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -32,7 +32,8 @@ #include <sys/zfeature.h> uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t object; @@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, os->os_obj_next = object - 1; } - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, tx); mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, dn); @@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, return (object); } +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_alloc_ibs(os, ot, blocksize, 0, + bonustype, bonuslen, tx)); +} + int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 43010e2964..026623f3d5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -1757,6 +1757,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); + drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; @@ -1817,7 +1818,9 @@ struct receive_writer_arg { /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; - uint64_t last_object, last_offset; + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ }; @@ -2089,6 +2092,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file @@ -2184,6 +2190,9 @@ receive_freeobjects(struct receive_writer_arg *rwa, err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); + + if (obj > rwa->max_object) + rwa->max_object = obj; } if (next_err != ESRCH) return (next_err); @@ -2213,6 +2222,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -2289,6 +2301,9 @@ receive_write_byref(struct receive_writer_arg *rwa, ref_os = rwa->os; } + if (drrwbr->drr_object > rwa->max_object) + rwa->max_object = drrwbr->drr_object; + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) @@ -2331,6 +2346,9 @@ receive_write_embedded(struct receive_writer_arg *rwa, if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, @@ -2367,6 +2385,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); @@ -2411,6 +2432,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); @@ -3033,6 +3057,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, } mutex_exit(&rwa.mutex); + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. Note that it's possible that + * we've resumed this send and the first record we received was the END + * record. In that case, max_object would be 0, but we shouldn't start + * freeing all objects from there; instead we should start from the + * resumeobj. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj; + if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) + obj = 0; + if (rwa.max_object > obj) + obj = rwa.max_object; + obj++; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa.os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + cv_destroy(&rwa.cv); mutex_destroy(&rwa.mutex); bqueue_destroy(&rwa.q); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 879a820a92..549c3ca1b0 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -1087,7 +1087,12 @@ dmu_tx_wait(dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } } diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c index b42d449c77..e85a85f913 100644 --- a/usr/src/uts/common/fs/zfs/space_map.c +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -52,6 +52,14 @@ */ boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; +/* + * Override the default indirect block size of 128K, instead using 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + boolean_t sm_entry_is_debug(uint64_t e) { @@ -674,8 +682,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * * [1] The feature is enabled. * [2] The offset or run is too big for a single-word entry, - * or the vdev_id is set (meaning not equal to - * SM_NO_VDEVID). + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). * * Note that for purposes of testing we've added the case that * we write two-word entries occasionally when the feature is @@ -834,7 +842,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != blocksize) { + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, @@ -890,8 +899,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) bonuslen = SPACE_MAP_SIZE_V0; } - object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize, - DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 6a33cb7d81..52238bc735 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -354,6 +354,9 @@ typedef struct dmu_buf { */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, @@ -514,6 +517,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db); int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, + uint32_t flags); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); @@ -752,14 +758,19 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); +int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h index 38b1b042e5..b8403313e9 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h @@ -63,6 +63,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; boolean_t drc_force; boolean_t drc_resumable; + boolean_t drc_clone; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h index e583d61eac..bf3b269d70 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_IMPL_H @@ -92,6 +92,7 @@ typedef struct tx_state { kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_synced_txg; /* last synced txg id */ diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 5578ba5c97..6d2ffe9921 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -445,6 +445,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) } } +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + static void txg_sync_thread(void *arg) { @@ -471,7 +495,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0 && + !txg_has_quiesced_to_sync(dp) && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); @@ -484,7 +508,7 @@ txg_sync_thread(void *arg) * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -499,6 +523,7 @@ txg_sync_thread(void *arg) * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ + ASSERT(tx->tx_quiesced_txg != 0); txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; @@ -549,7 +574,7 @@ txg_quiesce_thread(void *arg) */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) + txg_has_quiesced_to_sync(dp))) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) @@ -559,6 +584,8 @@ txg_quiesce_thread(void *arg) dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); @@ -567,6 +594,7 @@ txg_quiesce_thread(void *arg) * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); @@ -664,7 +692,8 @@ txg_kick(dsl_pool_t *dp) ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - if (tx->tx_syncing_txg == 0 && + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 6be167b7b6..03d711838c 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -129,7 +129,7 @@ typedef struct zvol_state { zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ znode_t zv_znode; /* for range locking */ - dmu_buf_t *zv_dbuf; /* bonus handle */ + dnode_t *zv_dn; /* dnode hold */ } zvol_state_t; /* @@ -652,7 +652,7 @@ zvol_first_open(zvol_state_t *zv) return (error); } - error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); if (error) { dmu_objset_disown(os, zvol_tag); return (error); @@ -677,8 +677,8 @@ zvol_last_close(zvol_state_t *zv) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_buf_rele(zv->zv_dbuf, zvol_tag); - zv->zv_dbuf = NULL; + dnode_rele(zv->zv_dn, zvol_tag); + zv->zv_dn = NULL; /* * Evict cached data @@ -999,8 +999,6 @@ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; - objset_t *os = zv->zv_objset; - uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; /* length of user data */ dmu_buf_t *db; @@ -1024,7 +1022,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_read(os, object, offset, size, buf, + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* @@ -1037,7 +1035,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) offset = P2ALIGN(offset, size); zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_buf_hold(os, object, offset, zgd, &db, + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -1104,8 +1102,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, + off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; @@ -1536,7 +1534,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); @@ -1650,7 +1648,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) int zvol_get_volume_params(minor_t minor, uint64_t *blksize, uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, - void **rl_hdl, void **bonus_hdl) + void **rl_hdl, void **dnode_hdl) { zvol_state_t *zv; @@ -1661,7 +1659,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, return (SET_ERROR(ENXIO)); ASSERT(blksize && max_xfer_len && minor_hdl && - objset_hdl && zil_hdl && rl_hdl && bonus_hdl); + objset_hdl && zil_hdl && rl_hdl && dnode_hdl); *blksize = zv->zv_volblocksize; *max_xfer_len = (uint64_t)zvol_maxphys; @@ -1669,7 +1667,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; *rl_hdl = &zv->zv_znode; - *bonus_hdl = zv->zv_dbuf; + *dnode_hdl = zv->zv_dn; return (0); } diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c index 0e96e2ec96..bf9a369506 100644 --- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c +++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c @@ -59,12 +59,12 @@ * zfs internal interfaces referenced here: * * FUNCTIONS - * dmu_buf_hold_array_by_bonus() + * dmu_buf_hold_array_by_dnode() * dmu_buf_rele_array() * - * dmu_request_arc_buf() + * arc_loan_buf() * dmu_assign_arcbuf() - * dmu_return_arc() + * dmu_return_arcbuf() * arc_buf_size() * * dmu_tx_create() @@ -88,7 +88,7 @@ * zv_objset - dmu_tx_create * zv_zilog - zil_commit * zv_znode - zfs_range_lock - * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf + * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf * GLOBAL DATA * zvol_maxphys */ @@ -114,7 +114,7 @@ sbd_zvol_get_volume_params(sbd_lu_t *sl) &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ &sl->sl_zvol_zil_hdl, /* zil_commit */ &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ - &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */ + &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */ /* dmu_request_arcbuf, */ /* dmu_assign_arcbuf */ @@ -153,10 +153,10 @@ int sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) { sbd_zvol_io_t *zvio = dbuf->db_lu_private; - rl_t *rl; - int numbufs, error; - uint64_t len = dbuf->db_data_size; - uint64_t offset = zvio->zvio_offset; + rl_t *rl; + int numbufs, error; + uint64_t len = dbuf->db_data_size; + uint64_t offset = zvio->zvio_offset; dmu_buf_t **dbpp, *dbp; /* Make sure request is reasonable */ @@ -171,8 +171,9 @@ sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) */ rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); - error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset, - len, TRUE, RDTAG, &numbufs, &dbpp); + error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl, + offset, len, TRUE, RDTAG, &numbufs, &dbpp, + DMU_READ_PREFETCH); zfs_range_unlock(rl); @@ -242,8 +243,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) uint64_t blksize; arc_buf_t **abp; stmf_sglist_ent_t *sgl; - uint64_t len = dbuf->db_data_size; - uint64_t offset = zvio->zvio_offset; + uint64_t len = dbuf->db_data_size; + uint64_t offset = zvio->zvio_offset; /* Make sure request is reasonable */ if (len > sl->sl_max_xfer_len) @@ -293,7 +294,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) if (seglen == 0) seglen = blksize; seglen = MIN(seglen, len); - abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen); + abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl), + B_FALSE, (int)seglen); ASSERT(arc_buf_size(abp[i]) == (int)seglen); sgl->seg_addr = abp[i]->b_data; sgl->seg_length = (uint32_t)seglen; @@ -335,7 +337,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) sbd_zvol_io_t *zvio = dbuf->db_lu_private; dmu_tx_t *tx; int sync, i, error; - rl_t *rl; + rl_t *rl; arc_buf_t **abp = zvio->zvio_abp; int flags = zvio->zvio_flags; uint64_t toffset, offset = zvio->zvio_offset; @@ -364,7 +366,8 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) abuf = abp[i]; size = arc_buf_size(abuf); - dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx); + dmu_assign_arcbuf_dnode(sl->sl_zvol_dn_hdl, toffset, abuf, + tx); toffset += size; resid -= size; } @@ -391,7 +394,7 @@ int sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) { int error; - rl_t *rl; + rl_t *rl; uint64_t len = (uint64_t)uio->uio_resid; uint64_t offset = (uint64_t)uio->uio_loffset; @@ -403,7 +406,7 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); - error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len); + error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); zfs_range_unlock(rl); if (error == ECKSUM) @@ -418,8 +421,8 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) int sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) { - rl_t *rl; - dmu_tx_t *tx; + rl_t *rl; + dmu_tx_t *tx; int error, sync; uint64_t len = (uint64_t)uio->uio_resid; uint64_t offset = (uint64_t)uio->uio_loffset; @@ -442,7 +445,7 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) if (error) { dmu_tx_abort(tx); } else { - error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx); + error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx); if (error == 0) { zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, (ssize_t)len, sync); diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h index efbc7268ea..a402ad0ee3 100644 --- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h +++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h @@ -228,7 +228,7 @@ typedef struct sbd_lu { void *sl_zvol_objset_hdl; void *sl_zvol_zil_hdl; void *sl_zvol_rl_hdl; - void *sl_zvol_bonus_hdl; + void *sl_zvol_dn_hdl; /* Backing store */ char *sl_data_filename; diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c index 2ae8b5cd92..b57f978f3b 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c @@ -24,7 +24,7 @@ */ /* * Copyright 2018 Joyent, Inc. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. */ /* @@ -1072,19 +1072,20 @@ apic_cpu_remove(psm_cpu_request_t *reqp) } /* - * Return the number of APIC clock ticks elapsed for 8245 to decrement - * (APIC_TIME_COUNT + pit_ticks_adj) ticks. + * Return the number of ticks the APIC decrements in SF nanoseconds. + * The fixed-frequency PIT (aka 8254) is used for the measurement. */ -uint_t -apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) +static uint64_t +apic_calibrate_impl() { uint8_t pit_tick_lo; - uint16_t pit_tick, target_pit_tick; - uint32_t start_apic_tick, end_apic_tick; + uint16_t pit_tick, target_pit_tick, pit_ticks_adj; + uint32_t pit_ticks; + uint32_t start_apic_tick, end_apic_tick, apic_ticks; ulong_t iflag; - uint32_t reg; - reg = addr + APIC_CURR_COUNT - apicadr; + apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); iflag = intr_clear(); @@ -1095,7 +1096,7 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX); /* - * Wait for the 8254 to decrement by 5 ticks to ensure + * Wait for the PIT to decrement by 5 ticks to ensure * we didn't start in the middle of a tick. * Compare with 0x10 for the wrap around case. */ @@ -1105,11 +1106,10 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - start_apic_tick = apic_reg_ops->apic_read(reg); + start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); /* - * Wait for the 8254 to decrement by - * (APIC_TIME_COUNT + pit_ticks_adj) ticks + * Wait for the PIT to decrement by APIC_TIME_COUNT ticks */ target_pit_tick = pit_tick - APIC_TIME_COUNT; do { @@ -1117,13 +1117,95 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - end_apic_tick = apic_reg_ops->apic_read(reg); - - *pit_ticks_adj = target_pit_tick - pit_tick; + end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); intr_restore(iflag); - return (start_apic_tick - end_apic_tick); + apic_ticks = start_apic_tick - end_apic_tick; + + /* The PIT might have decremented by more ticks than planned */ + pit_ticks_adj = target_pit_tick - pit_tick; + /* total number of PIT ticks corresponding to apic_ticks */ + pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; + + /* + * Determine the number of nanoseconds per APIC clock tick + * and then determine how many APIC ticks to interrupt at the + * desired frequency + * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s + * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s + * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) + * apic_ticks_per_SFns = + * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) + */ + return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC)); +} + +/* + * It was found empirically that 5 measurements seem sufficient to give a good + * accuracy. Most spurious measurements are higher than the target value thus + * we eliminate up to 2/5 spurious measurements. + */ +#define APIC_CALIBRATE_MEASUREMENTS 5 + +#define APIC_CALIBRATE_PERCENT_OFF_WARNING 10 + +/* + * Return the number of ticks the APIC decrements in SF nanoseconds. + * Several measurements are taken to filter out outliers. + */ +uint64_t +apic_calibrate() +{ + uint64_t measurements[APIC_CALIBRATE_MEASUREMENTS]; + int median_idx; + uint64_t median; + + /* + * When running under a virtual machine, the emulated PIT and APIC + * counters do not always return the right values and can roll over. + * Those spurious measurements are relatively rare but could + * significantly affect the calibration. + * Therefore we take several measurements and then keep the median. + * The median is preferred to the average here as we only want to + * discard outliers. + */ + for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) + measurements[i] = apic_calibrate_impl(); + + /* + * sort results and retrieve median. + */ + for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) { + for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) { + if (measurements[j] < measurements[i]) { + uint64_t tmp = measurements[i]; + measurements[i] = measurements[j]; + measurements[j] = tmp; + } + } + } + median_idx = APIC_CALIBRATE_MEASUREMENTS / 2; + median = measurements[median_idx]; + +#if (APIC_CALIBRATE_MEASUREMENTS >= 3) + /* + * Check that measurements are consistent. Post a warning + * if the three middle values are not close to each other. + */ + uint64_t delta_warn = median * + APIC_CALIBRATE_PERCENT_OFF_WARNING / 100; + if ((median - measurements[median_idx - 1]) > delta_warn || + (measurements[median_idx + 1] - median) > delta_warn) { + cmn_err(CE_WARN, "apic_calibrate measurements lack " + "precision: %llu, %llu, %llu.", + (u_longlong_t)measurements[median_idx - 1], + (u_longlong_t)median, + (u_longlong_t)measurements[median_idx + 1]); + } +#endif + + return (median); } /* diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c index 348f5034fc..bc61c114c2 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -90,34 +91,12 @@ static apic_timer_t apic_timer; int apic_timer_init(int hertz) { - uint_t apic_ticks = 0; - uint_t pit_ticks; int ret, timer_mode; - uint16_t pit_ticks_adj; static int firsttime = 1; if (firsttime) { /* first time calibrate on CPU0 only */ - - apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); - - /* total number of PIT ticks corresponding to apic_ticks */ - pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; - - /* - * Determine the number of nanoseconds per APIC clock tick - * and then determine how many APIC ticks to interrupt at the - * desired frequency - * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s - * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s - * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) - * pic_ticks_per_SFns = - * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) - */ - apic_ticks_per_SFnsecs = ((SF * apic_ticks * PIT_HZ) / - ((uint64_t)pit_ticks * NANOSEC)); + apic_ticks_per_SFnsecs = apic_calibrate(); /* the interval timer initial count is 32 bit max */ apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 0352a154af..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -830,7 +831,7 @@ extern int apic_local_mode(); extern void apic_change_eoi(); extern void apic_send_EOI(uint32_t); extern void apic_send_directed_EOI(uint32_t); -extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); +extern uint64_t apic_calibrate(); extern void x2apic_send_pir_ipi(processorid_t); extern volatile uint32_t *apicadr; /* virtual addr of local APIC */ diff --git a/usr/src/uts/i86pc/sys/apic_common.h b/usr/src/uts/i86pc/sys/apic_common.h index 9c08d73798..dc02031ac3 100644 --- a/usr/src/uts/i86pc/sys/apic_common.h +++ b/usr/src/uts/i86pc/sys/apic_common.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright 2018 Joyent, Inc. @@ -183,7 +184,7 @@ extern void apic_unset_idlecpu(processorid_t cpun); extern void apic_shutdown(int cmd, int fcn); extern void apic_preshutdown(int cmd, int fcn); extern processorid_t apic_get_next_processorid(processorid_t cpun); -extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); +extern uint64_t apic_calibrate(); extern int apic_get_pir_ipivect(void); extern void apic_send_pir_ipi(processorid_t); |