diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2018-05-18 12:15:12 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2018-05-18 12:15:12 +0000 |
commit | 49eb54a21b7e8a194ae7ac1204fa1f9b31b9fb78 (patch) | |
tree | 89b089ff23e0497d39cc5d862562e71fcda9af53 /usr/src | |
parent | 2df613f7b7aa3978e80f8e63f228b7ed64ea5d5a (diff) | |
parent | 8dfe5547fbf0979fc1065a8b6fddc1e940a7cf4f (diff) | |
download | illumos-joyent-49eb54a21b7e8a194ae7ac1204fa1f9b31b9fb78.tar.gz |
[illumos-gate merge]
commit 8dfe5547fbf0979fc1065a8b6fddc1e940a7cf4f
9539 Make zvol operations use _by_dnode routines
commit f39927996d932d886093624a919a94b0daf5cb83
9511 printf family isn't aware of multibyte decimal point characters (fix regression)
commit 9ee48d48fcedfa1c02bcd16d6abbbfd28b9726c1
8660 mpi code checks return value of void function
commit e87636823fcefbf553fdda979f84ad782e6e2202
9234 reduce apic calibration error by taking multiple measurements
commit 03a4c2f4bfaca30115963b76445279b36468a614
9523 Large alloc in zdb can cause trouble
commit f41179bd376293096297cdc1f32e610d44f65c8b
9521 Add checkpoint field in the default list of the zpool-list man page
commit 7864b8192b8d30471fa2240466d516292e5765b8
9487 Free objects when receiving full stream as clone
commit fa41d87de9ec9000964c605eb01d6dc19e4a1abe
9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
commit 221813c13b43ef48330b03725e00edee85108cf1
9442 decrease indirect block size of spacemaps
commit 20596fe40e947343459994c3b1bcb68f7c0df52e
9245 zfs-test failures: slog_013_pos and slog_014_pos
Conflicts:
usr/src/uts/i86pc/sys/apic_common.h
usr/src/uts/i86pc/sys/apic.h
Diffstat (limited to 'usr/src')
25 files changed, 340 insertions, 149 deletions
diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index bdf197ae23..7ccd124dbe 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -4909,7 +4909,7 @@ zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; - char buf[SPA_MAXBLOCKSIZE]; + char *buf; int err; bzero(&bp, sizeof (bp)); @@ -4920,16 +4920,22 @@ zdb_embedded_block(char *thing) words + 8, words + 9, words + 10, words + 11, words + 12, words + 13, words + 14, words + 15); if (err != 16) { - (void) printf("invalid input format\n"); + (void) fprintf(stderr, "invalid input format\n"); exit(1); } ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); if (err != 0) { - (void) printf("decode failed: %u\n", err); + (void) fprintf(stderr, "decode failed: %u\n", err); exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); } static boolean_t diff --git a/usr/src/common/mpi/mpi-priv.h b/usr/src/common/mpi/mpi-priv.h index fa6af6d661..9af654ca1d 100644 --- a/usr/src/common/mpi/mpi-priv.h +++ b/usr/src/common/mpi/mpi-priv.h @@ -46,14 +46,14 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * + * Copyright 2017 RackTop Systems. + * * Sun elects to use this software under the MPL license. */ #ifndef _MPI_PRIV_H #define _MPI_PRIV_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* $Id: mpi-priv.h,v 1.20 2005/11/22 07:16:43 relyea%netscape.com Exp $ */ #include "mpi.h" @@ -300,7 +300,7 @@ mp_err MPI_ASM_DECL s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, /* c += a * b * (MP_RADIX ** offset); */ #define s_mp_mul_d_add_offset(a, b, c, off) \ -(s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off), MP_OKAY) + s_mpv_mul_d_add_prop(MP_DIGITS(a), MP_USED(a), b, MP_DIGITS(c) + off) typedef struct { mp_int N; /* modulus N */ diff --git a/usr/src/common/mpi/mpmontg.c b/usr/src/common/mpi/mpmontg.c index 33aea8b0d6..150bd2d37f 100644 --- a/usr/src/common/mpi/mpmontg.c +++ b/usr/src/common/mpi/mpmontg.c @@ -40,11 +40,11 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * + * Copyright 2017 RackTop Systems. + * * Sun elects to use this software under the MPL license. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* $Id: mpmontg.c,v 1.20 2006/08/29 02:41:38 nelson%bolyard.com Exp $ */ /* This file implements moduluar exponentiation using Montgomery's @@ -104,7 +104,7 @@ mp_err s_mp_redc(mp_int *T, mp_mont_modulus *mmm) for (i = 0; i < MP_USED(&mmm->N); ++i ) { mp_digit m_i = MP_DIGIT(T, i) * mmm->n0prime; /* T += N * m_i * (MP_RADIX ** i); */ - MP_CHECKOK( s_mp_mul_d_add_offset(&mmm->N, m_i, T, i) ); + s_mp_mul_d_add_offset(&mmm->N, m_i, T, i); } s_mp_clamp(T); diff --git a/usr/src/lib/libc/port/print/doprnt.c b/usr/src/lib/libc/port/print/doprnt.c index b6e8ceef0a..e8cfaad6de 100644 --- a/usr/src/lib/libc/port/print/doprnt.c +++ b/usr/src/lib/libc/port/print/doprnt.c @@ -1574,7 +1574,7 @@ _ndoprnt(const char *format, va_list in_args, FILE *iop, int prflag) p = insert_thousands_sep(buf, p); /* Put in a decimal point if needed */ - if (prec != 0 || (flagword & FSHARP)) + if (prec > 0 || (flagword & FSHARP)) p = insert_decimal_point(p); /* Digits (if any) after the decimal point */ diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m index 728f73de9d..1e7b2b8cfc 100644 --- a/usr/src/man/man1m/zpool.1m +++ b/usr/src/man/man1m/zpool.1m @@ -26,7 +26,7 @@ .\" Copyright (c) 2017 Datto Inc. .\" Copyright (c) 2017 George Melikov. All Rights Reserved. .\" -.Dd December 6, 2017 +.Dd April 27, 2018 .Dt ZPOOL 1M .Os .Sh NAME @@ -1450,7 +1450,7 @@ See the .Sx Properties section for a list of valid properties. The default list is -.Cm name , size , allocated , free , expandsize , fragmentation , capacity , +.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , capacity , .Cm dedupratio , health , altroot . .It Fl p Display numbers in parsable diff --git a/usr/src/test/zfs-tests/include/libtest.shlib b/usr/src/test/zfs-tests/include/libtest.shlib index dc4c4e253b..512bb069f9 100644 --- a/usr/src/test/zfs-tests/include/libtest.shlib +++ b/usr/src/test/zfs-tests/include/libtest.shlib @@ -1675,6 +1675,25 @@ function is_pool_removed #pool return $? } +function wait_for_degraded +{ + typeset pool=$1 + typeset timeout=${2:-30} + typeset t0=$SECONDS + + while :; do + [[ $(get_pool_prop health $pool) == "DEGRADED" ]] && break + log_note "$pool is not yet degraded." + sleep 1 + if ((SECONDS - t0 > $timeout)); then + log_note "$pool not degraded after $timeout seconds." + return 1 + fi + done + + return 0 +} + # # Use create_pool()/destroy_pool() to clean up the infomation in # in the given disk to avoid slice overlapping. diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib index a6d82f28d8..493ceda60d 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog.kshlib @@ -33,12 +33,8 @@ function cleanup { - if datasetexists $TESTPOOL ; then - log_must zpool destroy -f $TESTPOOL - fi - if datasetexists $TESTPOOL2 ; then - log_must zpool destroy -f $TESTPOOL2 - fi + poolexists $TESTPOOL && destroy_pool $TESTPOOL + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 } # diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh index 1ebb34fdda..3607da7928 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_013_pos.ksh @@ -47,12 +47,7 @@ verify_runnable "global" function cleanup_testenv { cleanup - if datasetexists $TESTPOOL2 ; then - log_must zpool destroy -f $TESTPOOL2 - fi - if [[ -n $lofidev ]]; then - lofiadm -d $lofidev - fi + [[ -n $lofidev ]] && $LOFIADM -d $lofidev } log_assert "Verify slog device can be disk, file, lofi device or any device " \ @@ -80,13 +75,3 @@ log_must verify_slog_device $TESTPOOL $lofidev 'ONLINE' log_pass "Verify slog device can be disk, file, lofi device or any device " \ "that presents a block interface." - -# Add file which reside in the itself -mntpnt=$(get_prop mountpoint $TESTPOOL) -log_must mkfile $MINVDEVSIZE $mntpnt/vdev -log_must zpool add $TESTPOOL $mntpnt/vdev - -# Add ZFS volume -vol=$TESTPOOL/vol -log_must zpool create -V $MINVDEVSIZE $vol -log_must zpool add $TESTPOOL /dev/zvol/dsk/$vol diff --git a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh index 621ac23aa9..0190479e44 100644 --- a/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/slog/slog_014_pos.ksh @@ -45,11 +45,9 @@ verify_runnable "global" log_assert "log device can survive when one of the pool device get corrupted." -for type in "mirror" "raidz" "raidz2" -do - for spare in "" "spare" - do - log_must zpool create $TESTPOOL $type $VDEV $spare $SDEV \ +for type in "mirror" "raidz" "raidz2"; do + for spare in "" "spare"; do + log_must $ZPOOL create $TESTPOOL $type $VDEV $spare $SDEV \ log $LDEV # Create a file to be corrupted @@ -69,13 +67,8 @@ do conv=notrunc count=50 log_must zpool scrub $TESTPOOL log_must display_status $TESTPOOL - log_must zpool status $TESTPOOL 2>&1 >/dev/null - zpool status -v $TESTPOOL | \ - grep "state: DEGRADED" 2>&1 >/dev/null - if (( $? != 0 )); then - log_fail "pool $TESTPOOL status should be DEGRADED" - fi + log_must wait_for_degraded $TESTPOOL zpool status -v $TESTPOOL | grep logs | \ grep "DEGRADED" 2>&1 >/dev/null diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 6b75289537..dfb78321dc 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -443,7 +443,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ -static int +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { @@ -1302,7 +1302,7 @@ xuio_stat_wbuf_nocopy(void) } #ifdef _KERNEL -static int +int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; @@ -1411,7 +1411,7 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) return (err); } -static int +int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; @@ -1600,22 +1600,17 @@ dmu_return_arcbuf(arc_buf_t *buf) * dmu_write(). */ void -dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, +dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; - dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the @@ -1632,11 +1627,8 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); - DB_DNODE_ENTER(dbuf); - dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; - DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); @@ -1645,6 +1637,17 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } } +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + + DB_DNODE_ENTER(dbuf); + dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); +} + typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index 9f0f2b437c..aede315502 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -32,7 +32,8 @@ #include <sys/zfeature.h> uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t object; @@ -92,7 +93,8 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, os->os_obj_next = object - 1; } - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, tx); mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, dn); @@ -101,6 +103,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, return (object); } +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_alloc_ibs(os, ot, blocksize, 0, + bonustype, bonuslen, tx)); +} + int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 43010e2964..026623f3d5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -1757,6 +1757,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_force = force; drc->drc_resumable = resumable; drc->drc_cred = CRED(); + drc->drc_clone = (origin != NULL); if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { drc->drc_byteswap = B_TRUE; @@ -1817,7 +1818,9 @@ struct receive_writer_arg { /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; - uint64_t last_object, last_offset; + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ }; @@ -2089,6 +2092,9 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file @@ -2184,6 +2190,9 @@ receive_freeobjects(struct receive_writer_arg *rwa, err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); + + if (obj > rwa->max_object) + rwa->max_object = obj; } if (next_err != ESRCH) return (next_err); @@ -2213,6 +2222,9 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, rwa->last_object = drrw->drr_object; rwa->last_offset = drrw->drr_offset; + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -2289,6 +2301,9 @@ receive_write_byref(struct receive_writer_arg *rwa, ref_os = rwa->os; } + if (drrwbr->drr_object > rwa->max_object) + rwa->max_object = drrwbr->drr_object; + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) @@ -2331,6 +2346,9 @@ receive_write_embedded(struct receive_writer_arg *rwa, if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwe->drr_object, @@ -2367,6 +2385,9 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); @@ -2411,6 +2432,9 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); @@ -3033,6 +3057,41 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, } mutex_exit(&rwa.mutex); + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. Note that it's possible that + * we've resumed this send and the first record we received was the END + * record. In that case, max_object would be 0, but we shouldn't start + * freeing all objects from there; instead we should start from the + * resumeobj. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj; + if (nvlist_lookup_uint64(begin_nvl, "resume_object", &obj) != 0) + obj = 0; + if (rwa.max_object > obj) + obj = rwa.max_object; + obj++; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa.os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa.os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + cv_destroy(&rwa.cv); mutex_destroy(&rwa.mutex); bqueue_destroy(&rwa.q); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 879a820a92..549c3ca1b0 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -1087,7 +1087,12 @@ dmu_tx_wait(dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } } diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c index b42d449c77..e85a85f913 100644 --- a/usr/src/uts/common/fs/zfs/space_map.c +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -52,6 +52,14 @@ */ boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; +/* + * Override the default indirect block size of 128K, instead using 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + boolean_t sm_entry_is_debug(uint64_t e) { @@ -674,8 +682,8 @@ space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, * * [1] The feature is enabled. * [2] The offset or run is too big for a single-word entry, - * or the vdev_id is set (meaning not equal to - * SM_NO_VDEVID). + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). * * Note that for purposes of testing we've added the case that * we write two-word entries occasionally when the feature is @@ -834,7 +842,8 @@ space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != blocksize) { + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, @@ -890,8 +899,8 @@ space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) bonuslen = SPACE_MAP_SIZE_V0; } - object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize, - DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 6a33cb7d81..52238bc735 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -354,6 +354,9 @@ typedef struct dmu_buf { */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, @@ -514,6 +517,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db); int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, + uint32_t flags); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); @@ -752,14 +758,19 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); +int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, + dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h index 38b1b042e5..b8403313e9 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h @@ -63,6 +63,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; boolean_t drc_force; boolean_t drc_resumable; + boolean_t drc_clone; struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h index e583d61eac..bf3b269d70 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_IMPL_H @@ -92,6 +92,7 @@ typedef struct tx_state { kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_synced_txg; /* last synced txg id */ diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 5578ba5c97..6d2ffe9921 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -445,6 +445,30 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) } } +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + static void txg_sync_thread(void *arg) { @@ -471,7 +495,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0 && + !txg_has_quiesced_to_sync(dp) && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); @@ -484,7 +508,7 @@ txg_sync_thread(void *arg) * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -499,6 +523,7 @@ txg_sync_thread(void *arg) * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ + ASSERT(tx->tx_quiesced_txg != 0); txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; @@ -549,7 +574,7 @@ txg_quiesce_thread(void *arg) */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) + txg_has_quiesced_to_sync(dp))) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) @@ -559,6 +584,8 @@ txg_quiesce_thread(void *arg) dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); @@ -567,6 +594,7 @@ txg_quiesce_thread(void *arg) * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); @@ -664,7 +692,8 @@ txg_kick(dsl_pool_t *dp) ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - if (tx->tx_syncing_txg == 0 && + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 6be167b7b6..03d711838c 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -129,7 +129,7 @@ typedef struct zvol_state { zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ znode_t zv_znode; /* for range locking */ - dmu_buf_t *zv_dbuf; /* bonus handle */ + dnode_t *zv_dn; /* dnode hold */ } zvol_state_t; /* @@ -652,7 +652,7 @@ zvol_first_open(zvol_state_t *zv) return (error); } - error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf); + error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); if (error) { dmu_objset_disown(os, zvol_tag); return (error); @@ -677,8 +677,8 @@ zvol_last_close(zvol_state_t *zv) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_buf_rele(zv->zv_dbuf, zvol_tag); - zv->zv_dbuf = NULL; + dnode_rele(zv->zv_dn, zvol_tag); + zv->zv_dn = NULL; /* * Evict cached data @@ -999,8 +999,6 @@ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; - objset_t *os = zv->zv_objset; - uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; /* length of user data */ dmu_buf_t *db; @@ -1024,7 +1022,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_read(os, object, offset, size, buf, + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); } else { /* indirect write */ /* @@ -1037,7 +1035,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) offset = P2ALIGN(offset, size); zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); - error = dmu_buf_hold(os, object, offset, zgd, &db, + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; @@ -1104,8 +1102,8 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (wr_state == WR_COPIED && dmu_read(zv->zv_objset, - ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, + off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; @@ -1536,7 +1534,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); @@ -1650,7 +1648,7 @@ zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) int zvol_get_volume_params(minor_t minor, uint64_t *blksize, uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl, - void **rl_hdl, void **bonus_hdl) + void **rl_hdl, void **dnode_hdl) { zvol_state_t *zv; @@ -1661,7 +1659,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, return (SET_ERROR(ENXIO)); ASSERT(blksize && max_xfer_len && minor_hdl && - objset_hdl && zil_hdl && rl_hdl && bonus_hdl); + objset_hdl && zil_hdl && rl_hdl && dnode_hdl); *blksize = zv->zv_volblocksize; *max_xfer_len = (uint64_t)zvol_maxphys; @@ -1669,7 +1667,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; *rl_hdl = &zv->zv_znode; - *bonus_hdl = zv->zv_dbuf; + *dnode_hdl = zv->zv_dn; return (0); } diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c index 0e96e2ec96..bf9a369506 100644 --- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c +++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c @@ -59,12 +59,12 @@ * zfs internal interfaces referenced here: * * FUNCTIONS - * dmu_buf_hold_array_by_bonus() + * dmu_buf_hold_array_by_dnode() * dmu_buf_rele_array() * - * dmu_request_arc_buf() + * arc_loan_buf() * dmu_assign_arcbuf() - * dmu_return_arc() + * dmu_return_arcbuf() * arc_buf_size() * * dmu_tx_create() @@ -88,7 +88,7 @@ * zv_objset - dmu_tx_create * zv_zilog - zil_commit * zv_znode - zfs_range_lock - * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf + * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf * GLOBAL DATA * zvol_maxphys */ @@ -114,7 +114,7 @@ sbd_zvol_get_volume_params(sbd_lu_t *sl) &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ &sl->sl_zvol_zil_hdl, /* zil_commit */ &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ - &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */ + &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */ /* dmu_request_arcbuf, */ /* dmu_assign_arcbuf */ @@ -153,10 +153,10 @@ int sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) { sbd_zvol_io_t *zvio = dbuf->db_lu_private; - rl_t *rl; - int numbufs, error; - uint64_t len = dbuf->db_data_size; - uint64_t offset = zvio->zvio_offset; + rl_t *rl; + int numbufs, error; + uint64_t len = dbuf->db_data_size; + uint64_t offset = zvio->zvio_offset; dmu_buf_t **dbpp, *dbp; /* Make sure request is reasonable */ @@ -171,8 +171,9 @@ sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) */ rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); - error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset, - len, TRUE, RDTAG, &numbufs, &dbpp); + error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl, + offset, len, TRUE, RDTAG, &numbufs, &dbpp, + DMU_READ_PREFETCH); zfs_range_unlock(rl); @@ -242,8 +243,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) uint64_t blksize; arc_buf_t **abp; stmf_sglist_ent_t *sgl; - uint64_t len = dbuf->db_data_size; - uint64_t offset = zvio->zvio_offset; + uint64_t len = dbuf->db_data_size; + uint64_t offset = zvio->zvio_offset; /* Make sure request is reasonable */ if (len > sl->sl_max_xfer_len) @@ -293,7 +294,8 @@ sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) if (seglen == 0) seglen = blksize; seglen = MIN(seglen, len); - abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen); + abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl), + B_FALSE, (int)seglen); ASSERT(arc_buf_size(abp[i]) == (int)seglen); sgl->seg_addr = abp[i]->b_data; sgl->seg_length = (uint32_t)seglen; @@ -335,7 +337,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) sbd_zvol_io_t *zvio = dbuf->db_lu_private; dmu_tx_t *tx; int sync, i, error; - rl_t *rl; + rl_t *rl; arc_buf_t **abp = zvio->zvio_abp; int flags = zvio->zvio_flags; uint64_t toffset, offset = zvio->zvio_offset; @@ -364,7 +366,8 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) abuf = abp[i]; size = arc_buf_size(abuf); - dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx); + dmu_assign_arcbuf_dnode(sl->sl_zvol_dn_hdl, toffset, abuf, + tx); toffset += size; resid -= size; } @@ -391,7 +394,7 @@ int sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) { int error; - rl_t *rl; + rl_t *rl; uint64_t len = (uint64_t)uio->uio_resid; uint64_t offset = (uint64_t)uio->uio_loffset; @@ -403,7 +406,7 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); - error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len); + error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); zfs_range_unlock(rl); if (error == ECKSUM) @@ -418,8 +421,8 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) int sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) { - rl_t *rl; - dmu_tx_t *tx; + rl_t *rl; + dmu_tx_t *tx; int error, sync; uint64_t len = (uint64_t)uio->uio_resid; uint64_t offset = (uint64_t)uio->uio_loffset; @@ -442,7 +445,7 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) if (error) { dmu_tx_abort(tx); } else { - error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx); + error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx); if (error == 0) { zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, (ssize_t)len, sync); diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h index efbc7268ea..a402ad0ee3 100644 --- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h +++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h @@ -228,7 +228,7 @@ typedef struct sbd_lu { void *sl_zvol_objset_hdl; void *sl_zvol_zil_hdl; void *sl_zvol_rl_hdl; - void *sl_zvol_bonus_hdl; + void *sl_zvol_dn_hdl; /* Backing store */ char *sl_data_filename; diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c index 2ae8b5cd92..b57f978f3b 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_common.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_common.c @@ -24,7 +24,7 @@ */ /* * Copyright 2018 Joyent, Inc. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. */ /* @@ -1072,19 +1072,20 @@ apic_cpu_remove(psm_cpu_request_t *reqp) } /* - * Return the number of APIC clock ticks elapsed for 8245 to decrement - * (APIC_TIME_COUNT + pit_ticks_adj) ticks. + * Return the number of ticks the APIC decrements in SF nanoseconds. + * The fixed-frequency PIT (aka 8254) is used for the measurement. */ -uint_t -apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) +static uint64_t +apic_calibrate_impl() { uint8_t pit_tick_lo; - uint16_t pit_tick, target_pit_tick; - uint32_t start_apic_tick, end_apic_tick; + uint16_t pit_tick, target_pit_tick, pit_ticks_adj; + uint32_t pit_ticks; + uint32_t start_apic_tick, end_apic_tick, apic_ticks; ulong_t iflag; - uint32_t reg; - reg = addr + APIC_CURR_COUNT - apicadr; + apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); + apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); iflag = intr_clear(); @@ -1095,7 +1096,7 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick_lo <= APIC_LB_MIN || pit_tick_lo >= APIC_LB_MAX); /* - * Wait for the 8254 to decrement by 5 ticks to ensure + * Wait for the PIT to decrement by 5 ticks to ensure * we didn't start in the middle of a tick. * Compare with 0x10 for the wrap around case. */ @@ -1105,11 +1106,10 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - start_apic_tick = apic_reg_ops->apic_read(reg); + start_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); /* - * Wait for the 8254 to decrement by - * (APIC_TIME_COUNT + pit_ticks_adj) ticks + * Wait for the PIT to decrement by APIC_TIME_COUNT ticks */ target_pit_tick = pit_tick - APIC_TIME_COUNT; do { @@ -1117,13 +1117,95 @@ apic_calibrate(volatile uint32_t *addr, uint16_t *pit_ticks_adj) pit_tick = (inb(PITCTR0_PORT) << 8) | pit_tick_lo; } while (pit_tick > target_pit_tick || pit_tick_lo < 0x10); - end_apic_tick = apic_reg_ops->apic_read(reg); - - *pit_ticks_adj = target_pit_tick - pit_tick; + end_apic_tick = apic_reg_ops->apic_read(APIC_CURR_COUNT); intr_restore(iflag); - return (start_apic_tick - end_apic_tick); + apic_ticks = start_apic_tick - end_apic_tick; + + /* The PIT might have decremented by more ticks than planned */ + pit_ticks_adj = target_pit_tick - pit_tick; + /* total number of PIT ticks corresponding to apic_ticks */ + pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; + + /* + * Determine the number of nanoseconds per APIC clock tick + * and then determine how many APIC ticks to interrupt at the + * desired frequency + * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s + * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s + * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) + * apic_ticks_per_SFns = + * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) + */ + return ((SF * apic_ticks * PIT_HZ) / ((uint64_t)pit_ticks * NANOSEC)); +} + +/* + * It was found empirically that 5 measurements seem sufficient to give a good + * accuracy. Most spurious measurements are higher than the target value thus + * we eliminate up to 2/5 spurious measurements. + */ +#define APIC_CALIBRATE_MEASUREMENTS 5 + +#define APIC_CALIBRATE_PERCENT_OFF_WARNING 10 + +/* + * Return the number of ticks the APIC decrements in SF nanoseconds. + * Several measurements are taken to filter out outliers. + */ +uint64_t +apic_calibrate() +{ + uint64_t measurements[APIC_CALIBRATE_MEASUREMENTS]; + int median_idx; + uint64_t median; + + /* + * When running under a virtual machine, the emulated PIT and APIC + * counters do not always return the right values and can roll over. + * Those spurious measurements are relatively rare but could + * significantly affect the calibration. + * Therefore we take several measurements and then keep the median. + * The median is preferred to the average here as we only want to + * discard outliers. + */ + for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) + measurements[i] = apic_calibrate_impl(); + + /* + * sort results and retrieve median. + */ + for (int i = 0; i < APIC_CALIBRATE_MEASUREMENTS; i++) { + for (int j = i + 1; j < APIC_CALIBRATE_MEASUREMENTS; j++) { + if (measurements[j] < measurements[i]) { + uint64_t tmp = measurements[i]; + measurements[i] = measurements[j]; + measurements[j] = tmp; + } + } + } + median_idx = APIC_CALIBRATE_MEASUREMENTS / 2; + median = measurements[median_idx]; + +#if (APIC_CALIBRATE_MEASUREMENTS >= 3) + /* + * Check that measurements are consistent. Post a warning + * if the three middle values are not close to each other. + */ + uint64_t delta_warn = median * + APIC_CALIBRATE_PERCENT_OFF_WARNING / 100; + if ((median - measurements[median_idx - 1]) > delta_warn || + (measurements[median_idx + 1] - median) > delta_warn) { + cmn_err(CE_WARN, "apic_calibrate measurements lack " + "precision: %llu, %llu, %llu.", + (u_longlong_t)measurements[median_idx - 1], + (u_longlong_t)median, + (u_longlong_t)measurements[median_idx + 1]); + } +#endif + + return (median); } /* diff --git a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c index 348f5034fc..bc61c114c2 100644 --- a/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c +++ b/usr/src/uts/i86pc/io/pcplusmp/apic_timer.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -90,34 +91,12 @@ static apic_timer_t apic_timer; int apic_timer_init(int hertz) { - uint_t apic_ticks = 0; - uint_t pit_ticks; int ret, timer_mode; - uint16_t pit_ticks_adj; static int firsttime = 1; if (firsttime) { /* first time calibrate on CPU0 only */ - - apic_reg_ops->apic_write(APIC_DIVIDE_REG, apic_divide_reg_init); - apic_reg_ops->apic_write(APIC_INIT_COUNT, APIC_MAXVAL); - apic_ticks = apic_calibrate(apicadr, &pit_ticks_adj); - - /* total number of PIT ticks corresponding to apic_ticks */ - pit_ticks = APIC_TIME_COUNT + pit_ticks_adj; - - /* - * Determine the number of nanoseconds per APIC clock tick - * and then determine how many APIC ticks to interrupt at the - * desired frequency - * apic_ticks / (pitticks / PIT_HZ) = apic_ticks_per_s - * (apic_ticks * PIT_HZ) / pitticks = apic_ticks_per_s - * apic_ticks_per_ns = (apic_ticks * PIT_HZ) / (pitticks * 10^9) - * pic_ticks_per_SFns = - * (SF * apic_ticks * PIT_HZ) / (pitticks * 10^9) - */ - apic_ticks_per_SFnsecs = ((SF * apic_ticks * PIT_HZ) / - ((uint64_t)pit_ticks * NANOSEC)); + apic_ticks_per_SFnsecs = apic_calibrate(); /* the interval timer initial count is 32 bit max */ apic_nsec_max = APIC_TICKS_TO_NSECS(APIC_MAXVAL); diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 0352a154af..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright (c) 2010, Intel Corporation. @@ -830,7 +831,7 @@ extern int apic_local_mode(); extern void apic_change_eoi(); extern void apic_send_EOI(uint32_t); extern void apic_send_directed_EOI(uint32_t); -extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); +extern uint64_t apic_calibrate(); extern void x2apic_send_pir_ipi(processorid_t); extern volatile uint32_t *apicadr; /* virtual addr of local APIC */ diff --git a/usr/src/uts/i86pc/sys/apic_common.h b/usr/src/uts/i86pc/sys/apic_common.h index 9c08d73798..dc02031ac3 100644 --- a/usr/src/uts/i86pc/sys/apic_common.h +++ b/usr/src/uts/i86pc/sys/apic_common.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ /* * Copyright 2018 Joyent, Inc. @@ -183,7 +184,7 @@ extern void apic_unset_idlecpu(processorid_t cpun); extern void apic_shutdown(int cmd, int fcn); extern void apic_preshutdown(int cmd, int fcn); extern processorid_t apic_get_next_processorid(processorid_t cpun); -extern uint_t apic_calibrate(volatile uint32_t *, uint16_t *); +extern uint64_t apic_calibrate(); extern int apic_get_pir_ipivect(void); extern void apic_send_pir_ipi(processorid_t); |