diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-03-01 12:26:59 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2017-03-01 12:26:59 +0000 |
commit | 4fa363ca296221e73316b994680e92bca02f9b08 (patch) | |
tree | 1f3fccf9d07f2d8a890a1b6e19cd41106584cb90 | |
parent | 56651a474ca10d3fb356c0e36f6ba69d86e4bcab (diff) | |
parent | c5bde7273ef861a8dc54cfb9abe48d56062177da (diff) | |
download | illumos-joyent-4fa363ca296221e73316b994680e92bca02f9b08.tar.gz |
[illumos-gate merge]
commit c5bde7273ef861a8dc54cfb9abe48d56062177da
7843 get_clones_stat() is suboptimal for lots of clones
commit 61e255ce7267b52208af9daf434b77d37fb75622
7793 ztest fails assertion in dmu_tx_willuse_space
commit 2e972bf18f2d3dc8a060f336db39dc8432ee887c
7816 remove static unused variable in zfs_vfsops.c
commit ed61ec1da9132e570b0853386d0f78a32f852cd2
6410 teach zdb to perform object lookups by path
commit ff157c8690676593df83d0602f60f960862d3492
7896 loader.efi: Don't set *dev in the zfs root case, it may be NULL
commit 04f8e09339a9f05578b71312033d46d49376f828
7891 loader.efi: EFI time setup
Conflicts:
usr/src/uts/common/fs/zfs/dmu_tx.c
34 files changed, 914 insertions, 1593 deletions
diff --git a/usr/src/boot/sys/boot/efi/include/efilib.h b/usr/src/boot/sys/boot/efi/include/efilib.h index 4752be19f0..77e5525da3 100644 --- a/usr/src/boot/sys/boot/efi/include/efilib.h +++ b/usr/src/boot/sys/boot/efi/include/efilib.h @@ -23,8 +23,6 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ #include <stand.h> @@ -55,5 +53,9 @@ void efi_free_devpath_name(CHAR16 *); int efi_status_to_errno(EFI_STATUS); +void efi_time_init(void); +void efi_time_fini(void); + EFI_STATUS main(int argc, CHAR16 *argv[]); void exit(EFI_STATUS status); +void delay(int usecs); diff --git a/usr/src/boot/sys/boot/efi/libefi/Makefile b/usr/src/boot/sys/boot/efi/libefi/Makefile index 5d6e05df6e..39923b58b7 100644 --- a/usr/src/boot/sys/boot/efi/libefi/Makefile +++ b/usr/src/boot/sys/boot/efi/libefi/Makefile @@ -27,6 +27,12 @@ install: SRCS= delay.c devpath.c efi_console.c efinet.c efipart.c errno.c handles.c \ libefi.c time.c +#.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" +#SRCS += time.c +#.elif ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm" +#SRCS+= time_event.c +#.endif + OBJS= $(SRCS:%.c=%.o) CPPFLAGS= -DTERM_EMU diff --git a/usr/src/boot/sys/boot/efi/libefi/time.c b/usr/src/boot/sys/boot/efi/libefi/time.c index 1f9d5daed1..5ab10b6c43 100644 --- a/usr/src/boot/sys/boot/efi/libefi/time.c +++ b/usr/src/boot/sys/boot/efi/libefi/time.c @@ -39,7 +39,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); #include <efi.h> #include <efilib.h> @@ -58,6 +57,16 @@ __FBSDID("$FreeBSD$"); #define SECSPERHOUR ( 60*60 ) #define SECSPERDAY (24 * SECSPERHOUR) +void +efi_time_init(void) +{ +} + +void +efi_time_fini(void) +{ +} + static time_t efi_time(EFI_TIME *ETime) { @@ -218,7 +227,7 @@ time(time_t *tloc) } time_t -getsecs() +getsecs(void) { return time(0); } diff --git a/usr/src/boot/sys/boot/efi/libefi/time_event.c b/usr/src/boot/sys/boot/efi/libefi/time_event.c new file mode 100644 index 0000000000..d76af38bf9 --- /dev/null +++ b/usr/src/boot/sys/boot/efi/libefi/time_event.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2016 Andrew Turner + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <efi.h> +#include <efilib.h> + +#include <time.h> +#include <sys/time.h> + +static EFI_EVENT time_event; +static uint64_t curtime; + +static void +time_update(EFI_EVENT event, void *context) +{ + + curtime += 10; +} + +void +efi_time_init(void) +{ + + /* Create a timer event */ + BS->CreateEvent(EVT_TIMER | EVT_NOTIFY_SIGNAL, TPL_CALLBACK, + time_update, 0, &time_event); + /* Use a 10ms timer */ + BS->SetTimer(time_event, TimerPeriodic, 100000); +} + +void +efi_time_fini(void) +{ + + /* Cancel the timer */ + BS->SetTimer(time_event, TimerCancel, 0); + BS->CloseEvent(time_event); +} + +time_t +time(time_t *tloc) +{ + time_t t; + + t = curtime / 1000; + if (tloc != NULL) + *tloc = t; + + return (t); +} + +time_t +getsecs(void) +{ + return time(0); +} diff --git a/usr/src/boot/sys/boot/efi/loader/arch/amd64/elf64_freebsd.c b/usr/src/boot/sys/boot/efi/loader/arch/amd64/elf64_freebsd.c index f0cb7e7248..7571eb9561 100644 --- a/usr/src/boot/sys/boot/efi/loader/arch/amd64/elf64_freebsd.c +++ b/usr/src/boot/sys/boot/efi/loader/arch/amd64/elf64_freebsd.c @@ -169,9 +169,12 @@ elf64_exec(struct preloaded_file *fp) printf("Start @ 0x%lx ...\n", ehdr->e_entry); + efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernend); - if (err != 0) + if (err != 0) { + efi_time_init(); return(err); + } dev_cleanup(); diff --git a/usr/src/boot/sys/boot/efi/loader/arch/arm/exec.c b/usr/src/boot/sys/boot/efi/loader/arch/arm/exec.c index 716c7d300a..83d3f2b114 100644 --- a/usr/src/boot/sys/boot/efi/loader/arch/arm/exec.c +++ b/usr/src/boot/sys/boot/efi/loader/arch/arm/exec.c @@ -73,8 +73,11 @@ __elfN(arm_exec)(struct preloaded_file *fp) e = (Elf_Ehdr *)&fmp->md_data; - if ((error = bi_load(fp->f_args, &modulep, &kernend)) != 0) + efi_time_fini(); + if ((error = bi_load(fp->f_args, &modulep, &kernend)) != 0) { + efi_time_init(); return (error); + } entry = efi_translate(e->e_entry); printf("Kernel entry at 0x%x...\n", (unsigned)entry); diff --git a/usr/src/boot/sys/boot/efi/loader/arch/arm64/exec.c b/usr/src/boot/sys/boot/efi/loader/arch/arm64/exec.c index eb1830c548..a8420b4b64 100644 --- a/usr/src/boot/sys/boot/efi/loader/arch/arm64/exec.c +++ b/usr/src/boot/sys/boot/efi/loader/arch/arm64/exec.c @@ -25,7 +25,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); #include <stand.h> #include <string.h> @@ -108,14 +107,17 @@ elf64_exec(struct preloaded_file *fp) } if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) - return(EFTYPE); + return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); entry = efi_translate(ehdr->e_entry); + efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernendp); - if (err != 0) + if (err != 0) { + efi_time_init(); return (err); + } dev_cleanup(); diff --git a/usr/src/boot/sys/boot/efi/loader/arch/i386/elf32_freebsd.c b/usr/src/boot/sys/boot/efi/loader/arch/i386/elf32_freebsd.c index bae8f7bbca..b3a18d27d3 100644 --- a/usr/src/boot/sys/boot/efi/loader/arch/i386/elf32_freebsd.c +++ b/usr/src/boot/sys/boot/efi/loader/arch/i386/elf32_freebsd.c @@ -71,9 +71,12 @@ elf32_exec(struct preloaded_file *fp) return(EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); + efi_time_fini(); err = bi_load(fp->f_args, &boothowto, &bootdev, &bootinfop, &modulep, &kernend); - if (err != 0) + if (err != 0) { + efi_time_init(); return(err); + } entry = ehdr->e_entry & 0xffffff; printf("Start @ 0x%lx ...\n", entry); diff --git a/usr/src/boot/sys/boot/efi/loader/devicename.c b/usr/src/boot/sys/boot/efi/loader/devicename.c index 8fc80ebbc5..64a417dacc 100644 --- a/usr/src/boot/sys/boot/efi/loader/devicename.c +++ b/usr/src/boot/sys/boot/efi/loader/devicename.c @@ -120,7 +120,6 @@ efi_parsedev(struct devdesc **dev, const char *devspec, const char **path) free(idev); return (err); } - *dev = idev; cp = strchr(np + 1, ':'); } else #endif diff --git a/usr/src/boot/sys/boot/efi/loader/main.c b/usr/src/boot/sys/boot/efi/loader/main.c index 5e333e39d4..cd45451889 100644 --- a/usr/src/boot/sys/boot/efi/loader/main.c +++ b/usr/src/boot/sys/boot/efi/loader/main.c @@ -244,6 +244,9 @@ main(int argc, CHAR16 *argv[]) archsw.arch_zfs_probe = efi_zfs_probe; #endif + /* Init the time source */ + efi_time_init(); + has_kbd = has_keyboard(); /* diff --git a/usr/src/boot/sys/boot/i386/libi386/pxe.c b/usr/src/boot/sys/boot/i386/libi386/pxe.c index f943ef5ec8..9dca8d7dab 100644 --- a/usr/src/boot/sys/boot/i386/libi386/pxe.c +++ b/usr/src/boot/sys/boot/i386/libi386/pxe.c @@ -592,7 +592,7 @@ bangpxe_call(int func) time_t -getsecs() +getsecs(void) { time_t n = 0; time(&n); diff --git a/usr/src/boot/sys/boot/ofw/libofw/ofw_time.c b/usr/src/boot/sys/boot/ofw/libofw/ofw_time.c index f53997dc05..d3f3f9484e 100644 --- a/usr/src/boot/sys/boot/ofw/libofw/ofw_time.c +++ b/usr/src/boot/sys/boot/ofw/libofw/ofw_time.c @@ -25,7 +25,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); #include <stand.h> #include "openfirm.h" @@ -41,8 +40,8 @@ time(time_t *tloc) return secs; } -int -getsecs() +time_t +getsecs(void) { time_t n = 0; time(&n); diff --git a/usr/src/boot/sys/boot/uboot/lib/time.c b/usr/src/boot/sys/boot/uboot/lib/time.c index 9083675b75..b50eb112fa 100644 --- a/usr/src/boot/sys/boot/uboot/lib/time.c +++ b/usr/src/boot/sys/boot/uboot/lib/time.c @@ -26,7 +26,6 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD$"); #include <stand.h> @@ -47,7 +46,7 @@ time(time_t *tloc) return (secs); } -int +time_t getsecs(void) { diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index ecf63568ab..788dec60ab 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -23,7 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Nexenta Systems, Inc. */ #include <stdio.h> @@ -120,19 +120,22 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CmMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] " - "[-U config] [-I inflight I/Os] [-x dumpdir] [-o var=value] " - "poolname [object...]\n" - " %s [-divPA] [-e -p path...] [-U config] dataset " - "[object...]\n" - " %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " - "poolname [vdev [metaslab...]]\n" - " %s -R [-A] [-e [-p path...]] poolname " - "vdev:offset:size[:flags]\n" - " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" - " %s -l [-Aqu] device\n" - " %s -C [-A] [-U config]\n\n", - cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); + "Usage:\t%s [-AbcdDFGhiLMPsvX] [-e [-p <path> ...]] " + "[-I <inflight I/Os>]\n" + "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" + "\t\t[<poolname> [<object> ...]]\n" + "\t%s [-AdiPv] [-e [-p <path> ...]] [-U <cache>] <dataset> " + "[<object> ...]\n" + "\t%s -C [-A] [-U <cache>]\n" + "\t%s -l [-Aqu] <device>\n" + "\t%s -m [-AFLPX] [-e [-p <path> ...]] [-t <txg>] [-U <cache>]\n" + "\t\t<poolname> [<vdev> [<metaslab> ...]]\n" + "\t%s -O <dataset> <path>\n" + "\t%s -R [-A] [-e [-p <path> ...]] [-U <cache>]\n" + "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" + "\t%s -S [-AP] [-e [-p <path> ...]] [-U <cache>] <poolname>\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, + cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -141,52 +144,54 @@ usage(void) (void) fprintf(stderr, " If object numbers are specified, only " "those objects are dumped\n\n"); (void) fprintf(stderr, " Options to control amount of output:\n"); - (void) fprintf(stderr, " -d dataset(s)\n"); - (void) fprintf(stderr, " -i intent logs\n"); - (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); - (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -b block statistics\n"); - (void) fprintf(stderr, " -m metaslabs\n"); - (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); - (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); (void) fprintf(stderr, " -D dedup statistics\n"); - (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); - (void) fprintf(stderr, " -v verbose (applies to all others)\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -i intent logs\n"); (void) fprintf(stderr, " -l read label contents\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); + (void) fprintf(stderr, " -O perform object lookups by path\n"); (void) fprintf(stderr, " -R read and display block from a " - "device\n\n"); + "device\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all " + "others)\n\n"); (void) fprintf(stderr, " Below options are intended for use " "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); - (void) fprintf(stderr, " -F attempt automatic rewind within " - "safe range of transaction groups\n"); - (void) fprintf(stderr, " -U <cachefile_path> -- use alternate " - "cachefile\n"); - (void) fprintf(stderr, " -X attempt extreme rewind (does not " - "work with dataset)\n"); (void) fprintf(stderr, " -e pool is exported/destroyed/" "has altroot/not in a cachefile\n"); - (void) fprintf(stderr, " -p <path> -- use one or more with " - "-e to specify path to vdev dir\n"); - (void) fprintf(stderr, " -x <dumpdir> -- " - "dump all read blocks into specified directory\n"); - (void) fprintf(stderr, " -P print numbers in parseable form\n"); - (void) fprintf(stderr, " -t <txg> -- highest txg to use when " - "searching for uberblocks\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); (void) fprintf(stderr, " -I <number of inflight I/Os> -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); - (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " - "exiting\n"); (void) fprintf(stderr, " -o <variable>=<value> set global " "variable to an unsigned 32-bit integer value\n"); + (void) fprintf(stderr, " -p <path> -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -q don't print label contents\n"); + (void) fprintf(stderr, " -t <txg> -- highest txg to use when " + "searching for uberblocks\n"); (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -U <cachefile_path> -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -x <dumpdir> -- " + "dump all read blocks into specified directory\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); @@ -1578,8 +1583,55 @@ dump_deadlist(dsl_deadlist_t *dl) static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; -static boolean_t sa_loaded; -sa_attr_type_t *sa_attr_table; +static objset_t *sa_os = NULL; +static sa_attr_type_t *sa_attr_table = NULL; + +static int +open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) +{ + int err; + uint64_t sa_attrs = 0; + uint64_t version = 0; + + VERIFY3P(sa_os, ==, NULL); + err = dmu_objset_own(path, type, B_TRUE, tag, osp); + if (err != 0) { + (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, + strerror(err)); + return (err); + } + + if (dmu_objset_type(*osp) == DMU_OST_ZFS) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version); + if (version >= ZPL_VERSION_SA) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs); + } + err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, + &sa_attr_table); + if (err != 0) { + (void) fprintf(stderr, "sa_setup failed: %s\n", + strerror(err)); + dmu_objset_disown(*osp, tag); + *osp = NULL; + } + } + sa_os = *osp; + + return (0); +} + +static void +close_objset(objset_t *os, void *tag) +{ + VERIFY3P(os, ==, sa_os); + if (os->os_sa != NULL) + sa_tear_down(os); + dmu_objset_disown(os, tag); + sa_attr_table = NULL; + sa_os = NULL; +} static void fuid_table_destroy() @@ -1651,25 +1703,7 @@ dump_znode(objset_t *os, uint64_t object, void *data, size_t size) int idx = 0; int error; - if (!sa_loaded) { - uint64_t sa_attrs = 0; - uint64_t version; - - VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &version) == 0); - if (version >= ZPL_VERSION_SA) { - VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, - 8, 1, &sa_attrs) == 0); - } - if ((error = sa_setup(os, sa_attrs, zfs_attr_table, - ZPL_END, &sa_attr_table)) != 0) { - (void) printf("sa_setup failed errno %d, can't " - "display znode contents\n", error); - return; - } - sa_loaded = B_TRUE; - } - + VERIFY3P(os, ==, sa_os); if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { (void) printf("Failed to get handle for SA znode\n"); return; @@ -2141,6 +2175,108 @@ dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) } } +static char curpath[PATH_MAX]; + +/* + * Iterate through the path components, recursively passing + * current one's obj and remaining path until we find the obj + * for the last one. + */ +static int +dump_path_impl(objset_t *os, uint64_t obj, char *name) +{ + int err; + int header = 1; + uint64_t child_obj; + char *s; + dmu_buf_t *db; + dmu_object_info_t doi; + + if ((s = strchr(name, '/')) != NULL) + *s = '\0'; + err = zap_lookup(os, obj, name, 8, 1, &child_obj); + + (void) strlcat(curpath, name, sizeof (curpath)); + + if (err != 0) { + (void) fprintf(stderr, "failed to lookup %s: %s\n", + curpath, strerror(err)); + return (err); + } + + child_obj = ZFS_DIRENT_OBJ(child_obj); + err = sa_buf_hold(os, child_obj, FTAG, &db); + if (err != 0) { + (void) fprintf(stderr, + "failed to get SA dbuf for obj %llu: %s\n", + (u_longlong_t)child_obj, strerror(err)); + return (EINVAL); + } + dmu_object_info_from_db(db, &doi); + sa_buf_rele(db, FTAG); + + if (doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) { + (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", + doi.doi_bonus_type, (u_longlong_t)child_obj); + return (EINVAL); + } + + if (dump_opt['v'] > 6) { + (void) printf("obj=%llu %s type=%d bonustype=%d\n", + (u_longlong_t)child_obj, curpath, doi.doi_type, + doi.doi_bonus_type); + } + + (void) strlcat(curpath, "/", sizeof (curpath)); + + switch (doi.doi_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (s != NULL && *(s + 1) != '\0') + return (dump_path_impl(os, child_obj, s + 1)); + /*FALLTHROUGH*/ + case DMU_OT_PLAIN_FILE_CONTENTS: + dump_object(os, child_obj, dump_opt['v'], &header); + return (0); + default: + (void) fprintf(stderr, "object %llu has non-file/directory " + "type %d\n", (u_longlong_t)obj, doi.doi_type); + break; + } + + return (EINVAL); +} + +/* + * Dump the blocks for the object specified by path inside the dataset. + */ +static int +dump_path(char *ds, char *path) +{ + int err; + objset_t *os; + uint64_t root_obj; + + err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); + if (err != 0) + return (err); + + err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); + if (err != 0) { + (void) fprintf(stderr, "can't lookup root znode: %s\n", + strerror(err)); + dmu_objset_disown(os, FTAG); + return (EINVAL); + } + + (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); + + err = dump_path_impl(os, root_obj, path); + + close_objset(os, FTAG); + return (err); +} + static int dump_label(const char *dev) { @@ -2240,11 +2376,9 @@ dump_one_dir(const char *dsname, void *arg) int error; objset_t *os; - error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os); - if (error) { - (void) printf("Could not open %s, error %d\n", dsname, error); + error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); + if (error != 0) return (0); - } for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!dmu_objset_ds(os)->ds_feature_inuse[f]) @@ -2255,9 +2389,8 @@ dump_one_dir(const char *dsname, void *arg) } dump_dir(os); - dmu_objset_disown(os, FTAG); + close_objset(os, FTAG); fuid_table_destroy(); - sa_loaded = B_FALSE; return (0); } @@ -3599,35 +3732,37 @@ main(int argc, char **argv) spa_config_path = spa_config_path_env; while ((c = getopt(argc, argv, - "bcdhilmMI:suCDRSAFLXx:evp:t:U:PGo:q")) != -1) { + "AbcCdDeFGhiI:lLmMo:Op:PqRsSt:uU:vx:X")) != -1) { switch (c) { case 'b': case 'c': + case 'C': case 'd': + case 'D': + case 'G': case 'h': case 'i': case 'l': case 'm': - case 's': - case 'u': - case 'C': - case 'D': case 'M': + case 'O': case 'R': + case 's': case 'S': - case 'G': + case 'u': dump_opt[c]++; dump_all = 0; break; case 'A': + case 'e': case 'F': case 'L': - case 'X': - case 'e': case 'P': case 'q': + case 'X': dump_opt[c]++; break; + /* NB: Sort single match options below. */ case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { @@ -3637,6 +3772,11 @@ main(int argc, char **argv) usage(); } break; + case 'o': + error = set_global_var(optarg); + if (error != 0) + usage(); + break; case 'p': if (searchdirs == NULL) { searchdirs = umem_alloc(sizeof (char *), @@ -3669,11 +3809,6 @@ main(int argc, char **argv) case 'x': vn_dumpdir = optarg; break; - case 'o': - error = set_global_var(optarg); - if (error != 0) - usage(); - break; default: usage(); break; @@ -3711,7 +3846,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && !strchr("elAFLRSXP", c)) + if (dump_all && strchr("AeFlLOPRSX", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -3736,6 +3871,13 @@ main(int argc, char **argv) if (dump_opt['l']) return (dump_label(argv[0])); + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1])); + } + if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); @@ -3810,8 +3952,7 @@ main(int argc, char **argv) } } } else { - error = dmu_objset_own(target, DMU_OST_ANY, - B_TRUE, FTAG, &os); + error = open_objset(target, DMU_OST_ANY, FTAG, &os); } } nvlist_free(policy); @@ -3854,10 +3995,12 @@ main(int argc, char **argv) zdb_read_block(argv[i], spa); } - (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); + if (os != NULL) + close_objset(os, FTAG); + else + spa_close(spa, FTAG); fuid_table_destroy(); - sa_loaded = B_FALSE; dump_debug_buffer(); diff --git a/usr/src/man/man1m/zdb.1m b/usr/src/man/man1m/zdb.1m index eaa615a3fc..44c1b5848f 100644 --- a/usr/src/man/man1m/zdb.1m +++ b/usr/src/man/man1m/zdb.1m @@ -1,4 +1,3 @@ -'\" t .\" .\" This file and its contents are supplied under the terms of the .\" Common Development and Distribution License ("CDDL"), version 1.0. @@ -12,545 +11,372 @@ .\" .\" Copyright 2012, Richard Lowe. .\" Copyright (c) 2012, 2016 by Delphix. All rights reserved. -.\" Copyright 2016 Nexenta Systems, Inc. +.\" Copyright 2017 Nexenta Systems, Inc. .\" -.TH "ZDB" "1M" "April 9, 2016" - -.SH "NAME" -\fBzdb\fR - Display zpool debugging and consistency information - -.SH "SYNOPSIS" -\fBzdb\fR [-CmdibcsDvhLMXFPAG] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] - [-U \fIcache\fR] [-I \fIinflight I/Os\fR] [-x \fIdumpdir\fR] - [-o \fIvar\fR=\fIvalue\fR] ... [\fIpoolname\fR [\fIobject\fR ...]] - -.P -\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] - \fIdataset\fR [\fIobject\fR ...] - -.P -\fBzdb\fR -m [-MLXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] - \fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]] - -.P -\fBzdb\fR -R [-A] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fIpoolname\fR - \fIvdev\fR:\fIoffset\fR:\fIsize\fR[:\fIflags\fR] - -.P -\fBzdb\fR -S [-AP] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fIpoolname\fR - -.P -\fBzdb\fR -l [-Aqu] \fIdevice\fR - -.P -\fBzdb\fR -C [-A] [-U \fIcache\fR] - -.SH "DESCRIPTION" -The \fBzdb\fR utility displays information about a ZFS pool useful for -debugging and performs some amount of consistency checking. It is a not a -general purpose tool and options (and facilities) may change. This is neither -a fsck(1M) nor an fsdb(1M) utility. - -.P +.Dd January 14, 2017 +.Dt ZDB 1M +.Os +.Sh NAME +.Nm zdb +.Nd display zpool debugging and consistency information +.Sh SYNOPSIS +.Nm +.Op Fl AbcdDFGhiLMPsvX +.Op Fl e Op Fl p Ar path ... +.Op Fl I Ar inflight I/Os +.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... +.Op Fl t Ar txg +.Op Fl U Ar cache +.Op Fl x Ar dumpdir +.Op Ar poolname Op Ar object ... +.Nm +.Op Fl AdiPv +.Op Fl e Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar dataset Op Ar object ... +.Nm +.Fl C +.Op Fl A +.Op Fl U Ar cache +.Nm +.Fl l +.Op Fl Aqu +.Ar device +.Nm +.Fl m +.Op Fl AFLPX +.Op Fl t Ar txg +.Op Fl e Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname Op Ar vdev Op Ar metaslab ... +.Nm +.Fl O +.Ar dataset path +.Nm +.Fl R +.Op Fl A +.Op Fl e Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname vdev Ns : Ns Ar offset Ns : Ns Ar size Ns Op : Ns Ar flags +.Nm +.Fl S +.Op Fl AP +.Op Fl e Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname +.Sh DESCRIPTION +The +.Nm +utility displays information about a ZFS pool useful for debugging and performs +some amount of consistency checking. +It is a not a general purpose tool and options +.Pq and facilities +may change. +This is neither a +.Xr fsck 1M +nor an +.Xr fsdb 1M +utility. +.Pp The output of this command in general reflects the on-disk structure of a ZFS -pool, and is inherently unstable. The precise output of most invocations is -not documented, a knowledge of ZFS internals is assumed. - -.P -If the \fIdataset\fR argument does not contain any \fB/\fR or \fB@\fR -characters, it is interpreted as a pool name. The root dataset can be -specified as \fIpool\fB/\fR (pool name followed by a slash). - -.P +pool, and is inherently unstable. +The precise output of most invocations is not documented, a knowledge of ZFS +internals is assumed. +.Pp +If the +.Ar dataset +argument does not contain any +.Qq Sy / +or +.Qq Sy @ +characters, it is interpreted as a pool name. +The root dataset can be specified as +.Ar pool Ns / +.Pq pool name followed by a slash . +.Pp When operating on an imported and active pool it is possible, though unlikely, that zdb may interpret inconsistent pool data and behave erratically. - -.SH "OPTIONS" +.Sh OPTIONS Display options: - -.sp -.ne 2 -.na -\fB-b\fR -.ad -.sp .6 -.RS 4n -Display statistics regarding the number, size (logical, physical and -allocated) and deduplication of blocks. -.RE - -.sp -.ne 2 -.na -\fB-c\fR -.ad -.sp .6 -.RS 4n +.Bl -tag -width Ds +.It Fl b +Display statistics regarding the number, size +.Pq logical, physical and allocated +and deduplication of blocks. +.It Fl c Verify the checksum of all metadata blocks while printing block statistics -(see \fB-b\fR). -.sp +.Po see +.Fl b +.Pc . +.Pp If specified multiple times, verify the checksums of all blocks. -.RE - -.sp -.ne 2 -.na -\fB-C\fR -.ad -.sp .6 -.RS 4n -Display information about the configuration. If specified with no other -options, instead display information about the cache file -(\fB/etc/zfs/zpool.cache\fR). To specify the cache file to display, see -\fB-U\fR. -.P -If specified multiple times, and a pool name is also specified display both -the cached configuration and the on-disk configuration. If specified multiple -times with \fB-e\fR also display the configuration that would be used were the -pool to be imported. -.RE - -.sp -.ne 2 -.na -\fB-d\fR -.ad -.sp .6 -.RS 4n -Display information about datasets. Specified once, displays basic dataset -information: ID, create transaction, size, and object count. -.sp +.It Fl C +Display information about the configuration. +If specified with no other options, instead display information about the cache +file +.Pq Pa /etc/zfs/zpool.cache . +To specify the cache file to display, see +.Fl U . +.Pp +If specified multiple times, and a pool name is also specified display both the +cached configuration and the on-disk configuration. +If specified multiple times with +.Fl e +also display the configuration that would be used were the pool to be imported. +.It Fl d +Display information about datasets. +Specified once, displays basic dataset information: ID, create transaction, +size, and object count. +.Pp If specified multiple times provides greater and greater verbosity. -.sp -If object IDs are specified, display information about those specific objects only. -.RE - -.sp -.ne 2 -.na -\fB-D\fR -.ad -.sp .6 -.RS 4n -Display deduplication statistics, including the deduplication ratio (dedup), -compression ratio (compress), inflation due to the zfs copies property -(copies), and an overall effective ratio (dedup * compress / copies). -.sp -If specified twice, display a histogram of deduplication statistics, showing -the allocated (physically present on disk) and referenced (logically -referenced in the pool) block counts and sizes by reference count. -.sp -If specified a third time, display the statistics independently for each deduplication table. -.sp -If specified a fourth time, dump the contents of the deduplication tables describing duplicate blocks. -.sp -If specified a fifth time, also dump the contents of the deduplication tables describing unique blocks. -.RE - -.sp -.ne 2 -.na -\fB-h\fR -.ad -.sp .6 -.RS 4n -Display pool history similar to \fBzpool history\fR, but include internal -changes, transaction, and dataset information. -.RE - -.sp -.ne 2 -.na -\fB-i\fR -.ad -.sp .6 -.RS 4n -Display information about intent log (ZIL) entries relating to each -dataset. If specified multiple times, display counts of each intent log -transaction type. -.RE - -.sp -.ne 2 -.na -\fB-l\fR \fIdevice\fR -.ad -.sp .6 -.RS 4n -Read the vdev labels from the specified device. \fBzdb -l\fR will return 0 if -valid label was found, 1 if error occured, and 2 if no valid labels were found. -.P -If the \fB-u\fR option is also specified, also display the uberblocks on this -device. -.P -If the \fB-q\fR option is also specified, don't print the labels. -.RE - -.sp -.ne 2 -.na -\fB-L\fR -.ad -.sp .6 -.RS 4n -Disable leak tracing and the loading of space maps. By default, \fBzdb\fR +.Pp +If object IDs are specified, display information about those specific objects +only. +.It Fl D +Display deduplication statistics, including the deduplication ratio +.Pq Sy dedup , +compression ratio +.Pq Sy compress , +inflation due to the zfs copies property +.Pq Sy copies , +and an overall effective ratio +.Pq Sy dedup No * Sy compress No / Sy copies . +.It Fl DD +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. +.It Fl DDD +Display the statistics independently for each deduplication table. +.It Fl DDDD +Dump the contents of the deduplication tables describing duplicate blocks. +.It Fl DDDDD +Also dump the contents of the deduplication tables describing unique blocks. +.It Fl h +Display pool history similar to +.Nm zpool Cm history , +but include internal changes, transaction, and dataset information. +.It Fl i +Display information about intent log +.Pq ZIL +entries relating to each dataset. +If specified multiple times, display counts of each intent log transaction type. +.It Fl l Ar device +Read the vdev labels from the specified device. +.Nm Fl l +will return 0 if valid label was found, 1 if error occurred, and 2 if no valid +labels were found. +.Pp +If the +.Fl q +option is also specified, don't print the labels. +.Pp +If the +.Fl u +option is also specified, also display the uberblocks on this device. +.It Fl L +Disable leak tracing and the loading of space maps. +By default, +.Nm verifies that all non-free blocks are referenced, which can be very expensive. -.RE - -.sp -.ne 2 -.na -\fB-m\fR -.ad -.sp .6 -.RS 4n +.It Fl m Display the offset, spacemap, and free space of each metaslab. -When specified twice, also display information about the on-disk free -space histogram associated with each metaslab. When specified three time, -display the maximum contiguous free space, the in-core free space histogram, -and the percentage of free space in each space map. When specified -four times display every spacemap record. -.RE - -.sp -.ne 2 -.na -\fB-M\fR -.ad -.sp .6 -.RS 4n +.It Fl mm +Also display information about the on-disk free space histogram associated with +each metaslab. +.It Fl mmm +Display the maximum contiguous free space, the in-core free space histogram, and +the percentage of free space in each space map. +.It Fl mmmm +Display every spacemap record. +.It Fl M Display the offset, spacemap, and free space of each metaslab. -When specified twice, also display information about the maximum contiguous -free space and the percentage of free space in each space map. When specified -three times display every spacemap record. -.RE - -.sp -.ne 2 -.na -\fB-R\fR \fIpoolname\fR \fIvdev\fR:\fIoffset\fR:\fIsize\fR[:\fIflags\fR] -.ad -.sp .6 -.RS 4n -Read and display a block from the specified device. By default the block is -displayed as a hex dump, but see the description of the \'r\' flag, below. -.sp -The block is specified in terms of a colon-separated tuple \fIvdev\fR (an -integer vdev identifier) \fIoffset\fR (the offset within the vdev) \fIsize\fR -(the size of the block to read) and, optionally, \fIflags\fR (a set of flags, -described below). - -.sp -.ne 2 -.na -\fBb\fR \fIoffset\fR -.ad -.sp .6 -.RS 4n +.It Fl MM +Also display information about the maximum contiguous free space and the +percentage of free space in each space map. +.It Fl MMM +Display every spacemap record. +.It Fl O Ar dataset path +Look up the specified +.Ar path +inside of the +.Ar dataset +and display its metadata and indirect blocks. +Specified +.Ar path +must be relative to the root of +.Ar dataset . +This option can be combined with +.Fl v +for increasing verbosity. +.It Fl R Ar poolname vdev Ns : Ns Ar offset Ns : Ns Ar size Ns Op : Ns Ar flags +Read and display a block from the specified device. +By default the block is displayed as a hex dump, but see the description of the +.Sy r +flag, below. +.Pp +The block is specified in terms of a colon-separated tuple +.Ar vdev +.Pq an integer vdev identifier +.Ar offset +.Pq the offset within the vdev +.Ar size +.Pq the size of the block to read +and, optionally, +.Ar flags +.Pq a set of flags, described below . +.Pp +.Bl -tag -compact -width "b offset" +.It Sy b Ar offset Print block pointer -.RE - -.sp -.ne 2 -.na -\fBd\fR -.ad -.sp .6 -.RS 4n +.It Sy d Decompress the block -.RE - -.sp -.ne 2 -.na -\fBe\fR -.ad -.sp .6 -.RS 4n +.It Sy e Byte swap the block -.RE - -.sp -.ne 2 -.na -\fBg\fR -.ad -.sp .6 -.RS 4n +.It Sy g Dump gang block header -.RE - -.sp -.ne 2 -.na -\fBi\fR -.ad -.sp .6 -.RS 4n +.It Sy i Dump indirect block -.RE - -.sp -.ne 2 -.na -\fBr\fR -.ad -.sp .6 -.RS 4n +.It Sy r Dump raw uninterpreted block data -.RE -.RE - -.sp -.ne 2 -.na -\fB-s\fR -.ad -.sp .6 -.RS 4n -Report statistics on \fBzdb\fR\'s I/O. Display operation counts, bandwidth, -and error counts of I/O to the pool from \fBzdb\fR. -.RE - -.sp -.ne 2 -.na -\fB-S\fR -.ad -.sp .6 -.RS 4n +.El +.It Fl s +Report statistics on +.Nm zdb +I/O. +Display operation counts, bandwidth, and error counts of I/O to the pool from +.Nm . +.It Fl S Simulate the effects of deduplication, constructing a DDT and then display -that DDT as with \fB-DD\fR. -.RE - -.sp -.ne 2 -.na -\fB-u\fR -.ad -.sp .6 -.RS 4n +that DDT as with +.Fl DD . +.It Fl u Display the current uberblock. -.RE - -.P +.El +.Pp Other options: - -.sp -.ne 2 -.na -\fB-A\fR -.ad -.sp .6 -.RS 4n +.Bl -tag -width Ds +.It Fl A Do not abort should any assertion fail. -.RE - -.sp -.ne 2 -.na -\fB-AA\fR -.ad -.sp .6 -.RS 4n +.It Fl AA Enable panic recovery, certain errors which would otherwise be fatal are demoted to warnings. -.RE - -.sp -.ne 2 -.na -\fB-AAA\fR -.ad -.sp .6 -.RS 4n +.It Fl AAA Do not abort if asserts fail and also enable panic recovery. -.RE - -.sp -.ne 2 -.na -\fB-e\fR [-p \fIpath\fR]... -.ad -.sp .6 -.RS 4n -Operate on an exported pool, not present in \fB/etc/zfs/zpool.cache\fR. The -\fB-p\fR flag specifies the path under which devices are to be searched. -.RE - -.sp -.ne 2 -.na -\fB-x\fR \fIdumpdir\fR -.ad -.sp .6 -.RS 4n +.It Fl e Op Fl p Ar path ... +Operate on an exported pool, not present in +.Pa /etc/zfs/zpool.cache . +The +.Fl p +flag specifies the path under which devices are to be searched. +.It Fl x Ar dumpdir All blocks accessed will be copied to files in the specified directory. The blocks will be placed in sparse files whose name is the same as -that of the file or device read. zdb can be then run on the generated files. -Note that the \fB-bbc\fR flags are sufficient to access (and thus copy) +that of the file or device read. +.Nm +can be then run on the generated files. +Note that the +.Fl bbc +flags are sufficient to access +.Pq and thus copy all metadata on the pool. -.RE - -.sp -.ne 2 -.na -\fB-F\fR -.ad -.sp .6 -.RS 4n +.It Fl F Attempt to make an unreadable pool readable by trying progressively older transactions. -.RE - -.sp -.ne 2 -.na -\fB-G\fR -.ad -.sp .6 -.RS 4n -Dump the contents of the zfs_dbgmsg buffer before exiting zdb. zfs_dbgmsg is -a buffer used by ZFS to dump advanced debug information. -.RE - -.sp -.ne 2 -.na -\fB-I \fIinflight I/Os\fR \fR -.ad -.sp .6 -.RS 4n -Limit the number of outstanding checksum I/Os to the specified value. The -default value is 200. This option affects the performance of the \fB-c\fR +.It Fl G +Dump the contents of the zfs_dbgmsg buffer before exiting +.Nm . +zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information. +.It Fl I Ar inflight I/Os +Limit the number of outstanding checksum I/Os to the specified value. +The default value is 200. +This option affects the performance of the +.Fl c option. -.RE - -.sp -.ne 2 -.na -\fB-o \fIvar\fR=\fIvalue\fR ... \fR -.ad -.sp .6 -.RS 4n -Set the given global libzpool variable to the provided value. The value must -be an unsigned 32-bit integer. Currently only little-endian systems are -supported to avoid accidentally setting the high 32 bits of 64-bit variables. -.RE - -.sp -.ne 2 -.na -\fB-P\fR -.ad -.sp .6 -.RS 4n +.It Fl o Ar var Ns = Ns Ar value ... +Set the given global libzpool variable to the provided value. +The value must be an unsigned 32-bit integer. +Currently only little-endian systems are supported to avoid accidentally setting +the high 32 bits of 64-bit variables. +.It Fl P Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather than 1M. -.RE - -.sp -.ne 2 -.na -\fB-t\fR \fItransaction\fR -.ad -.sp .6 -.RS 4n -Specify the highest transaction to use when searching for uberblocks. See also -the \fB-u\fR and \fB-l\fR options for a means to see the available uberblocks -and their associated transaction numbers. -.RE - -.sp -.ne 2 -.na -\fB-U\fR \fIcachefile\fR -.ad -.sp .6 -.RS 4n -Use a cache file other than \fB/etc/zfs/zpool.cache\fR. -.RE - -.sp -.ne 2 -.na -\fB-v\fR -.ad -.sp .6 -.RS 4n -Enable verbosity. Specify multiple times for increased verbosity. -.RE - -.sp -.ne 2 -.na -\fB-X\fR -.ad -.sp .6 -.RS 4n -Attempt \'extreme\' transaction rewind, that is attempt the same recovery as -\fB-F\fR but read transactions otherwise deemed too old. -.RE - -.P +.It Fl t Ar transaction +Specify the highest transaction to use when searching for uberblocks. +See also the +.Fl u +and +.Fl l +options for a means to see the available uberblocks and their associated +transaction numbers. +.It Fl U Ar cachefile +Use a cache file other than +.Pa /etc/zfs/zpool.cache . +.It Fl v +Enable verbosity. +Specify multiple times for increased verbosity. +.It Fl X +Attempt +.Qq extreme +transaction rewind, that is attempt the same recovery as +.Fl F +but read transactions otherwise deemed too old. +.El +.Pp Specifying a display option more than once enables verbosity for only that option, with more occurrences enabling more verbosity. -.P +.Pp If no options are specified, all information about the named pool will be displayed at default verbosity. - -.SH "EXAMPLES" -.LP -\fBExample 1 \fRDisplay the configuration of imported pool 'rpool' -.sp -.in +2 -.nf +.Sh EXAMPLES +.Bl -tag -width Ds +.It Xo +.Sy Example 1 +Display the configuration of imported pool +.Pa rpool +.Xc +.Bd -literal # zdb -C rpool MOS Configuration: version: 28 name: 'rpool' ... -.fi -.in -2 -.sp - -.LP -\fBExample 2 \fRDisplay basic dataset information about 'rpool' -.sp -.in +2 -.nf +.Ed +.It Xo +.Sy Example 2 +Display basic dataset information about +.Pa rpool +.Xc +.Bd -literal # zdb -d rpool Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects ... -.fi -.in -2 -.sp - -.LP -\fBExample 3 \fRDisplay basic information about object 0 in 'rpool/export/home' -.sp -.in +2 -.nf +.Ed +.It Xo +.Sy Example 3 +Display basic information about object 0 in +.Pa rpool/export/home +.Xc +.Bd -literal # zdb -d rpool/export/home 0 Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects Object lvl iblk dblk dsize lsize %full type 0 7 16K 16K 15.0K 16K 25.00 DMU dnode -.fi -.in -2 -.sp - -.LP -\fBExample 4 \fRDisplay the predicted effect of enabling deduplication on 'rpool' -.sp -.in +2 -.nf +.Ed +.It Xo +.Sy Example 4 +Display the predicted effect of enabling deduplication on +.Pa rpool +.Xc +.Bd -literal # zdb -S rpool Simulated DDT histogram: -bucket allocated referenced +bucket allocated referenced ______ ______________________________ ______________________________ refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE ------ ------ ----- ----- ----- ------ ----- ----- ----- @@ -558,9 +384,8 @@ refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G ... dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 -.fi -.in -2 -.sp - -.SH "SEE ALSO" -zfs(1M), zpool(1M) +.Ed +.El +.Sh SEE ALSO +.Xr zfs 1M , +.Xr zpool 1M diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index e626c5e340..c692fef20c 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -1357,41 +1357,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, mutex_exit(&dn->dn_dbufs_mtx); } -static int -dbuf_block_freeable(dmu_buf_impl_t *db) -{ - dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; - uint64_t birth_txg = 0; - - /* - * We don't need any locking to protect db_blkptr: - * If it's syncing, then db_last_dirty will be set - * so we'll ignore db_blkptr. - * - * This logic ensures that only block births for - * filled blocks are considered. - */ - ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty && (db->db_blkptr == NULL || - !BP_IS_HOLE(db->db_blkptr))) { - birth_txg = db->db_last_dirty->dr_txg; - } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { - birth_txg = db->db_blkptr->blk_birth; - } - - /* - * If this block don't exist or is in a snapshot, it can't be freed. - * Don't pass the bp to dsl_dataset_block_freeable() since we - * are holding the db_mtx lock and might deadlock if we are - * prefetching a dedup-ed block. - */ - if (birth_txg != 0) - return (ds == NULL || - dsl_dataset_block_freeable(ds, NULL, birth_txg)); - else - return (B_FALSE); -} - void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { @@ -1441,7 +1406,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); - dnode_willuse_space(dn, size-osize, tx); + dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); DB_DNODE_EXIT(db); } @@ -1491,7 +1456,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; - boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); @@ -1613,15 +1577,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { - /* - * Update the accounting. - * Note: we delay "free accounting" until after we drop - * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dsize() while - * also holding the db_mtx. - */ - dnode_willuse_space(dn, db->db.db_size, tx); - do_free_accounting = dbuf_block_freeable(db); + dmu_objset_willuse_space(os, db->db.db_size, tx); } /* @@ -1714,21 +1670,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drop_struct_lock = TRUE; } - if (do_free_accounting) { - blkptr_t *bp = db->db_blkptr; - int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dsize(os->os_spa, bp) : db->db.db_size; - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - ddt_prefetch(os->os_spa, bp); - dnode_willuse_space(dn, -willfree, tx); - } + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); @@ -2938,19 +2886,6 @@ dmu_buf_user_evict_wait() taskq_wait(dbu_evict_taskq); } -boolean_t -dmu_buf_freeable(dmu_buf_t *dbuf) -{ - boolean_t res = B_FALSE; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - - if (db->db_blkptr) - res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, - db->db_blkptr, db->db_blkptr->blk_birth); - - return (res); -} - blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index a2322a90cc..6ef45db8ab 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -2103,3 +2103,20 @@ dmu_fsname(const char *snapname, char *buf) (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } + +/* + * Call when we think we're going to write/free space in open context to track + * the amount of dirty data in the open txg, which is also the amount + * of memory that can not be evicted until this txg syncs. + */ +void +dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); + + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); + } +} diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index dadd0083cd..88748595b6 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -30,10 +30,10 @@ #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> -#include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ -#include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ +#include <sys/dsl_dataset.h> +#include <sys/dsl_dir.h> #include <sys/dsl_pool.h> -#include <sys/zap_impl.h> /* for fzap_default_block_shift */ +#include <sys/zap_impl.h> #include <sys/spa.h> #include <sys/sa.h> #include <sys/sa_impl.h> @@ -56,10 +56,6 @@ dmu_tx_create_dd(dsl_dir_t *dd) list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); -#ifdef ZFS_DEBUG - refcount_create(&tx->tx_space_written); - refcount_create(&tx->tx_space_freed); -#endif return (tx); } @@ -68,7 +64,6 @@ dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -130,16 +125,10 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, txh->txh_tx = tx; txh->txh_dnode = dn; refcount_create(&txh->txh_space_towrite); - refcount_create(&txh->txh_space_tofree); - refcount_create(&txh->txh_space_tooverwrite); - refcount_create(&txh->txh_space_tounref); refcount_create(&txh->txh_memory_tohold); - refcount_create(&txh->txh_fudge); -#ifdef ZFS_DEBUG txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; -#endif list_insert_tail(&tx->tx_holds, txh); return (txh); @@ -158,6 +147,34 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) } } +/* + * This function reads specified data from disk. The specified data will + * be needed to perform the transaction -- i.e, it will be read after + * we do dmu_tx_assign(). There are two reasons that we read the data now + * (before dmu_tx_assign()): + * + * 1. Reading it now has potentially better performance. The transaction + * has not yet been assigned, so the TXG is not held open, and also the + * caller typically has less locks held when calling dmu_tx_hold_*() than + * after the transaction has been assigned. This reduces the lock (and txg) + * hold times, thus reducing lock contention. + * + * 2. It is easier for callers (primarily the ZPL) to handle i/o errors + * that are detected before they start making changes to the DMU state + * (i.e. now). Once the transaction has been assigned, and some DMU + * state has been changed, it can be difficult to recover from an i/o + * error (e.g. to undo the changes already made in memory at the DMU + * layer). Typically code to do so does not exist in the caller -- it + * assumes that the data has already been cached and thus i/o errors are + * not possible. + * + * It has been observed that the i/o initiated here can be a performance + * problem, and it appears to be optional, because we don't look at the + * data which is read. However, removing this read would only serve to + * move the work elsewhere (after the dmu_tx_assign()), where it may + * have a greater impact on performance (in addition to the impact on + * fault tolerance noted above). + */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { @@ -174,63 +191,11 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } -static void -dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, - int level, uint64_t blkid, boolean_t freeable, uint64_t *history) -{ - objset_t *os = dn->dn_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - dmu_buf_impl_t *parent = NULL; - blkptr_t *bp = NULL; - uint64_t space; - - if (level >= dn->dn_nlevels || history[level] == blkid) - return; - - history[level] = blkid; - - space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); - - if (db == NULL || db == dn->dn_dbuf) { - ASSERT(level != 0); - db = NULL; - } else { - ASSERT(DB_DNODE(db) == dn); - ASSERT(db->db_level == level); - ASSERT(db->db.db_size == space); - ASSERT(db->db_blkid == blkid); - bp = db->db_blkptr; - parent = db->db_parent; - } - - freeable = (bp && (freeable || - dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); - - if (freeable) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - space, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - space, FTAG); - } - - if (bp) { - (void) refcount_add_many(&txh->txh_space_tounref, - bp_get_dsize(os->os_spa, bp), FTAG); - } - - dmu_tx_count_twig(txh, dn, parent, level + 1, - blkid >> epbs, freeable, history); -} - /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; - uint64_t start, end, i; - int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; if (len == 0) @@ -238,197 +203,74 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); - min_bs = SPA_MINBLOCKSHIFT; - max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; - min_ibs = DN_MIN_INDBLKSHIFT; - max_ibs = DN_MAX_INDBLKSHIFT; - - if (dn) { - uint64_t history[DN_MAX_LEVELS]; - int nlvls = dn->dn_nlevels; - int delta; + (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - if (dn->dn_maxblkid == 0) { - delta = dn->dn_datablksz; - start = (off < dn->dn_datablksz) ? 0 : 1; - end = (off+len <= dn->dn_datablksz) ? 0 : 1; - if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) - goto out; - delta -= off; - } - } else { - zio_t *zio = zio_root(dn->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - - /* first level-0 block */ - start = off >> dn->dn_datablkshift; - if (P2PHASE(off, dn->dn_datablksz) || - len < dn->dn_datablksz) { - err = dmu_tx_check_ioerr(zio, dn, 0, start); - if (err) - goto out; - } + if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) + err = SET_ERROR(EFBIG); - /* last level-0 block */ - end = (off+len-1) >> dn->dn_datablkshift; - if (end != start && end <= dn->dn_maxblkid && - P2PHASE(off+len, dn->dn_datablksz)) { - err = dmu_tx_check_ioerr(zio, dn, 0, end); - if (err) - goto out; - } + if (dn == NULL) + return; - /* level-1 blocks */ - if (nlvls > 1) { - int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = (start>>shft)+1; i < end>>shft; i++) { - err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) - goto out; - } + /* + * For i/o error checking, read the blocks that will be needed + * to perform the write: the first and last level-0 blocks (if + * they are not aligned, i.e. if they are partial-block writes), + * and all the level-1 blocks. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; } - - err = zio_wait(zio); - if (err) - goto out; - delta = P2NPHASE(off, dn->dn_datablksz); - } - - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_maxblkid > 0) { - /* - * The blocksize can't change, - * so we can make a more precise estimate. - */ - ASSERT(dn->dn_datablkshift != 0); - min_bs = max_bs = dn->dn_datablkshift; - } else { - /* - * The blocksize can increase up to the recordsize, - * or if it is already more than the recordsize, - * up to the next power of 2. - */ - min_bs = highbit64(dn->dn_datablksz - 1); - max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); - /* - * If this write is not off the end of the file - * we need to account for overwrites/unref. - */ - if (start <= dn->dn_maxblkid) { - for (int l = 0; l < DN_MAX_LEVELS; l++) - history[l] = -1ULL; + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } } - while (start <= dn->dn_maxblkid) { - dmu_buf_impl_t *db; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, 0, start, - FALSE, FALSE, FTAG, &db); - rw_exit(&dn->dn_struct_rwlock); - - if (err) { + /* last level-0 block */ + uint64_t end = (off + len - 1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off + len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err != 0) { txh->txh_tx->tx_err = err; - return; } + } - dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, - history); - dbuf_rele(db, FTAG); - if (++start > end) { - /* - * Account for new indirects appearing - * before this IO gets assigned into a txg. - */ - bits = 64 - min_bs; - epbs = min_ibs - SPA_BLKPTRSHIFT; - for (bits -= epbs * (nlvls - 1); - bits >= 0; bits -= epbs) { - (void) refcount_add_many( - &txh->txh_fudge, - 1ULL << max_ibs, FTAG); + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (uint64_t i = (start >> shft) + 1; + i < end >> shft; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + txh->txh_tx->tx_err = err; } - goto out; } - off += delta; - if (len >= delta) - len -= delta; - delta = dn->dn_datablksz; } - } - - /* - * 'end' is the last thing we will access, not one past. - * This way we won't overflow when accessing the last byte. - */ - start = P2ALIGN(off, 1ULL << max_bs); - end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; - (void) refcount_add_many(&txh->txh_space_towrite, - end - start + 1, FTAG); - - start >>= min_bs; - end >>= min_bs; - - epbs = min_ibs - SPA_BLKPTRSHIFT; - /* - * The object contains at most 2^(64 - min_bs) blocks, - * and each indirect level maps 2^epbs. - */ - for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { - start >>= epbs; - end >>= epbs; - ASSERT3U(end, >=, start); - (void) refcount_add_many(&txh->txh_space_towrite, - (end - start + 1) << max_ibs, FTAG); - if (start != 0) { - /* - * We also need a new blkid=0 indirect block - * to reference any existing file data. - */ - (void) refcount_add_many(&txh->txh_space_towrite, - 1ULL << max_ibs, FTAG); + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; } } - -out: - if (refcount_count(&txh->txh_space_towrite) + - refcount_count(&txh->txh_space_tooverwrite) > - 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - - if (err) - txh->txh_tx->tx_err = err; } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { - dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); - uint64_t space = mdn->dn_datablksz + - ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); - - if (dn && dn->dn_dbuf->db_blkptr && - dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - space, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); - if (dn && dn->dn_dbuf->db_blkptr) { - (void) refcount_add_many(&txh->txh_space_tounref, - space, FTAG); - } - } + (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); } void @@ -436,8 +278,8 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); - ASSERT(len < DMU_MAX_ACCESS); + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, @@ -449,179 +291,6 @@ dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) dmu_tx_count_dnode(txh); } -static void -dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) -{ - uint64_t blkid, nblks, lastblk; - uint64_t space = 0, unref = 0, skipped = 0; - dnode_t *dn = txh->txh_dnode; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - spa_t *spa = txh->txh_tx->tx_pool->dp_spa; - int epbs; - uint64_t l0span = 0, nl1blks = 0; - - if (dn->dn_nlevels == 0) - return; - - /* - * The struct_rwlock protects us against dn_nlevels - * changing, in case (against all odds) we manage to dirty & - * sync out the changes after we check for being dirty. - * Also, dbuf_hold_impl() wants us to have the struct_rwlock. - */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - if (dn->dn_maxblkid == 0) { - if (off == 0 && len >= dn->dn_datablksz) { - blkid = 0; - nblks = 1; - } else { - rw_exit(&dn->dn_struct_rwlock); - return; - } - } else { - blkid = off >> dn->dn_datablkshift; - nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - - if (blkid > dn->dn_maxblkid) { - rw_exit(&dn->dn_struct_rwlock); - return; - } - if (blkid + nblks > dn->dn_maxblkid) - nblks = dn->dn_maxblkid - blkid + 1; - - } - l0span = nblks; /* save for later use to calc level > 1 overhead */ - if (dn->dn_nlevels == 1) { - int i; - for (i = 0; i < nblks; i++) { - blkptr_t *bp = dn->dn_phys->dn_blkptr; - ASSERT3U(blkid + i, <, dn->dn_nblkptr); - bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { - dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dsize(spa, bp); - } - unref += BP_GET_ASIZE(bp); - } - nl1blks = 1; - nblks = 0; - } - - lastblk = blkid + nblks - 1; - while (nblks) { - dmu_buf_impl_t *dbuf; - uint64_t ibyte, new_blkid; - int epb = 1 << epbs; - int err, i, blkoff, tochk; - blkptr_t *bp; - - ibyte = blkid << dn->dn_datablkshift; - err = dnode_next_offset(dn, - DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); - new_blkid = ibyte >> dn->dn_datablkshift; - if (err == ESRCH) { - skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; - break; - } - if (err) { - txh->txh_tx->tx_err = err; - break; - } - if (new_blkid > lastblk) { - skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; - break; - } - - if (new_blkid > blkid) { - ASSERT((new_blkid >> epbs) > (blkid >> epbs)); - skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; - nblks -= new_blkid - blkid; - blkid = new_blkid; - } - blkoff = P2PHASE(blkid, epb); - tochk = MIN(epb - blkoff, nblks); - - err = dbuf_hold_impl(dn, 1, blkid >> epbs, - FALSE, FALSE, FTAG, &dbuf); - if (err) { - txh->txh_tx->tx_err = err; - break; - } - - (void) refcount_add_many(&txh->txh_memory_tohold, - dbuf->db.db_size, FTAG); - - /* - * We don't check memory_tohold against DMU_MAX_ACCESS because - * memory_tohold is an over-estimation (especially the >L1 - * indirect blocks), so it could fail. Callers should have - * already verified that they will not be holding too much - * memory. - */ - - err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); - if (err != 0) { - txh->txh_tx->tx_err = err; - dbuf_rele(dbuf, FTAG); - break; - } - - bp = dbuf->db.db_data; - bp += blkoff; - - for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, &bp[i], - bp[i].blk_birth)) { - dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dsize(spa, &bp[i]); - } - unref += BP_GET_ASIZE(bp); - } - dbuf_rele(dbuf, FTAG); - - ++nl1blks; - blkid += tochk; - nblks -= tochk; - } - rw_exit(&dn->dn_struct_rwlock); - - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels and a - * worst-possible distribution of l1-blocks over the region to free. - */ - { - uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); - int level = 2; - /* - * Here we don't use DN_MAX_LEVEL, but calculate it with the - * given datablkshift and indblkshift. This makes the - * difference between 19 and 8 on large files. - */ - int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / - (dn->dn_indblkshift - SPA_BLKPTRSHIFT); - - while (level++ < maxlevel) { - (void) refcount_add_many(&txh->txh_memory_tohold, - MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift, - FTAG); - blkcnt = 1 + (blkcnt >> epbs); - } - } - - /* account for new level 1 indirect blocks that might show up */ - if (skipped > 0) { - (void) refcount_add_many(&txh->txh_fudge, - skipped << dn->dn_indblkshift, FTAG); - skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); - (void) refcount_add_many(&txh->txh_memory_tohold, - skipped << dn->dn_indblkshift, FTAG); - } - (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG); -} - /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and @@ -633,45 +302,27 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) void dmu_tx_mark_netfree(dmu_tx_t *tx) { - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - DMU_NEW_OBJECT, THT_FREE, 0, 0); - - /* - * Pretend that this operation will free 1GB of space. This - * should be large enough to cancel out the largest write. - * We don't want to use something like UINT64_MAX, because that would - * cause overflows when doing math with these values (e.g. in - * dmu_tx_try_assign()). - */ - (void) refcount_add_many(&txh->txh_space_tofree, - 1024 * 1024 * 1024, FTAG); - (void) refcount_add_many(&txh->txh_space_tounref, - 1024 * 1024 * 1024, FTAG); + tx->tx_netfree = B_TRUE; } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { - dmu_tx_hold_t *txh; - dnode_t *dn; int err; - zio_t *zio; ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh == NULL) return; - dn = txh->txh_dnode; + dnode_t *dn = txh->txh_dnode; dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) + if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) - len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 @@ -691,7 +342,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) - dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_write(txh, off + len, 1); } /* @@ -713,7 +364,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) if (dn->dn_datablkshift == 0) start = end = 0; - zio = zio_root(tx->tx_pool->dp_spa, + zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; @@ -721,127 +372,80 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) i = ibyte >> shift; if (err == ESRCH || i > end) break; - if (err) { + if (err != 0) { tx->tx_err = err; + (void) zio_wait(zio); return; } + (void) refcount_add_many(&txh->txh_memory_tohold, + 1 << dn->dn_indblkshift, FTAG); + err = dmu_tx_check_ioerr(zio, dn, 1, i); - if (err) { + if (err != 0) { tx->tx_err = err; + (void) zio_wait(zio); return; } } err = zio_wait(zio); - if (err) { + if (err != 0) { tx->tx_err = err; return; } } - - dmu_tx_count_free(txh, off, len); } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { - dmu_tx_hold_t *txh; - dnode_t *dn; int err; ASSERT(tx->tx_txg == 0); - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh == NULL) return; - dn = txh->txh_dnode; + dnode_t *dn = txh->txh_dnode; dmu_tx_count_dnode(txh); - if (dn == NULL) { - /* - * We will be able to fit a new object's entries into one leaf - * block. So there will be at most 2 blocks total, - * including the header block. - */ - dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); + /* + * Modifying a almost-full microzap is around the worst case (128KB) + * + * If it is a fat zap, the worst case would be 7*16KB=112KB: + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + */ + (void) refcount_add_many(&txh->txh_space_towrite, + MZAP_MAX_BLKSZ, FTAG); + + if (dn == NULL) return; - } ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); - if (dn->dn_maxblkid == 0 && !add) { - blkptr_t *bp; - + if (dn->dn_maxblkid == 0 || name == NULL) { /* - * If there is only one block (i.e. this is a micro-zap) - * and we are not adding anything, the accounting is simple. + * This is a microzap (only one block), or we don't know + * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); - if (err) { + if (err != 0) { tx->tx_err = err; - return; - } - - /* - * Use max block size here, since we don't know how much - * the size will change between now and the dbuf dirty call. - */ - bp = &dn->dn_phys->dn_blkptr[0]; - if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - bp, bp->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - MZAP_MAX_BLKSZ, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - MZAP_MAX_BLKSZ, FTAG); - } - if (!BP_IS_HOLE(bp)) { - (void) refcount_add_many(&txh->txh_space_tounref, - MZAP_MAX_BLKSZ, FTAG); } - return; - } - - if (dn->dn_maxblkid > 0 && name) { + } else { /* - * access the name in this fat-zap so that we'll check - * for i/o errors to the leaf blocks, etc. + * Access the name so that we'll check for i/o errors to + * the leaf blocks, etc. We ignore ENOENT, as this name + * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); - if (err == EIO) { + if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; - return; - } - } - - err = zap_count_write_by_dnode(dn, name, add, - &txh->txh_space_towrite, &txh->txh_space_tooverwrite); - - /* - * If the modified blocks are scattered to the four winds, - * we'll have to modify an indirect twig for each. We can make - * modifications at up to 3 locations: - * - header block at the beginning of the object - * - target leaf block - * - end of the object, where we might need to write: - * - a new leaf block if the target block needs to be split - * - the new pointer table, if it is growing - * - the new cookie table, if it is growing - */ - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - dsl_dataset_phys_t *ds_phys = - dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); - for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) { - uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl)); - uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift; - if (ds_phys->ds_prev_snap_obj != 0) { - (void) refcount_add_many(&txh->txh_space_towrite, - spc, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - spc, FTAG); } } } @@ -871,42 +475,15 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } -int -dmu_tx_holds(dmu_tx_t *tx, uint64_t object) -{ - dmu_tx_hold_t *txh; - int holds = 0; - - /* - * By asserting that the tx is assigned, we're counting the - * number of dn_tx_holds, which is the same as the number of - * dn_holds. Otherwise, we'd be counting dn_holds, but - * dn_tx_holds could be 0. - */ - ASSERT(tx->tx_txg != 0); - - /* if (tx->tx_anyobj == TRUE) */ - /* return (0); */ - - for (txh = list_head(&tx->tx_holds); txh; - txh = list_next(&tx->tx_holds, txh)) { - if (txh->txh_dnode && txh->txh_dnode->dn_object == object) - holds++; - } - - return (holds); -} - #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { - dmu_tx_hold_t *txh; - int match_object = FALSE, match_offset = FALSE; - dnode_t *dn; + boolean_t match_object = B_FALSE; + boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); + dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); @@ -922,7 +499,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) return; } - for (txh = list_head(&tx->tx_holds); txh; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) @@ -1154,13 +731,49 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) #endif } +/* + * This routine attempts to assign the transaction to a transaction group. + * To do so, we must determine if there is sufficient free space on disk. + * + * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() + * on it), then it is assumed that there is sufficient free space, + * unless there's insufficient slop space in the pool (see the comment + * above spa_slop_shift in spa_misc.c). + * + * If it is not a "netfree" transaction, then if the data already on disk + * is over the allowed usage (e.g. quota), this will fail with EDQUOT or + * ENOSPC. Otherwise, if the current rough estimate of pending changes, + * plus the rough estimate of this transaction's changes, may exceed the + * allowed usage, then this will fail with ERESTART, which will cause the + * caller to wait for the pending changes to be written to disk (by waiting + * for the next TXG to open), and then check the space usage again. + * + * The rough estimate of pending changes is comprised of the sum of: + * + * - this transaction's holds' txh_space_towrite + * + * - dd_tempreserved[], which is the sum of in-flight transactions' + * holds' txh_space_towrite (i.e. those transactions that have called + * dmu_tx_assign() but not yet called dmu_tx_commit()). + * + * - dd_space_towrite[], which is the amount of dirtied dbufs. + * + * Note that all of these values are inflated by spa_get_worst_case_asize(), + * which means that we may get ERESTART well before we are actually in danger + * of running out of space, but this also mitigates any small inaccuracies + * in the rough estimate (e.g. txh_space_towrite doesn't take into account + * indirect blocks, and dd_space_towrite[] doesn't take into account changes + * to the MOS). + * + * Note that due to this algorithm, it is possible to exceed the allowed + * usage by one transaction. Also, as we approach the allowed usage, + * we will allow a very limited amount of changes into each TXG, thus + * decreasing performance. + */ static int dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { - dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; - uint64_t memory, asize, fsize, usize; - uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ASSERT0(tx->tx_txg); @@ -1199,8 +812,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) * dmu_tx_unassign() logic. */ - towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; - for (txh = list_head(&tx->tx_holds); txh; + uint64_t towrite = 0; + uint64_t tohold = 0; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { @@ -1217,50 +831,18 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) mutex_exit(&dn->dn_mtx); } towrite += refcount_count(&txh->txh_space_towrite); - tofree += refcount_count(&txh->txh_space_tofree); - tooverwrite += refcount_count(&txh->txh_space_tooverwrite); - tounref += refcount_count(&txh->txh_space_tounref); tohold += refcount_count(&txh->txh_memory_tohold); - fudge += refcount_count(&txh->txh_fudge); - } - - /* - * If a snapshot has been taken since we made our estimates, - * assume that we won't be able to free or overwrite anything. - */ - if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > - tx->tx_lastsnap_txg) { - towrite += tooverwrite; - tooverwrite = tofree = 0; } /* needed allocation: worst-case estimate of write space */ - asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); - /* freed space estimate: worst-case overwrite + free estimate */ - fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; - /* convert unrefd space to worst-case estimate */ - usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); + uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ - memory = towrite + tooverwrite + tohold; - -#ifdef ZFS_DEBUG - /* - * Add in 'tohold' to account for our dirty holds on this memory - * XXX - the "fudge" factor is to account for skipped blocks that - * we missed because dnode_next_offset() misses in-core-only blocks. - */ - tx->tx_space_towrite = asize + - spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); - tx->tx_space_tofree = tofree; - tx->tx_space_tooverwrite = tooverwrite; - tx->tx_space_tounref = tounref; -#endif + uint64_t memory = towrite + tohold; - if (tx->tx_dir && asize != 0) { + if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, - asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); - if (err) + asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); + if (err != 0) return (err); } @@ -1270,8 +852,6 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) static void dmu_tx_unassign(dmu_tx_t *tx) { - dmu_tx_hold_t *txh; - if (tx->tx_txg == 0) return; @@ -1281,7 +861,8 @@ dmu_tx_unassign(dmu_tx_t *tx) * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ - for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); + txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1407,23 +988,6 @@ dmu_tx_wait(dmu_tx_t *tx) } } -void -dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) -{ -#ifdef ZFS_DEBUG - if (tx->tx_dir == NULL || delta == 0) - return; - - if (delta > 0) { - ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, - tx->tx_space_towrite); - (void) refcount_add_many(&tx->tx_space_written, delta, NULL); - } else { - (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); - } -#endif -} - static void dmu_tx_destroy(dmu_tx_t *tx) { @@ -1435,16 +999,8 @@ dmu_tx_destroy(dmu_tx_t *tx) list_remove(&tx->tx_holds, txh); refcount_destroy_many(&txh->txh_space_towrite, refcount_count(&txh->txh_space_towrite)); - refcount_destroy_many(&txh->txh_space_tofree, - refcount_count(&txh->txh_space_tofree)); - refcount_destroy_many(&txh->txh_space_tooverwrite, - refcount_count(&txh->txh_space_tooverwrite)); - refcount_destroy_many(&txh->txh_space_tounref, - refcount_count(&txh->txh_space_tounref)); refcount_destroy_many(&txh->txh_memory_tohold, refcount_count(&txh->txh_memory_tohold)); - refcount_destroy_many(&txh->txh_fudge, - refcount_count(&txh->txh_fudge)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); @@ -1452,12 +1008,6 @@ dmu_tx_destroy(dmu_tx_t *tx) list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); -#endif kmem_free(tx, sizeof (dmu_tx_t)); } @@ -1496,11 +1046,6 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); -#ifdef ZFS_DEBUG - dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", - tx->tx_space_towrite, refcount_count(&tx->tx_space_written), - tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); -#endif dmu_tx_destroy(tx); } @@ -1532,7 +1077,6 @@ dmu_tx_pool(dmu_tx_t *tx) return (tx->tx_pool); } - void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { @@ -1578,12 +1122,10 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { - int i; - if (!sa->sa_need_attr_registration) return; - for (i = 0; i != sa->sa_num_attrs; i++) { + for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, @@ -1595,42 +1137,14 @@ dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) } } - void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { - dnode_t *dn; - dmu_tx_hold_t *txh; - - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, - THT_SPILL, 0, 0); - - dn = txh->txh_dnode; - - if (dn == NULL) - return; + dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, + tx->tx_objset, object, THT_SPILL, 0, 0); - /* If blkptr doesn't exist then add space to towrite */ - if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } else { - blkptr_t *bp; - - bp = &dn->dn_phys->dn_spill; - if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - bp, bp->blk_birth)) { - (void) refcount_add_many(&txh->txh_space_tooverwrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } else { - (void) refcount_add_many(&txh->txh_space_towrite, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } - if (!BP_IS_HOLE(bp)) { - (void) refcount_add_many(&txh->txh_space_tounref, - SPA_OLD_MAXBLOCKSIZE, FTAG); - } - } + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); } void @@ -1643,9 +1157,9 @@ dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) if (tx->tx_objset->os_sa->sa_master_obj == 0) return; - if (tx->tx_objset->os_sa->sa_layout_attr_obj) + if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); - else { + } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 74ceb5732c..f123b19e9c 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -1805,25 +1805,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) } /* - * Call when we think we're going to write/free space in open context to track - * the amount of memory in use by the currently open txg. - */ -void -dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) -{ - objset_t *os = dn->dn_objset; - dsl_dataset_t *ds = os->os_dsl_dataset; - int64_t aspace = spa_get_asize(os->os_spa, space); - - if (ds != NULL) { - dsl_dir_willuse_space(ds->ds_dir, aspace, tx); - dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); - } - - dmu_tx_willuse_space(tx, aspace); -} - -/* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index 155d1cf7f2..78088147a2 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -238,42 +238,6 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, return (used); } -uint64_t -dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) -{ - uint64_t trysnap = 0; - - if (ds == NULL) - return (0); - /* - * The snapshot creation could fail, but that would cause an - * incorrect FALSE return, which would only result in an - * overestimation of the amount of space that an operation would - * consume, which is OK. - * - * There's also a small window where we could miss a pending - * snapshot, because we could set the sync task in the quiescing - * phase. So this should only be used as a guess. - */ - if (ds->ds_trysnap_txg > - spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) - trysnap = ds->ds_trysnap_txg; - return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); -} - -boolean_t -dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, - uint64_t blk_birth) -{ - if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || - (bp != NULL && BP_IS_HOLE(bp))) - return (B_FALSE); - - ddt_prefetch(dsl_dataset_get_spa(ds), bp); - - return (B_TRUE); -} - /* * We have to release the fsid syncronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This @@ -1723,11 +1687,22 @@ get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) zap_cursor_t zc; zap_attribute_t za; nvlist_t *propval = fnvlist_alloc(); - nvlist_t *val = fnvlist_alloc(); + nvlist_t *val; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* + * We use nvlist_alloc() instead of fnvlist_alloc() because the + * latter would allocate the list with NV_UNIQUE_NAME flag. + * As a result, every time a clone name is appended to the list + * it would be (linearly) searched for for a duplicate name. + * We already know that all clone names must be unique and we + * want avoid the quadratic complexity of double-checking that + * because we can have a large number of clones. + */ + VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + + /* * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 60477b20eb..da2c95b1e9 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -1025,13 +1025,12 @@ static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; - int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); - for (i = 0; i < TXG_SIZE; i++) { - space += dd->dd_space_towrite[i&TXG_MASK]; - ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); + for (int i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i & TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); } return (space); } @@ -1111,16 +1110,13 @@ struct tempreserve { static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, - boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, + boolean_t ignorequota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; - uint64_t est_inflight, used_on_disk, quota, parent_rsrv; - uint64_t deferred = 0; + uint64_t quota; struct tempreserve *tr; int retval = EDQUOT; - int txgidx = txg & TXG_MASK; - int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); @@ -1132,10 +1128,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ - est_inflight = dsl_dir_space_towrite(dd); - for (i = 0; i < TXG_SIZE; i++) + uint64_t est_inflight = dsl_dir_space_towrite(dd); + for (int i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; - used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; + uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and @@ -1146,9 +1142,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; - error = dsl_dataset_check_quota(ds, checkrefquota, + error = dsl_dataset_check_quota(ds, !netfree, asize, est_inflight, &used_on_disk, &ref_rsrv); - if (error) { + if (error != 0) { mutex_exit(&dd->dd_lock); return (error); } @@ -1173,6 +1169,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * we're very close to full, this will allow a steady trickle of * removes to get through. */ + uint64_t deferred = 0; if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); @@ -1202,9 +1199,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, } /* We need to up our estimated delta before dropping dd_lock */ - dd->dd_tempreserved[txgidx] += asize; + dd->dd_tempreserved[txg & TXG_MASK] += asize; - parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); @@ -1214,11 +1211,11 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ - if (dd->dd_parent && parent_rsrv) { + if (dd->dd_parent != NULL && parent_rsrv != 0) { boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); + parent_rsrv, netfree, ismos, tr_list, tx, B_FALSE)); } else { return (0); } @@ -1232,7 +1229,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, - uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) + boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; @@ -1246,7 +1243,6 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); - ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { @@ -1273,8 +1269,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, } if (err == 0) { - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, - FALSE, asize > usize, tr_list, tx, TRUE); + err = dsl_dir_tempreserve_impl(dd, asize, netfree, + B_FALSE, tr_list, tx, B_TRUE); } if (err != 0) diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 5882d18f41..d2638470f1 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -1641,7 +1641,7 @@ spa_freeze_txg(spa_t *spa) /* ARGSUSED */ uint64_t -spa_get_asize(spa_t *spa, uint64_t lsize) +spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) { return (lsize * spa_asize_inflation); } diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 1e68c8e217..140726af9a 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -645,11 +645,6 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* - * Tells if the given dbuf is freeable. - */ -boolean_t dmu_buf_freeable(dmu_buf_t *); - -/* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index 3adb914bf6..d0636b7560 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -86,7 +86,6 @@ extern "C" { * held from: * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) - * dmu_tx_count_free: * dbuf_read_impl: db_mtx, dmu_zfetch() * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() * dbuf_new_size: db_mtx @@ -197,7 +196,6 @@ extern "C" { * dsl_prop_changed_notify: none (dd_prop_cbs) * dsl_prop_register: none (dd_prop_cbs) * dsl_prop_unregister: none (dd_prop_cbs) - * dsl_dataset_block_freeable: none (dd_sync_*) * * os_lock (leaf) * protects: diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index 6fe14ca754..1d14462cea 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -184,6 +184,7 @@ boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); +void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); void dmu_objset_init(void); void dmu_objset_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h index 8d9545449f..d9abdcd879 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TX_H @@ -70,6 +70,9 @@ struct dmu_tx { /* has this transaction already been delayed? */ boolean_t tx_waited; + /* transaction is marked as being a "net free" of space */ + boolean_t tx_netfree; + /* time this transaction was created */ hrtime_t tx_start; @@ -77,14 +80,6 @@ struct dmu_tx { boolean_t tx_wait_dirty; int tx_err; -#ifdef ZFS_DEBUG - uint64_t tx_space_towrite; - uint64_t tx_space_tofree; - uint64_t tx_space_tooverwrite; - uint64_t tx_space_tounref; - refcount_t tx_space_written; - refcount_t tx_space_freed; -#endif }; enum dmu_tx_hold_type { @@ -103,16 +98,10 @@ typedef struct dmu_tx_hold { list_node_t txh_node; struct dnode *txh_dnode; refcount_t txh_space_towrite; - refcount_t txh_space_tofree; - refcount_t txh_space_tooverwrite; - refcount_t txh_space_tounref; refcount_t txh_memory_tohold; - refcount_t txh_fudge; -#ifdef ZFS_DEBUG enum dmu_tx_hold_type txh_type; uint64_t txh_arg1; uint64_t txh_arg2; -#endif } dmu_tx_hold_t; typedef struct dmu_tx_callback { @@ -148,9 +137,7 @@ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx); void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); -void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); -int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); #ifdef ZFS_DEBUG diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 733751f07c..eeb9a83a6a 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -304,7 +304,6 @@ void dnode_verify(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index cab7cbb10f..fc5117bc9b 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -282,9 +282,6 @@ void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async); -boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, - uint64_t blk_birth); -uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index ab2d740741..a6414887c3 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -137,8 +137,7 @@ uint64_t dsl_dir_space_available(dsl_dir_t *dd, void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, - uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, - dmu_tx_t *tx); + uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index f7ec240ef3..50ffe676a3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -775,7 +775,7 @@ extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h index cece948977..6600318cd9 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h @@ -217,8 +217,6 @@ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); void fzap_prefetch(zap_name_t *zn); -int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, - refcount_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c index 0caed0ad75..70bf9d12db 100644 --- a/usr/src/uts/common/fs/zfs/zap.c +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -1331,64 +1331,3 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } - -int -fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, - refcount_t *tooverwrite) -{ - zap_t *zap = zn->zn_zap; - zap_leaf_t *l; - int err; - - /* - * Account for the header block of the fatzap. - */ - if (!add && dmu_buf_freeable(zap->zap_dbuf)) { - (void) refcount_add_many(tooverwrite, - zap->zap_dbuf->db_size, FTAG); - } else { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size, FTAG); - } - - /* - * Account for the pointer table blocks. - * If we are adding we need to account for the following cases : - * - If the pointer table is embedded, this operation could force an - * external pointer table. - * - If this already has an external pointer table this operation - * could extend the table. - */ - if (add) { - if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size, FTAG); - } else { - (void) refcount_add_many(towrite, - zap->zap_dbuf->db_size * 3, FTAG); - } - } - - /* - * Now, check if the block containing leaf is freeable - * and account accordingly. - */ - err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); - if (err != 0) { - return (err); - } - - if (!add && dmu_buf_freeable(l->l_dbuf)) { - (void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG); - } else { - /* - * If this an add operation, the leaf block could split. - * Hence, we need to account for an additional leaf block. - */ - (void) refcount_add_many(towrite, - (add ? 2 : 1) * l->l_dbuf->db_size, FTAG); - } - - zap_put_leaf(l); - return (0); -} diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index 7086b2abad..4632518798 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -1480,85 +1480,3 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) zap_unlockdir(zap, FTAG); return (0); } - -int -zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, - refcount_t *towrite, refcount_t *tooverwrite) -{ - zap_t *zap; - int err = 0; - - /* - * Since, we don't have a name, we cannot figure out which blocks will - * be affected in this operation. So, account for the worst case : - * - 3 blocks overwritten: target leaf, ptrtbl block, header block - * - 4 new blocks written if adding: - * - 2 blocks for possibly split leaves, - * - 2 grown ptrtbl blocks - * - * This also accommodates the case where an add operation to a fairly - * large microzap results in a promotion to fatzap. - */ - if (name == NULL) { - (void) refcount_add_many(towrite, - (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); - return (err); - } - - /* - * We lock the zap with adding == FALSE. Because, if we pass - * the actual value of add, it could trigger a mzap_upgrade(). - * At present we are just evaluating the possibility of this operation - * and hence we do not want to trigger an upgrade. - */ - err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, - FTAG, &zap); - if (err != 0) - return (err); - - if (!zap->zap_ismicro) { - zap_name_t *zn = zap_name_alloc(zap, name, 0); - if (zn) { - err = fzap_count_write(zn, add, towrite, - tooverwrite); - zap_name_free(zn); - } else { - /* - * We treat this case as similar to (name == NULL) - */ - (void) refcount_add_many(towrite, - (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); - } - } else { - /* - * We are here if (name != NULL) and this is a micro-zap. - * We account for the header block depending on whether it - * is freeable. - * - * Incase of an add-operation it is hard to find out - * if this add will promote this microzap to fatzap. - * Hence, we consider the worst case and account for the - * blocks assuming this microzap would be promoted to a - * fatzap. - * - * 1 block overwritten : header block - * 4 new blocks written : 2 new split leaf, 2 grown - * ptrtbl blocks - */ - if (dmu_buf_freeable(zap->zap_dbuf)) { - (void) refcount_add_many(tooverwrite, - MZAP_MAX_BLKSZ, FTAG); - } else { - (void) refcount_add_many(towrite, - MZAP_MAX_BLKSZ, FTAG); - } - - if (add) { - (void) refcount_add_many(towrite, - 4 * MZAP_MAX_BLKSZ, FTAG); - } - } - - zap_unlockdir(zap, FTAG); - return (err); -} diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 9e0a8c0ed8..8b4cde4bed 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -97,11 +97,6 @@ static const fs_operation_def_t zfs_vfsops_template[] = { NULL, NULL }; -static const fs_operation_def_t zfs_vfsops_eio_template[] = { - VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs }, - NULL, NULL -}; - /* * We need to keep a count of active fs's. * This is necessary to prevent our module |