diff options
Diffstat (limited to 'usr/src/uts/common/fs')
154 files changed, 14192 insertions, 2499 deletions
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c index e642e86169..5b3171e0d1 100644 --- a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c +++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c @@ -93,7 +93,7 @@ bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) * there's nothing to be done about that. */ vfs_setresource(vfsp, bootfs_name, 0); - bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI); + bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP_LAZY); if (bfs == NULL) return (ENOMEM); diff --git a/usr/src/uts/common/fs/dev/sdev_ptsops.c b/usr/src/uts/common/fs/dev/sdev_ptsops.c index 4d8f47397b..1b3f1561de 100644 --- a/usr/src/uts/common/fs/dev/sdev_ptsops.c +++ b/usr/src/uts/common/fs/dev/sdev_ptsops.c @@ -97,7 +97,6 @@ devpts_strtol(const char *nm, minor_t *mp) * away, we use the validator to do deferred cleanup i.e. when such * nodes are encountered during subsequent lookup() and readdir(). */ -/*ARGSUSED*/ int devpts_validate(struct sdev_node *dv) { @@ -124,8 +123,8 @@ devpts_validate(struct sdev_node *dv) /* * Check if pts driver is attached */ - if (ptms_slave_attached() == (major_t)-1) { - sdcmn_err7(("devpts_validate: slave not attached\n")); + if (ptms_subsidiary_attached() == (major_t)-1) { + sdcmn_err7(("devpts_validate: subsidiary not attached\n")); return (SDEV_VTOR_INVALID); } @@ -159,7 +158,6 @@ devpts_validate(struct sdev_node *dv) * This callback is invoked from devname_lookup_func() to create * a pts entry when the node is not found in the cache. */ -/*ARGSUSED*/ static int devpts_create_rvp(struct sdev_node *ddv, char *nm, void **arg, cred_t *cred, void *whatever, char *whichever) @@ -177,12 +175,11 @@ devpts_create_rvp(struct sdev_node *ddv, char *nm, } /* - * Check if pts driver is attached and if it is - * get the major number. + * Check if pts driver is attached and if it is get the major number. */ - maj = ptms_slave_attached(); + maj = ptms_subsidiary_attached(); if (maj == (major_t)-1) { - sdcmn_err7(("devpts_create_rvp: slave not attached\n")); + sdcmn_err7(("devpts_create_rvp: subsidiary not attached\n")); return (-1); } @@ -286,7 +283,6 @@ devpts_prunedir(struct sdev_node *ddv) * access the realvp of the specfs node directly instead of using * VOP_REALVP(). */ -/*ARGSUSED3*/ static int devpts_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, @@ -326,7 +322,6 @@ devpts_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, * - creating an existing dir read-only succeeds, otherwise EISDIR * - exclusive creates fail - EEXIST */ -/*ARGSUSED2*/ static int devpts_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl, int mode, struct vnode **vpp, struct cred *cred, int flag, @@ -359,11 +354,10 @@ devpts_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl, } /* - * Display all instantiated pts (slave) device nodes. - * A /dev/pts entry will be created only after the first lookup of the slave - * device succeeds. + * Display all instantiated pts (subsidiary) device nodes. + * A /dev/pts entry will be created only after the first lookup of the + * subsidiary device succeeds. */ -/*ARGSUSED4*/ static int devpts_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp, caller_context_t *ct, int flags) @@ -387,7 +381,6 @@ devpts_set_id(struct sdev_node *dv, struct vattr *vap, int protocol) } -/*ARGSUSED4*/ static int devpts_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, caller_context_t *ctp) diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index 102375dedd..b0edec758c 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -25,7 +25,7 @@ */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 @@ -256,7 +256,7 @@ vnode_t negative_cache_vnode; */ #define dnlc_free(ncp) \ { \ - kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \ + kmem_free((ncp), NCACHE_SIZE((ncp)->namlen)); \ atomic_dec_32(&dnlc_nentries); \ } @@ -460,7 +460,7 @@ dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp) VN_HOLD_DNLC(dp); ncp->vp = vp; VN_HOLD_DNLC(vp); - bcopy(name, ncp->name, namlen + 1); /* name and null */ + bcopy(name, ncp->name, namlen); ncp->hash = hash; hp = &nc_hash[hash & nc_hashmask]; @@ -534,7 +534,7 @@ dnlc_update(vnode_t *dp, const char *name, vnode_t *vp) VN_HOLD_DNLC(dp); ncp->vp = vp; VN_HOLD_DNLC(vp); - bcopy(name, ncp->name, namlen + 1); /* name and null */ + bcopy(name, ncp->name, namlen); ncp->hash = hash; hp = &nc_hash[hash & nc_hashmask]; @@ -977,7 +977,7 @@ dnlc_get(uchar_t namlen) dnlc_max_nentries_cnt++; /* keep a statistic */ return (NULL); } - ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP); + ncp = kmem_alloc(NCACHE_SIZE(namlen), KM_NOSLEEP); if (ncp == NULL) { return (NULL); } @@ -1257,7 +1257,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle) * dnlc_dir_reclaim() is called as a result of memory shortage. */ DNLC_DIR_HASH(name, hash, namlen); - dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); + dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP); if (dep == NULL) { #ifdef DEBUG /* @@ -1268,7 +1268,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle) * performance running a debug kernel. * This random error only occurs in debug mode. */ - dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); + dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP); if (dep != NULL) goto ok; #endif @@ -1278,7 +1278,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle) * called with. */ dnlc_dir_reclaim(NULL); - dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP); + dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP); if (dep == NULL) { /* * still no memory, better delete this cache @@ -1311,7 +1311,7 @@ ok: dnlc_dir_max_size) { mutex_exit(&dcap->dca_lock); dnlc_dir_purge(dcap); - kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); + kmem_free(dep, DCENTTRY_SIZE(namlen)); ncs.ncs_dir_add_max.value.ui64++; return (DTOOBIG); } @@ -1348,7 +1348,7 @@ ok: return (DOK); } else { mutex_exit(&dcap->dca_lock); - kmem_free(dep, sizeof (dcentry_t) - 1 + namlen); + kmem_free(dep, DCENTTRY_SIZE(namlen)); return (DNOCACHE); } } @@ -1481,8 +1481,7 @@ dnlc_dir_abort(dircache_t *dcp) nhp = dcp->dc_namehash[i]; while (nhp != NULL) { /* for each chained entry */ dep = nhp->de_next; - kmem_free(nhp, sizeof (dcentry_t) - 1 + - nhp->de_namelen); + kmem_free(nhp, DCENTTRY_SIZE(nhp->de_namelen)); nhp = dep; } } @@ -1578,8 +1577,7 @@ dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep) } te = *prevpp; *prevpp = (*prevpp)->de_next; - kmem_free(te, sizeof (dcentry_t) - 1 + - te->de_namelen); + kmem_free(te, DCENTTRY_SIZE(te->de_namelen)); /* * If the total number of entries diff --git a/usr/src/uts/common/fs/doorfs/door_sys.c b/usr/src/uts/common/fs/doorfs/door_sys.c index 68a7a11d82..a2d3812938 100644 --- a/usr/src/uts/common/fs/doorfs/door_sys.c +++ b/usr/src/uts/common/fs/doorfs/door_sys.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -1114,6 +1115,19 @@ door_stack_copyout(const void *kaddr, void *uaddr, size_t count) } /* + * The IA32 ABI supplement 1.0 changed the required stack alignment to + * 16 bytes (from 4 bytes), so that code can make use of SSE instructions. + * This is already done for process entry, thread entry, and makecontext(); + * We need to do this for door_return as well. The stack will be aligned to + * whatever the door_results is aligned. + * See: usr/src/lib/libc/i386/gen/makectxt.c for more details. + */ +#if defined(__amd64) +#undef STACK_ALIGN32 +#define STACK_ALIGN32 16 +#endif + +/* * Writes the stack layout for door_return() into the door_server_t of the * server thread. */ diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c index 05ee2c6e09..cc03f41c8d 100644 --- a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -504,7 +504,7 @@ hldiraddentry( /* Alloc and init dir entry */ namelen = strlen(name) + 1; alloc_size = namelen + sizeof (hldirent_t); - hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP); + hdp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY); if (hdp == NULL) return (ENOSPC); diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c index c582a8cac2..bf80da6dbe 100644 --- a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -271,8 +271,7 @@ hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) &dpn)) != 0) goto out; - if ((hm = kmem_zalloc(sizeof (hlfsmount_t), - KM_NORMALPRI | KM_NOSLEEP)) == NULL) { + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), KM_NOSLEEP_LAZY)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 093db5a4b4..71e2aeb48b 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -20,8 +20,7 @@ */ /* - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. */ @@ -246,6 +245,9 @@ lookuppnvp( pp = &presrvd; } + if (flags & __FLXNOAUTO) + lookup_flags |= __FLXNOAUTO; + if (auditing) audit_anchorpath(pnp, vp == rootvp); @@ -433,7 +435,7 @@ checkforroot: * Traverse mount points. * XXX why don't we need to hold a read lock here (call vn_vfsrlock)? * What prevents a concurrent update to v_vfsmountedhere? - * Possible answer: if mounting, we might not see the mount + * Possible answer: if mounting, we might not see the mount * if it is concurrently coming into existence, but that's * really not much different from the thread running a bit slower. * If unmounting, we may get into traverse() when we shouldn't, @@ -1052,7 +1054,26 @@ vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn, VN_HOLD(vrootp); if (vrootp != rootdir) VN_HOLD(vrootp); - if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp, + + /* + * The FOLLOW flag only determines, if the final path component + * is a symlink, whether lookuppnvp will return the symlink, or its + * target. + * + * If the vp is a VLNK, then passing the FOLLOW flag will cause + * lookuppnvp to return the vnode of its target, instead of itself, and + * so vn_compare will fail. Therefore, we do not pass FOLLOW when our vp + * is a symlink. + * + * If the vp is not a VLNK, then we pass FOLLOW on the off-chance that + * the stored v_path ends at a symlink, instead of the symlink's target. + */ + if (vp->v_type != VLNK) + flags |= FOLLOW; + else + flags &= ~FOLLOW; + + if (lookuppnvp(pn, rpn, flags, NULL, &compvp, vrootp, vrootp, cr) == 0) { /* * Check to see if the returned vnode is the same as the one we diff --git a/usr/src/uts/common/fs/mntfs/mntvnops.c b/usr/src/uts/common/fs/mntfs/mntvnops.c index 7374820f95..6bb3b514fb 100644 --- a/usr/src/uts/common/fs/mntfs/mntvnops.c +++ b/usr/src/uts/common/fs/mntfs/mntvnops.c @@ -54,7 +54,7 @@ extern void vfs_mnttab_readop(void); * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of * the mounted resources: the read-only file /etc/mnttab, and a collection of * ioctl() commands. Most of these interfaces are public and are described in - * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT, + * mnttab(5). Three private ioctl() commands, MNTIOC_GETMNTENT, * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C) * family of functions, allowing them to support white space in mount names. * @@ -1039,7 +1039,7 @@ mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, /* * The mntnode already has at least one snapshot from * which to take the size; the user will understand from - * mnttab(4) that the current size of the in-kernel + * mnttab(5) that the current size of the in-kernel * mnttab is irrelevant. */ size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size : @@ -1186,7 +1186,7 @@ mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) * has a special meaning for /etc/mnttab: it forces mntfs to refresh the * snapshot at the next ioctl(). * - * mnttab(4) explains that "the snapshot...is taken any time a read(2) is + * mnttab(5) explains that "the snapshot...is taken any time a read(2) is * performed at offset 0". We therefore ignore the read snapshot here. */ /* ARGSUSED */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_client.c b/usr/src/uts/common/fs/nfs/nfs4_client.c index 5456fc7c63..856da430ea 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_client.c +++ b/usr/src/uts/common/fs/nfs/nfs4_client.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. + * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All Rights Reserved */ @@ -464,33 +464,15 @@ nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, rp = VTOR4(vp); mutex_enter(&rp->r_statelock); was_serial = (rp->r_serial == curthread); - if (rp->r_serial && !was_serial) { - klwp_t *lwp = ttolwp(curthread); - + if (rp->r_serial != NULL && !was_serial) { /* - * If we're the recovery thread, then purge current attrs - * and bail out to avoid potential deadlock between another - * thread caching attrs (r_serial thread), recov thread, - * and an async writer thread. + * Purge current attrs and bail out to avoid potential deadlock + * between another thread caching attrs (r_serial thread), this + * thread, and a thread trying to read or write pages. */ - if (recov) { - PURGE_ATTRCACHE4_LOCKED(rp); - mutex_exit(&rp->r_statelock); - return; - } - - if (lwp != NULL) - lwp->lwp_nostop++; - while (rp->r_serial != NULL) { - if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { - mutex_exit(&rp->r_statelock); - if (lwp != NULL) - lwp->lwp_nostop--; - return; - } - } - if (lwp != NULL) - lwp->lwp_nostop--; + PURGE_ATTRCACHE4_LOCKED(rp); + mutex_exit(&rp->r_statelock); + return; } /* @@ -3067,7 +3049,7 @@ nfs_free_mi4(mntinfo4_t *mi) nfs4_oo_hash_bucket_t *bucketp; nfs4_debug_msg_t *msgp; int i; - servinfo4_t *svp; + servinfo4_t *svp; /* * Code introduced here should be carefully evaluated to make diff --git a/usr/src/uts/common/fs/nfs/nfs4_idmap.c b/usr/src/uts/common/fs/nfs/nfs4_idmap.c index c0e2492d56..0eb449b5ef 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_idmap.c +++ b/usr/src/uts/common/fs/nfs/nfs4_idmap.c @@ -31,11 +31,11 @@ * mapping code is executing on the client or server. Thus, the following * rules represents the latest incantation of the id mapping policies. * - * 1) For the case in which the nfsmapid(1m) daemon has _never_ been + * 1) For the case in which the nfsmapid(8) daemon has _never_ been * started, the policy is to _always_ work with stringified uid's * and gid's * - * 2) For the case in which the nfsmapid(1m) daemon _was_ started but + * 2) For the case in which the nfsmapid(8) daemon _was_ started but * has either died or become unresponsive, the mapping policies are * as follows: * @@ -72,7 +72,7 @@ * `-------------------------------'---------------------------------' * * 3) Lastly, in order to leverage better cache utilization whenever - * communication with nfsmapid(1m) is currently hindered, cache + * communication with nfsmapid(8) is currently hindered, cache * entry eviction is throttled whenever nfsidmap_daemon_dh == NULL. * * @@ -80,28 +80,28 @@ * ==================================================== * * GETATTR - Server-side GETATTR *id to attr string conversion policies - * for unresponsive/dead nfsmapid(1m) daemon + * for unresponsive/dead nfsmapid(8) daemon * * a) If the *id is *ID_NOBODY, the string "nobody" is returned * - * b) If the *id is not *ID_NOBODY _and_ the nfsmapid(1m) daemon + * b) If the *id is not *ID_NOBODY _and_ the nfsmapid(8) daemon * _is_ operational, the daemon is contacted to convert the * [u/g]id into a string of type "[user/group]@domain" * - * c) If the nfsmapid(1m) daemon has died or has become unresponsive, + * c) If the nfsmapid(8) daemon has died or has become unresponsive, * the server returns status == NFS4_OK for the GETATTR operation, * and returns a strigified [u/g]id to let the client map it into * the appropriate value. * * SETATTR - Server-side SETATTR attr string to *id conversion policies - * for unresponsive/dead nfsmapid(1m) daemon + * for unresponsive/dead nfsmapid(8) daemon * * a) If the otw string is a stringified uid (ie. does _not_ contain * an '@' sign and is of the form "12345") then the literal uid is * decoded and it is used to perform the mapping. * * b) If, on the other hand, the otw string _is_ of the form - * "[user/group]@domain" and problems arise contacting nfsmapid(1m), + * "[user/group]@domain" and problems arise contacting nfsmapid(8), * the SETATTR operation _must_ fail w/NFS4ERR_DELAY, as the server * cannot default to *ID_NOBODY, which would allow a file to be * given away by setting it's owner or owner_group to "nobody". @@ -329,7 +329,7 @@ nfs_idmap_str_uid(utf8string *u8s, uid_t *uid, bool_t isserver) } /* - * Start-off with upcalls disabled, and once nfsmapid(1m) is up and + * Start-off with upcalls disabled, and once nfsmapid(8) is up and * running, we'll leverage it's first flush to let the kernel know * when it's up and available to perform mappings. Also, on client * only, be smarter about when to issue upcalls by checking the @@ -399,7 +399,7 @@ retry: /* * string came in as stringified id. Don't cache ! * - * nfsmapid(1m) semantics have changed in order to + * nfsmapid(8) semantics have changed in order to * support diskless clients. Thus, for stringified * id's that have passwd/group entries, we'll go * ahead and map them, returning no error. @@ -538,7 +538,7 @@ nfs_idmap_uid_str(uid_t uid, utf8string *u8s, bool_t isserver) } /* - * Start-off with upcalls disabled, and once nfsmapid(1m) is + * Start-off with upcalls disabled, and once nfsmapid(8) is * up and running, we'll leverage it's first flush to let the * kernel know when it's up and available to perform mappings. * We fall back to answering with stringified uid's. @@ -708,7 +708,7 @@ nfs_idmap_str_gid(utf8string *u8s, gid_t *gid, bool_t isserver) } /* - * Start-off with upcalls disabled, and once nfsmapid(1m) is up and + * Start-off with upcalls disabled, and once nfsmapid(8) is up and * running, we'll leverage it's first flush to let the kernel know * when it's up and available to perform mappings. Also, on client * only, be smarter about when to issue upcalls by checking the @@ -779,7 +779,7 @@ retry: /* * string came in as stringified id. Don't cache ! * - * nfsmapid(1m) semantics have changed in order to + * nfsmapid(8) semantics have changed in order to * support diskless clients. Thus, for stringified * id's that have passwd/group entries, we'll go * ahead and map them, returning no error. @@ -918,7 +918,7 @@ nfs_idmap_gid_str(gid_t gid, utf8string *u8s, bool_t isserver) } /* - * Start-off with upcalls disabled, and once nfsmapid(1m) is + * Start-off with upcalls disabled, and once nfsmapid(8) is * up and running, we'll leverage it's first flush to let the * kernel know when it's up and available to perform mappings. * We fall back to answering with stringified gid's. @@ -1119,7 +1119,7 @@ nfs_idmap_args(struct nfsidmap_args *idmp) nfs_idmap_cache_flush(&nig->s2g_ci); /* - * nfsmapid(1m) up and running; enable upcalls + * nfsmapid(8) up and running; enable upcalls * State: * 0 Just flush caches * 1 Re-establish door knob @@ -1309,7 +1309,7 @@ nfs_idmap_cache_s2i_lkup(idmap_cache_info_t *cip, utf8string *u8s, * Check entry for staleness first, as user's id * may have changed and may need to be remapped. * Note that we don't evict entries from the cache - * if we're having trouble contacting nfsmapid(1m) + * if we're having trouble contacting nfsmapid(8) */ if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) { nfs_idmap_cache_rment(p); @@ -1405,7 +1405,7 @@ nfs_idmap_cache_s2i_insert(idmap_cache_info_t *cip, uid_t id, utf8string *u8s, * Check entry for staleness first, as user's id * may have changed and may need to be remapped. * Note that we don't evict entries from the cache - * if we're having trouble contacting nfsmapid(1m) + * if we're having trouble contacting nfsmapid(8) */ if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) { nfs_idmap_cache_rment(p); @@ -1486,7 +1486,7 @@ nfs_idmap_cache_i2s_lkup(idmap_cache_info_t *cip, uid_t id, uint_t *hashno, * Check entry for staleness first, as user's id * may have changed and may need to be remapped. * Note that we don't evict entries from the cache - * if we're having trouble contacting nfsmapid(1m) + * if we're having trouble contacting nfsmapid(8) */ if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) { nfs_idmap_cache_rment(p); @@ -1570,7 +1570,7 @@ nfs_idmap_cache_i2s_insert(idmap_cache_info_t *cip, uid_t id, utf8string *u8s, * Check entry for staleness first, as user's id * may have changed and may need to be remapped. * Note that we don't evict entries from the cache - * if we're having trouble contacting nfsmapid(1m) + * if we're having trouble contacting nfsmapid(8) */ if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) { nfs_idmap_cache_rment(p); diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c index 757964eb84..077fc4a25f 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c @@ -32,6 +32,7 @@ * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright 2019 Nexenta Systems, Inc. * Copyright 2019 Nexenta by DDN, Inc. + * Copyright 2021 Racktop Systems, Inc. */ #include <sys/param.h> @@ -5840,13 +5841,12 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, ASSERT(exi == NULL); ASSERT(cr == NULL); - cr = crget(); + cr = svc_xprt_cred(req->rq_xprt); ASSERT(cr != NULL); if (sec_svc_getcred(req, cr, &cs.principal, &cs.nfsflavor) == 0) { DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs, COMPOUND4args *, args); - crfree(cr); DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs, COMPOUND4res *, resp); svcerr_badcred(req->rq_xprt); @@ -5965,8 +5965,6 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, if (cs.saved_fh.nfs_fh4_val) kmem_free(cs.saved_fh.nfs_fh4_val, NFS4_FHSIZE); - if (cs.basecr) - crfree(cs.basecr); if (cs.cr) crfree(cs.cr); /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c index a9ee217a8b..13e5320752 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c @@ -2093,7 +2093,7 @@ rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, * occur. Please refer to nfs4_idmap.c for details. * * Any other errors, such as the mapping not being found by - * nfsmapid(1m), and interrupted clnt_call, etc, will result + * nfsmapid(8), and interrupted clnt_call, etc, will result * in NFS4ERR_BADOWNER. * * XXX need to return consistent errors, perhaps all @@ -2206,7 +2206,7 @@ rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg, * cannot occur. Please refer to nfs4_idmap.c for details. * * Any other errors, such as the mapping not being found by - * nfsmapid(1m), and interrupted clnt_call, etc, will result + * nfsmapid(8), and interrupted clnt_call, etc, will result * in NFS4ERR_BADOWNER. * * XXX need to return consistent errors, perhaps all diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c index 920ebeca53..b719b0e2ca 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c @@ -660,7 +660,6 @@ treeclimb_export(struct exportinfo *exip) if (error) break; - /* XXX KEBE ASKS DO WE NEED THIS?!? */ ASSERT3U(exip->exi_zoneid, ==, curzone->zone_id); /* * The root of the file system, or the zone's root for diff --git a/usr/src/uts/common/fs/nfs/nfs4_state.c b/usr/src/uts/common/fs/nfs/nfs4_state.c index 0c1efb26df..b95dd6fb02 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_state.c +++ b/usr/src/uts/common/fs/nfs/nfs4_state.c @@ -74,6 +74,7 @@ stateid4 special1 = { int rfs4_debug; #endif +rfs4_db_mem_cache_t rfs4_db_mem_cache_table[RFS4_DB_MEM_CACHE_NUM]; static uint32_t rfs4_database_debug = 0x00; /* CSTYLED */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c index 83c84b7892..d0950dd6f0 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c @@ -1906,7 +1906,7 @@ nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp, uap->flags = MS_SYSSPACE | MS_DATA; /* fstype-independent mount options not covered elsewhere */ - /* copy parent's mount(1M) "-m" flag */ + /* copy parent's mount(8) "-m" flag */ if (stubvfsp->vfs_flag & VFS_NOMNTTAB) uap->flags |= MS_NOMNTTAB; diff --git a/usr/src/uts/common/fs/nfs/nfs4_subr.c b/usr/src/uts/common/fs/nfs/nfs4_subr.c index ec5fda53a0..aaec5ca976 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_subr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_subr.c @@ -27,7 +27,7 @@ */ /* - * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. + * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All Rights Reserved */ @@ -1582,7 +1582,7 @@ rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp, cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep) { int i, error; - enum clnt_stat rpc_status = NFS4_OK; + enum clnt_stat rpc_status = RPC_SUCCESS; int num_resops; struct nfs4_clnt *nfscl; diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index 15c6445146..6a3fbff48e 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -2596,12 +2596,6 @@ nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, osp->os_ref_count--; if (ep->error == 0) { - /* - * Avoid a deadlock with the r_serial thread waiting for - * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be - * held by us. We will wait in nfs4_attr_cache() for the - * completion of the r_serial thread. - */ mutex_exit(&osp->os_sync_lock); *have_sync_lockp = 0; diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c index 7ac3c3318b..16979e1422 100644 --- a/usr/src/uts/common/fs/nfs/nfs_auth.c +++ b/usr/src/uts/common/fs/nfs/nfs_auth.c @@ -219,7 +219,7 @@ nfsauth_zone_init(nfs_globals_t *ng) nag = kmem_zalloc(sizeof (*nag), KM_SLEEP); /* - * mountd can be restarted by smf(5). We need to make sure + * mountd can be restarted by smf(7). We need to make sure * the updated door handle will safely make it to mountd_dh. */ mutex_init(&nag->mountd_lock, NULL, MUTEX_DEFAULT, NULL); @@ -462,7 +462,7 @@ retry: if (dh == NULL) { /* * The rendezvous point has not been established yet! - * This could mean that either mountd(1m) has not yet + * This could mean that either mountd(8) has not yet * been started or that _this_ routine nuked the door * handle after receiving an EINTR for a REVOKED door. * @@ -523,8 +523,8 @@ retry: /* * The server barfed and revoked * the (existing) door on us; we - * want to wait to give smf(5) a - * chance to restart mountd(1m) + * want to wait to give smf(7) a + * chance to restart mountd(8) * and establish a new door handle. */ mutex_enter(&nag->mountd_lock); @@ -910,9 +910,6 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, ASSERT(taddrmask != NULL); addrmask(&addr, taddrmask); - ac.auth_flavor = flavor; - ac.auth_clnt_cred = crdup(cr); - acc.authc_addr = addr; tree = exi->exi_cache[hash(&addr)]; @@ -925,7 +922,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, rw_exit(&exi->exi_cache_lock); - nc = kmem_alloc(sizeof (*nc), KM_NOSLEEP | KM_NORMALPRI); + nc = kmem_alloc(sizeof (*nc), KM_NOSLEEP_LAZY); if (nc == NULL) goto retrieve; @@ -933,8 +930,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, * Initialize the new auth_cache_clnt */ nc->authc_addr = addr; - nc->authc_addr.buf = kmem_alloc(addr.maxlen, - KM_NOSLEEP | KM_NORMALPRI); + nc->authc_addr.buf = kmem_alloc(addr.maxlen, KM_NOSLEEP_LAZY); if (addr.maxlen != 0 && nc->authc_addr.buf == NULL) { kmem_free(nc, sizeof (*nc)); goto retrieve; @@ -964,6 +960,10 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, ASSERT(c != NULL); rw_enter(&c->authc_lock, RW_READER); + + ac.auth_flavor = flavor; + ac.auth_clnt_cred = cr; + p = (struct auth_cache *)avl_find(&c->authc_tree, &ac, NULL); if (p == NULL) { @@ -971,8 +971,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, rw_exit(&c->authc_lock); - np = kmem_cache_alloc(exi_cache_handle, - KM_NOSLEEP | KM_NORMALPRI); + np = kmem_cache_alloc(exi_cache_handle, KM_NOSLEEP_LAZY); if (np == NULL) { rw_exit(&exi->exi_cache_lock); goto retrieve; @@ -983,7 +982,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, */ np->auth_clnt = c; np->auth_flavor = flavor; - np->auth_clnt_cred = ac.auth_clnt_cred; + np->auth_clnt_cred = crdup(cr); np->auth_srv_ngids = 0; np->auth_srv_gids = NULL; np->auth_time = np->auth_freshness = gethrestime_sec(); @@ -1004,12 +1003,11 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, cv_destroy(&np->auth_cv); mutex_destroy(&np->auth_lock); - crfree(ac.auth_clnt_cred); + crfree(np->auth_clnt_cred); kmem_cache_free(exi_cache_handle, np); } } else { rw_exit(&exi->exi_cache_lock); - crfree(ac.auth_clnt_cred); } mutex_enter(&p->auth_lock); @@ -1071,7 +1069,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, * auth_cache entry */ tmpgids = kmem_alloc(tmpngids * sizeof (gid_t), - KM_NOSLEEP | KM_NORMALPRI); + KM_NOSLEEP_LAZY); if (tmpgids != NULL) bcopy(*gids, tmpgids, tmpngids * sizeof (gid_t)); @@ -1212,7 +1210,6 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor, return (access); retrieve: - crfree(ac.auth_clnt_cred); /* * Retrieve the required data without caching. diff --git a/usr/src/uts/common/fs/nfs/nfs_cmd.c b/usr/src/uts/common/fs/nfs/nfs_cmd.c index 40775bb231..b9d23ba0d6 100644 --- a/usr/src/uts/common/fs/nfs/nfs_cmd.c +++ b/usr/src/uts/common/fs/nfs/nfs_cmd.c @@ -138,7 +138,7 @@ retry: if (dh == NULL) { /* * The rendezvous point has not been established yet ! - * This could mean that either mountd(1m) has not yet + * This could mean that either mountd(8) has not yet * been started or that _this_ routine nuked the door * handle after receiving an EINTR for a REVOKED door. * @@ -176,8 +176,8 @@ retry: /* * The server barfed and revoked * the (existing) door on us; we - * want to wait to give smf(5) a - * chance to restart mountd(1m) + * want to wait to give smf(7) a + * chance to restart mountd(8) * and establish a new door handle. */ mutex_enter(&ncg->nfscmd_lock); diff --git a/usr/src/uts/common/fs/nfs/nfs_export.c b/usr/src/uts/common/fs/nfs/nfs_export.c index 080dfe1adf..b18912d154 100644 --- a/usr/src/uts/common/fs/nfs/nfs_export.c +++ b/usr/src/uts/common/fs/nfs/nfs_export.c @@ -85,7 +85,7 @@ static bool_t exi_id_overflow; avl_tree_t exi_id_tree; kmutex_t nfs_exi_id_lock; -static int unexport(nfs_export_t *, exportinfo_t *); +static int unexport(nfs_export_t *, exportinfo_t *, cred_t *); static void exportfree(exportinfo_t *); static int loadindex(exportdata_t *); @@ -975,7 +975,15 @@ nfs_export_zone_shutdown(nfs_globals_t *ng) nfs_export_t *ne = ng->nfs_export; struct exportinfo *exi, *nexi; int i, errors; + zoneid_t zoneid = ng->nfs_zoneid; + cred_t *cr; + /* + * Use the zone's credential. Since this is a zone shutdown method, + * the zone_t should still be around for a zone_get_kcred() call. + */ + cr = zone_get_kcred(zoneid); + VERIFY(cr != NULL); rw_enter(&ne->exported_lock, RW_READER); errors = 0; @@ -986,7 +994,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng) exi_hold(exi); while (exi != NULL) { - + ASSERT3U(zoneid, ==, exi->exi_zoneid); /* * Get and hold next export before * dropping the rwlock and unexport @@ -1002,7 +1010,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng) * create/destroy handling. */ if (exi != ne->exi_root && - unexport(ne, exi) != 0) + unexport(ne, exi, cr) != 0) errors++; exi_rele(exi); @@ -1016,6 +1024,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng) } rw_exit(&ne->exported_lock); + crfree(cr); } void @@ -1286,7 +1295,7 @@ exportfs(struct exportfs_args *args, model_t model, cred_t *cr) pn_free(&lookpn); if (ex1 == NULL) return (EINVAL); - error = unexport(ne, ex1); + error = unexport(ne, ex1, cr); exi_rele(ex1); return (error); } @@ -1886,7 +1895,7 @@ export_unlink(nfs_export_t *ne, struct exportinfo *exi) * Unexport an exported filesystem */ static int -unexport(nfs_export_t *ne, struct exportinfo *exi) +unexport(nfs_export_t *ne, struct exportinfo *exi, cred_t *cr) { struct secinfo cursec[MAX_FLAVORS]; int curcnt; @@ -1954,18 +1963,14 @@ unexport(nfs_export_t *ne, struct exportinfo *exi) * the public filehandle to the root. */ - /* - * XXX KEBE ASKS --> Should CRED() instead be - * exi->exi_zone->zone_kcred? - */ if (exi == ne->exi_public) { ne->exi_public = ne->exi_root; - nfslog_share_record(ne->exi_public, CRED()); + nfslog_share_record(ne->exi_public, cr); } if (exi->exi_export.ex_flags & EX_LOG) - nfslog_unshare_record(exi, CRED()); + nfslog_unshare_record(exi, cr); exi_rele(exi); return (0); diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index 5b7658d048..28c079968f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -24,6 +24,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2017 Joyent Inc * Copyright 2019 Nexenta by DDN, Inc. + * Copyright 2021 Racktop Systems, Inc. */ /* @@ -115,6 +116,13 @@ krwlock_t nfssrv_globals_rwl; kmem_cache_t *nfs_xuio_cache; int nfs_loaned_buffers = 0; +/* array of paths passed-in from nfsd command-line; stored in nvlist */ +char **rfs4_dss_newpaths; +uint_t rfs4_dss_numnewpaths; + +/* nvlists of all DSS paths: current, and before last warmstart */ +nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths; + int _init(void) { @@ -1356,11 +1364,6 @@ static struct rpc_disptable rfs_disptable[] = { static int nfs_portmon = 0; #ifdef DEBUG -static int cred_hits = 0; -static int cred_misses = 0; -#endif - -#ifdef DEBUG /* * Debug code to allow disabling of rfs_dispatch() use of * fastxdrargs() and fastxdrres() calls for testing purposes. @@ -1628,25 +1631,7 @@ common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers, else anon_ok = 0; - cr = xprt->xp_cred; - ASSERT(cr != NULL); -#ifdef DEBUG - { - if (crgetref(cr) != 1) { - crfree(cr); - cr = crget(); - xprt->xp_cred = cr; - cred_misses++; - } else - cred_hits++; - } -#else - if (crgetref(cr) != 1) { - crfree(cr); - cr = crget(); - xprt->xp_cred = cr; - } -#endif + cr = svc_xprt_cred(xprt); exi = checkexport(fsid, xfid); diff --git a/usr/src/uts/common/fs/nfs/nfs_stats.c b/usr/src/uts/common/fs/nfs/nfs_stats.c index 97f820d756..13466a4f33 100644 --- a/usr/src/uts/common/fs/nfs/nfs_stats.c +++ b/usr/src/uts/common/fs/nfs/nfs_stats.c @@ -34,7 +34,7 @@ /* * Key to retrieve per-zone data corresponding to NFS kstats consumed by - * nfsstat(1m). + * nfsstat(8). */ zone_key_t nfsstat_zone_key; diff --git a/usr/src/uts/common/fs/pcfs/pc_node.c b/usr/src/uts/common/fs/pcfs/pc_node.c index 84a29f4430..bf01336c6f 100644 --- a/usr/src/uts/common/fs/pcfs/pc_node.c +++ b/usr/src/uts/common/fs/pcfs/pc_node.c @@ -667,7 +667,7 @@ pc_mark_irrecov(struct pcfs *fsp) "an irrecoverable error was encountered.\n" "File damage is possible. To prevent further\n" "damage, this pcfs instance will now be frozen.\n" - "Use umount(1M) to release the instance.\n"); + "Use umount(8) to release the instance.\n"); (void) pc_unlockfs(fsp); } } diff --git a/usr/src/uts/common/fs/pcfs/pc_vfsops.c b/usr/src/uts/common/fs/pcfs/pc_vfsops.c index 7b2205e1d7..60041a3d71 100644 --- a/usr/src/uts/common/fs/pcfs/pc_vfsops.c +++ b/usr/src/uts/common/fs/pcfs/pc_vfsops.c @@ -589,7 +589,7 @@ pcfs_parse_mntopts(struct pcfs *fsp) /* * The "secsize=..." mount option is a workaround for the lack of - * lofi(7d) support for DKIOCGMEDIAINFO. If PCFS wants to parse the + * lofi(4D) support for DKIOCGMEDIAINFO. If PCFS wants to parse the * partition table of a disk image and it has been partitioned with * sector sizes other than 512 bytes, we'd fail on loopback'ed disk * images. @@ -1988,7 +1988,7 @@ parseBPB(struct pcfs *fsp, uchar_t *bpb, int *valid) mediasize = (len_t)totsec * (len_t)secsize; /* * This is not an error because not all devices support the - * dkio(7i) mediasize queries, and/or not all devices are + * dkio(4I) mediasize queries, and/or not all devices are * partitioned. If we have not been able to figure out the * size of the underlaying medium, we have to trust the BPB. */ @@ -2286,7 +2286,7 @@ recheck: * * Test whether the device is: * - a floppy device from a known controller type via DKIOCINFO - * - a real floppy using the fd(7d) driver and capable of fdio(7I) ioctls + * - a real floppy using the fd(4D) driver and capable of fdio(4I) ioctls * - a USB floppy drive (identified by drive geometry) * * Detecting a floppy will make PCFS metadata updates on such media synchronous, @@ -2381,7 +2381,7 @@ pcfs_device_getinfo(struct pcfs *fsp) arg.mi.dki_media_type == DK_JAZ); /* - * if this device understands fdio(7I) requests it's + * if this device understands fdio(4I) requests it's * obviously a floppy drive. */ if (!isfloppy && @@ -2390,7 +2390,7 @@ pcfs_device_getinfo(struct pcfs *fsp) /* * some devices we like to treat as floppies, but they don't - * understand fdio(7I) requests. + * understand fdio(4I) requests. */ if (!isfloppy && !ldi_ioctl(lh, DKIOCINFO, argp, FKIOCTL, cr, NULL) && diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c index b307fe11d7..1965444071 100644 --- a/usr/src/uts/common/fs/pcfs/pc_vnops.c +++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c @@ -1852,8 +1852,8 @@ out: *offp = io_off; if (lenp) *lenp = io_len; - PC_DPRINTF4(4, "pcfs_putapage: vp=%p pp=%p off=%lld len=%lu\n", - (void *)vp, (void *)pp, io_off, io_len); + PC_DPRINTF4(4, "pcfs_putapage: vp=%p pp=%p off=%lld len=%lu\n", + (void *)vp, (void *)pp, io_off, io_len); if (err) { PC_DPRINTF1(1, "pcfs_putapage err=%d", err); } @@ -2093,7 +2093,7 @@ set_long_fn_chunk(struct pcdir_lfn *ep, char *buf, int len) static int get_long_fn_chunk(struct pcdir_lfn *ep, char *buf) { - char *tmp = buf; + char *tmp = buf; int i; /* Copy all the names, no filtering now */ diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c index 91d998b4b5..dd32c82434 100644 --- a/usr/src/uts/common/fs/portfs/port.c +++ b/usr/src/uts/common/fs/portfs/port.c @@ -156,7 +156,7 @@ * interested on. * The internal pollwakeup() function is used by all the file * systems --which are supporting the VOP_POLL() interface- to notify - * the upper layer (poll(2), devpoll(7d) and now event ports) about + * the upper layer (poll(2), devpoll(4D) and now event ports) about * the event triggered (see valid events in poll(2)). * The pollwakeup() function forwards the event to the layer registered * to receive the current event. diff --git a/usr/src/uts/common/fs/portfs/port_fd.c b/usr/src/uts/common/fs/portfs/port_fd.c index a1a1d6fb68..511c15e979 100644 --- a/usr/src/uts/common/fs/portfs/port_fd.c +++ b/usr/src/uts/common/fs/portfs/port_fd.c @@ -230,7 +230,7 @@ port_associate_fd(port_t *pp, int source, uintptr_t object, int events, * Allocate a polldat_t structure per fd * The use of the polldat_t structure to cache file descriptors * is required to be able to share the pollwakeup() function - * with poll(2) and devpoll(7d). + * with poll(2) and devpoll(4D). */ pfd = kmem_zalloc(sizeof (portfd_t), KM_SLEEP); pdp = PFTOD(pfd); diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c index 019de0540a..a6ca583a4d 100644 --- a/usr/src/uts/common/fs/portfs/port_fop.c +++ b/usr/src/uts/common/fs/portfs/port_fop.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* @@ -257,7 +257,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { VOPNAME_READ, { .femop_read = port_fop_read }, VOPNAME_WRITE, { .femop_write = port_fop_write }, VOPNAME_MAP, { .femop_map = port_fop_map }, - VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr }, + VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr }, VOPNAME_CREATE, { .femop_create = port_fop_create }, VOPNAME_REMOVE, { .femop_remove = port_fop_remove }, VOPNAME_LINK, { .femop_link = port_fop_link }, @@ -266,7 +266,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { VOPNAME_RMDIR, { .femop_rmdir = port_fop_rmdir }, VOPNAME_READDIR, { .femop_readdir = port_fop_readdir }, VOPNAME_SYMLINK, { .femop_symlink = port_fop_symlink }, - VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr }, + VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr }, VOPNAME_VNEVENT, { .femop_vnevent = port_fop_vnevent }, NULL, NULL }; @@ -275,7 +275,7 @@ const fs_operation_def_t port_vnodesrc_template[] = { * Fsem - vfs ops hooks */ const fs_operation_def_t port_vfssrc_template[] = { - VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount }, + VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount }, NULL, NULL }; @@ -539,14 +539,14 @@ port_fop_trimpfplist(vnode_t *vp) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } } /* * This routine returns 1, if the vnode can be rele'ed by the caller. - * The caller has to VN_RELE the vnode with out holding any + * The caller has to VN_PHANTOM_RELE the vnode with out holding any * locks. */ int @@ -616,7 +616,7 @@ port_fop_femuninstall(vnode_t *vp) * able to remove it from the port's queue). * * vpp and dvpp will point to the vnode and directory vnode which the caller - * is required to VN_RELE without holding any locks. + * is required to VN_PHANTOM_RELE without holding any locks. */ int port_remove_fop(portfop_t *pfp, portfop_cache_t *pfcp, int cleanup, @@ -726,12 +726,12 @@ port_cache_lookup_fop(portfop_cache_t *pfcp, pid_t pid, uintptr_t obj) /* * Given the file name, get the vnode and also the directory vnode - * On return, the vnodes are held (VN_HOLD). The caller has to VN_RELE - * the vnode(s). + * On return, the vnodes are held with phantom holds (VN_PHANTOM_HOLD). The + * caller has to VN_PHANTOM_RELE the vnode(s). */ int port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, - char **cname, int *len, int follow) + char **cname, int *len, int follow) { int error = 0; struct pathname pn; @@ -777,6 +777,17 @@ port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, } } + /* Trade VN_HOLD()s from lookuppn with VN_PHANTOM_HOLD()s */ + if (dvp != NULL && *dvp != NULL) { + VN_PHANTOM_HOLD(*dvp); + VN_RELE(*dvp); + } + + if (vp != NULL && *vp != NULL) { + VN_PHANTOM_HOLD(*vp); + VN_RELE(*vp); + } + pn_free(&pn); return (error); } @@ -815,7 +826,7 @@ port_getsrc(port_t *pp, int source) */ static void port_check_timestamp(portfop_cache_t *pfcp, vnode_t *vp, vnode_t *dvp, - portfop_t *pfp, void *objptr, uintptr_t object) + portfop_t *pfp, void *objptr, uintptr_t object) { vattr_t vatt; portfop_vp_t *pvp = vp->v_fopdata; @@ -1102,8 +1113,8 @@ port_install_fopdata(vnode_t *vp) */ int port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, - uintptr_t object, int events, void *user, char *cname, int clen, - vnode_t *dvp) + uintptr_t object, int events, void *user, char *cname, int clen, + vnode_t *dvp) { portfop_t *pfp = NULL; port_kevent_t *pkevp; @@ -1176,7 +1187,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold a reference to the vnode since * we successfully installed the hooks. */ - VN_HOLD(vp); + VN_PHANTOM_HOLD(vp); } else { (void) fem_uninstall(vp, femp, vp); pvp->pvp_femp = NULL; @@ -1209,7 +1220,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold the directory vnode since we have a reference now. */ if (dvp != NULL) - VN_HOLD(dvp); + VN_PHANTOM_HOLD(dvp); *pfpp = pfp; return (0); } @@ -1224,9 +1235,9 @@ port_resolve_vp(vnode_t *vp) */ if (vfs_mntdummyvp && mntfstype != 0 && vp->v_vfsp->vfs_fstype == mntfstype) { - VN_RELE(vp); + VN_PHANTOM_RELE(vp); vp = vfs_mntdummyvp; - VN_HOLD(vfs_mntdummyvp); + VN_PHANTOM_HOLD(vfs_mntdummyvp); } /* @@ -1234,8 +1245,8 @@ port_resolve_vp(vnode_t *vp) * hardlinks. */ if ((VOP_REALVP(vp, &rvp, NULL) == 0) && vp != rvp) { - VN_HOLD(rvp); - VN_RELE(vp); + VN_PHANTOM_HOLD(rvp); + VN_PHANTOM_RELE(vp); vp = rvp; } return (vp); @@ -1247,10 +1258,10 @@ port_resolve_vp(vnode_t *vp) * The association is identified by the object pointer and the pid. * The events argument contains the events to be monitored for. * - * The vnode will have a VN_HOLD once the fem hooks are installed. + * The vnode will have a VN_PHANTOM_HOLD once the fem hooks are installed. * - * Every reference(pfp) to the directory vnode will have a VN_HOLD to ensure - * that the directory vnode pointer does not change. + * Every reference(pfp) to the directory vnode will have a VN_PHANTOM_HOLD to + * ensure that the directory vnode pointer does not change. */ int port_associate_fop(port_t *pp, int source, uintptr_t object, int events, @@ -1330,7 +1341,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, */ if (dvp != NULL && dvp->v_vfsp != vp->v_vfsp && !(orig->v_type == VPROC && vp != NULL && vp->v_type != VPROC)) { - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); dvp = NULL; } @@ -1350,8 +1361,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, pfp = port_cache_lookup_fop(pfcp, curproc->p_pid, object); /* - * If it is not the same vnode, just discard it. VN_RELE needs to be - * called with no locks held, therefore save vnode pointers and + * If it is not the same vnode, just discard it. VN_PHANTOM_RELE needs + * to be called with no locks held, therefore save vnode pointers and * vn_rele them later. */ if (pfp != NULL && (pfp->pfop_vp != vp || pfp->pfop_dvp != dvp)) { @@ -1404,7 +1415,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * This vnode pointer is just used * for comparison, so rele it */ - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); } } @@ -1437,8 +1448,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * active and it is not being removed from * the vnode list. This is checked in * port_remove_fop with the vnode lock held. - * The vnode returned is VN_RELE'ed after dropping - * the locks. + * The vnode returned is VN_PHANTOM_RELE'ed after + * dropping the locks. */ tdvp = tvp = NULL; if (port_remove_fop(pfp, pfcp, 0, NULL, &tvp, &tdvp)) { @@ -1451,9 +1462,9 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, } mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); goto errout; } } else { @@ -1519,14 +1530,14 @@ errout: * Release the hold acquired due to the lookup operation. */ if (vp != NULL) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); if (dvp != NULL) - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); if (oldvp != NULL) - VN_RELE(oldvp); + VN_PHANTOM_RELE(oldvp); if (olddvp != NULL) - VN_RELE(olddvp); + VN_PHANTOM_RELE(olddvp); /* * copied file name not used, free it. @@ -1587,9 +1598,9 @@ port_dissociate_fop(port_t *pp, uintptr_t object) (void) port_remove_fop(pfp, pfcp, 1, &active, &tvp, &tdvp); mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); return (active ? 0 : ENOENT); } @@ -1610,7 +1621,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) portfop_t *pfpnext; int index, i; port_source_t *pse; - vnode_t *tdvp = NULL; + vnode_t *tdvp = NULL; vnode_t *vpl[PORTFOP_NVP]; pse = port_getsrc(pp, PORT_SOURCE_FILE); @@ -1627,7 +1638,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) * be possible as the port is being closed. * * The common case is that the port is not shared and all the entries - * are of this pid and have to be freed. Since VN_RELE has to be + * are of this pid and have to be freed. Since VN_PHANTOM_RELE has to be * called outside the lock, we do it in batches. */ hashtbl = (portfop_t **)pfcp->pfc_hash; @@ -1654,14 +1665,14 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) if (pfp == NULL) index++; /* - * Now call VN_RELE if we have collected enough vnodes or - * we have reached the end of the hash table. + * Now call VN_PHANTOM_RELE if we have collected enough vnodes + * or we have reached the end of the hash table. */ if (i >= (PORTFOP_NVP - 1) || (i > 0 && index == PORTFOP_HASHSIZE)) { mutex_exit(&pfcp->pfc_lock); while (i > 0) { - VN_RELE(vpl[--i]); + VN_PHANTOM_RELE(vpl[--i]); vpl[i] = NULL; } mutex_enter(&pfcp->pfc_lock); @@ -1769,7 +1780,7 @@ port_fop_excep(list_t *tlist, int op) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } @@ -1933,7 +1944,7 @@ port_fop_sendevent(vnode_t *vp, int events, vnode_t *dvp, char *cname) * that may be attempting to remove an object from the vnode's. */ if (port_fop_femuninstall(vp)) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); /* * Send exception events and discard the watch entries. @@ -1980,7 +1991,7 @@ port_fop(vnode_t *vp, int op, int retval) event |= FILE_TRUNC; } if (event) { - port_fop_sendevent(vp, event, NULL, NULL); + port_fop_sendevent(vp, event, NULL, NULL); } } @@ -2068,7 +2079,7 @@ port_fop_unmount(fsemarg_t *vf, int flag, cred_t *cr) * unmount is in process. */ port_fop_sendevent(pvp->pvp_vp, UNMOUNTED, NULL, NULL); - VN_RELE(pvp->pvp_vp); + VN_PHANTOM_RELE(pvp->pvp_vp); } error = vfsnext_unmount(vf, flag, cr); diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c index 08c5f6ffc0..d596d06a34 100644 --- a/usr/src/uts/common/fs/proc/prioctl.c +++ b/usr/src/uts/common/fs/proc/prioctl.c @@ -71,7 +71,7 @@ #include <sys/ctfs_impl.h> #include <sys/ctfs.h> -#if defined(__i386) || defined(__i386_COMPAT) +#if defined(__i386_COMPAT) #include <sys/sysi86.h> #endif @@ -133,6 +133,7 @@ prctioctl(prnode_t *pnp, int cmd, intptr_t arg, int flag, cred_t *cr) /* * Control operations (lots). */ +/* BEGIN CSTYLED */ /*ARGSUSED*/ #ifdef _SYSCALL32_IMPL static int @@ -144,6 +145,7 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, caller_context_t *ct) #endif /* _SYSCALL32_IMPL */ { +/* END CSTYLED */ int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG; caddr_t cmaddr = (caddr_t)arg; proc_t *p; @@ -275,11 +277,11 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr, case PIOCAUXV: break; -#if defined(__i386) || defined(__amd64) +#if defined(__x86) case PIOCNLDT: case PIOCLDT: break; -#endif /* __i386 || __amd64 */ +#endif /* __x86 */ #if defined(__sparc) case PIOCGWIN: @@ -1235,7 +1237,7 @@ startover: break; } -#if defined(__i386) || defined(__amd64) +#if defined(__x86) case PIOCNLDT: /* get number of LDT entries */ { int n; @@ -1290,7 +1292,7 @@ startover: kmem_free(ssd, (n+1) * sizeof (*ssd)); break; } -#endif /* __i386 || __amd64 */ +#endif /* __x86 */ #if defined(__sparc) case PIOCGWIN: /* get gwindows_t (see sys/reg.h) */ @@ -1830,11 +1832,11 @@ prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr, case PIOCAUXV: break; -#if defined(__i386) || defined(__i386_COMPAT) +#if defined(__i386_COMPAT) case PIOCNLDT: case PIOCLDT: break; -#endif /* __i386 || __i386_COMPAT */ +#endif /* __i386_COMPAT */ #if defined(__sparc) case PIOCGWIN: @@ -2867,7 +2869,7 @@ startover: break; } -#if defined(__i386) || defined(__i386_COMPAT) +#if defined(__i386_COMPAT) case PIOCNLDT: /* get number of LDT entries */ { int n; @@ -2922,7 +2924,7 @@ startover: kmem_free(ssd, (n+1) * sizeof (*ssd)); break; } -#endif /* __i386 || __i386_COMPAT */ +#endif /* __i386_COMPAT */ #if defined(__sparc) case PIOCGWIN: /* get gwindows_t (see sys/reg.h) */ diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index d096168b48..2dccbb2f63 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -954,8 +954,7 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr) fdinfo = pr_iol_newbuf(&data, offsetof(prfdinfo_t, pr_misc)); fdinfo->pr_fd = fd; fdinfo->pr_fdflags = ufp_flag; - /* FEPOLLED on f_flag2 should never be user-visible */ - fdinfo->pr_fileflags = (fp->f_flag2 & ~FEPOLLED) << 16 | fp->f_flag; + fdinfo->pr_fileflags = fp->f_flag2 << 16 | fp->f_flag; if ((fdinfo->pr_fileflags & (FSEARCH | FEXEC)) == 0) fdinfo->pr_fileflags += FOPEN; fdinfo->pr_offset = fp->f_offset; @@ -6236,7 +6235,7 @@ prseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) /* * We use the p_execdir member of proc_t to expand the %d token in core file * paths (the directory path for the executable that dumped core; see - * coreadm(1M) for details). We'd like gcore(1) to be able to expand %d in + * coreadm(8) for details). We'd like gcore(1) to be able to expand %d in * the same way as core dumping from the kernel, but there's no convenient * and comprehensible way to export the path name for p_execdir. To solve * this, we try to find the actual path to the executable that was used. In diff --git a/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c b/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c index 4235c94a06..f1a24bfeff 100644 --- a/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c +++ b/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c @@ -32,7 +32,7 @@ * Common function to see if a mech is available. */ static int -find_mech(smb_sign_mech_t *mech, crypto_mech_name_t name) +find_mech(smb_sign_mech_t *mech, const char *name) { crypto_mech_type_t t; diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c index 73b5c62225..16b9987972 100644 --- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c +++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c @@ -34,7 +34,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ #include <sys/param.h> @@ -439,7 +439,7 @@ out: if (fhp != NULL) smb_fh_rele(fhp); - return (0); + return (error); } void diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c index 3fca806155..c19e92976f 100644 --- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c +++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c @@ -34,7 +34,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -326,6 +326,7 @@ smbfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) /* * We have a new FID and access rights. */ + VERIFY(fid != NULL); oldfid = np->n_fid; np->n_fid = fid; np->n_fidrefs++; @@ -562,6 +563,10 @@ smbfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) return (EIO); + /* Sanity check: should have a valid open */ + if (np->n_fid == NULL) + return (EIO); + ASSERT(smbfs_rw_lock_held(&np->r_rwlock, RW_READER)); if (vp->v_type != VREG) @@ -723,6 +728,10 @@ smbfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) return (EIO); + /* Sanity check: should have a valid open */ + if (np->n_fid == NULL) + return (EIO); + ASSERT(smbfs_rw_lock_held(&np->r_rwlock, RW_WRITER)); if (vp->v_type != VREG) @@ -4427,6 +4436,10 @@ smbfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) return (EIO); + /* Sanity check: should have a valid open */ + if (np->n_fid == NULL) + return (EIO); + if (vp->v_flag & VNOMAP) return (ENOSYS); diff --git a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c index 9010e3a181..9aafb6e4d7 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c @@ -11,7 +11,7 @@ /* * Copyright 2019 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 RackTop Systems. + * Copyright 2020 RackTop Systems, Inc. */ @@ -973,6 +973,19 @@ cmd_done: */ (void) smb2_encode_header(sr, B_TRUE); + /* + * Cannot move this into smb2_session_setup() - encoded header required. + */ + if (session->dialect >= SMB_VERS_3_11 && + sr->smb2_cmd_code == SMB2_SESSION_SETUP && + sr->smb2_status == NT_STATUS_MORE_PROCESSING_REQUIRED) { + if (smb31_preauth_sha512_calc(sr, &sr->reply, + sr->uid_user->u_preauth_hashval, + sr->uid_user->u_preauth_hashval) != 0) + cmn_err(CE_WARN, "(3) Preauth hash calculation " + "failed"); + } + /* Don't sign if we're going to encrypt */ if (sr->tform_ssn == NULL && (sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0) @@ -1109,8 +1122,8 @@ cmd_start: disconnect = B_TRUE; goto cleanup; } - sr->smb2_hdr_flags |= (SMB2_FLAGS_SERVER_TO_REDIR | - SMB2_FLAGS_ASYNC_COMMAND); + sr->smb2_hdr_flags |= (SMB2_FLAGS_SERVER_TO_REDIR | + SMB2_FLAGS_ASYNC_COMMAND); sr->smb2_async_id = SMB2_ASYNCID(sr); /* @@ -1479,8 +1492,7 @@ smb2_send_reply(smb_request_t *sr) if ((session->capabilities & SMB2_CAP_ENCRYPTION) == 0 || sr->tform_ssn == NULL) { - if (smb_session_send(sr->session, 0, &sr->reply) == 0) - sr->reply.chain = 0; + (void) smb_session_send(sr->session, 0, &sr->reply); return; } @@ -1505,8 +1517,8 @@ smb2_send_reply(smb_request_t *sr) goto errout; } - if (smb_session_send(sr->session, 0, &enc_reply) == 0) - enc_reply.chain = 0; + (void) smb_session_send(sr->session, 0, &enc_reply); + kmem_free(tmpbuf, buflen); return; errout: @@ -1590,6 +1602,66 @@ smb2sr_put_error_data(smb_request_t *sr, uint32_t status, mbuf_chain_t *mbc) } /* + * Build an SMB2 error context response (dialect 3.1.1). + */ +void +smb2sr_put_error_ctx(smb_request_t *sr, uint32_t status, uint32_t errid, + mbuf_chain_t *mbc) +{ + DWORD len; + + /* + * The common dispatch code writes this when it + * updates the SMB2 header before sending. + */ + sr->smb2_status = status; + + /* Rewind to the end of the SMB header. */ + sr->reply.chain_offset = sr->smb2_reply_hdr + SMB2_HDR_SIZE; + + /* + * Error Context is 8-byte header plus encaps. data (ErrorContextData), + * which can be zero-length. + */ + if (mbc != NULL && (len = MBC_LENGTH(mbc)) != 0) { + (void) smb_mbc_encodef( + &sr->reply, + "wbblllC", + 9, /* StructSize */ /* w */ + 1, /* ErrorContextCount */ /* b */ + 0, /* reserved */ /* b */ + 8+len, /* ByteCount */ /* l */ + len, /* ErrorDataLength */ /* l */ + errid, /* ErrorId */ /* l */ + mbc); /* C */ + } else { + (void) smb_mbc_encodef( + &sr->reply, + "wbblll", + 9, /* StructSize */ /* w */ + 1, /* ErrorContextCount */ /* b */ + 0, /* reserved */ /* b */ + 8, /* ByteCount */ /* l */ + 0, /* ErrorDataLength */ /* l */ + errid); /* ErrorId */ /* l */ + } +} + +/* + * Build an SMB2 error context response with SMB2_ERROR_ID_DEFAULT ErrorId. + * + * This only handles the case we currently need, encapsulating a + * single error data section inside an SMB2_ERROR_ID_DEFAULT + * error context type (which is type zero, and that's what + * the zero on the end of this function name refers to). + */ +void +smb2sr_put_error_ctx0(smb_request_t *sr, uint32_t status, mbuf_chain_t *mbc) +{ + return (smb2sr_put_error_ctx(sr, status, SMB2_ERROR_ID_DEFAULT, mbc)); +} + +/* * smb2sr_lookup_fid * * Setup sr->fid_ofile, either inherited from a related command, diff --git a/usr/src/uts/common/fs/smbsrv/smb2_durable.c b/usr/src/uts/common/fs/smbsrv/smb2_durable.c index 56dda62832..c783cd9659 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_durable.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_durable.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* @@ -179,6 +179,8 @@ preserve_some: /* preserve_opens == SMB2_DH_PRESERVE_SOME */ switch (of->dh_vers) { + uint32_t ol_state; + case SMB2_RESILIENT: return (B_TRUE); @@ -188,7 +190,11 @@ preserve_some: /* FALLTHROUGH */ case SMB2_DURABLE_V1: /* IS durable (v1 or v2) */ - if ((of->f_oplock.og_state & (OPLOCK_LEVEL_BATCH | + if (of->f_lease != NULL) + ol_state = of->f_lease->ls_state; + else + ol_state = of->f_oplock.og_state; + if ((ol_state & (OPLOCK_LEVEL_BATCH | OPLOCK_LEVEL_CACHE_HANDLE)) != 0) return (B_TRUE); /* FALLTHROUGH */ @@ -360,6 +366,12 @@ smb2_dh_import_share(void *arg) break; /* + * If the server's stopping, no point importing. + */ + if (smb_server_is_stopping(sr->sr_server)) + break; + + /* * Read a stream name and info */ rc = smb_odir_read_streaminfo(sr, od, str_info, &eof); @@ -392,6 +404,7 @@ smb2_dh_import_share(void *arg) of = NULL; } sr->fid_ofile = NULL; + smb_llist_flush(&sr->tid_tree->t_ofile_list); } while (!eof); @@ -813,7 +826,7 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node, smb_attr_t attr; iovec_t iov; uio_t uio; - smb_kshare_t *shr = sr->arg.tcon.si; + smb_tree_t *tree = sr->tid_tree; cred_t *kcr = zone_kcred(); size_t flen; int rc; @@ -823,14 +836,14 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node, rc = smb_node_getattr(NULL, node, kcr, NULL, &attr); if (rc != 0) { cmn_err(CE_NOTE, "CA import (%s/%s) getattr rc=%d", - shr->shr_path, node->od_name, rc); + tree->t_resource, node->od_name, rc); return (rc); } if (attr.sa_vattr.va_size < 4 || attr.sa_vattr.va_size > sr->sr_req_length) { cmn_err(CE_NOTE, "CA import (%s/%s) bad size=%" PRIu64, - shr->shr_path, node->od_name, + tree->t_resource, node->od_name, (uint64_t)attr.sa_vattr.va_size); return (EINVAL); } @@ -847,19 +860,19 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node, rc = smb_fsop_read(sr, kcr, node, NULL, &uio, 0); if (rc != 0) { cmn_err(CE_NOTE, "CA import (%s/%s) read, rc=%d", - shr->shr_path, node->od_name, rc); + tree->t_resource, node->od_name, rc); return (rc); } if (uio.uio_resid != 0) { cmn_err(CE_NOTE, "CA import (%s/%s) short read", - shr->shr_path, node->od_name); + tree->t_resource, node->od_name); return (EIO); } rc = nvlist_unpack(sr->sr_request_buf, flen, nvlpp, KM_SLEEP); if (rc != 0) { cmn_err(CE_NOTE, "CA import (%s/%s) unpack, rc=%d", - shr->shr_path, node->od_name, rc); + tree->t_resource, node->od_name, rc); return (rc); } diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c index 4240328207..930bd353c4 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c @@ -204,7 +204,7 @@ smb2_fsctl_copychunk(smb_request_t *sr, smb_fsctl_t *fsctl) * The client should then fall back to normal copy. */ args->bufsize = smb2_copychunk_max_seg; - args->buffer = kmem_alloc(args->bufsize, KM_NOSLEEP | KM_NORMALPRI); + args->buffer = kmem_alloc(args->bufsize, KM_NOSLEEP_LAZY); if (args->buffer == NULL) { status = NT_STATUS_INSUFF_SERVER_RESOURCES; goto out; @@ -447,6 +447,8 @@ smb2_fsctl_copychunk_meta(smb_request_t *sr, smb_ofile_t *src_of) * here don't generally have WRITE_DAC access (sigh) so we * have to bypass ofile access checks for this operation. * The file-system level still does its access checking. + * + * TODO: this should really copy the SACL, too. */ smb_fssd_init(&fs_sd, secinfo, sd_flags); sr->fid_ofile = NULL; diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c index 829beda2e4..381fd7663e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -23,7 +23,16 @@ #include <smbsrv/smb_fsops.h> #include <smb/winioctl.h> -/* ARGSUSED */ +/* + * XXX: Should use smb2_fsctl_invalid in place of smb2_fsctl_notsup + * but that will require some re-testing. + */ +static uint32_t +smb2_fsctl_invalid(smb_request_t *sr, smb_fsctl_t *fsctl) +{ + return (NT_STATUS_INVALID_DEVICE_REQUEST); +} + static uint32_t smb2_fsctl_notsup(smb_request_t *sr, smb_fsctl_t *fsctl) { @@ -52,9 +61,12 @@ smb2_fsctl_get_compression(smb_request_t *sr, smb_fsctl_t *fsctl) { _NOTE(ARGUNUSED(sr)) uint16_t compress_state = 0; + int rc; - (void) smb_mbc_encodef(fsctl->in_mbc, "w", + rc = smb_mbc_encodef(fsctl->in_mbc, "w", compress_state); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (NT_STATUS_SUCCESS); } @@ -97,6 +109,7 @@ smb2_fsctl_get_resume_key(smb_request_t *sr, smb_fsctl_t *fsctl) { smb_ofile_t *of = sr->fid_ofile; smb2fid_t smb2fid; + int rc; /* Caller makes sure we have of = sr->fid_ofile */ /* Don't insist on a plain file (see above). */ @@ -104,10 +117,12 @@ smb2_fsctl_get_resume_key(smb_request_t *sr, smb_fsctl_t *fsctl) smb2fid.persistent = of->f_persistid; smb2fid.temporal = of->f_fid; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( fsctl->out_mbc, "qq16.", smb2fid.persistent, smb2fid.temporal); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (NT_STATUS_SUCCESS); } @@ -130,9 +145,11 @@ smb2_fsctl_fs(smb_request_t *sr, smb_fsctl_t *fsctl) break; case FSCTL_SET_REPARSE_POINT: /* 41 */ case FSCTL_GET_REPARSE_POINT: /* 42 */ - case FSCTL_CREATE_OR_GET_OBJECT_ID: /* 48 */ func = smb2_fsctl_notsup; break; + case FSCTL_CREATE_OR_GET_OBJECT_ID: /* 48 */ + func = smb2_fsctl_invalid; + break; case FSCTL_SET_SPARSE: /* 49 */ func = smb2_fsctl_set_sparse; break; diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c index 0452cddb39..fe748bbd62 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c @@ -667,7 +667,7 @@ smb2_fsctl_odx_write_native1(smb_request_t *sr, * allow the allocation to fail and return an error. * The client should then fall back to normal copy. */ - buffer = kmem_alloc(bufsize, KM_NOSLEEP | KM_NORMALPRI); + buffer = kmem_alloc(bufsize, KM_NOSLEEP_LAZY); if (buffer == NULL) { status = NT_STATUS_INSUFF_SERVER_RESOURCES; goto out; diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lease.c b/usr/src/uts/common/fs/smbsrv/smb2_lease.c index 95d7d9c7f1..a23f474cec 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_lease.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_lease.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -653,7 +653,6 @@ done: ofile->f_oplock.og_state = op->op_oplock_state; mutex_enter(&lease->ls_mutex); lease->ls_state = op->op_oplock_state & CACHE_RWH; - lease->ls_oplock_ofile = ofile; lease->ls_epoch++; mutex_exit(&lease->ls_mutex); } @@ -685,6 +684,9 @@ smb2_lease_ofile_close(smb_ofile_t *ofile) smb_lease_t *lease = ofile->f_lease; smb_ofile_t *o; + ASSERT(RW_READ_HELD(&node->n_ofile_list.ll_lock)); + ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex)); + /* * If this ofile was not the oplock owner for this lease, * we can leave things as they are. @@ -696,24 +698,22 @@ smb2_lease_ofile_close(smb_ofile_t *ofile) * Find another ofile to which we can move the oplock. * The ofile must be open and allow a new ref. */ - smb_llist_enter(&node->n_ofile_list, RW_READER); FOREACH_NODE_OFILE(node, o) { if (o == ofile) continue; if (o->f_lease != lease) continue; + if (o->f_oplock.og_closing) + continue; /* If we can get a hold, use this ofile. */ if (smb_ofile_hold(o)) break; } if (o == NULL) { /* Normal for last close on a lease. */ - smb_llist_exit(&node->n_ofile_list); return; } smb_oplock_move(node, ofile, o); - lease->ls_oplock_ofile = o; - smb_llist_exit(&node->n_ofile_list); smb_ofile_release(o); } diff --git a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c index e8d8419f93..7d67247588 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c @@ -11,7 +11,7 @@ /* * Copyright 2019 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 RackTop Systems. + * Copyright 2021 RackTop Systems, Inc. */ /* @@ -20,14 +20,7 @@ #include <smbsrv/smb2_kproto.h> #include <smbsrv/smb2.h> - -/* - * Note from [MS-SMB2] Sec. 2.2.3: Windows servers return - * invalid parameter if the dialect count is greater than 64 - * This is here (and not in smb2.h) because this is technically - * an implementation detail, not protocol specification. - */ -#define SMB2_NEGOTIATE_MAX_DIALECTS 64 +#include <sys/random.h> static int smb2_negotiate_common(smb_request_t *, uint16_t); @@ -85,6 +78,7 @@ static uint16_t smb2_versions[] = { 0x210, /* SMB 2.1 */ 0x300, /* SMB 3.0 */ 0x302, /* SMB 3.02 */ + 0x311, /* SMB 3.11 */ }; static uint16_t smb2_nversions = sizeof (smb2_versions) / sizeof (smb2_versions[0]); @@ -210,16 +204,377 @@ smb2_find_best_dialect(smb_session_t *s, uint16_t cl_versions[], * Return value is 0 for success, and anything else will * terminate the reader thread (drop the connection). */ +enum smb2_neg_ctx_type { + SMB2_PREAUTH_INTEGRITY_CAPS = 1, + SMB2_ENCRYPTION_CAPS = 2, + SMB2_COMPRESSION_CAPS = 3, /* not imlemented */ + SMB2_NETNAME_NEGOTIATE_CONTEXT_ID = 5 /* not imlemented */ +}; + +typedef struct smb2_negotiate_ctx { + uint16_t type; + uint16_t datalen; +} smb2_neg_ctx_t; + +#define SMB31_PREAUTH_CTX_SALT_LEN 32 + +/* + * SMB 3.1.1 originally specified a single hashing algorithm - SHA-512 - and + * two encryption ones - AES-128-CCM and AES-128-GCM. + * Windows Server 2022 and Windows 11 introduced two further encryption + * algorithms - AES-256-CCM and AES-256-GCM. + */ +#define MAX_HASHID_NUM (1) +#define MAX_CIPHER_NUM (4) + +typedef struct smb2_preauth_integrity_caps { + uint16_t picap_hash_count; + uint16_t picap_salt_len; + uint16_t picap_hash_id; + uint8_t picap_salt[SMB31_PREAUTH_CTX_SALT_LEN]; +} smb2_preauth_caps_t; + +typedef struct smb2_encryption_caps { + uint16_t encap_cipher_count; + uint16_t encap_cipher_ids[MAX_CIPHER_NUM]; +} smb2_encrypt_caps_t; + +/* + * The contexts we support + */ +typedef struct smb2_preauth_neg_ctx { + smb2_neg_ctx_t neg_ctx; + smb2_preauth_caps_t preauth_caps; +} smb2_preauth_neg_ctx_t; + +typedef struct smb2_encrypt_neg_ctx { + smb2_neg_ctx_t neg_ctx; + smb2_encrypt_caps_t encrypt_caps; +} smb2_encrypt_neg_ctx_t; + +typedef struct smb2_neg_ctxs { + uint32_t offset; + uint16_t count; + smb2_preauth_neg_ctx_t preauth_ctx; + smb2_encrypt_neg_ctx_t encrypt_ctx; +} smb2_neg_ctxs_t; + +#define NEG_CTX_INFO_OFFSET (SMB2_HDR_SIZE + 28) +#define NEG_CTX_OFFSET_OFFSET (SMB2_HDR_SIZE + 64) +#define NEG_CTX_MAX_COUNT (16) +#define NEG_CTX_MAX_DATALEN (256) + +#define STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP (0xC05D0000) + +#define STATUS_PREAUTH_HASH_OVERLAP \ + STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP + +#define SMB3_CIPHER_ENABLED(c, f) ((c) <= SMB3_CIPHER_MAX && \ + SMB3_CIPHER_BIT(c) & (f)) + +/* + * This function should be called only for dialect >= 0x311 + * Negotiate context list should contain exactly one + * SMB2_PREAUTH_INTEGRITY_CAPS context. + * Otherwise STATUS_INVALID_PARAMETER. + * It should contain at least 1 hash algorith what server does support. + * Otehrwise STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP. + */ +static uint32_t +smb31_decode_neg_ctxs(smb_request_t *sr, smb2_neg_ctxs_t *neg_ctxs) +{ + smb_session_t *s = sr->session; + smb2_preauth_caps_t *picap = &neg_ctxs->preauth_ctx.preauth_caps; + smb2_encrypt_caps_t *encap = &neg_ctxs->encrypt_ctx.encrypt_caps; + boolean_t found_sha512 = B_FALSE; + boolean_t found_cipher = B_FALSE; + uint16_t ciphers = sr->sr_server->sv_cfg.skc_encrypt_cipher; + uint32_t status = 0; + int32_t skip; + int found_preauth_ctx = 0; + int found_encrypt_ctx = 0; + int cnt, i; + int rc; + + sr->command.chain_offset = NEG_CTX_INFO_OFFSET; + + rc = smb_mbc_decodef(&sr->command, "lw2.", + &neg_ctxs->offset, /* l */ + &neg_ctxs->count); /* w */ + if (rc != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + /* + * There should be exactly 1 SMB2_PREAUTH_INTEGRITY_CAPS negotiate ctx. + * SMB2_ENCRYPTION_CAPS is optional one. + * If there is no contexts or there are to many then stop parsing. + */ + cnt = neg_ctxs->count; + if (cnt < 1 || cnt > NEG_CTX_MAX_COUNT) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Cannot proceed parsing if the first context isn't aligned by 8. + */ + if (neg_ctxs->offset % 8 != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + if ((skip = neg_ctxs->offset - sr->command.chain_offset) != 0 && + smb_mbc_decodef(&sr->command, "#.", skip) != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Parse negotiate contexts. Ignore non-decoding errors to fill + * as much as possible data for dtrace probe. + */ + for (i = 0; i < cnt; i++) { + smb2_neg_ctx_t neg_ctx; + int32_t ctx_end_off; + int32_t ctx_next_off; + + if (i > 0) { + if ((skip = ctx_next_off - ctx_end_off) != 0 && + smb_mbc_decodef(&sr->command, "#.", skip) != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + } + + rc = smb_mbc_decodef( + &sr->command, "ww4.", + &neg_ctx.type, /* w */ + &neg_ctx.datalen); /* w */ + if (rc != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * We got something crazy + */ + if (neg_ctx.datalen > NEG_CTX_MAX_DATALEN) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + ctx_end_off = sr->command.chain_offset + neg_ctx.datalen; + ctx_next_off = P2ROUNDUP(ctx_end_off, 8); + + switch (neg_ctx.type) { + case SMB2_PREAUTH_INTEGRITY_CAPS: + memcpy(&neg_ctxs->preauth_ctx.neg_ctx, &neg_ctx, + sizeof (neg_ctx)); + + if (found_preauth_ctx++ != 0) { + status = NT_STATUS_INVALID_PARAMETER; + continue; + } + + rc = smb_mbc_decodef( + &sr->command, "ww", + &picap->picap_hash_count, /* w */ + &picap->picap_salt_len); /* w */ + if (rc != 0 || picap->picap_hash_count > + MAX_HASHID_NUM) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Get hash id + */ + rc = smb_mbc_decodef( + &sr->command, "#w", + picap->picap_hash_count, + &picap->picap_hash_id); /* w */ + if (rc != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Get salt + */ + rc = smb_mbc_decodef( + &sr->command, "#c", + sizeof (picap->picap_salt), + &picap->picap_salt[0]); /* w */ + if (rc != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * In SMB 0x311 there should be exactly 1 preauth + * negotiate context, and there should be exactly 1 + * hash value in the list - SHA512. + */ + if (picap->picap_hash_count != 1) { + status = NT_STATUS_INVALID_PARAMETER; + continue; + } + + if (picap->picap_hash_id == SMB3_HASH_SHA512) + found_sha512 = B_TRUE; + break; + case SMB2_ENCRYPTION_CAPS: + memcpy(&neg_ctxs->preauth_ctx.neg_ctx, &neg_ctx, + sizeof (neg_ctx)); + + if (found_encrypt_ctx++ != 0) { + status = NT_STATUS_INVALID_PARAMETER; + continue; + } + + rc = smb_mbc_decodef( + &sr->command, "w", + &encap->encap_cipher_count); /* w */ + if (rc != 0 || encap->encap_cipher_count > + MAX_CIPHER_NUM) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Get cipher list + */ + rc = smb_mbc_decodef( + &sr->command, "#w", + encap->encap_cipher_count, + &encap->encap_cipher_ids[0]); /* w */ + if (rc != 0) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + /* + * Select the first enabled cipher. + * Client should list more prioritized ciphers first. + */ + for (int k = 0; k < encap->encap_cipher_count; k++) { + uint16_t c = encap->encap_cipher_ids[k]; + + if (SMB3_CIPHER_ENABLED(c, ciphers)) { + s->smb31_enc_cipherid = c; + found_cipher = B_TRUE; + break; + } + } + break; + default: + ; + } + } + + if (status) + goto errout; + + /* Not found mandatory SMB2_PREAUTH_INTEGRITY_CAPS ctx */ + if (found_preauth_ctx != 1 || found_encrypt_ctx > 1) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + + if (!found_sha512) { + status = STATUS_PREAUTH_HASH_OVERLAP; + goto errout; + } + + s->smb31_preauth_hashid = SMB3_HASH_SHA512; + + if (!found_cipher) + s->smb31_enc_cipherid = 0; + +errout: + return (status); +} + +static int +smb31_encode_neg_ctxs(smb_request_t *sr, smb2_neg_ctxs_t *neg_ctxs) +{ + smb_session_t *s = sr->session; + smb2_preauth_caps_t *picap = &neg_ctxs->preauth_ctx.preauth_caps; + smb2_encrypt_caps_t *encap = &neg_ctxs->encrypt_ctx.encrypt_caps; + uint16_t salt_len = sizeof (picap->picap_salt); + uint32_t preauth_ctx_len = 6 + salt_len; + uint32_t enc_ctx_len = 4; + uint32_t neg_ctx_off = NEG_CTX_OFFSET_OFFSET + + P2ROUNDUP(sr->sr_cfg->skc_negtok_len, 8); + uint32_t rc; + + bzero(neg_ctxs, sizeof (*neg_ctxs)); + + if ((rc = smb_mbc_put_align(&sr->reply, 8)) != 0) + return (rc); + + ASSERT3S(neg_ctx_off, ==, sr->reply.chain_offset); + + encap->encap_cipher_ids[0] = s->smb31_enc_cipherid; + picap->picap_hash_id = s->smb31_preauth_hashid; + picap->picap_salt_len = salt_len; + + (void) random_get_pseudo_bytes(picap->picap_salt, salt_len); + + rc = smb_mbc_encodef( + &sr->reply, "ww4.", + SMB2_PREAUTH_INTEGRITY_CAPS, + preauth_ctx_len + /* 4. */); /* reserved */ + if (rc != 0) + return (rc); + + rc = smb_mbc_encodef( + &sr->reply, "www#c", + 1, /* hash algo count */ + salt_len, /* salt length */ + s->smb31_preauth_hashid, /* hash id */ + salt_len, /* salt length */ + picap->picap_salt); + + /* aligned on 8-bytes boundary */ + if (rc != 0 || s->smb31_enc_cipherid == 0) { + cmn_err(CE_NOTE, "Encryption is not supported"); + return (rc); + } + + if ((rc = smb_mbc_put_align(&sr->reply, 8)) != 0) + return (rc); + + rc = smb_mbc_encodef( + &sr->reply, "ww4.", + SMB2_ENCRYPTION_CAPS, + enc_ctx_len + /* 4. */); /* reserved */ + + rc = smb_mbc_encodef( + &sr->reply, "ww", + 1, /* cipher count */ + s->smb31_enc_cipherid); /* encrypt. cipher id */ + + return (rc); +} + int smb2_newrq_negotiate(smb_request_t *sr) { smb_session_t *s = sr->session; + smb2_neg_ctxs_t neg_in_ctxs; + smb2_neg_ctxs_t neg_out_ctxs; + smb2_arg_negotiate_t *nego2 = &sr->sr_nego2; int rc; uint32_t status = 0; uint16_t struct_size; uint16_t best_version; - uint16_t version_cnt; - uint16_t cl_versions[SMB2_NEGOTIATE_MAX_DIALECTS]; + + bzero(&neg_in_ctxs, sizeof (neg_in_ctxs)); + bzero(&neg_out_ctxs, sizeof (neg_out_ctxs)); sr->smb2_cmd_hdr = sr->command.chain_offset; rc = smb2_decode_header(sr); @@ -239,7 +594,7 @@ smb2_newrq_negotiate(smb_request_t *sr) rc = smb_mbc_decodef( &sr->command, "www..l16c8.", &struct_size, /* w */ - &version_cnt, /* w */ + &s->cli_dialect_cnt, /* w */ &s->cli_secmode, /* w */ /* reserved (..) */ &s->capabilities, /* l */ @@ -255,33 +610,16 @@ smb2_newrq_negotiate(smb_request_t *sr) * * Be somewhat tolerant while decoding the variable part * so we can return errors instead of dropping the client. - * Will limit decoding to the size of cl_versions here, - * and do the error checks on version_cnt after the + * Will limit decoding to the size of cli_dialects here, + * and do the error checks on s->cli_dialect_cnt after the * dtrace start probe. */ - if (version_cnt > 0 && - version_cnt <= SMB2_NEGOTIATE_MAX_DIALECTS && - smb_mbc_decodef(&sr->command, "#w", version_cnt, - cl_versions) != 0) { - /* decode error; force an error below */ - version_cnt = 0; - } - - DTRACE_SMB2_START(op__Negotiate, smb_request_t *, sr); - - sr->smb2_hdr_flags |= SMB2_FLAGS_SERVER_TO_REDIR; - (void) smb2_encode_header(sr, B_FALSE); - - /* - * [MS-SMB2] 3.3.5.2.4 Verifying the Signature - * "If the SMB2 header of the SMB2 NEGOTIATE request has the - * SMB2_FLAGS_SIGNED bit set in the Flags field, the server - * MUST fail the request with STATUS_INVALID_PARAMETER." - */ - if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0) { - sr->smb2_hdr_flags &= ~SMB2_FLAGS_SIGNED; - status = NT_STATUS_INVALID_PARAMETER; - goto errout; + if (s->cli_dialect_cnt > 0 && + s->cli_dialect_cnt <= SMB2_NEGOTIATE_MAX_DIALECTS && + smb_mbc_decodef(&sr->command, "#w", s->cli_dialect_cnt, + s->cli_dialects) != 0) { + /* decode error; force an error below */ + s->cli_dialect_cnt = 0; } /* @@ -289,26 +627,53 @@ smb2_newrq_negotiate(smb_request_t *sr) * "If the DialectCount of the SMB2 NEGOTIATE Request is 0, the * server MUST fail the request with STATUS_INVALID_PARAMETER." */ - if (version_cnt == 0 || - version_cnt > SMB2_NEGOTIATE_MAX_DIALECTS) { + if (s->cli_dialect_cnt == 0 || + s->cli_dialect_cnt > SMB2_NEGOTIATE_MAX_DIALECTS) { status = NT_STATUS_INVALID_PARAMETER; - goto errout; } /* * The client offers an array of protocol versions it - * supports, which we have decoded into cl_versions[]. + * supports, which we have decoded into s->cli_dialects[]. * We walk the array and pick the highest supported. * * [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request * "If a common dialect is not found, the server MUST fail * the request with STATUS_NOT_SUPPORTED." */ - best_version = smb2_find_best_dialect(s, cl_versions, version_cnt); - if (best_version == 0) { - status = NT_STATUS_NOT_SUPPORTED; + + if (status == 0) { + best_version = smb2_find_best_dialect(s, s->cli_dialects, + s->cli_dialect_cnt); + if (best_version >= SMB_VERS_3_11) { + status = smb31_decode_neg_ctxs(sr, &neg_in_ctxs); + nego2->neg_in_ctxs = &neg_in_ctxs; + } else if (best_version == 0) { + status = NT_STATUS_NOT_SUPPORTED; + } + } + + DTRACE_SMB2_START(op__Negotiate, smb_request_t *, sr); + nego2->neg_in_ctxs = NULL; + + sr->smb2_hdr_flags |= SMB2_FLAGS_SERVER_TO_REDIR; + (void) smb2_encode_header(sr, B_FALSE); + + if (status != 0) + goto errout; + + /* + * [MS-SMB2] 3.3.5.2.4 Verifying the Signature + * "If the SMB2 header of the SMB2 NEGOTIATE request has the + * SMB2_FLAGS_SIGNED bit set in the Flags field, the server + * MUST fail the request with STATUS_INVALID_PARAMETER." + */ + if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0) { + sr->smb2_hdr_flags &= ~SMB2_FLAGS_SIGNED; + status = NT_STATUS_INVALID_PARAMETER; goto errout; } + s->dialect = best_version; /* Allow normal SMB2 requests now. */ @@ -318,14 +683,30 @@ smb2_newrq_negotiate(smb_request_t *sr) if (smb2_negotiate_common(sr, best_version) != 0) status = NT_STATUS_INTERNAL_ERROR; + if (s->dialect >= SMB_VERS_3_11 && status == 0) { + if (smb31_encode_neg_ctxs(sr, &neg_out_ctxs) != 0) + status = NT_STATUS_INTERNAL_ERROR; + nego2->neg_out_ctxs = &neg_out_ctxs; + } + errout: sr->smb2_status = status; DTRACE_SMB2_DONE(op__Negotiate, smb_request_t *, sr); + nego2->neg_out_ctxs = NULL; if (sr->smb2_status != 0) smb2sr_put_error(sr, sr->smb2_status); (void) smb2_encode_header(sr, B_TRUE); + if (s->dialect >= SMB_VERS_3_11 && sr->smb2_status == 0) { + ASSERT3U(s->smb31_preauth_hashid, !=, 0); + if (smb31_preauth_sha512_calc(sr, &sr->reply, + s->smb31_preauth_hashval, + s->smb31_preauth_hashval) != 0) + cmn_err(CE_WARN, "(1) Preauth hash calculation " + "failed"); + } + smb2_send_reply(sr); return (rc); @@ -347,6 +728,8 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) int rc; uint32_t max_rwsize; uint16_t secmode; + uint16_t neg_ctx_cnt = 0; + uint32_t neg_ctx_off = 0; /* * Negotiation itself. First the Security Mode. @@ -379,6 +762,8 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) */ if (version != 0x2FF) smb2_sign_init_mech(s); + if (version >= 0x311) + smb31_preauth_init_mech(s); /* * [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request @@ -404,6 +789,21 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) if ((s->srv_cap & SMB2_CAP_ENCRYPTION) != 0 && smb3_encrypt_init_mech(s) != 0) { s->srv_cap &= ~SMB2_CAP_ENCRYPTION; + s->smb31_enc_cipherid = 0; + } + + if (s->dialect >= SMB_VERS_3_11) { + neg_ctx_cnt = s->smb31_enc_cipherid == 0 ? 1 : 2; + neg_ctx_off = NEG_CTX_OFFSET_OFFSET + + P2ROUNDUP(sr->sr_cfg->skc_negtok_len, 8); + + ASSERT3U(s->smb31_preauth_hashid, !=, 0); + + if (smb31_preauth_sha512_calc(sr, &sr->command, + s->smb31_preauth_hashval, + s->smb31_preauth_hashval) != 0) + cmn_err(CE_WARN, "(0) Preauth hash calculation " + "failed"); } } @@ -421,7 +821,7 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) 65, /* StructSize */ /* w */ s->srv_secmode, /* w */ version, /* w */ - 0, /* reserved */ /* w */ + neg_ctx_cnt, /* w */ UUID_LEN, /* # */ &s->s_cfg.skc_machine_uuid, /* c */ s->srv_cap, /* l */ @@ -432,10 +832,12 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version) &boot_tv, /* T */ 128, /* SecBufOff */ /* w */ sr->sr_cfg->skc_negtok_len, /* w */ - 0, /* reserved */ /* l */ + neg_ctx_off, /* l */ sr->sr_cfg->skc_negtok_len, /* # */ sr->sr_cfg->skc_negtok); /* c */ + + /* smb2_send_reply(sr); in caller */ (void) ksocket_setsockopt(s->sock, SOL_SOCKET, @@ -467,12 +869,12 @@ uint32_t smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl) { smb_session_t *s = sr->session; + boolean_t smb311 = s->s_cfg.skc_max_protocol >= SMB_VERS_3_11; int rc; /* * The spec. says to parse the VALIDATE_NEGOTIATE_INFO here * and verify that the original negotiate was not modified. - * The request MUST be signed, and we MUST validate the signature. * * One interesting requirement here is that we MUST reply * with exactly the same information as we returned in our @@ -480,12 +882,22 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl) * If we don't the client closes the connection. */ - /* dialects[8] taken from cl_versions[8] in smb2_newrq_negotiate */ uint32_t capabilities; - uint16_t secmode, num_dialects, dialects[8]; + uint16_t secmode; + uint16_t num_dialects; + uint16_t dialects[SMB2_NEGOTIATE_MAX_DIALECTS]; uint8_t clnt_guid[16]; - if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) == 0) + if (s->dialect >= SMB_VERS_3_11) + goto drop; + + /* + * [MS-SMB2] 3.3.5.2.4 Verifying the Signature + * + * If the dialect is SMB3 and the message was successfully + * decrypted we MUST skip processing of the signature. + */ + if (!sr->encrypted && (sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) == 0) goto drop; if (fsctl->InputCount < 24) @@ -497,7 +909,9 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl) &secmode, /* w */ &num_dialects); /* w */ - if (num_dialects == 0 || num_dialects > 8) + if (num_dialects == 0 || num_dialects > SMB2_NEGOTIATE_MAX_DIALECTS) + goto drop; + if (smb311 && num_dialects != s->cli_dialect_cnt) goto drop; if (secmode != s->cli_secmode) goto drop; @@ -513,8 +927,16 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl) if (rc != 0) goto drop; - if (smb2_find_best_dialect(s, dialects, num_dialects) != s->dialect) - goto drop; + if (smb311) { + for (int i = 0; i < num_dialects; i++) { + if (dialects[i] != s->cli_dialects[i]) + goto drop; + } + } else { + if (smb2_find_best_dialect(s, dialects, num_dialects) != + s->dialect) + goto drop; + } rc = smb_mbc_encodef( fsctl->out_mbc, "l#cww", diff --git a/usr/src/uts/common/fs/smbsrv/smb2_oplock.c b/usr/src/uts/common/fs/smbsrv/smb2_oplock.c index 84bd8ccafb..f3f96c2b21 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_oplock.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_oplock.c @@ -10,7 +10,8 @@ */ /* - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. + * Copyright 2019 RackTop Systems. */ /* @@ -96,13 +97,35 @@ smb2_oplock_break_ack(smb_request_t *sr) NewLevel = OPLOCK_LEVEL_BATCH; break; case SMB2_OPLOCK_LEVEL_LEASE: /* 0xFF */ - default: NewLevel = OPLOCK_LEVEL_NONE; break; + default: + status = NT_STATUS_INVALID_PARAMETER; + goto errout; } ofile = sr->fid_ofile; + if (ofile->f_oplock.og_breaking == 0) { + /* + * This is an unsolicited Ack. (There is no + * outstanding oplock break in progress now.) + * There are WPTS tests that care which error + * is returned. See [MS-SMB2] 3.3.5.22.1 + */ + if (smbOplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { + status = NT_STATUS_INVALID_PARAMETER; + goto errout; + } + if (NewLevel >= (ofile->f_oplock.og_state & + OPLOCK_LEVEL_TYPE_MASK)) { + status = NT_STATUS_INVALID_OPLOCK_PROTOCOL; + goto errout; + } + status = NT_STATUS_INVALID_DEVICE_STATE; + goto errout; + } ofile->f_oplock.og_breaking = 0; + status = smb_oplock_ack_break(sr, ofile, &NewLevel); if (status == NT_STATUS_OPLOCK_BREAK_IN_PROGRESS) { status = smb2sr_go_async(sr); diff --git a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c index ab682b7966..929f02522b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -240,10 +240,11 @@ static uint32_t smb2_qif_basic(smb_request_t *sr, smb_queryinfo_t *qi) { smb_attr_t *sa = &qi->qi_attr; + int rc; ASSERT((sa->sa_mask & SMB_AT_BASIC) == SMB_AT_BASIC); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "TTTTll", &sa->sa_crtime, /* T */ &sa->sa_vattr.va_atime, /* T */ @@ -251,6 +252,8 @@ smb2_qif_basic(smb_request_t *sr, smb_queryinfo_t *qi) &sa->sa_vattr.va_ctime, /* T */ sa->sa_dosattr, /* l */ 0); /* reserved */ /* l */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -265,10 +268,11 @@ static uint32_t smb2_qif_standard(smb_request_t *sr, smb_queryinfo_t *qi) { smb_attr_t *sa = &qi->qi_attr; + int rc; ASSERT((sa->sa_mask & SMB_AT_STANDARD) == SMB_AT_STANDARD); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "qqlbbw", sa->sa_allocsz, /* q */ sa->sa_vattr.va_size, /* q */ @@ -276,6 +280,8 @@ smb2_qif_standard(smb_request_t *sr, smb_queryinfo_t *qi) qi->qi_delete_on_close, /* b */ qi->qi_isdir, /* b */ 0); /* reserved */ /* w */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -290,6 +296,7 @@ smb2_qif_internal(smb_request_t *sr, smb_queryinfo_t *qi) { smb_attr_t *sa = &qi->qi_attr; u_longlong_t nodeid; + int rc; ASSERT((sa->sa_mask & SMB_AT_NODEID) == SMB_AT_NODEID); nodeid = sa->sa_vattr.va_nodeid; @@ -298,9 +305,11 @@ smb2_qif_internal(smb_request_t *sr, smb_queryinfo_t *qi) (sr->session->s_flags & SMB_SSN_AAPL_CCEXT) != 0) nodeid = 0; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "q", nodeid); /* q */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -315,9 +324,12 @@ static uint32_t smb2_qif_ea_size(smb_request_t *sr, smb_queryinfo_t *qi) { _NOTE(ARGUNUSED(qi)) + int rc; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "l", 0); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -341,10 +353,13 @@ smb2_qif_access(smb_request_t *sr, smb_queryinfo_t *qi) { _NOTE(ARGUNUSED(qi)) smb_ofile_t *of = sr->fid_ofile; + int rc; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "l", of->f_granted_access); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -358,14 +373,17 @@ smb2_qif_access(smb_request_t *sr, smb_queryinfo_t *qi) static uint32_t smb2_qif_name(smb_request_t *sr, smb_queryinfo_t *qi) { + int rc; ASSERT(qi->qi_namelen > 0); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "llU", 0, /* FileIndex (l) */ qi->qi_namelen, /* l */ qi->qi_name); /* U */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -379,13 +397,16 @@ smb2_qif_position(smb_request_t *sr, smb_queryinfo_t *qi) _NOTE(ARGUNUSED(qi)) smb_ofile_t *of = sr->fid_ofile; uint64_t pos; + int rc; mutex_enter(&of->f_mutex); pos = of->f_seek_pos; mutex_exit(&of->f_mutex); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "q", pos); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -399,9 +420,12 @@ static uint32_t smb2_qif_mode(smb_request_t *sr, smb_queryinfo_t *qi) { _NOTE(ARGUNUSED(qi)) + int rc; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "l", 0); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -413,9 +437,12 @@ static uint32_t smb2_qif_alignment(smb_request_t *sr, smb_queryinfo_t *qi) { _NOTE(ARGUNUSED(qi)) + int rc; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "l", 0); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -430,6 +457,7 @@ static uint32_t smb2_qif_altname(smb_request_t *sr, smb_queryinfo_t *qi) { smb_ofile_t *of = sr->fid_ofile; + int rc; ASSERT(qi->qi_namelen > 0); ASSERT(qi->qi_attr.sa_mask & SMB_AT_NODEID); @@ -442,10 +470,12 @@ smb2_qif_altname(smb_request_t *sr, smb_queryinfo_t *qi) /* fill in qi->qi_shortname */ smb_query_shortname(of->f_node, qi); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "%lU", sr, smb_wcequiv_strlen(qi->qi_shortname), qi->qi_shortname); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -481,6 +511,7 @@ smb2_qif_pipe(smb_request_t *sr, smb_queryinfo_t *qi) smb_ofile_t *of = sr->fid_ofile; uint32_t pipe_mode; uint32_t nonblock; + int rc; switch (of->f_ftype) { case SMB_FTYPE_BYTE_PIPE: @@ -496,9 +527,11 @@ smb2_qif_pipe(smb_request_t *sr, smb_queryinfo_t *qi) } nonblock = 0; /* XXX todo: Get this from the pipe handle. */ - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "ll", pipe_mode, nonblock); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -532,13 +565,16 @@ smb2_qif_compr(smb_request_t *sr, smb_queryinfo_t *qi) { smb_attr_t *sa = &qi->qi_attr; uint16_t CompressionFormat = 0; /* COMPRESSION_FORMAT_NONE */ + int rc; ASSERT(sa->sa_mask & SMB_AT_SIZE); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "qw6.", sa->sa_vattr.va_size, /* q */ CompressionFormat); /* w */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -550,8 +586,9 @@ static uint32_t smb2_qif_opens(smb_request_t *sr, smb_queryinfo_t *qi) { smb_attr_t *sa = &qi->qi_attr; + int rc; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "TTTTqqll", &sa->sa_crtime, /* T */ &sa->sa_vattr.va_atime, /* T */ @@ -561,6 +598,8 @@ smb2_qif_opens(smb_request_t *sr, smb_queryinfo_t *qi) sa->sa_vattr.va_size, /* q */ sa->sa_dosattr, /* l */ 0); /* reserved */ /* l */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -578,8 +617,12 @@ static uint32_t smb2_qif_tags(smb_request_t *sr, smb_queryinfo_t *qi) { _NOTE(ARGUNUSED(qi)) - (void) smb_mbc_encodef( + int rc; + + rc = smb_mbc_encodef( &sr->raw_data, "ll", 0, 0); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } diff --git a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c index 856a59e939..7bf3d1339e 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -100,6 +100,7 @@ smb2_qfs_volume(smb_request_t *sr) smb_node_t *snode; fsid_t fsid; uint32_t LabelLength; + int rc; if (!STYPE_ISDSK(tree->t_res_type)) return (NT_STATUS_INVALID_PARAMETER); @@ -112,14 +113,16 @@ smb2_qfs_volume(smb_request_t *sr) /* * NT has the "supports objects" flag set to 1. */ - (void) smb_mbc_encodef( - &sr->raw_data, "qllb.U", - 0LL, /* Volume creation time (q) */ + rc = smb_mbc_encodef( + &sr->raw_data, "Tllb.U", + &tree->t_create_time, /* (T) */ fsid.val[0], /* serial no. (l) */ LabelLength, /* (l) */ 0, /* Supports objects (b) */ /* reserved (.) */ tree->t_volume); /* (U) */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -141,12 +144,14 @@ smb2_qfs_size(smb_request_t *sr) if (rc) return (smb_errno2status(rc)); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "qqll", fssize.fs_caller_units, fssize.fs_caller_avail, fssize.fs_sectors_per_unit, fssize.fs_bytes_per_sector); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -168,13 +173,15 @@ smb2_qfs_fullsize(smb_request_t *sr) if (rc) return (smb_errno2status(rc)); - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "qqqll", fssize.fs_caller_units, fssize.fs_caller_avail, fssize.fs_volume_avail, fssize.fs_sectors_per_unit, fssize.fs_bytes_per_sector); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -188,6 +195,7 @@ smb2_qfs_device(smb_request_t *sr) smb_tree_t *tree = sr->tid_tree; uint32_t DeviceType; uint32_t Characteristics; + int rc; if (!STYPE_ISDSK(tree->t_res_type)) return (NT_STATUS_INVALID_PARAMETER); @@ -195,10 +203,12 @@ smb2_qfs_device(smb_request_t *sr) DeviceType = FILE_DEVICE_DISK; Characteristics = FILE_DEVICE_IS_MOUNTED; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "ll", DeviceType, Characteristics); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -213,6 +223,7 @@ smb2_qfs_attr(smb_request_t *sr) char *fsname; uint32_t namelen; uint32_t FsAttr; + int rc; /* This call is OK on all tree types. */ switch (tree->t_res_type & STYPE_MASK) { @@ -247,12 +258,14 @@ smb2_qfs_attr(smb_request_t *sr) if (tree->t_flags & SMB_TREE_SPARSE) FsAttr |= FILE_SUPPORTS_SPARSE_FILES; - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "lllU", FsAttr, MAXNAMELEN-1, namelen, fsname); + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -264,6 +277,7 @@ uint32_t smb2_qfs_control(smb_request_t *sr) { smb_tree_t *tree = sr->tid_tree; + int rc; if (!STYPE_ISDSK(tree->t_res_type)) return (NT_STATUS_INVALID_PARAMETER); @@ -275,7 +289,7 @@ smb2_qfs_control(smb_request_t *sr) return (NT_STATUS_VOLUME_NOT_UPGRADED); } - (void) smb_mbc_encodef( + rc = smb_mbc_encodef( &sr->raw_data, "qqqqqll", 0, /* free space start filtering - MUST be 0 */ 0, /* free space threshold - MUST be 0 */ @@ -284,6 +298,8 @@ smb2_qfs_control(smb_request_t *sr) SMB_QUOTA_UNLIMITED, /* default quota limit */ FILE_VC_QUOTA_ENFORCE, /* fs control flag */ 0); /* pad bytes */ + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } @@ -364,7 +380,7 @@ smb2_qfs_sectorsize(smb_request_t *sr) smb_fssize_t fssize; smb_tree_t *tree = sr->tid_tree; uint32_t lbps, pbps; - uint32_t flags; + uint32_t flags, unk; int rc; if (!STYPE_ISDSK(tree->t_res_type)) @@ -373,24 +389,15 @@ smb2_qfs_sectorsize(smb_request_t *sr) rc = smb_fssize(sr, &fssize); if (rc) return (smb_errno2status(rc)); + + // PhysicalBytesPerSector pbps = fssize.fs_bytes_per_sector; + + // LogicalBytesPerSector lbps = fssize.fs_sectors_per_unit * pbps; if (lbps > smb2_max_logical_sector_size) lbps = smb2_max_logical_sector_size; - // LogicalBytesPerSector - (void) smb_mbc_encodef(&sr->raw_data, "l", lbps); - - // PhysicalBytesPerSectorForAtomicity - (void) smb_mbc_encodef(&sr->raw_data, "l", pbps); - - // PhysicalBytesPerSectorForPerformance - // Using logical size here. - (void) smb_mbc_encodef(&sr->raw_data, "l", lbps); - - // FileSystemEffectivePhysicalBytesPerSectorForAtomicity - (void) smb_mbc_encodef(&sr->raw_data, "l", pbps); - // Flags // We include "no seek penalty" because our files are // always ZFS-backed, which can reorder things on disk. @@ -398,15 +405,24 @@ smb2_qfs_sectorsize(smb_request_t *sr) flags = SSINFO_FLAGS_ALIGNED_DEVICE | SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE | SSINFO_FLAGS_NO_SEEK_PENALTY; - (void) smb_mbc_encodef(&sr->raw_data, "l", flags); // ByteOffsetForSectorAlignment // ByteOffsetForPartitionAlignment // Just say "unknown" for these two. - (void) smb_mbc_encodef( - &sr->raw_data, "l", - SSINFO_OFFSET_UNKNOWN, - SSINFO_OFFSET_UNKNOWN); + unk = SSINFO_OFFSET_UNKNOWN; + + rc = smb_mbc_encodef( + &sr->raw_data, + "lllllll", + lbps, // LogicalBytesPerSector + pbps, // PhysicalBytesPerSectorForAtomicity + lbps, // PhysicalBytesPerSectorForPerformance + pbps, // FileSystemEffectivePhysicalBytesPerSectorForAtomicity + flags, + unk, unk); + + if (rc != 0) + return (NT_STATUS_BUFFER_OVERFLOW); return (0); } diff --git a/usr/src/uts/common/fs/smbsrv/smb2_query_info.c b/usr/src/uts/common/fs/smbsrv/smb2_query_info.c index dc59307fc3..61c6cbb97d 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_query_info.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_query_info.c @@ -14,6 +14,10 @@ */ /* + * Copyright 2020 RackTop Systems, Inc. + */ + +/* * Dispatch function for SMB2_QUERY_INFO */ @@ -107,16 +111,42 @@ errout: /* Not really an error, per se. Advisory. */ break; - case NT_STATUS_BUFFER_TOO_SMALL: - case NT_STATUS_INFO_LENGTH_MISMATCH: + case NT_STATUS_BUFFER_TOO_SMALL: /* only in smb2_qinfo_sec.c */ /* - * These are special, per. [MS-SMB2] 3.2.5.17 - * The error data is a 4-byte count of the size - * required to successfully query the data. - * That error data is built by the functions - * that returns one of these errors. + * [MS-SMB2] 3.3.5.20.3 + * Handling SMB2_0_INFO_SECURITY + * If dialect 3.1.1 must return 4-byte value + * containing required buffer size. + * ByteCount==12, ErrorContextCount==1, + * ErrorData: ErrorDataLength==4,ErrorId==0 + * ErrorContextData==<buffer size> + * Otherwise ByteCount==4 + * + * When returning with data, 3.1.1 encapsulate. */ - smb2sr_put_error_data(sr, status, &sr->raw_data); + if (sr->session->dialect < SMB_VERS_3_11) { + smb2sr_put_error_data(sr, status, &sr->raw_data); + } else { + smb2sr_put_error_ctx0(sr, status, &sr->raw_data); + } + return (SDRC_SUCCESS); + + case NT_STATUS_INFO_LENGTH_MISMATCH: /* there is no in smb2_qinfo_*.c */ + /* + * [MS-SMB2] 3.3.5.20.1 + * SMB 3.1.1 Handling SMB2_0_INFO_FILE + * [MS-SMB2] 3.3.5.20.2 + * SMB 3.1.1 Handling SMB2_0_INFO_FILESYSTEM + * + * ByteCount==8, ErrorContextCount==1, + * ErrorData: ErrorDataLength==0,ErrorId==0 + * Otherwise ByteCount==0 + */ + if (sr->session->dialect < SMB_VERS_3_11) { + smb2sr_put_error_data(sr, status, NULL); + } else { + smb2sr_put_error_ctx0(sr, status, NULL); + } return (SDRC_SUCCESS); default: diff --git a/usr/src/uts/common/fs/smbsrv/smb2_read.c b/usr/src/uts/common/fs/smbsrv/smb2_read.c index f8c91c878f..936fa06f6c 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_read.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_read.c @@ -10,11 +10,12 @@ */ /* - * Copyright 2019 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* * Dispatch function for SMB2_READ + * MS-SMB2 sec. 3.3.5.12 */ #include <smbsrv/smb2_kproto.h> @@ -160,6 +161,14 @@ smb2_read(smb_request_t *sr) MBC_ATTACH_MBUF(&sr->raw_data, m); /* + * [MS-SMB2] If the read returns fewer bytes than specified by + * the MinimumCount field of the request, the server MUST fail + * the request with STATUS_END_OF_FILE + */ + if (status == 0 && XferCount < MinCount) + status = NT_STATUS_END_OF_FILE; + + /* * Checking the error return _after_ dealing with * the returned data so that if m was allocated, * it will be free'd via sr->raw_data cleanup. diff --git a/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c b/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c index 0a258f1bf4..9be9630d57 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c @@ -11,6 +11,7 @@ /* * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* diff --git a/usr/src/uts/common/fs/smbsrv/smb2_signing.c b/usr/src/uts/common/fs/smbsrv/smb2_signing.c index 704dfc652a..fd4c4ecfb4 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_signing.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_signing.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* * These routines provide the SMB MAC signing for the SMB2 server. @@ -118,16 +119,6 @@ smb3_sign_calc(smb_request_t *sr, return (rv); } -/* - * Input to KDF for SigningKey. - * See comment for smb3_do_kdf for content. - */ -static uint8_t sign_kdf_input[29] = { - 0, 0, 0, 1, 'S', 'M', 'B', '2', - 'A', 'E', 'S', 'C', 'M', 'A', 'C', 0, - 0, 'S', 'm', 'b', 'S', 'i', 'g', 'n', - 0, 0, 0, 0, 0x80 }; - void smb2_sign_init_mech(smb_session_t *s) { @@ -196,10 +187,21 @@ smb2_sign_begin(smb_request_t *sr, smb_token_t *token) * For SMB3, the signing key is a "KDF" hash of the * session key. */ - if (smb3_do_kdf(sign_key->key, sign_kdf_input, - sizeof (sign_kdf_input), token->tkn_ssnkey.val, - token->tkn_ssnkey.len) != 0) - return; + if (s->dialect >= SMB_VERS_3_11) { + if (smb3_kdf(sign_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMBSigningKey", 14, + u->u_preauth_hashval, SHA512_DIGEST_LENGTH) + != 0) + return; + } else { + if (smb3_kdf(sign_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMB2AESCMAC", 12, + (uint8_t *)"SmbSign", 8) + != 0) + return; + } sign_key->len = SMB3_KEYLEN; } else { /* @@ -430,63 +432,3 @@ smb2_sign_reply(smb_request_t *sr) (void) smb_mbc_poke(&sr->reply, hdr_off, "#c", SMB2_SIG_SIZE, reply_sig); } - -/* - * Derive SMB3 key as described in [MS-SMB2] 3.1.4.2 - * and [NIST SP800-108] - * - * r = 32, L = 128, PRF = HMAC-SHA256, key = (session key) - * - * Note that these describe pre-3.1.1 inputs. - * - * Session.SigningKey for binding a session: - * - Session.SessionKey as K1 - * - label = SMB2AESCMAC (size 12) - * - context = SmbSign (size 8) - * Channel.SigningKey for for all other requests - * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1; - * - otherwise, Session.SessionKey as K1 - * - label = SMB2AESCMAC (size 12) - * - context = SmbSign (size 8) - * Session.ApplicationKey for ... (not sure what yet) - * - Session.SessionKey as K1 - * - label = SMB2APP (size 8) - * - context = SmbRpc (size 7) - * Session.EncryptionKey for encrypting server messages - * - Session.SessionKey as K1 - * - label = "SMB2AESCCM" (size 11) - * - context = "ServerOut" (size 10) - * Session.DecryptionKey for decrypting client requests - * - Session.SessionKey as K1 - * - label = "SMB2AESCCM" (size 11) - * - context = "ServerIn " (size 10) (Note the space) - */ - -int -smb3_do_kdf(void *outbuf, void *input, size_t input_len, - uint8_t *key, uint32_t key_len) -{ - uint8_t digest32[SHA256_DIGEST_LENGTH]; - smb_crypto_mech_t mech; - smb_sign_ctx_t hctx = 0; - int rc; - - bzero(&mech, sizeof (mech)); - if ((rc = smb2_hmac_getmech(&mech)) != 0) - return (rc); - - /* Limit the SessionKey input to its maximum size (16 bytes) */ - rc = smb2_hmac_init(&hctx, &mech, key, MIN(key_len, SMB2_KEYLEN)); - if (rc != 0) - return (rc); - - if ((rc = smb2_hmac_update(hctx, input, input_len)) != 0) - return (rc); - - if ((rc = smb2_hmac_final(hctx, digest32)) != 0) - return (rc); - - /* Output is first 16 bytes of digest. */ - bcopy(digest32, outbuf, SMB3_KEYLEN); - return (0); -} diff --git a/usr/src/uts/common/fs/smbsrv/smb2_write.c b/usr/src/uts/common/fs/smbsrv/smb2_write.c index 776ea24ae1..8f10f67d49 100644 --- a/usr/src/uts/common/fs/smbsrv/smb2_write.c +++ b/usr/src/uts/common/fs/smbsrv/smb2_write.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2019 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* @@ -148,7 +148,6 @@ smb2_write(smb_request_t *sr) &vdb->vdb_uio, &XferCount, stability); if (rc) break; - of->f_written = B_TRUE; /* This revokes read cache delegations. */ (void) smb_oplock_break_WRITE(of->f_node, of); break; diff --git a/usr/src/uts/common/fs/smbsrv/smb31_preauth.c b/usr/src/uts/common/fs/smbsrv/smb31_preauth.c new file mode 100644 index 0000000000..35455b9784 --- /dev/null +++ b/usr/src/uts/common/fs/smbsrv/smb31_preauth.c @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2020 RackTop Systems, Inc. + */ + +#include <smbsrv/smb2_kproto.h> +#include <smbsrv/smb2.h> +#include <sys/crypto/api.h> +#include <smbsrv/smb_kproto.h> +#include <smbsrv/smb_kcrypt.h> + +/* + * SMB 3.1.1 Preauth Integrity + */ +int +smb3_sha512_getmech(smb_crypto_mech_t *mech) +{ + crypto_mech_type_t t; + + t = crypto_mech2id(SUN_CKM_SHA512); + if (t == CRYPTO_MECH_INVALID) { + cmn_err(CE_NOTE, "smb: no kcf mech: %s", SUN_CKM_SHA512); + return (-1); + } + mech->cm_type = t; + return (0); +} + +/* + * (called from smb2_negotiate_common) + */ +void +smb31_preauth_init_mech(smb_session_t *s) +{ + smb_crypto_mech_t *mech; + int rc; + + ASSERT3S(s->dialect, >=, SMB_VERS_3_11); + + if (s->preauth_mech != NULL) + return; + + mech = kmem_zalloc(sizeof (*mech), KM_SLEEP); + rc = smb3_sha512_getmech(mech); + if (rc != 0) { + kmem_free(mech, sizeof (*mech)); + return; + } + s->preauth_mech = mech; +} + +void +smb31_preauth_fini(smb_session_t *s) +{ + smb_crypto_mech_t *mech; + + if ((mech = s->preauth_mech) != NULL) { + kmem_free(mech, sizeof (*mech)); + s->preauth_mech = NULL; + } +} + +/* + * Start the KCF session, load the key + */ +int +smb_sha512_init(smb_sign_ctx_t *ctxp, smb_crypto_mech_t *mech) +{ + int rv; + + rv = crypto_digest_init(mech, ctxp, NULL); + + return (rv == CRYPTO_SUCCESS ? 0 : -1); +} + +/* + * Digest one segment + */ +int +smb_sha512_update(smb_sign_ctx_t ctx, void *buf, size_t len) +{ + crypto_data_t data; + int rv; + + bzero(&data, sizeof (data)); + data.cd_format = CRYPTO_DATA_RAW; + data.cd_length = len; + data.cd_raw.iov_base = buf; + data.cd_raw.iov_len = len; + + rv = crypto_digest_update(ctx, &data, 0); + + if (rv != CRYPTO_SUCCESS) { + crypto_cancel_ctx(ctx); + return (-1); + } + + return (0); +} + +/* + * Get the final digest. + */ +int +smb_sha512_final(smb_sign_ctx_t ctx, uint8_t *digest) +{ + crypto_data_t out; + int rv; + + bzero(&out, sizeof (out)); + out.cd_format = CRYPTO_DATA_RAW; + out.cd_length = SHA512_DIGEST_LENGTH; + out.cd_raw.iov_len = SHA512_DIGEST_LENGTH; + out.cd_raw.iov_base = (void *)digest; + + rv = crypto_digest_final(ctx, &out, 0); + + return (rv == CRYPTO_SUCCESS ? 0 : -1); +} + +int +smb31_preauth_sha512_calc(smb_request_t *sr, struct mbuf_chain *mbc, + uint8_t *in_hashval, uint8_t *out_hashval) +{ + smb_session_t *s = sr->session; + smb_sign_ctx_t ctx = 0; + struct mbuf *mbuf = mbc->chain; + int rc; + + ASSERT3U(s->smb31_preauth_hashid, !=, 0); + + if (s->preauth_mech == NULL) + return (-1); + + if ((rc = smb_sha512_init(&ctx, s->preauth_mech)) != 0) + return (rc); + + /* Digest current hashval */ + rc = smb_sha512_update(ctx, in_hashval, SHA512_DIGEST_LENGTH); + if (rc != 0) + return (rc); + + while (mbuf != NULL) { + rc = smb_sha512_update(ctx, mbuf->m_data, mbuf->m_len); + if (rc != 0) + return (rc); + mbuf = mbuf->m_next; + } + + rc = smb_sha512_final(ctx, out_hashval); + return (rc); +} diff --git a/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c b/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c index fdbd49ef74..8b2f36f802 100644 --- a/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c +++ b/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c @@ -11,6 +11,7 @@ /* * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -24,23 +25,8 @@ #define SMB3_NONCE_OFFS 20 #define SMB3_SIG_OFFS 4 -#define SMB3_NONCE_SIZE 11 /* 12 for gcm later */ - -/* - * Inputs to KDF for EncryptionKey and DecryptionKey. - * See comment for smb3_do_kdf for content. - */ -static uint8_t encrypt_kdf_input[30] = { - 0, 0, 0, 1, 'S', 'M', 'B', '2', - 'A', 'E', 'S', 'C', 'C', 'M', 0, 0, - 'S', 'e', 'r', 'v', 'e', 'r', 'O', - 'u', 't', 0, 0, 0, 0, 0x80 }; - -static uint8_t decrypt_kdf_input[30] = { - 0, 0, 0, 1, 'S', 'M', 'B', '2', - 'A', 'E', 'S', 'C', 'C', 'M', 0, 0, - 'S', 'e', 'r', 'v', 'e', 'r', 'I', - 'n', ' ', 0, 0, 0, 0, 0x80 }; +#define SMB3_AES128_CCM_NONCE_SIZE 11 +#define SMB3_AES128_GCM_NONCE_SIZE 12 /* * Arbitrary value used to prevent nonce reuse via overflow. Currently @@ -100,8 +86,23 @@ smb3_encrypt_init_mech(smb_session_t *s) if (s->enc_mech != NULL) return (0); + if (s->dialect < SMB_VERS_3_11) + s->smb31_enc_cipherid = SMB3_CIPHER_AES128_CCM; + mech = kmem_zalloc(sizeof (*mech), KM_SLEEP); - rc = smb3_encrypt_getmech(mech); + + switch (s->smb31_enc_cipherid) { + case SMB3_CIPHER_AES128_GCM: + rc = smb3_aes_gcm_getmech(mech); + break; + case SMB3_CIPHER_AES128_CCM: + rc = smb3_aes_ccm_getmech(mech); + break; + default: + rc = -1; + break; + } + if (rc != 0) { kmem_free(mech, sizeof (*mech)); return (rc); @@ -150,15 +151,31 @@ smb3_encrypt_begin(smb_request_t *sr, smb_token_t *token) * For SMB3, the encrypt/decrypt keys are derived from * the session key using KDF in counter mode. */ - if (smb3_do_kdf(enc_key->key, encrypt_kdf_input, - sizeof (encrypt_kdf_input), token->tkn_ssnkey.val, - token->tkn_ssnkey.len) != 0) - return; - - if (smb3_do_kdf(dec_key->key, decrypt_kdf_input, - sizeof (decrypt_kdf_input), token->tkn_ssnkey.val, - token->tkn_ssnkey.len) != 0) - return; + if (s->dialect >= SMB_VERS_3_11) { + if (smb3_kdf(enc_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMBS2CCipherKey", 16, + u->u_preauth_hashval, SHA512_DIGEST_LENGTH) != 0) + return; + + if (smb3_kdf(dec_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMBC2SCipherKey", 16, + u->u_preauth_hashval, SHA512_DIGEST_LENGTH) != 0) + return; + } else { + if (smb3_kdf(enc_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMB2AESCCM", 11, + (uint8_t *)"ServerOut", 10) != 0) + return; + + if (smb3_kdf(dec_key->key, + token->tkn_ssnkey.val, token->tkn_ssnkey.len, + (uint8_t *)"SMB2AESCCM", 11, + (uint8_t *)"ServerIn ", 10) != 0) + return; + } smb3_encrypt_init_nonce(u); @@ -184,6 +201,10 @@ smb3_decrypt_sr(smb_request_t *sr) int offset, resid, tlen, rc; smb3_crypto_param_t param; smb_crypto_mech_t mech; + boolean_t gcm = sr->session->smb31_enc_cipherid == + SMB3_CIPHER_AES128_GCM; + size_t nonce_size = (gcm ? SMB3_AES128_GCM_NONCE_SIZE : + SMB3_AES128_CCM_NONCE_SIZE); ASSERT(u != NULL); if (s->enc_mech == NULL || dec_key->len != 16) { @@ -210,8 +231,12 @@ smb3_decrypt_sr(smb_request_t *sr) * The transform header, minus the PROTOCOL_ID and the * SIGNATURE, is authenticated but not encrypted. */ - smb3_crypto_init_param(¶m, sr->nonce, SMB3_NONCE_SIZE, - tmp_hdr, tlen, sr->msgsize + SMB2_SIG_SIZE); + if (gcm) + smb3_crypto_init_gcm_param(¶m, sr->nonce, nonce_size, + tmp_hdr, tlen); + else + smb3_crypto_init_ccm_param(¶m, sr->nonce, nonce_size, + tmp_hdr, tlen, sr->msgsize + SMB2_SIG_SIZE); /* * Unlike signing, which uses one global mech struct, @@ -317,13 +342,17 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc, int resid, tlen, rc; smb3_crypto_param_t param; smb_crypto_mech_t mech; + boolean_t gcm = sr->session->smb31_enc_cipherid == + SMB3_CIPHER_AES128_GCM; + size_t nonce_size = (gcm ? SMB3_AES128_GCM_NONCE_SIZE : + SMB3_AES128_CCM_NONCE_SIZE); ASSERT(u != NULL); if (s->enc_mech == NULL || enc_key->len != 16) { return (-1); } - rc = smb3_encrypt_gen_nonce(u, sr->nonce, SMB3_NONCE_SIZE); + rc = smb3_encrypt_gen_nonce(u, sr->nonce, nonce_size); if (rc != 0) { cmn_err(CE_WARN, "ran out of nonces"); @@ -331,7 +360,7 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc, } (void) smb_mbc_poke(out_mbc, SMB3_NONCE_OFFS, "#c", - SMB3_NONCE_SIZE, sr->nonce); + nonce_size, sr->nonce); resid = in_mbc->max_bytes; @@ -339,10 +368,14 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc, * The transform header, minus the PROTOCOL_ID and the * SIGNATURE, is authenticated but not encrypted. */ - smb3_crypto_init_param(¶m, - sr->nonce, SMB3_NONCE_SIZE, - buf + SMB3_NONCE_OFFS, SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS, - resid); + if (gcm) + smb3_crypto_init_gcm_param(¶m, sr->nonce, nonce_size, + buf + SMB3_NONCE_OFFS, + SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS); + else + smb3_crypto_init_ccm_param(¶m, sr->nonce, nonce_size, + buf + SMB3_NONCE_OFFS, + SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS, resid); /* * Unlike signing, which uses one global mech struct, diff --git a/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c b/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c index 690a2d792d..c4392feb01 100644 --- a/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c +++ b/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c @@ -11,6 +11,7 @@ /* * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -28,36 +29,63 @@ #include <sys/cmn_err.h> /* - * SMB3 encryption helpers: - * (getmech, init, update, final) + * Common function to see if a mech is available. */ - -int -smb3_encrypt_getmech(smb_crypto_mech_t *mech) +static int +find_mech(smb_crypto_mech_t *mech, const char *name) { crypto_mech_type_t t; - t = crypto_mech2id(SUN_CKM_AES_CCM); + t = crypto_mech2id(name); if (t == CRYPTO_MECH_INVALID) { - cmn_err(CE_NOTE, "smb: no kcf mech: %s", SUN_CKM_AES_CCM); + cmn_err(CE_NOTE, "smb: no kcf mech: %s", name); return (-1); } mech->cm_type = t; - return (0); } +/* + * SMB3 encryption helpers: + * (getmech, init, update, final) + */ + +int +smb3_aes_ccm_getmech(smb_crypto_mech_t *mech) +{ + return (find_mech(mech, SUN_CKM_AES_CCM)); +} + +int +smb3_aes_gcm_getmech(smb_crypto_mech_t *mech) +{ + return (find_mech(mech, SUN_CKM_AES_GCM)); +} + void -smb3_crypto_init_param(smb3_crypto_param_t *param, +smb3_crypto_init_ccm_param(smb3_crypto_param_t *param, uint8_t *nonce, size_t noncesize, uint8_t *auth, size_t authsize, size_t datasize) { - param->ulMACSize = SMB2_SIG_SIZE; - param->ulNonceSize = noncesize; - param->nonce = nonce; - param->ulDataSize = datasize; - param->ulAuthDataSize = authsize; - param->authData = auth; + param->ccm.ulMACSize = SMB2_SIG_SIZE; + param->ccm.ulNonceSize = noncesize; + param->ccm.nonce = nonce; + param->ccm.ulDataSize = datasize; + param->ccm.ulAuthDataSize = authsize; + param->ccm.authData = auth; +} + +void +smb3_crypto_init_gcm_param(smb3_crypto_param_t *param, + uint8_t *nonce, size_t noncesize, uint8_t *auth, size_t authsize) +{ + ASSERT3U(noncesize, ==, 12); + param->gcm.pIv = nonce; + param->gcm.ulIvLen = noncesize; /* should be 12 bytes */ + /* tform hdr size - (protcolo id + signing) == 32 bytes */ + param->gcm.ulTagBits = SMB2_SIG_SIZE << 3; /* convert bytes to bits */ + param->gcm.pAAD = auth; /* auth data */ + param->gcm.ulAADLen = authsize; /* auth data len */ } /* @@ -199,7 +227,22 @@ smb3_encrypt_final(smb3_enc_ctx_t *ctxp, uint8_t *digest16) return (-1); } - outlen = out.cd_offset - SMB2_SIG_SIZE; + /* + * For some reason AES module processes ccm_encrypt_final and + * gcm_encrypt_final differently. + * For GCM it restores original offset (which is 0) and updates + * cd_length to size of residual data + mac len. + * For CCM it does nothing, what means offset is updated and cd_length + * is decreased by size of residual data + mac len. + */ + if (out.cd_offset == 0) { + /* GCM */ + outlen = out.cd_length - SMB2_SIG_SIZE; + } else { + /* CCM */ + outlen = out.cd_offset - SMB2_SIG_SIZE; + } + if (outlen > 0) bcopy(buf, ctxp->output.cd_raw.iov_base + ctxp->output.cd_offset, outlen); diff --git a/usr/src/uts/common/fs/smbsrv/smb3_kdf.c b/usr/src/uts/common/fs/smbsrv/smb3_kdf.c new file mode 100644 index 0000000000..e62acd8808 --- /dev/null +++ b/usr/src/uts/common/fs/smbsrv/smb3_kdf.c @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. + */ + +#include <smbsrv/smb_kcrypt.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +/* + * Derive SMB3 key as described in [MS-SMB2] 3.1.4.2 + * and [NIST SP800-108] + * + * r = 32, L = 128, PRF = HMAC-SHA256, key = (session key) + */ + +/* + * SMB 3.0.2 KDF Input + * + * Session.SigningKey for binding a session: + * - Session.SessionKey as K1 + * - label = "SMB2AESCMAC" (size 12) + * - context = "SmbSign" (size 8) + * Channel.SigningKey for for all other requests + * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1; + * - otherwise, Session.SessionKey as K1 + * - label = "SMB2AESCMAC" (size 12) + * - context = "SmbSign" (size 8) + * Session.ApplicationKey for ... (not sure what yet) + * - Session.SessionKey as K1 + * - label = "SMB2APP" (size 8) + * - context = "SmbRpc" (size 7) + * Session.EncryptionKey for encrypting server messages + * - Session.SessionKey as K1 + * - label = "SMB2AESCCM" (size 11) + * - context = "ServerOut" (size 10) + * Session.DecryptionKey for decrypting client requests + * - Session.SessionKey as K1 + * - label = "SMB2AESCCM" (size 11) + * - context = "ServerIn " (size 10) (Note the space) + */ + +/* + * SMB 3.1.1 KDF Input + * + * Session.SigningKey for binding a session: + * - Session.SessionKey as K1 + * - label = "SMBSigningKey" (size 14) + * - context = preauth hashval + * Channel.SigningKey for for all other requests + * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1; + * - otherwise, Session.SessionKey as K1 + * - label = "SMBSigningKey" (size 14) + * - context = preauth hashval + * Session.EncryptionKey for encrypting server messages + * - Session.SessionKey as K1 + * - label = "SMBS2CCipherKey" (size 16) + * - context = preauth hashval + * Session.DecryptionKey for decrypting client requests + * - Session.SessionKey as K1 + * - label = "SMBC2SCipherKey" (size 16) + * - context = preauth hashval + */ + +/* + * SMB3KDF(Ki, Label, Context) + * counter || Label || 0x00 || Context || L + */ +int +smb3_kdf(uint8_t *outbuf, + uint8_t *key, size_t key_len, + uint8_t *label, size_t label_len, + uint8_t *context, size_t context_len) +{ + static uint8_t L[4] = { 0, 0, 0, 0x80 }; + uint8_t digest32[SHA256_DIGEST_LENGTH]; + /* Maximum length of kdf input is 89 for Encription/Decryption key */ + uint8_t kdfbuf[89] = { 0, 0, 0, 1 }; /* initialized by counter */ + smb_crypto_mech_t mech; + smb_sign_ctx_t hctx = 0; + int pos = 4; /* skip counter */ + int rc; + + bcopy(label, &kdfbuf[pos], label_len); + pos += label_len; + + kdfbuf[pos] = 0; + pos++; + + bcopy(context, &kdfbuf[pos], context_len); + pos += context_len; + + bcopy(L, &kdfbuf[pos], 4); + pos += 4; + + bzero(&mech, sizeof (mech)); + if ((rc = smb2_hmac_getmech(&mech)) != 0) + return (rc); + + /* Limit the SessionKey input to its maximum size (16 bytes) */ + rc = smb2_hmac_init(&hctx, &mech, key, MIN(key_len, SMB2_KEYLEN)); + if (rc != 0) + return (rc); + + if ((rc = smb2_hmac_update(hctx, kdfbuf, pos)) != 0) + return (rc); + + if ((rc = smb2_hmac_final(hctx, digest32)) != 0) + return (rc); + + /* Output is first 16 bytes of digest. */ + bcopy(digest32, outbuf, SMB3_KEYLEN); + return (0); +} diff --git a/usr/src/uts/common/fs/smbsrv/smb_authenticate.c b/usr/src/uts/common/fs/smbsrv/smb_authenticate.c index 64f26363a6..c6da5a5158 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_authenticate.c +++ b/usr/src/uts/common/fs/smbsrv/smb_authenticate.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -36,6 +37,7 @@ #include <smbsrv/smb_idmap.h> #include <smbsrv/smb_kproto.h> #include <smbsrv/smb_token.h> +#include <smbsrv/smb2_kproto.h> static uint32_t smb_authsock_open(smb_request_t *); static int smb_authsock_send(ksocket_t, void *, size_t); @@ -285,6 +287,14 @@ smb_authenticate_ext(smb_request_t *sr) goto errout; msg_hdr.lmh_msgtype = LSA_MTYPE_ESFIRST; + + if (sr->session->dialect >= SMB_VERS_3_11) { + if (smb31_preauth_sha512_calc(sr, &sr->command, + sr->session->smb31_preauth_hashval, + user->u_preauth_hashval) != 0) + cmn_err(CE_WARN, "(2) Preauth hash calculation " + "failed"); + } } else { user = smb_session_lookup_uid_st(sr->session, sr->smb2_ssnid, sr->smb_uid, SMB_USER_STATE_LOGGING_ON); @@ -295,6 +305,14 @@ smb_authenticate_ext(smb_request_t *sr) sr->uid_user = user; msg_hdr.lmh_msgtype = LSA_MTYPE_ESNEXT; + + if (sr->session->dialect >= SMB_VERS_3_11) { + if (smb31_preauth_sha512_calc(sr, &sr->command, + user->u_preauth_hashval, + user->u_preauth_hashval) != 0) + cmn_err(CE_WARN, "(4) Preauth hash calculation " + "failed"); + } } /* diff --git a/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c index 39d67dd824..8ec21f5f37 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c +++ b/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -495,9 +495,20 @@ smb_oplock_request(smb_request_t *sr, smb_ofile_t *ofile, uint32_t *statep) } /* Give caller back the "Granular" bit. */ - if (status == NT_STATUS_SUCCESS) + if (status == NT_STATUS_SUCCESS) { *statep |= LEVEL_GRANULAR; + /* + * The oplock lease may have moved to this ofile. Update. + * Minor violation of layering here (leases vs oplocks) + * but we want this update coverd by the oplock mutex. + */ +#ifndef TESTJIG + if (ofile->f_lease != NULL) + ofile->f_lease->ls_oplock_ofile = ofile; +#endif + } + out: mutex_exit(&node->n_oplock.ol_mutex); smb_llist_exit(&node->n_ofile_list); @@ -545,6 +556,12 @@ smb_oplock_req_excl( ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex)); /* + * Don't allow grants on closing ofiles. + */ + if (ofile->f_oplock.og_closing) + return (status); + + /* * If Open.Stream.Oplock is empty: * Build a new Oplock object with fields initialized as follows: * Oplock.State set to NO_OPLOCK. @@ -1030,6 +1047,12 @@ smb_oplock_req_shared( ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex)); /* + * Don't allow grants on closing ofiles. + */ + if (ofile->f_oplock.og_closing) + return (status); + + /* * If Open.Stream.Oplock is empty: * Build a new Oplock object with fields initialized as follows: * Oplock.State set to NO_OPLOCK. @@ -2036,6 +2059,20 @@ smb_oplock_ack_break( } /* Switch (oplock.state) */ out: + if (status == NT_STATUS_INVALID_OPLOCK_PROTOCOL) + *rop = LEVEL_NONE; + + if (status == NT_STATUS_SUCCESS && + type == LEVEL_GRANULAR && + *rop != LEVEL_NONE) { + *rop |= LEVEL_GRANULAR; + /* As above, leased oplock may have moved. */ +#ifndef TESTJIG + if (ofile->f_lease != NULL) + ofile->f_lease->ls_oplock_ofile = ofile; +#endif + } + /* * The spec. describes waiting for a break here, * but we let the caller do that (when needed) if @@ -2044,14 +2081,6 @@ out: mutex_exit(&node->n_oplock.ol_mutex); smb_llist_exit(&node->n_ofile_list); - if (status == NT_STATUS_INVALID_OPLOCK_PROTOCOL) - *rop = LEVEL_NONE; - - if (status == NT_STATUS_SUCCESS && - type == LEVEL_GRANULAR && - *rop != LEVEL_NONE) - *rop |= LEVEL_GRANULAR; - return (status); } @@ -2257,13 +2286,12 @@ smb_oplock_break_CLOSE(smb_node_t *node, smb_ofile_t *ofile) { smb_ofile_t *o; - if (ofile == NULL) { - ASSERT(0); - return; - } + ASSERT(RW_READ_HELD(&node->n_ofile_list.ll_lock)); + ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex)); - smb_llist_enter(&node->n_ofile_list, RW_READER); - mutex_enter(&node->n_oplock.ol_mutex); + if (ofile->f_oplock.og_closing) + return; + ofile->f_oplock.og_closing = B_TRUE; /* * If Oplock.IIOplocks is not empty: @@ -2481,8 +2509,6 @@ smb_oplock_break_CLOSE(smb_node_t *node, smb_ofile_t *ofile) if ((node->n_oplock.ol_state & BREAK_ANY) == 0) cv_broadcast(&node->n_oplock.WaitingOpenCV); - mutex_exit(&node->n_oplock.ol_mutex); - smb_llist_exit(&node->n_ofile_list); } /* @@ -3515,8 +3541,7 @@ smb_oplock_move(smb_node_t *node, ASSERT(fr_ofile->f_node == node); ASSERT(to_ofile->f_node == node); - - mutex_enter(&node->n_oplock.ol_mutex); + ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex)); /* * The ofile to which we're moving the oplock @@ -3541,5 +3566,4 @@ smb_oplock_move(smb_node_t *node, if (node->n_oplock.excl_open == fr_ofile) node->n_oplock.excl_open = to_ofile; - mutex_exit(&node->n_oplock.ol_mutex); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_common_open.c b/usr/src/uts/common/fs/smbsrv/smb_common_open.c index 8007463ba1..fb4d46f599 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_common_open.c +++ b/usr/src/uts/common/fs/smbsrv/smb_common_open.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* @@ -253,6 +253,7 @@ smb_common_open(smb_request_t *sr) smb_node_t *fnode = NULL; smb_node_t *dnode = NULL; smb_node_t *cur_node = NULL; + smb_node_t *tmp_node = NULL; smb_arg_open_t *op = &sr->sr_open; smb_pathname_t *pn = &op->fqi.fq_path; smb_ofile_t *of = NULL; @@ -269,6 +270,7 @@ smb_common_open(smb_request_t *sr) uint16_t tree_fid = 0; boolean_t created = B_FALSE; boolean_t last_comp_found = B_FALSE; + boolean_t stream_found = B_FALSE; boolean_t opening_incr = B_FALSE; boolean_t dnode_held = B_FALSE; boolean_t dnode_wlock = B_FALSE; @@ -278,6 +280,7 @@ smb_common_open(smb_request_t *sr) boolean_t did_open = B_FALSE; boolean_t did_break_handle = B_FALSE; boolean_t did_cleanup_orphans = B_FALSE; + char *sname = NULL; /* Get out now if we've been cancelled. */ mutex_enter(&sr->sr_mutex); @@ -418,9 +421,13 @@ smb_common_open(smb_request_t *sr) if ((op->desired_access & ~FILE_READ_ATTRIBUTES) == DELETE) lookup_flags &= ~SMB_FOLLOW_LINKS; - rc = smb_fsop_lookup_name(sr, zone_kcred(), lookup_flags, + /* + * Lookup *just* the file portion of the name. + * Returns stream name in sname, which this allocates + */ + rc = smb_fsop_lookup_file(sr, zone_kcred(), lookup_flags, sr->tid_tree->t_snode, op->fqi.fq_dnode, op->fqi.fq_last_comp, - &op->fqi.fq_fnode); + &sname, &op->fqi.fq_fnode); if (rc == 0) { last_comp_found = B_TRUE; @@ -449,9 +456,6 @@ smb_common_open(smb_request_t *sr) if (last_comp_found) { - smb_node_unlock(dnode); - dnode_wlock = B_FALSE; - fnode = op->fqi.fq_fnode; dnode = op->fqi.fq_dnode; @@ -468,8 +472,9 @@ smb_common_open(smb_request_t *sr) * it must NOT be (required by Lotus Notes) * - the target is NOT a directory and client requires that * it MUST be. + * Streams are never directories. */ - if (smb_node_is_dir(fnode)) { + if (smb_node_is_dir(fnode) && sname == NULL) { if (op->create_options & FILE_NON_DIRECTORY_FILE) { status = NT_STATUS_FILE_IS_A_DIRECTORY; goto errout; @@ -482,20 +487,81 @@ smb_common_open(smb_request_t *sr) } } - /* - * No more open should be accepted when "Delete on close" - * flag is set. - */ - if (fnode->flags & NODE_FLAGS_DELETE_ON_CLOSE) { - status = NT_STATUS_DELETE_PENDING; - goto errout; + /* If we're given a stream name, look it up now */ + if (sname != NULL) { + tmp_node = fnode; + rc = smb_fsop_lookup_stream(sr, zone_kcred(), + lookup_flags, sr->tid_tree->t_snode, fnode, sname, + &fnode); + } else { + rc = 0; } - /* - * Specified file already exists so the operation should fail. - */ - if (op->create_disposition == FILE_CREATE) { - status = NT_STATUS_OBJECT_NAME_COLLISION; + if (rc == 0) { /* Stream Exists (including unnamed stream) */ + stream_found = B_TRUE; + smb_node_unlock(dnode); + dnode_wlock = B_FALSE; + + if (tmp_node != NULL) + smb_node_release(tmp_node); + + /* + * No more open should be accepted when + * "Delete on close" flag is set. + */ + if (fnode->flags & NODE_FLAGS_DELETE_ON_CLOSE) { + status = NT_STATUS_DELETE_PENDING; + goto errout; + } + + /* + * Specified file already exists + * so the operation should fail. + */ + if (op->create_disposition == FILE_CREATE) { + status = NT_STATUS_OBJECT_NAME_COLLISION; + goto errout; + } + + if ((op->create_disposition == FILE_SUPERSEDE) || + (op->create_disposition == FILE_OVERWRITE_IF) || + (op->create_disposition == FILE_OVERWRITE)) { + + if (sname == NULL) { + if (!smb_sattr_check( + op->fqi.fq_fattr.sa_dosattr, + op->dattr)) { + status = + NT_STATUS_ACCESS_DENIED; + goto errout; + } + op->desired_access |= + FILE_WRITE_ATTRIBUTES; + } + + if (smb_node_is_dir(fnode)) { + status = NT_STATUS_ACCESS_DENIED; + goto errout; + } + } + + /* MS-FSA 2.1.5.1.2 */ + if (op->create_disposition == FILE_SUPERSEDE) + op->desired_access |= DELETE; + if ((op->create_disposition == FILE_OVERWRITE_IF) || + (op->create_disposition == FILE_OVERWRITE)) + op->desired_access |= FILE_WRITE_DATA; + } else if (rc == ENOENT) { /* File Exists, but Stream doesn't */ + if (op->create_disposition == FILE_OPEN || + op->create_disposition == FILE_OVERWRITE) { + status = NT_STATUS_OBJECT_NAME_NOT_FOUND; + goto errout; + } + + op->desired_access |= FILE_WRITE_DATA; + } else { /* Error looking up stream */ + status = smb_errno2status(rc); + fnode = tmp_node; goto errout; } @@ -520,29 +586,6 @@ smb_common_open(smb_request_t *sr) } } - if ((op->create_disposition == FILE_SUPERSEDE) || - (op->create_disposition == FILE_OVERWRITE_IF) || - (op->create_disposition == FILE_OVERWRITE)) { - - if (!smb_sattr_check(op->fqi.fq_fattr.sa_dosattr, - op->dattr)) { - status = NT_STATUS_ACCESS_DENIED; - goto errout; - } - - if (smb_node_is_dir(fnode)) { - status = NT_STATUS_ACCESS_DENIED; - goto errout; - } - } - - /* MS-FSA 2.1.5.1.2 */ - if (op->create_disposition == FILE_SUPERSEDE) - op->desired_access |= DELETE; - if ((op->create_disposition == FILE_OVERWRITE_IF) || - (op->create_disposition == FILE_OVERWRITE)) - op->desired_access |= FILE_WRITE_DATA; - /* Dataset roots can't be deleted, so don't set DOC */ if ((op->create_options & FILE_DELETE_ON_CLOSE) != 0 && (fnode->flags & NODE_FLAGS_VFSROOT) != 0) { @@ -552,6 +595,7 @@ smb_common_open(smb_request_t *sr) status = smb_fsop_access(sr, sr->user_cr, fnode, op->desired_access); + if (status != NT_STATUS_SUCCESS) goto errout; @@ -575,6 +619,31 @@ smb_common_open(smb_request_t *sr) if ((op->desired_access & FILE_DATA_ALL) != 0) op->desired_access |= FILE_READ_ATTRIBUTES; + /* If the stream didn't exist, create it now */ + if (!stream_found) { + smb_node_t *tmp_node = fnode; + + bzero(&new_attr, sizeof (new_attr)); + new_attr.sa_vattr.va_type = VREG; + new_attr.sa_vattr.va_mode = S_IRUSR; + new_attr.sa_mask |= SMB_AT_TYPE | SMB_AT_MODE; + + rc = smb_fsop_create_stream(sr, sr->user_cr, dnode, + fnode, sname, lookup_flags, &new_attr, &fnode); + smb_node_release(tmp_node); + + if (rc != 0) { + status = smb_errno2status(rc); + fnode_held = B_FALSE; + goto errout; + } + op->action_taken = SMB_OACT_CREATED; + created = B_TRUE; + + smb_node_unlock(dnode); + dnode_wlock = B_FALSE; + } + /* * Oplock break is done prior to sharing checks as the break * may cause other clients to close the file which would @@ -593,6 +662,24 @@ smb_common_open(smb_request_t *sr) smb_node_inc_opening_count(fnode); opening_incr = B_TRUE; + if (!stream_found) { + /* + * Stake our Share Access claim. + */ + smb_node_wrlock(fnode); + fnode_wlock = B_TRUE; + + status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid, + op->desired_access, op->share_access); + if (status != 0) + goto errout; + + fnode_shrlk = B_TRUE; + smb_node_unlock(fnode); + fnode_wlock = B_FALSE; + goto stream_created; + } + /* * XXX Supposed to do share access checks next. * [MS-FSA] describes that as part of access check: @@ -780,11 +867,20 @@ smb_common_open(smb_request_t *sr) case FILE_SUPERSEDE: case FILE_OVERWRITE_IF: case FILE_OVERWRITE: - op->dattr |= FILE_ATTRIBUTE_ARCHIVE; - /* Don't apply readonly until smb_set_open_attributes */ - if (op->dattr & FILE_ATTRIBUTE_READONLY) { - op->dattr &= ~FILE_ATTRIBUTE_READONLY; - op->created_readonly = B_TRUE; + bzero(&new_attr, sizeof (new_attr)); + if (sname == NULL) { + op->dattr |= FILE_ATTRIBUTE_ARCHIVE; + /* + * Don't apply readonly until + * smb_set_open_attributes + */ + if (op->dattr & FILE_ATTRIBUTE_READONLY) { + op->dattr &= ~FILE_ATTRIBUTE_READONLY; + op->created_readonly = B_TRUE; + } + new_attr.sa_dosattr = op->dattr; + } else { + new_attr.sa_dosattr = FILE_ATTRIBUTE_ARCHIVE; } /* @@ -793,8 +889,6 @@ smb_common_open(smb_request_t *sr) * after we have an ofile. See: * smb_set_open_attributes */ - bzero(&new_attr, sizeof (new_attr)); - new_attr.sa_dosattr = op->dattr; new_attr.sa_vattr.va_size = 0; new_attr.sa_mask = SMB_AT_DOSATTR | SMB_AT_SIZE; rc = smb_fsop_setattr(sr, sr->user_cr, fnode, @@ -844,6 +938,12 @@ create: goto errout; } + if ((op->desired_access & ACCESS_SYSTEM_SECURITY) != 0 && + !smb_user_has_security_priv(sr->uid_user, sr->user_cr)) { + status = NT_STATUS_ACCESS_DENIED; + goto errout; + } + if (pn->pn_fname && smb_is_invalid_filename(pn->pn_fname)) { status = NT_STATUS_OBJECT_NAME_INVALID; goto errout; @@ -982,6 +1082,7 @@ create: (void) smb_oplock_break_PARENT(dnode, of); } +stream_created: /* * We might have blocked in smb_oplock_break_OPEN long enough * so a tree disconnect might have happened. In that case, @@ -1061,6 +1162,8 @@ create: * how that happens is protocol-specific. */ + if (sname != NULL) + kmem_free(sname, MAXNAMELEN); if (fnode_wlock) smb_node_unlock(fnode); if (opening_incr) @@ -1091,6 +1194,8 @@ errout: smb_delete_new_object(sr); } + if (sname != NULL) + kmem_free(sname, MAXNAMELEN); if (fnode_wlock) smb_node_unlock(fnode); if (opening_incr) @@ -1147,22 +1252,6 @@ smb_set_open_attributes(smb_request_t *sr, smb_ofile_t *of) attr.sa_mask |= SMB_AT_MTIME; } - /* - * Used to have code here to set mtime, ctime, atime - * when the open op->create_disposition is any of: - * FILE_SUPERSEDE, FILE_OVERWRITE_IF, FILE_OVERWRITE. - * We know that in those cases we will have set the - * file size, in which case the file system will - * update those times, so we don't have to. - * - * However, keep track of the fact that we modified - * the file via this handle, so we can do the evil, - * gratuitious mtime update on close that Windows - * clients expect. - */ - if (op->action_taken == SMB_OACT_TRUNCATED) - of->f_written = B_TRUE; - if (attr.sa_mask != 0) rc = smb_node_setattr(sr, node, of->f_cr, of, &attr); diff --git a/usr/src/uts/common/fs/smbsrv/smb_fem.c b/usr/src/uts/common/fs/smbsrv/smb_fem.c index c41ddddac8..b68466edaa 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_fem.c +++ b/usr/src/uts/common/fs/smbsrv/smb_fem.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. * Copyright 2015 Joyent, Inc. */ @@ -170,12 +170,15 @@ smb_fem_fcn_install(smb_node_t *node) return (rc); } -void +int smb_fem_fcn_uninstall(smb_node_t *node) { + int rc; + if (smb_fcn_ops == NULL) - return; - VERIFY0(fem_uninstall(node->vp, smb_fcn_ops, (void *)node)); + return (ENOSYS); + rc = fem_uninstall(node->vp, smb_fcn_ops, (void *)node); + return (rc); } int diff --git a/usr/src/uts/common/fs/smbsrv/smb_fsops.c b/usr/src/uts/common/fs/smbsrv/smb_fsops.c index 8fafac5f60..4d6ffa5754 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_fsops.c +++ b/usr/src/uts/common/fs/smbsrv/smb_fsops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Nexenta by DDN, Inc. All rights reserved. */ #include <sys/sid.h> @@ -35,8 +35,8 @@ extern caller_context_t smb_ct; -static int smb_fsop_create_stream(smb_request_t *, cred_t *, smb_node_t *, - char *, char *, int, smb_attr_t *, smb_node_t **); +static int smb_fsop_create_file_with_stream(smb_request_t *, cred_t *, + smb_node_t *, char *, char *, int, smb_attr_t *, smb_node_t **); static int smb_fsop_create_file(smb_request_t *, cred_t *, smb_node_t *, char *, int, smb_attr_t *, smb_node_t **); @@ -136,6 +136,7 @@ smb_fsop_create_with_sd(smb_request_t *sr, cred_t *cr, boolean_t is_dir; ASSERT(fs_sd); + ASSERT(ret_snode != NULL); if (SMB_TREE_IS_CASEINSENSITIVE(sr)) flags = SMB_IGNORE_CASE; @@ -147,10 +148,9 @@ smb_fsop_create_with_sd(smb_request_t *sr, cred_t *cr, is_dir = ((fs_sd->sd_flags & SMB_FSSD_FLAGS_DIR) != 0); if (smb_tree_has_feature(sr->tid_tree, SMB_TREE_ACLONCREATE)) { - if (fs_sd->sd_secinfo & SMB_ACL_SECINFO) { - dacl = fs_sd->sd_zdacl; - sacl = fs_sd->sd_zsacl; - ASSERT(dacl || sacl); + dacl = fs_sd->sd_zdacl; + sacl = fs_sd->sd_zsacl; + if (dacl != NULL || sacl != NULL) { if (dacl && sacl) { acl = smb_fsacl_merge(dacl, sacl); } else if (dacl) { @@ -320,7 +320,7 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode, sname = kmem_alloc(MAXNAMELEN, KM_SLEEP); smb_stream_parse_name(name, fname, sname); - rc = smb_fsop_create_stream(sr, cr, dnode, + rc = smb_fsop_create_file_with_stream(sr, cr, dnode, fname, sname, flags, attr, ret_snode); kmem_free(fname, MAXNAMELEN); @@ -349,39 +349,31 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode, /* - * smb_fsop_create_stream + * smb_fsop_create_file_with_stream * - * Create NTFS named stream file (sname) on unnamed stream - * file (fname), creating the unnamed stream file if it + * Create named stream (sname) on file (fname), creating the file if it * doesn't exist. - * If we created the unnamed stream file and then creation - * of the named stream file fails, we delete the unnamed stream. + * If we created the file and then creation of the named stream fails, + * we delete the file. * Since we use the real file name for the smb_vop_remove we * clear the SMB_IGNORE_CASE flag to ensure a case sensitive * match. * - * The second parameter of smb_vop_setattr() is set to - * NULL, even though an unnamed stream exists. This is - * because we want to set the UID and GID on the named - * stream in this case for consistency with the (unnamed - * stream) file (see comments for smb_vop_setattr()). - * * Note that some stream "types" are "restricted" and only * internal callers (cr == kcred) can create those. */ static int -smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, +smb_fsop_create_file_with_stream(smb_request_t *sr, cred_t *cr, smb_node_t *dnode, char *fname, char *sname, int flags, smb_attr_t *attr, smb_node_t **ret_snode) { - smb_attr_t fattr; smb_node_t *fnode; - vnode_t *xattrdvp; - vnode_t *vp; cred_t *kcr = zone_kcred(); int rc = 0; boolean_t fcreate = B_FALSE; + ASSERT(ret_snode != NULL); + if (cr != kcr && smb_strname_restricted(sname)) return (EACCES); @@ -390,8 +382,10 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, sr->tid_tree->t_snode, dnode, fname, &fnode); if (rc == 0) { if (smb_fsop_access(sr, sr->user_cr, fnode, - sr->sr_open.desired_access) != 0) + sr->sr_open.desired_access) != 0) { + smb_node_release(fnode); rc = EACCES; + } } else if (rc == ENOENT) { fcreate = B_TRUE; rc = smb_fsop_create_file(sr, cr, dnode, fname, flags, @@ -400,38 +394,77 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, if (rc != 0) return (rc); - fattr.sa_mask = SMB_AT_UID | SMB_AT_GID; - rc = smb_vop_getattr(fnode->vp, NULL, &fattr, 0, kcr); + rc = smb_fsop_create_stream(sr, cr, dnode, fnode, sname, flags, attr, + ret_snode); - if (rc == 0) { - /* create the named stream, sname */ - rc = smb_vop_stream_create(fnode->vp, sname, attr, - &vp, &xattrdvp, flags, cr); - } if (rc != 0) { if (fcreate) { flags &= ~SMB_IGNORE_CASE; (void) smb_vop_remove(dnode->vp, fnode->od_name, flags, cr); } - smb_node_release(fnode); - return (rc); } + smb_node_release(fnode); + return (rc); +} + +/* + * smb_fsop_create_stream + * + * Create named stream (sname) on existing file (fnode). + * + * The second parameter of smb_vop_setattr() is set to + * NULL, even though an unnamed stream exists. This is + * because we want to set the UID and GID on the named + * stream in this case for consistency with the (unnamed + * stream) file (see comments for smb_vop_setattr()). + * + * Note that some stream "types" are "restricted" and only + * internal callers (cr == kcred) can create those. + */ +int +smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, + smb_node_t *dnode, smb_node_t *fnode, char *sname, int flags, + smb_attr_t *attr, smb_node_t **ret_snode) +{ + smb_attr_t fattr; + vnode_t *xattrdvp; + vnode_t *vp; + cred_t *kcr = zone_kcred(); + int rc = 0; + + ASSERT(ret_snode != NULL); + + if (cr != kcr && smb_strname_restricted(sname)) + return (EACCES); + + bzero(&fattr, sizeof (fattr)); + fattr.sa_mask = SMB_AT_UID | SMB_AT_GID; + rc = smb_vop_getattr(fnode->vp, NULL, &fattr, 0, kcr); + + if (rc == 0) { + /* create the named stream, sname */ + rc = smb_vop_stream_create(fnode->vp, sname, + attr, &vp, &xattrdvp, flags, cr); + } + if (rc != 0) + return (rc); + attr->sa_vattr.va_uid = fattr.sa_vattr.va_uid; attr->sa_vattr.va_gid = fattr.sa_vattr.va_gid; attr->sa_mask = SMB_AT_UID | SMB_AT_GID; rc = smb_vop_setattr(vp, NULL, attr, 0, kcr); if (rc != 0) { - smb_node_release(fnode); + VN_RELE(xattrdvp); + VN_RELE(vp); return (rc); } *ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdvp, vp, sname); - smb_node_release(fnode); VN_RELE(xattrdvp); VN_RELE(vp); @@ -441,7 +474,7 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr, /* notify change to the unnamed stream */ if (rc == 0) smb_node_notify_change(dnode, - FILE_ACTION_ADDED_STREAM, fname); + FILE_ACTION_ADDED_STREAM, fnode->od_name); return (rc); } @@ -458,6 +491,8 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr, vnode_t *vp; int rc; + ASSERT(ret_snode != NULL); + #ifdef _KERNEL smb_fssd_t fs_sd; uint32_t secinfo; @@ -466,15 +501,24 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr, if (op->sd) { /* * SD sent by client in Windows format. Needs to be - * converted to FS format. No inheritance. + * converted to FS format. Inherit DACL/SACL if they're not + * specified. */ secinfo = smb_sd_get_secinfo(op->sd); + + if ((secinfo & SMB_SACL_SECINFO) != 0 && + !smb_user_has_security_priv(sr->uid_user, cr)) + return (EPERM); + smb_fssd_init(&fs_sd, secinfo, 0); status = smb_sd_tofs(op->sd, &fs_sd); if (status == NT_STATUS_SUCCESS) { - rc = smb_fsop_create_with_sd(sr, cr, dnode, - name, attr, ret_snode, &fs_sd); + rc = smb_fsop_sdinherit(sr, dnode, &fs_sd); + if (rc == 0) + rc = smb_fsop_create_with_sd(sr, cr, dnode, + name, attr, ret_snode, &fs_sd); + } else { rc = EINVAL; } @@ -485,7 +529,7 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr, * Server applies Windows inheritance rules, * see smb_fsop_sdinherit() comments as to why. */ - smb_fssd_init(&fs_sd, SMB_ACL_SECINFO, 0); + smb_fssd_init(&fs_sd, 0, 0); rc = smb_fsop_sdinherit(sr, dnode, &fs_sd); if (rc == 0) { rc = smb_fsop_create_with_sd(sr, cr, dnode, @@ -607,15 +651,23 @@ smb_fsop_mkdir( if (op->sd) { /* * SD sent by client in Windows format. Needs to be - * converted to FS format. No inheritance. + * converted to FS format. Inherit DACL/SACL if they're not + * specified. */ secinfo = smb_sd_get_secinfo(op->sd); + + if ((secinfo & SMB_SACL_SECINFO) != 0 && + !smb_user_has_security_priv(sr->uid_user, cr)) + return (EPERM); + smb_fssd_init(&fs_sd, secinfo, SMB_FSSD_FLAGS_DIR); status = smb_sd_tofs(op->sd, &fs_sd); if (status == NT_STATUS_SUCCESS) { - rc = smb_fsop_create_with_sd(sr, cr, dnode, - name, attr, ret_snode, &fs_sd); + rc = smb_fsop_sdinherit(sr, dnode, &fs_sd); + if (rc == 0) + rc = smb_fsop_create_with_sd(sr, cr, dnode, + name, attr, ret_snode, &fs_sd); } else rc = EINVAL; @@ -626,7 +678,7 @@ smb_fsop_mkdir( * Server applies Windows inheritance rules, * see smb_fsop_sdinherit() comments as to why. */ - smb_fssd_init(&fs_sd, SMB_ACL_SECINFO, SMB_FSSD_FLAGS_DIR); + smb_fssd_init(&fs_sd, 0, SMB_FSSD_FLAGS_DIR); rc = smb_fsop_sdinherit(sr, dnode, &fs_sd); if (rc == 0) { rc = smb_fsop_create_with_sd(sr, cr, dnode, @@ -1519,7 +1571,7 @@ smb_fsop_write( cr = kcr; } - smb_node_start_crit(snode, RW_WRITER); + smb_node_start_crit(snode, RW_READER); rc = nbl_svmand(vp, kcr, &svmand); if (rc) { smb_node_end_crit(snode); @@ -1691,10 +1743,7 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode, * it's not part of DACL. It's only granted via proper * privileges. */ - if ((sr->uid_user->u_privileges & - (SMB_USER_PRIV_BACKUP | - SMB_USER_PRIV_RESTORE | - SMB_USER_PRIV_SECURITY)) == 0) + if (!smb_user_has_security_priv(sr->uid_user, cr)) return (NT_STATUS_PRIVILEGE_NOT_HELD); faccess &= ~ACCESS_SYSTEM_SECURITY; @@ -1736,9 +1785,13 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode, /* * smb_fsop_lookup_name() * + * Lookup both the file and stream specified in 'name'. * If name indicates that the file is a stream file, perform * stream specific lookup, otherwise call smb_fsop_lookup. * + * On success, returns the found node in *ret_snode. This will be either a named + * or unnamed stream node, depending on the name specified. + * * Return an error if the looked-up file is in outside the tree. * (Required when invoked from open path.) * @@ -1760,18 +1813,64 @@ smb_fsop_lookup_name( char *name, smb_node_t **ret_snode) { - smb_node_t *fnode; - vnode_t *xattrdirvp; - vnode_t *vp; - char *od_name; + char *sname = NULL; + int rc; + smb_node_t *tmp_node; + + ASSERT(ret_snode != NULL); + + rc = smb_fsop_lookup_file(sr, cr, flags, root_node, dnode, name, + &sname, ret_snode); + + if (rc != 0 || sname == NULL) + return (rc); + + tmp_node = *ret_snode; + rc = smb_fsop_lookup_stream(sr, cr, flags, root_node, tmp_node, sname, + ret_snode); + kmem_free(sname, MAXNAMELEN); + smb_node_release(tmp_node); + + return (rc); +} + +/* + * smb_fsop_lookup_file() + * + * Look up of the file portion of 'name'. If a Stream is specified, + * return the stream name in 'sname', which this allocates. + * The caller must free 'sname'. + * + * Return an error if the looked-up file is outside the tree. + * (Required when invoked from open path.) + * + * Case sensitivity flags (SMB_IGNORE_CASE, SMB_CASE_SENSITIVE): + * if SMB_CASE_SENSITIVE is set, the SMB_IGNORE_CASE flag will NOT be set + * based on the tree's case sensitivity. However, if the SMB_IGNORE_CASE + * flag is set in the flags value passed as a parameter, a case insensitive + * lookup WILL be done (regardless of whether SMB_CASE_SENSITIVE is set + * or not). + */ + +int +smb_fsop_lookup_file( + smb_request_t *sr, + cred_t *cr, + int flags, + smb_node_t *root_node, + smb_node_t *dnode, + char *name, + char **sname, + smb_node_t **ret_snode) +{ char *fname; - char *sname; int rc; ASSERT(cr); ASSERT(dnode); ASSERT(dnode->n_magic == SMB_NODE_MAGIC); ASSERT(dnode->n_state != SMB_NODE_STATE_DESTROYING); + ASSERT(ret_snode != NULL); /* * The following check is required for streams processing, below @@ -1782,11 +1881,11 @@ smb_fsop_lookup_name( flags |= SMB_IGNORE_CASE; } - fname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - sname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - + *sname = NULL; if (smb_is_stream_name(name)) { - smb_stream_parse_name(name, fname, sname); + *sname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + fname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + smb_stream_parse_name(name, fname, *sname); /* * Look up the unnamed stream (i.e. fname). @@ -1794,49 +1893,8 @@ smb_fsop_lookup_name( * as well as any link target. */ rc = smb_fsop_lookup(sr, cr, flags, root_node, dnode, - fname, &fnode); - - if (rc != 0) { - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); - return (rc); - } - - od_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); - - /* - * od_name is the on-disk name of the stream, except - * without the prepended stream prefix (SMB_STREAM_PREFIX) - */ - - /* - * XXX - * What permissions NTFS requires for stream lookup if any? - */ - rc = smb_vop_stream_lookup(fnode->vp, sname, &vp, od_name, - &xattrdirvp, flags, root_node->vp, cr); - - if (rc != 0) { - smb_node_release(fnode); - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); - kmem_free(od_name, MAXNAMELEN); - return (rc); - } - - *ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdirvp, - vp, od_name); - - kmem_free(od_name, MAXNAMELEN); - smb_node_release(fnode); - VN_RELE(xattrdirvp); - VN_RELE(vp); - - if (*ret_snode == NULL) { - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); - return (ENOMEM); - } + fname, ret_snode); + kmem_free(fname, MAXNAMELEN); } else { rc = smb_fsop_lookup(sr, cr, flags, root_node, dnode, name, ret_snode); @@ -1851,8 +1909,66 @@ smb_fsop_lookup_name( } } - kmem_free(fname, MAXNAMELEN); - kmem_free(sname, MAXNAMELEN); + if (rc != 0 && *sname != NULL) { + kmem_free(*sname, MAXNAMELEN); + *sname = NULL; + } + return (rc); +} + +/* + * smb_fsop_lookup_stream + * + * The file exists, see if the stream exists. + */ +int +smb_fsop_lookup_stream( + smb_request_t *sr, + cred_t *cr, + int flags, + smb_node_t *root_node, + smb_node_t *fnode, + char *sname, + smb_node_t **ret_snode) +{ + char *od_name; + vnode_t *xattrdirvp; + vnode_t *vp; + int rc; + + /* + * The following check is required for streams processing, below + */ + + if (!(flags & SMB_CASE_SENSITIVE)) { + if (SMB_TREE_IS_CASEINSENSITIVE(sr)) + flags |= SMB_IGNORE_CASE; + } + + od_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); + + /* + * od_name is the on-disk name of the stream, except + * without the prepended stream prefix (SMB_STREAM_PREFIX) + */ + + rc = smb_vop_stream_lookup(fnode->vp, sname, &vp, od_name, + &xattrdirvp, flags, root_node->vp, cr); + + if (rc != 0) { + kmem_free(od_name, MAXNAMELEN); + return (rc); + } + + *ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdirvp, + vp, od_name); + + kmem_free(od_name, MAXNAMELEN); + VN_RELE(xattrdirvp); + VN_RELE(vp); + + if (*ret_snode == NULL) + return (ENOMEM); return (rc); } @@ -2391,6 +2507,8 @@ smb_fsop_sdmerge(smb_request_t *sr, smb_node_t *snode, smb_fssd_t *fs_sd) * owner has been specified. Callers should translate this to * STATUS_INVALID_OWNER which is not the normal mapping for EPERM * in upper layers, so EPERM is mapped to EBADE. + * + * If 'overwrite' is non-zero, then the existing ACL is ignored. */ int smb_fsop_sdwrite(smb_request_t *sr, cred_t *cr, smb_node_t *snode, @@ -2456,14 +2574,13 @@ smb_fsop_sdwrite(smb_request_t *sr, cred_t *cr, smb_node_t *snode, } if (fs_sd->sd_secinfo & SMB_ACL_SECINFO) { - if (overwrite == 0) { + if (overwrite == 0) error = smb_fsop_sdmerge(sr, snode, fs_sd); - if (error) - return (error); - } - error = smb_fsop_aclwrite(sr, cr, snode, fs_sd); - if (error) { + if (error == 0) + error = smb_fsop_aclwrite(sr, cr, snode, fs_sd); + + if (error != 0) { /* * Revert uid/gid changes if required. */ @@ -2511,39 +2628,46 @@ smb_fsop_sdinherit(smb_request_t *sr, smb_node_t *dnode, smb_fssd_t *fs_sd) acl_t *sacl = NULL; int is_dir; int error; + uint32_t secinfo; + smb_fssd_t pfs_sd; ASSERT(fs_sd); - if (sr->tid_tree->t_acltype != ACE_T) { - /* - * No forced inheritance for non-ZFS filesystems. - */ - fs_sd->sd_secinfo = 0; + secinfo = fs_sd->sd_secinfo; + + /* Anything to do? */ + if ((secinfo & SMB_ACL_SECINFO) == SMB_ACL_SECINFO) + return (0); + + /* + * No forced inheritance for non-ZFS filesystems. + */ + if (sr->tid_tree->t_acltype != ACE_T) return (0); - } + smb_fssd_init(&pfs_sd, SMB_ACL_SECINFO, fs_sd->sd_flags); /* Fetch parent directory's ACL */ - error = smb_fsop_sdread(sr, zone_kcred(), dnode, fs_sd); + error = smb_fsop_sdread(sr, zone_kcred(), dnode, &pfs_sd); if (error) { return (error); } is_dir = (fs_sd->sd_flags & SMB_FSSD_FLAGS_DIR); - dacl = smb_fsacl_inherit(fs_sd->sd_zdacl, is_dir, SMB_DACL_SECINFO, - sr->user_cr); - sacl = smb_fsacl_inherit(fs_sd->sd_zsacl, is_dir, SMB_SACL_SECINFO, - sr->user_cr); - - if (sacl == NULL) - fs_sd->sd_secinfo &= ~SMB_SACL_SECINFO; - - smb_fsacl_free(fs_sd->sd_zdacl); - smb_fsacl_free(fs_sd->sd_zsacl); + if ((secinfo & SMB_DACL_SECINFO) == 0) { + dacl = smb_fsacl_inherit(pfs_sd.sd_zdacl, is_dir, + SMB_DACL_SECINFO, sr->user_cr); + fs_sd->sd_zdacl = dacl; + } - fs_sd->sd_zdacl = dacl; - fs_sd->sd_zsacl = sacl; + if ((secinfo & SMB_SACL_SECINFO) == 0) { + sacl = smb_fsacl_inherit(pfs_sd.sd_zsacl, is_dir, + SMB_SACL_SECINFO, sr->user_cr); + fs_sd->sd_zsacl = sacl; + } + smb_fsacl_free(pfs_sd.sd_zdacl); + smb_fsacl_free(pfs_sd.sd_zsacl); return (0); } #endif /* _KERNEL */ diff --git a/usr/src/uts/common/fs/smbsrv/smb_idmap.c b/usr/src/uts/common/fs/smbsrv/smb_idmap.c index b9bfa991c4..e6c04193b0 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_idmap.c +++ b/usr/src/uts/common/fs/smbsrv/smb_idmap.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -83,12 +83,12 @@ smb_idmap_getsid(uid_t id, int idtype, smb_sid_t **sid) switch (idtype) { case SMB_IDMAP_USER: - sim.sim_stat = kidmap_getsidbyuid(global_zone, id, + sim.sim_stat = kidmap_getsidbyuid(curzone, id, (const char **)&sim.sim_domsid, &sim.sim_rid); break; case SMB_IDMAP_GROUP: - sim.sim_stat = kidmap_getsidbygid(global_zone, id, + sim.sim_stat = kidmap_getsidbygid(curzone, id, (const char **)&sim.sim_domsid, &sim.sim_rid); break; @@ -150,17 +150,17 @@ smb_idmap_getid(smb_sid_t *sid, uid_t *id, int *idtype) switch (*idtype) { case SMB_IDMAP_USER: - sim.sim_stat = kidmap_getuidbysid(global_zone, sim.sim_domsid, + sim.sim_stat = kidmap_getuidbysid(curzone, sim.sim_domsid, sim.sim_rid, sim.sim_id); break; case SMB_IDMAP_GROUP: - sim.sim_stat = kidmap_getgidbysid(global_zone, sim.sim_domsid, + sim.sim_stat = kidmap_getgidbysid(curzone, sim.sim_domsid, sim.sim_rid, sim.sim_id); break; case SMB_IDMAP_UNKNOWN: - sim.sim_stat = kidmap_getpidbysid(global_zone, sim.sim_domsid, + sim.sim_stat = kidmap_getpidbysid(curzone, sim.sim_domsid, sim.sim_rid, sim.sim_id, &sim.sim_idtype); break; @@ -186,7 +186,7 @@ smb_idmap_batch_create(smb_idmap_batch_t *sib, uint16_t nmap, int flags) bzero(sib, sizeof (smb_idmap_batch_t)); - sib->sib_idmaph = kidmap_get_create(global_zone); + sib->sib_idmaph = kidmap_get_create(curzone); sib->sib_flags = flags; sib->sib_nmap = nmap; diff --git a/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c b/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c index 132820a147..1476850683 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c +++ b/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c @@ -22,7 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. */ /* @@ -800,7 +800,7 @@ smb_mbc_poke(mbuf_chain_t *mbc, int offset, const char *fmt, ...) */ int smb_mbc_copy(mbuf_chain_t *dst_mbc, const mbuf_chain_t *src_mbc, - int copy_offset, int copy_len) + int copy_offset, int copy_len) { mbuf_t *src_m; int offset, len; @@ -1109,8 +1109,6 @@ mbc_marshal_put_oem_string(mbuf_chain_t *mbc, char *mbs, int repc) */ if (repc <= 0) repc = oemlen + 1; - if (mbc_marshal_make_room(mbc, repc)) - return (DECODE_NO_MORE_DATA); /* * Convert into a temporary buffer @@ -1133,6 +1131,10 @@ mbc_marshal_put_oem_string(mbuf_chain_t *mbc, char *mbs, int repc) */ s = oembuf; while (repc > 0) { + if (mbc_marshal_make_room(mbc, 1)) { + rc = DECODE_NO_MORE_DATA; + goto out; + } mbc_marshal_store_byte(mbc, *s); if (*s != '\0') s++; @@ -1158,6 +1160,7 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc) { smb_wchar_t *wcsbuf = NULL; smb_wchar_t *wp; + smb_wchar_t wchar; size_t wcslen, wcsbytes; size_t rlen; int rc; @@ -1183,8 +1186,6 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc) */ if (repc <= 0) repc = wcsbytes + 2; - if (mbc_marshal_make_room(mbc, repc)) - return (DECODE_NO_MORE_DATA); /* * Convert into a temporary buffer @@ -1208,18 +1209,27 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc) * little-endian order while copying. */ wp = wcsbuf; - while (repc > 1) { - smb_wchar_t wchar = LE_IN16(wp); + while (repc >= sizeof (smb_wchar_t)) { + if (mbc_marshal_make_room(mbc, sizeof (smb_wchar_t))) { + rc = DECODE_NO_MORE_DATA; + goto out; + } + wchar = LE_IN16(wp); mbc_marshal_store_byte(mbc, wchar); mbc_marshal_store_byte(mbc, wchar >> 8); if (wchar != 0) wp++; repc -= sizeof (smb_wchar_t); } - if (repc > 0) + if (repc > 0) { + if (mbc_marshal_make_room(mbc, 1)) { + rc = DECODE_NO_MORE_DATA; + goto out; + } mbc_marshal_store_byte(mbc, 0); - + } rc = 0; + out: if (wcsbuf != NULL) smb_mem_free(wcsbuf); diff --git a/usr/src/uts/common/fs/smbsrv/smb_node.c b/usr/src/uts/common/fs/smbsrv/smb_node.c index 8ce3e70712..a204326514 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_node.c +++ b/usr/src/uts/common/fs/smbsrv/smb_node.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2019 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* * SMB Node State Machine @@ -478,6 +478,18 @@ smb_node_release(smb_node_t *node) case SMB_NODE_STATE_AVAILABLE: node->n_state = SMB_NODE_STATE_DESTROYING; + + /* + * While we still hold n_mutex, + * make sure FEM hooks are gone. + */ + if (node->n_fcn_count > 0) { + DTRACE_PROBE1(fem__fcn__dangles, + smb_node_t *, node); + node->n_fcn_count = 0; + (void) smb_fem_fcn_uninstall(node); + } + mutex_exit(&node->n_mutex); smb_llist_enter(node->n_hash_bucket, RW_WRITER); @@ -883,8 +895,9 @@ smb_node_fcn_unsubscribe(smb_node_t *node) mutex_enter(&node->n_mutex); node->n_fcn_count--; - if (node->n_fcn_count == 0) - smb_fem_fcn_uninstall(node); + if (node->n_fcn_count == 0) { + VERIFY0(smb_fem_fcn_uninstall(node)); + } mutex_exit(&node->n_mutex); } @@ -1479,6 +1492,7 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node, int rc; uint_t times_mask; smb_attr_t tmp_attr; + smb_node_t *unnamed_node; SMB_NODE_VALID(node); @@ -1543,14 +1557,6 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node, } /* - * If we have an open file, and we set the size, - * then set the "written" flag so that at close, - * we can force an mtime update. - */ - if (of != NULL && (attr->sa_mask & SMB_AT_SIZE) != 0) - of->f_written = B_TRUE; - - /* * When operating on an open file, some settable attributes * become "sticky" in the open file object until close. * (see above re. timestamps) @@ -1615,6 +1621,13 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node, FILE_ACTION_MODIFIED, node->od_name); } + if ((unnamed_node = SMB_IS_STREAM(node)) != NULL) { + ASSERT(unnamed_node->n_magic == SMB_NODE_MAGIC); + ASSERT(unnamed_node->n_state != SMB_NODE_STATE_DESTROYING); + smb_node_notify_change(node->n_dnode, + FILE_ACTION_MODIFIED_STREAM, node->od_name); + } + return (0); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_notify.c b/usr/src/uts/common/fs/smbsrv/smb_notify.c index fda9197e6e..602fa1db3b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_notify.c +++ b/usr/src/uts/common/fs/smbsrv/smb_notify.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* @@ -97,8 +97,8 @@ * smb_notify_act1: * Validate parameters, setup ofile buffer. * If data already available, return it, all done. - * (In the "all done" case, skip act2 & act3.) - * If no data available, return a special error + * (In the "all done" case, skip act2 & act3.) + * If no data available, return a special error * ("STATUS_PENDING") to tell the caller they must * proceed with calls to act2 & act3. * @@ -201,6 +201,15 @@ smb_notify_act1(smb_request_t *sr, uint32_t buflen, uint32_t filter) mutex_enter(&of->f_mutex); /* + * It's possible this ofile has started closing, in which case + * we must not subscribe it for events etc. + */ + if (of->f_state != SMB_OFILE_STATE_OPEN) { + mutex_exit(&of->f_mutex); + return (NT_STATUS_FILE_CLOSED); + } + + /* * On the first FCN call with this ofile, subscribe to * events on the node. The corresponding unsubscribe * happens in smb_ofile_delete(). diff --git a/usr/src/uts/common/fs/smbsrv/smb_ofile.c b/usr/src/uts/common/fs/smbsrv/smb_ofile.c index d5388037c3..1d7a5c134f 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_ofile.c +++ b/usr/src/uts/common/fs/smbsrv/smb_ofile.c @@ -22,7 +22,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Syneto S.R.L. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. - * Copyright 2019 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ /* @@ -446,10 +446,23 @@ void smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec) { smb_attr_t *pa; - timestruc_t now; SMB_OFILE_VALID(of); + if (of->f_ftype == SMB_FTYPE_DISK) { + smb_node_t *node = of->f_node; + + smb_llist_enter(&node->n_ofile_list, RW_READER); + mutex_enter(&node->n_oplock.ol_mutex); + + if (of->f_lease != NULL) + smb2_lease_ofile_close(of); + smb_oplock_break_CLOSE(node, of); + + mutex_exit(&node->n_oplock.ol_mutex); + smb_llist_exit(&node->n_ofile_list); + } + mutex_enter(&of->f_mutex); ASSERT(of->f_refcnt); @@ -480,9 +493,6 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec) smb2_dh_close_persistent(of); if (of->f_persistid != 0) smb_ofile_del_persistid(of); - if (of->f_lease != NULL) - smb2_lease_ofile_close(of); - smb_oplock_break_CLOSE(of->f_node, of); /* FALLTHROUGH */ case SMB_FTYPE_PRINTER: /* or FTYPE_DISK */ @@ -498,20 +508,6 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec) pa->sa_mask |= SMB_AT_MTIME; } - /* - * If we have ever modified data via this handle - * (write or truncate) and if the mtime was not - * set via this handle, update the mtime again - * during the close. Windows expects this. - * [ MS-FSA 2.1.5.4 "Update Timestamps" ] - */ - if (of->f_written && - (pa->sa_mask & SMB_AT_MTIME) == 0) { - pa->sa_mask |= SMB_AT_MTIME; - gethrestime(&now); - pa->sa_vattr.va_mtime = now; - } - if (of->f_flags & SMB_OFLAGS_SET_DELETE_ON_CLOSE) { /* We delete using the on-disk name. */ uint32_t flags = SMB_CASE_SENSITIVE; @@ -1457,11 +1453,18 @@ smb_ofile_delete(void *arg) */ if (of->f_ftype == SMB_FTYPE_DISK || of->f_ftype == SMB_FTYPE_PRINTER) { - ASSERT(of->f_node != NULL); + smb_node_t *node = of->f_node; + + /* + * Oplock cleanup should have made sure that + * excl_open does not point to this ofile. + */ + VERIFY(node->n_oplock.excl_open != of); + /* * Note smb_ofile_close did smb_node_dec_open_ofiles() */ - smb_node_rem_ofile(of->f_node, of); + smb_node_rem_ofile(node, of); } /* diff --git a/usr/src/uts/common/fs/smbsrv/smb_sd.c b/usr/src/uts/common/fs/smbsrv/smb_sd.c index ddbd7b9413..946503fa8f 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_sd.c +++ b/usr/src/uts/common/fs/smbsrv/smb_sd.c @@ -22,7 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -243,28 +243,52 @@ smb_sd_tofs(smb_sd_t *sd, smb_fssd_t *fs_sd) } } + /* + * In SMB, the 'secinfo' determines which parts of the SD the client + * intends to change. Notably, this includes changing the DACL_PRESENT + * and SACL_PRESENT control bits. The client can specify e.g. + * SACL_SECINFO, but not SACL_PRESENT, and this means the client intends + * to remove the SACL. + * + * Note that Windows behavior differs from that described in [MS-DTYP]. + * MS-DTYP states that the offset is nonzero if-and-only-if the PRESENT + * bit is set. It also states that a DACL that is marked non-present + * is equivalent to 'no security', but one that is marked present and + * provides no ACEs is equivalent to 'no access'. + * + * Windows, on the other hand, allows the offset to be 0 even when + * the PRESENT bit is set, and only provides security when the DACL + * offset is non-zero. It will also convert an SD where the DACL is + * marked not-present to one where the PRESENT bit is set and the + * offset is 0. + * + * If the *_PRESENT bit isn't set, then the respective ACL will be NULL. + * For the fssd, we allow the SACL to be NULL, but we MUST have a DACL. + * If the DACL is NULL, that's equivalent to "everyone:full_set:allow". + * + * The IMPLY's should be enforced by smb_decode_sd(). + */ + /* DACL */ if (fs_sd->sd_secinfo & SMB_DACL_SECINFO) { - if (sd->sd_control & SE_DACL_PRESENT) { - status = smb_acl_to_zfs(sd->sd_dacl, flags, - SMB_DACL_SECINFO, &fs_sd->sd_zdacl); - if (status != NT_STATUS_SUCCESS) - return (status); - } - else - return (NT_STATUS_INVALID_ACL); + IMPLY(sd->sd_dacl != NULL, + (sd->sd_control & SE_DACL_PRESENT) != 0); + status = smb_acl_to_zfs(sd->sd_dacl, flags, + SMB_DACL_SECINFO, &fs_sd->sd_zdacl); + if (status != NT_STATUS_SUCCESS) + return (status); } /* SACL */ if (fs_sd->sd_secinfo & SMB_SACL_SECINFO) { + IMPLY(sd->sd_sacl != NULL, + (sd->sd_control & SE_SACL_PRESENT) != 0); if (sd->sd_control & SE_SACL_PRESENT) { status = smb_acl_to_zfs(sd->sd_sacl, flags, SMB_SACL_SECINFO, &fs_sd->sd_zsacl); if (status != NT_STATUS_SUCCESS) { return (status); } - } else { - return (NT_STATUS_INVALID_ACL); } } diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c index 13df16f55d..3b69a5699b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_server.c +++ b/usr/src/uts/common/fs/smbsrv/smb_server.c @@ -22,6 +22,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ /* @@ -2094,6 +2095,7 @@ smb_server_store_cfg(smb_server_t *sv, smb_ioc_cfg_t *ioc) sv->sv_cfg.skc_max_protocol = ioc->max_protocol; sv->sv_cfg.skc_min_protocol = ioc->min_protocol; sv->sv_cfg.skc_encrypt = ioc->encrypt; + sv->sv_cfg.skc_encrypt_cipher = ioc->encrypt_cipher; sv->sv_cfg.skc_execflags = ioc->exec_flags; sv->sv_cfg.skc_negtok_len = ioc->negtok_len; sv->sv_cfg.skc_version = ioc->version; diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c index 17bbc16e72..6739fee326 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_session.c +++ b/usr/src/uts/common/fs/smbsrv/smb_session.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. */ #include <sys/atomic.h> @@ -857,6 +858,9 @@ smb_session_delete(smb_session_t *session) session->signing.mackey_len); } + if (session->preauth_mech != NULL) + smb31_preauth_fini(session); + session->s_magic = 0; smb_rwx_destroy(&session->s_lock); diff --git a/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c b/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c index 55f4bc9d0e..44aa6ba117 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c +++ b/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c @@ -32,7 +32,7 @@ * Common function to see if a mech is available. */ static int -find_mech(smb_crypto_mech_t *mech, crypto_mech_name_t name) +find_mech(smb_crypto_mech_t *mech, const char *name) { crypto_mech_type_t t; diff --git a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c index 12d425d438..d4811f6857 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c +++ b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -173,7 +173,8 @@ smb_oplock_ind_break_in_ack(smb_request_t *ack_sr, smb_ofile_t *ofile, * We're going to schedule a request that will have a * reference to this ofile. Get the hold first. */ - if (!smb_ofile_hold_olbrk(ofile)) { + if (ofile->f_oplock.og_closing || + !smb_ofile_hold_olbrk(ofile)) { /* It's closing (or whatever). Nothing to do. */ return; } @@ -264,7 +265,8 @@ smb_oplock_ind_break(smb_ofile_t *ofile, uint32_t NewLevel, * We're going to schedule a request that will have a * reference to this ofile. Get the hold first. */ - if (!smb_ofile_hold_olbrk(ofile)) { + if (ofile->f_oplock.og_closing || + !smb_ofile_hold_olbrk(ofile)) { /* It's closing (or whatever). Nothing to do. */ return; } diff --git a/usr/src/uts/common/fs/smbsrv/smb_tree.c b/usr/src/uts/common/fs/smbsrv/smb_tree.c index aedacf2123..45f381ffb1 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_tree.c +++ b/usr/src/uts/common/fs/smbsrv/smb_tree.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2021 Tintri by DDN, Inc. All rights reserved. */ /* @@ -188,6 +188,7 @@ static void smb_tree_dealloc(void *); static boolean_t smb_tree_is_connected_locked(smb_tree_t *); static char *smb_tree_get_sharename(char *); static int smb_tree_getattr(const smb_kshare_t *, smb_node_t *, smb_tree_t *); +static void smb_tree_get_creation(smb_node_t *, smb_tree_t *); static void smb_tree_get_volname(vfs_t *, smb_tree_t *); static void smb_tree_get_flags(const smb_kshare_t *, vfs_t *, smb_tree_t *); static void smb_tree_log(smb_request_t *, const char *, const char *, ...); @@ -917,10 +918,6 @@ smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si, tree->t_session = session; tree->t_server = session->s_server; - /* grab a ref for tree->t_owner */ - smb_user_hold_internal(sr->uid_user); - tree->t_owner = sr->uid_user; - if (STYPE_ISDSK(stype) || STYPE_ISPRN(stype)) { if (smb_tree_getattr(si, snode, tree) != 0) { smb_idpool_free(&session->s_tid_pool, tid); @@ -964,6 +961,10 @@ smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si, tree->t_connect_time = gethrestime_sec(); tree->t_execflags = execflags; + /* grab a ref for tree->t_owner */ + smb_user_hold_internal(sr->uid_user); + tree->t_owner = sr->uid_user; + /* if FS is readonly, enforce that here */ if (tree->t_flags & SMB_TREE_READONLY) tree->t_access &= ~ACE_ALL_WRITE_PERMS; @@ -1099,15 +1100,29 @@ static int smb_tree_getattr(const smb_kshare_t *si, smb_node_t *node, smb_tree_t *tree) { vfs_t *vfsp = SMB_NODE_VFS(node); + vfs_t *realvfsp; smb_cfg_val_t srv_encrypt; ASSERT(vfsp); - if (getvfs(&vfsp->vfs_fsid) != vfsp) - return (ESTALE); - + smb_tree_get_creation(node, tree); smb_tree_get_volname(vfsp, tree); - smb_tree_get_flags(si, vfsp, tree); + + /* + * In the case of an lofs mount, we need to ask the (real) + * underlying filesystem about capabilities, where the + * passed in vfs_t will be from lofs. + */ + realvfsp = getvfs(&vfsp->vfs_fsid); + if (realvfsp != NULL) { + smb_tree_get_flags(si, realvfsp, tree); + VFS_RELE(realvfsp); + } else { + cmn_err(CE_NOTE, "Failed getting info for share: %s", + si->shr_name); + /* do the best we can without realvfsp */ + smb_tree_get_flags(si, vfsp, tree); + } srv_encrypt = tree->t_session->s_server->sv_cfg.skc_encrypt; if (tree->t_session->dialect >= SMB_VERS_3_0) { @@ -1122,11 +1137,27 @@ smb_tree_getattr(const smb_kshare_t *si, smb_node_t *node, smb_tree_t *tree) } else tree->t_encrypt = SMB_CONFIG_DISABLED; - VFS_RELE(vfsp); return (0); } /* + * File volume creation time + */ +static void +smb_tree_get_creation(smb_node_t *node, smb_tree_t *tree) +{ + smb_attr_t attr; + cred_t *kcr = zone_kcred(); + + bzero(&attr, sizeof (attr)); + attr.sa_mask = SMB_AT_CRTIME; + (void) smb_node_getattr(NULL, node, kcr, NULL, &attr); + /* On failure we'll have time zero, which is OK */ + + tree->t_create_time = attr.sa_crtime; +} + +/* * Extract the volume name. */ static void diff --git a/usr/src/uts/common/fs/smbsrv/smb_user.c b/usr/src/uts/common/fs/smbsrv/smb_user.c index b46cad1b6f..8934a213eb 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_user.c +++ b/usr/src/uts/common/fs/smbsrv/smb_user.c @@ -205,6 +205,8 @@ #include <sys/types.h> #include <sys/sid.h> #include <sys/priv_names.h> +#include <sys/priv.h> +#include <sys/policy.h> #include <smbsrv/smb_kproto.h> #include <smbsrv/smb_door.h> @@ -831,6 +833,45 @@ smb_user_setcred(smb_user_t *user, cred_t *cr, uint32_t privileges) #endif /* _KERNEL */ /* + * Determines whether a user can be granted ACCESS_SYSTEM_SECURITY + */ +boolean_t +smb_user_has_security_priv(smb_user_t *user, cred_t *cr) +{ + /* Need SeSecurityPrivilege to get/set SACL */ + if ((user->u_privileges & SMB_USER_PRIV_SECURITY) != 0) + return (B_TRUE); + +#ifdef _KERNEL + /* + * ACCESS_SYSTEM_SECURITY is also granted if the file is opened with + * BACKUP/RESTORE intent by a user with BACKUP/RESTORE privilege, + * which means we'll be using u_privcred. + * + * We translate BACKUP as DAC_READ and RESTORE as DAC_WRITE, + * to account for our various SMB_USER_* privileges. + */ + if (PRIV_POLICY_ONLY(cr, + priv_getbyname(PRIV_FILE_DAC_READ, 0), B_FALSE) || + PRIV_POLICY_ONLY(cr, + priv_getbyname(PRIV_FILE_DAC_WRITE, 0), B_FALSE)) + return (B_TRUE); +#else + /* + * No "real" privileges in fksmbsrv, so use the SMB privs instead. + */ + if ((user->u_privileges & + (SMB_USER_PRIV_BACKUP | + SMB_USER_PRIV_RESTORE | + SMB_USER_PRIV_READ_FILE | + SMB_USER_PRIV_WRITE_FILE)) != 0) + return (B_TRUE); +#endif + + return (B_FALSE); +} + +/* * Private function to support smb_user_enum. */ static int @@ -959,6 +1000,9 @@ smb_is_same_user(cred_t *cr1, cred_t *cr2) ksid_t *ks1 = crgetsid(cr1, KSID_USER); ksid_t *ks2 = crgetsid(cr2, KSID_USER); + if (ks1 == NULL || ks2 == NULL) { + return (B_FALSE); + } return (ks1->ks_rid == ks2->ks_rid && strcmp(ks1->ks_domain->kd_name, ks2->ks_domain->kd_name) == 0); } diff --git a/usr/src/uts/common/fs/smbsrv/smb_write.c b/usr/src/uts/common/fs/smbsrv/smb_write.c index 6db8cc9e1a..fbf85da282 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_write.c +++ b/usr/src/uts/common/fs/smbsrv/smb_write.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 Tintri by DDN, Inc. All rights reserved. */ #include <sys/sdt.h> @@ -517,17 +517,6 @@ smb_common_write(smb_request_t *sr, smb_rw_param_t *param) if (rc) return (rc); - /* - * Used to have code here to set mtime. - * We have just done a write, so we know - * the file system will update mtime. - * No need to do it again here. - * - * However, keep track of the fact that - * we have written data via this handle. - */ - ofile->f_written = B_TRUE; - /* This revokes read cache delegations. */ (void) smb_oplock_break_WRITE(node, ofile); diff --git a/usr/src/uts/common/fs/sockfs/nl7c.c b/usr/src/uts/common/fs/sockfs/nl7c.c index c76dada8d7..a71572cbd4 100644 --- a/usr/src/uts/common/fs/sockfs/nl7c.c +++ b/usr/src/uts/common/fs/sockfs/nl7c.c @@ -598,7 +598,7 @@ done: /* * Open and read each line from "/etc/nca/ncalogd.conf" and parse for - * the tokens and token text (i.e. key and value ncalogd.conf(4)): + * the tokens and token text (i.e. key and value ncalogd.conf(5)): * * status=enabled * diff --git a/usr/src/uts/common/fs/sockfs/nl7clogd.c b/usr/src/uts/common/fs/sockfs/nl7clogd.c index 1580a08c6c..4dd40abf2d 100644 --- a/usr/src/uts/common/fs/sockfs/nl7clogd.c +++ b/usr/src/uts/common/fs/sockfs/nl7clogd.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sysmacros.h> #include <sys/callb.h> #include <sys/fcntl.h> @@ -53,7 +51,7 @@ static void logit_flush(void *); * NL7C reuses the NCA logging scheme, the directory "/var/nca" contains * the symlink "current" to 1 of up to 16 NCA BLF logging files, by default * a single logging file "log", optionally paths of up to 16 log files can - * be specified via ncalogd.conf(4), note that these log files need not be + * be specified via ncalogd.conf(5), note that these log files need not be * in the "/var/nca" directory. * * NL7C reuses the NCA logging APIs defined in <inet/nca/ncalogd.h>, at diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index e7d69f9896..edcb41951c 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -458,16 +458,16 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) vp->v_data = so; vn_setops(vp, socket_vnodeops); - so->so_priv = NULL; + so->so_priv = NULL; so->so_oobmsg = NULL; so->so_proto_handle = NULL; - so->so_peercred = NULL; + so->so_peercred = NULL; so->so_rcv_queued = 0; - so->so_rcv_q_head = NULL; - so->so_rcv_q_last_head = NULL; + so->so_rcv_q_head = NULL; + so->so_rcv_q_last_head = NULL; so->so_rcv_head = NULL; so->so_rcv_last_head = NULL; so->so_rcv_wanted = 0; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index bc2878ccc8..59d052084f 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -174,7 +174,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, /* * Force a zero sa_family to match so_family. * - * Some programs like inetd(1M) don't set the + * Some programs like inetd(8) don't set the * family field. Other programs leave * sin_family set to garbage - SunOS 4.X does * not check the family field on a bind. diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c index d49bdbcc6d..532a24c223 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c @@ -25,6 +25,7 @@ */ /* * Copyright (c) 2017 by Delphix. All rights reserved. + * Copyright 2021 Racktop Systems, Inc. */ #include <sys/types.h> @@ -68,19 +69,19 @@ static int socket_vop_ioctl(struct vnode *, int, intptr_t, int, struct cred *, int32_t *, caller_context_t *); static int socket_vop_setfl(struct vnode *, int, int, cred_t *, caller_context_t *); -static int socket_vop_getattr(struct vnode *, struct vattr *, int, +static int socket_vop_getattr(struct vnode *, struct vattr *, int, struct cred *, caller_context_t *); -static int socket_vop_setattr(struct vnode *, struct vattr *, int, +static int socket_vop_setattr(struct vnode *, struct vattr *, int, struct cred *, caller_context_t *); -static int socket_vop_access(struct vnode *, int, int, struct cred *, +static int socket_vop_access(struct vnode *, int, int, struct cred *, caller_context_t *); -static int socket_vop_fsync(struct vnode *, int, struct cred *, +static int socket_vop_fsync(struct vnode *, int, struct cred *, caller_context_t *); static void socket_vop_inactive(struct vnode *, struct cred *, caller_context_t *); -static int socket_vop_fid(struct vnode *, struct fid *, +static int socket_vop_fid(struct vnode *, struct fid *, caller_context_t *); -static int socket_vop_seek(struct vnode *, offset_t, offset_t *, +static int socket_vop_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); static int socket_vop_poll(struct vnode *, short, int, short *, struct pollhead **, caller_context_t *); @@ -282,16 +283,23 @@ socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr, caller_context_t *ct) { dev_t fsid; - struct sonode *so; + struct sonode *so; static int sonode_shift = 0; /* * Calculate the amount of bitshift to a sonode pointer which will - * still keep it unique. See below. + * still keep it unique. See below. Note that highbit() uses + * 1-based indexing for the highest bit set (and 0 for 'no bits set'). + * To use the result of highbit() as a shift value, we must subtract 1 + * from the result. */ - if (sonode_shift == 0) - sonode_shift = highbit(sizeof (struct sonode)); - ASSERT(sonode_shift > 0); + if (sonode_shift == 0) { + int bit = highbit(sizeof (struct sonode)); + + /* Sanity check */ + VERIFY3S(bit, >, 0); + sonode_shift = bit - 1; + } so = VTOSO(vp); fsid = sockdev; @@ -311,11 +319,17 @@ socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags, vap->va_uid = vap->va_gid = 0; vap->va_fsid = fsid; /* - * If the va_nodeid is > MAX_USHORT, then i386 stats might fail. - * So we shift down the sonode pointer to try and get the most - * uniqueness into 16-bits. + * If the va_nodeid is > UINT32_MAX, then stat(2) might fail in + * unexpected ways inside non-largefile aware 32-bit processes -- + * historically, socket inode values (va_nodeid values) were capped at + * UINT16_MAX (for even more ancient reasons long since unnecessary). + * To avoid the potential of surprise failures, we shift down + * the sonode pointer address to try and get the most + * uniqueness into 32-bits. In practice, this represents the unique + * portion of the kernel address space, so the chance of duplicate + * socket inode values is minimized. */ - vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF; + vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFFFFFF; vap->va_nlink = 0; vap->va_size = 0; diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index ea161e30ae..62a079f419 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -42,7 +42,7 @@ * * Socket filter entry (sof_entry_t): * - * There exists one entry for each configured filter (done via soconfig(1M)), + * There exists one entry for each configured filter (done via soconfig(8)), * and they are all in sof_entry_list. In addition to the global list, each * sockparams entry maintains a list of filters that is interested in that * particular socket type. So the filter entry may be referenced by multiple diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index cf2ad8b20d..e63831e172 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -80,7 +80,7 @@ struct sof_entry_kstat { /* * Socket filter entry - one for each configured filter (added and - * removed by soconfig(1M)). + * removed by soconfig(8)). * * sofe_flags, sofe_refcnt and sofe_mod are protected by sofe_lock, and all * other fields are write once. @@ -106,7 +106,7 @@ struct sof_entry { /* Filter entry flags */ #define SOFEF_AUTO 0x1 /* automatic filter */ #define SOFEF_PROG 0x2 /* programmatic filter */ -#define SOFEF_CONDEMED 0x4 /* removed by soconfig(1M) */ +#define SOFEF_CONDEMED 0x4 /* removed by soconfig(8) */ /* * Socket filter instance - one for each socket using a sof_entry_t diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c index 1015decaac..86cbced50c 100644 --- a/usr/src/uts/common/fs/sockfs/sockparams.c +++ b/usr/src/uts/common/fs/sockfs/sockparams.c @@ -64,7 +64,7 @@ static int sockparams_sdev_init(struct sockparams *, char *, int); static void sockparams_sdev_fini(struct sockparams *); /* - * Global sockparams list (populated via soconfig(1M)). + * Global sockparams list (populated via soconfig(8)). */ static list_t sphead; diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 2c010343bb..e686978fd0 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -23,8 +23,8 @@ * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. * Copyright 2015, Joyent, Inc. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ #include <sys/types.h> @@ -962,7 +962,46 @@ so_closefds(void *control, t_uscalar_t controllen, int oldflg, (int)CMSG_CONTENTLEN(cmsg), startoff - (int)sizeof (struct cmsghdr)); } - startoff -= cmsg->cmsg_len; + startoff -= ROUNDUP_cmsglen(cmsg->cmsg_len); + } +} + +/* + * Handle truncation of a cmsg when the receive buffer is not big enough. + * Adjust the cmsg_len header field in the last cmsg that will be included in + * the buffer to reflect the number of bytes included. + */ +void +so_truncatecmsg(void *control, t_uscalar_t controllen, uint_t maxlen) +{ + struct cmsghdr *cmsg; + uint_t len = 0; + + if (control == NULL) + return; + + for (cmsg = control; + CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); + cmsg = CMSG_NEXT(cmsg)) { + + len += ROUNDUP_cmsglen(cmsg->cmsg_len); + + if (len > maxlen) { + /* + * This cmsg is the last one that will be included in + * the truncated buffer. + */ + socklen_t diff = len - maxlen; + + if (diff < CMSG_CONTENTLEN(cmsg)) { + dprint(1, ("so_truncatecmsg: %d -> %d\n", + cmsg->cmsg_len, cmsg->cmsg_len - diff)); + cmsg->cmsg_len -= diff; + } else { + cmsg->cmsg_len = sizeof (struct cmsghdr); + } + break; + } } } @@ -1282,8 +1321,24 @@ so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg, cmsg->cmsg_level = tohp->level; cmsg->cmsg_type = tohp->name; - cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) + - sizeof (struct cmsghdr)); + cmsg->cmsg_len = (socklen_t)sizeof (struct cmsghdr); + if (tohp->level == IPPROTO_IP && + (tohp->name == IP_RECVTOS || + tohp->name == IP_RECVTTL)) { + /* + * The data for these is a uint8_t but, in + * order to maintain alignment for any + * following TPI primitives in the message, + * there will be some trailing padding bytes + * which are included in the TPI_TOPT_DATALEN. + * For these types, we set the cmsg_len + * explicitly to the correct value. + */ + cmsg->cmsg_len += (socklen_t)sizeof (uint8_t); + } else { + cmsg->cmsg_len += + (socklen_t)(_TPI_TOPT_DATALEN(tohp)); + } /* copy content to control data part */ bcopy(&tohp[1], CMSG_CONTENT(cmsg), diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index 6a049b1828..30666f73ca 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -24,6 +24,7 @@ * Copyright 2015, Joyent, Inc. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ #include <sys/types.h> @@ -831,7 +832,7 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags, void *name; socklen_t namelen; void *control; - socklen_t controllen; + socklen_t controllen, free_controllen; ssize_t len; int error; @@ -858,6 +859,8 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags, lwp_stat_update(LWP_STAT_MSGRCV, 1); releasef(sock); + free_controllen = msg->msg_controllen; + error = copyout_name(name, namelen, namelenp, msg->msg_name, msg->msg_namelen); if (error) @@ -887,11 +890,7 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags, goto err; } } - /* - * Note: This MUST be done last. There can be no "goto err" after this - * point since it could make so_closefds run twice on some part - * of the file descriptor array. - */ + if (controllen != 0) { if (!(flags & MSG_XPG4_2)) { /* @@ -900,36 +899,65 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags, */ controllen &= ~((int)sizeof (uint32_t) - 1); } + + if (msg->msg_controllen > controllen || control == NULL) { + /* + * If the truncated part contains file descriptors, + * then they must be closed in the kernel as they + * will not be included in the data returned to + * user space. Close them now so that the header size + * can be safely adjusted prior to copyout. In case of + * an error during copyout, the remaining file + * descriptors will be closed in the error handler + * below. + */ + so_closefds(msg->msg_control, msg->msg_controllen, + !(flags & MSG_XPG4_2), + control == NULL ? 0 : controllen); + + /* + * In the case of a truncated control message, the last + * cmsg header that fits into the available buffer + * space must be adjusted to reflect the actual amount + * of associated data that will be returned. This only + * needs to be done for XPG4 messages as non-XPG4 + * messages are not structured (they are just a + * buffer and a length - msg_accrights(len)). + */ + if (control != NULL && (flags & MSG_XPG4_2)) { + so_truncatecmsg(msg->msg_control, + msg->msg_controllen, controllen); + msg->msg_controllen = controllen; + } + } + error = copyout_arg(control, controllen, controllenp, msg->msg_control, msg->msg_controllen); + if (error) goto err; - if (msg->msg_controllen > controllen || control == NULL) { - if (control == NULL) - controllen = 0; - so_closefds(msg->msg_control, msg->msg_controllen, - !(flags & MSG_XPG4_2), controllen); - } } if (msg->msg_namelen != 0) kmem_free(msg->msg_name, (size_t)msg->msg_namelen); - if (msg->msg_controllen != 0) - kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + if (free_controllen != 0) + kmem_free(msg->msg_control, (size_t)free_controllen); return (len - uiop->uio_resid); err: /* * If we fail and the control part contains file descriptors - * we have to close the fd's. + * we have to close them. For a truncated control message, the + * descriptors which were cut off have already been closed and the + * length adjusted so that they will not be closed again. */ if (msg->msg_controllen != 0) so_closefds(msg->msg_control, msg->msg_controllen, !(flags & MSG_XPG4_2), 0); if (msg->msg_namelen != 0) kmem_free(msg->msg_name, (size_t)msg->msg_namelen); - if (msg->msg_controllen != 0) - kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + if (free_controllen != 0) + kmem_free(msg->msg_control, (size_t)free_controllen); return (set_errno(error)); } diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index b8d83105e8..0e9883498b 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -865,7 +865,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, /* * Force a zero sa_family to match so_family. * - * Some programs like inetd(1M) don't set the + * Some programs like inetd(8) don't set the * family field. Other programs leave * sin_family set to garbage - SunOS 4.X does * not check the family field on a bind. @@ -6518,7 +6518,7 @@ socktpi_init(void) { /* * Create sonode caches. We create a special one for AF_UNIX so - * that we can track them for netstat(1m). + * that we can track them for netstat(8). */ socktpi_cache = kmem_cache_create("socktpi_cache", sizeof (struct sotpi_sonode), 0, socktpi_constructor, diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index 1a620642cc..b28ced7111 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -54,6 +54,11 @@ static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *, #define T_HASH_SIZE 8192 /* must be power of 2 */ #define T_MUTEX_SIZE 64 +/* Non-static so compilers won't constant-fold these away. */ +clock_t tmpfs_rename_backoff_delay = 1; +unsigned int tmpfs_rename_backoff_tries = 0; +unsigned long tmpfs_rename_loops = 0; + static struct tdirent *t_hashtable[T_HASH_SIZE]; static kmutex_t t_hashmutex[T_MUTEX_SIZE]; @@ -266,8 +271,65 @@ tdirenter( * to see if it has been removed while it was unlocked. */ if (op == DE_LINK || op == DE_RENAME) { - if (tp != dir) - rw_enter(&tp->tn_rwlock, RW_WRITER); + if (tp != dir) { + unsigned int tries = 0; + + /* + * If we are acquiring tp->tn_rwlock (for SOURCE) + * inside here, we must consider the following: + * + * - dir->tn_rwlock (TARGET) is already HELD (see + * above ASSERT()). + * + * - It is possible our SOURCE is a parent of our + * TARGET. Yes it's unusual, but it will return an + * error below via tdircheckpath(). + * + * - It is also possible that another thread, + * concurrent to this one, is performing + * rmdir(TARGET), which means it will first acquire + * SOURCE's lock, THEN acquire TARGET's lock, which + * could result in this thread holding TARGET and + * trying for SOURCE, but the other thread holding + * SOURCE and trying for TARGET. This is deadlock, + * and it's inducible. + * + * To prevent this, we borrow some techniques from UFS + * and rw_tryenter(), delaying if we fail, and + * if someone tweaks the number of backoff tries to be + * nonzero, return EBUSY after that number of tries. + */ + while (!rw_tryenter(&tp->tn_rwlock, RW_WRITER)) { + /* + * Sloppy, but this is a diagnostic so atomic + * increment would be overkill. + */ + tmpfs_rename_loops++; + + if (tmpfs_rename_backoff_tries != 0) { + if (tries > tmpfs_rename_backoff_tries) + return (EBUSY); + tries++; + } + /* + * NOTE: We're still holding dir->tn_rwlock, + * so drop it over the delay, so any other + * thread can get its business done. + * + * No state change or state inspection happens + * prior to here, so it is not wholly dangerous + * to release-and-reacquire dir->tn_rwlock. + * + * Hold the vnode of dir in case it gets + * released by another thread, though. + */ + VN_HOLD(TNTOV(dir)); + rw_exit(&dir->tn_rwlock); + delay(tmpfs_rename_backoff_delay); + rw_enter(&dir->tn_rwlock, RW_WRITER); + VN_RELE(TNTOV(dir)); + } + } mutex_enter(&tp->tn_tlock); if (tp->tn_nlink == 0) { mutex_exit(&tp->tn_tlock); @@ -928,7 +990,7 @@ tdiraddentry( tm = TNTOTM(dir); namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP | KM_NORMALPRI); + tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP_LAZY); if (tdp == NULL) return (ENOSPC); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index c52a6f7c77..24310fefe5 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -325,8 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) goto out; } - if ((tm = kmem_zalloc(sizeof (struct tmount), - KM_NOSLEEP | KM_NORMALPRI)) == NULL) { + if ((tm = kmem_zalloc(sizeof (struct tmount), KM_NOSLEEP_LAZY)) == + NULL) { pn_free(&dpn); error = ENOMEM; goto out; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index a356f22750..cbe19aefea 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -1645,7 +1645,7 @@ tmp_symlink( return (error); } len = strlen(tnm) + 1; - cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP | KM_NORMALPRI); + cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP_LAZY); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); diff --git a/usr/src/uts/common/fs/ufs/lufs_log.c b/usr/src/uts/common/fs/ufs/lufs_log.c index 2ec3f7907c..052c53d507 100644 --- a/usr/src/uts/common/fs/ufs/lufs_log.c +++ b/usr/src/uts/common/fs/ufs/lufs_log.c @@ -1591,7 +1591,7 @@ ldl_seterror(ml_unit_t *ul, char *why) cmn_err(CE_WARN, "%s", why); cmn_err(CE_WARN, "ufs log for %s changed state to Error", ul->un_ufsvfs->vfs_fs->fs_fsmnt); - cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)", + cmn_err(CE_WARN, "Please umount(8) %s and run fsck(8)", ul->un_ufsvfs->vfs_fs->fs_fsmnt); /* diff --git a/usr/src/uts/common/fs/ufs/ufs_alloc.c b/usr/src/uts/common/fs/ufs/ufs_alloc.c index ee7d99143e..3b052f75c0 100644 --- a/usr/src/uts/common/fs/ufs/ufs_alloc.c +++ b/usr/src/uts/common/fs/ufs/ufs_alloc.c @@ -381,7 +381,7 @@ loop: rw_exit(&ip->i_contents); VN_RELE(ITOV(ip)); cmn_err(CE_WARN, - "%s: unexpected allocated inode %d, run fsck(1M)%s", + "%s: unexpected allocated inode %d, run fsck(8)%s", fs->fs_fsmnt, (int)ino, (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); goto loop; @@ -406,7 +406,7 @@ loop: if (ip->i_size) { cmn_err(CE_WARN, - "%s: free inode %d had size 0x%llx, run fsck(1M)%s", + "%s: free inode %d had size 0x%llx, run fsck(8)%s", fs->fs_fsmnt, (int)ino, ip->i_size, (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); } diff --git a/usr/src/uts/common/fs/ufs/ufs_dir.c b/usr/src/uts/common/fs/ufs/ufs_dir.c index 8035e76025..02f7e57fcd 100644 --- a/usr/src/uts/common/fs/ufs/ufs_dir.c +++ b/usr/src/uts/common/fs/ufs/ufs_dir.c @@ -2870,9 +2870,6 @@ ufs_dirpurgedotdot( * Scan the directoy. If clr_dotdot is true clear the .. * directory else check to see if the directory is empty. * - * Using a struct dirtemplate here is not precisely - * what we want, but better than using a struct direct. - * * clr_dotdot is used as a flag to tell us if we need * to clear the dotdot entry * @@ -2886,20 +2883,19 @@ ufs_dirscan( int clr_dotdot) { offset_t off; - struct dirtemplate dbuf; - struct direct *dp = (struct direct *)&dbuf; + struct tmp_dir dbuf, *dp; int err, count; int empty = 1; /* Assume it's empty */ -#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + dp = &dbuf; ASSERT(RW_LOCK_HELD(&ip->i_contents)); ASSERT(ip->i_size <= (offset_t)MAXOFF_T); for (off = 0; off < ip->i_size; off += dp->d_reclen) { err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, - (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); + sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr); /* - * Since we read MINDIRSIZ, residual must + * Since we read sizeof (struct tmp_dir), residual must * be 0 unless we're at end of file. */ if (err || count != 0 || dp->d_reclen == 0) { @@ -3108,20 +3104,19 @@ int ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) { offset_t off; - struct dirtemplate dbuf; - struct direct *dp = (struct direct *)&dbuf; + struct tmp_dir dbuf, *dp; int err, count; int empty = 1; /* Assume it's empty */ -#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + dp = &dbuf; ASSERT(RW_LOCK_HELD(&ip->i_contents)); ASSERT(ip->i_size <= (offset_t)MAXOFF_T); for (off = 0; off < ip->i_size; off += dp->d_reclen) { err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, - (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); + sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr); /* - * Since we read MINDIRSIZ, residual must + * Since we read sizeof (struct tmp_dir), residual must * be 0 unless we're at end of file. */ diff --git a/usr/src/uts/common/fs/ufs/ufs_inode.c b/usr/src/uts/common/fs/ufs/ufs_inode.c index 05f23a6d29..35b66b203c 100644 --- a/usr/src/uts/common/fs/ufs/ufs_inode.c +++ b/usr/src/uts/common/fs/ufs/ufs_inode.c @@ -24,7 +24,7 @@ */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 @@ -107,6 +107,7 @@ union ihead *ihead; /* inode LRU cache, Chris Maltby */ kmutex_t *ih_lock; /* protect inode cache hash table */ static int ino_hashlen = 4; /* desired average hash chain length */ int inohsz; /* number of buckets in the hash table */ +struct timeval32 iuniqtime; kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ @@ -611,7 +612,7 @@ again: vp->v_vfsp = &EIO_vfs; VN_RELE(vp); cmn_err(CE_NOTE, - "%s: unexpected free inode %d, run fsck(1M)%s", + "%s: unexpected free inode %d, run fsck(8)%s", fs->fs_fsmnt, (int)ino, (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); return (EIO); @@ -838,8 +839,8 @@ ufs_iupdat(struct inode *ip, int waitfor) struct buf *bp; struct fs *fp; struct dinode *dp; - struct ufsvfs *ufsvfsp = ip->i_ufsvfs; - int i; + struct ufsvfs *ufsvfsp = ip->i_ufsvfs; + int i; int do_trans_times; ushort_t flag; o_uid_t suid; diff --git a/usr/src/uts/common/fs/ufs/ufs_vfsops.c b/usr/src/uts/common/fs/ufs/ufs_vfsops.c index afd43e7e63..390319bfef 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vfsops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vfsops.c @@ -983,7 +983,7 @@ mountfs(struct vfs *vfsp, enum whymountroot why, struct vnode *devvp, */ if (!(vfsp->vfs_flag & VFS_RDONLY)) { cmn_err(CE_WARN, "Error accessing ufs " - "log for %s; Please run fsck(1M)", path); + "log for %s; Please run fsck(8)", path); goto out; } } diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 9e0b071999..953ee80471 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,9 +21,11 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. + * Copyright 2022 Spencer Evans-Cole. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -51,6 +53,7 @@ #include <sys/vfs.h> #include <sys/vfs_opreg.h> #include <sys/vnode.h> +#include <sys/filio.h> #include <sys/rwstlock.h> #include <sys/fem.h> #include <sys/stat.h> @@ -841,18 +844,48 @@ done: void vn_rele(vnode_t *vp) { + mutex_enter(&vp->v_lock); + if (vp->v_count == 1) { + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } VERIFY(vp->v_count > 0); + VN_RELE_LOCKED(vp); + mutex_exit(&vp->v_lock); +} + +void +vn_phantom_rele(vnode_t *vp) +{ mutex_enter(&vp->v_lock); + VERIFY3U(vp->v_count, >=, vp->v_phantom_count); + vp->v_phantom_count--; + DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp); if (vp->v_count == 1) { + ASSERT0(vp->v_phantom_count); mutex_exit(&vp->v_lock); VOP_INACTIVE(vp, CRED(), NULL); return; } + VERIFY(vp->v_count > 0); VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } /* + * Return the number of non-phantom holds. Things such as portfs will use + * phantom holds to prevent it from blocking filesystems from mounting over + * watched directories. + */ +uint_t +vn_count(vnode_t *vp) +{ + ASSERT(MUTEX_HELD(&vp->v_lock)); + return (vp->v_count - vp->v_phantom_count); +} + +/* * Release a vnode referenced by the DNLC. Multiple DNLC references are treated * as a single reference, so v_count is not decremented until the last DNLC hold * is released. This makes it possible to distinguish vnodes that are referenced @@ -861,8 +894,8 @@ vn_rele(vnode_t *vp) void vn_rele_dnlc(vnode_t *vp) { - VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); mutex_enter(&vp->v_lock); + VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); if (--vp->v_count_dnlc == 0) { if (vp->v_count == 1) { mutex_exit(&vp->v_lock); @@ -884,7 +917,6 @@ vn_rele_dnlc(vnode_t *vp) void vn_rele_stream(vnode_t *vp) { - VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); vp->v_stream = NULL; if (vp->v_count == 1) { @@ -892,6 +924,7 @@ vn_rele_stream(vnode_t *vp) VOP_INACTIVE(vp, CRED(), NULL); return; } + VERIFY(vp->v_count > 0); VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } @@ -915,7 +948,6 @@ vn_rele_inactive(vnode_t *vp) void vn_rele_async(vnode_t *vp, taskq_t *taskq) { - VERIFY(vp->v_count > 0); mutex_enter(&vp->v_lock); if (vp->v_count == 1) { mutex_exit(&vp->v_lock); @@ -923,6 +955,7 @@ vn_rele_async(vnode_t *vp, taskq_t *taskq) vp, TQ_SLEEP) != TASKQID_INVALID); return; } + VERIFY(vp->v_count > 0); VN_RELE_LOCKED(vp); mutex_exit(&vp->v_lock); } @@ -1133,7 +1166,20 @@ top: * Do remaining checks for FNOFOLLOW and FNOLINKS. */ if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { - error = ELOOP; + /* + * The __FLXPATH flag is a private interface for use by the lx + * brand in order to emulate open(O_NOFOLLOW|O_PATH) which, + * when a symbolic link is encountered, returns a file + * descriptor which references it. + * See uts/common/brand/lx/syscall/lx_open.c + * + * When this flag is set, VOP_OPEN() is not called (for a + * symlink, most filesystems will return ENOSYS anyway) + * and the link's vnode is returned to be linked to the + * file descriptor. + */ + if ((filemode & __FLXPATH) == 0) + error = ELOOP; goto out; } if (filemode & FNOLINKS) { @@ -1223,6 +1269,22 @@ top: if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) goto out; } + + /* + * Turn on directio, if requested. + */ + if (filemode & FDIRECT) { + if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0, + CRED(), NULL, NULL)) != 0) { + /* + * On Linux, O_DIRECT returns EINVAL when the file + * system does not support directio, so we'll do the + * same. + */ + error = EINVAL; + goto out; + } + } out: ASSERT(vp->v_count > 0); @@ -2428,6 +2490,7 @@ vn_reinit(vnode_t *vp) { vp->v_count = 1; vp->v_count_dnlc = 0; + vp->v_phantom_count = 0; vp->v_vfsp = NULL; vp->v_stream = NULL; vp->v_vfsmountedhere = NULL; @@ -2484,6 +2547,7 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); + ASSERT0(vp->v_phantom_count); VERIFY(vp->v_path != NULL); if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); diff --git a/usr/src/uts/common/fs/xattr.c b/usr/src/uts/common/fs/xattr.c index 2326a42747..ffa68a362e 100644 --- a/usr/src/uts/common/fs/xattr.c +++ b/usr/src/uts/common/fs/xattr.c @@ -28,7 +28,7 @@ * * The Solaris VFS layer presents extended file attributes using a special * "XATTR" directory under files or directories that have extended file - * attributes. See fsattr(5) for background. + * attributes. See fsattr(7) for background. * * This design avoids the need for a separate set of VFS or vnode functions * for operating on XATTR objects. File system implementations that support diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c index 66a7a49d73..b841a8f38e 100644 --- a/usr/src/uts/common/fs/zfs/abd.c +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -12,6 +12,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright 2020 Joyent, Inc. */ /* @@ -218,7 +219,7 @@ abd_init(void) * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH * so that no allocator metadata is stored with the buffers. */ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 64, NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, @@ -764,7 +765,8 @@ abd_iter_map(struct abd_iter *aiter) } else { size_t index = abd_iter_scatter_chunk_index(aiter); offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = zfs_abd_chunk_size - offset; + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; } aiter->iter_mapaddr = (char *)paddr + offset; @@ -993,3 +995,180 @@ abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) { return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); } + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i]); + + if (dabd) + abd_iter_init(&daiter, dabd); + + ASSERT3S(dsize, >=, 0); + +#ifdef _KERNEL + kpreempt_disable(); +#endif + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + len = MIN(dsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } +#ifdef _KERNEL + kpreempt_enable(); +#endif +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i]); + abd_iter_init(&xiters[i], tabds[i]); + } + +#ifdef _KERNEL + kpreempt_disable(); +#endif + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } +#ifdef _KERNEL + kpreempt_enable(); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 9e04e5e00d..12b5872cdc 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -24,6 +24,12 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. */ /* @@ -294,6 +300,7 @@ #include <sys/kstat.h> #include <sys/zthr.h> #include <zfs_fletcher.h> +#include <sys/arc_impl.h> #include <sys/aggsum.h> #include <sys/cityhash.h> #include <sys/param.h> @@ -408,54 +415,6 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ boolean_t zfs_compressed_arc_enabled = B_TRUE; -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - /* - * list of evictable buffers - */ - multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; - /* - * total amount of evictable data in this state - */ - zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; - /* - * total amount of data in this state; this includes: evictable, - * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. - */ - zfs_refcount_t arcs_size; -} arc_state_t; - /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -464,263 +423,7 @@ static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; -typedef struct arc_stats { - kstat_named_t arcstat_hits; - kstat_named_t arcstat_misses; - kstat_named_t arcstat_demand_data_hits; - kstat_named_t arcstat_demand_data_misses; - kstat_named_t arcstat_demand_metadata_hits; - kstat_named_t arcstat_demand_metadata_misses; - kstat_named_t arcstat_prefetch_data_hits; - kstat_named_t arcstat_prefetch_data_misses; - kstat_named_t arcstat_prefetch_metadata_hits; - kstat_named_t arcstat_prefetch_metadata_misses; - kstat_named_t arcstat_mru_hits; - kstat_named_t arcstat_mru_ghost_hits; - kstat_named_t arcstat_mfu_hits; - kstat_named_t arcstat_mfu_ghost_hits; - kstat_named_t arcstat_deleted; - /* - * Number of buffers that could not be evicted because the hash lock - * was held by another thread. The lock may not necessarily be held - * by something using the same buffer, since hash locks are shared - * by multiple buffers. - */ - kstat_named_t arcstat_mutex_miss; - /* - * Number of buffers skipped when updating the access state due to the - * header having already been released after acquiring the hash lock. - */ - kstat_named_t arcstat_access_skip; - /* - * Number of buffers skipped because they have I/O in progress, are - * indirect prefetch buffers that have not lived long enough, or are - * not from the spa we're trying to evict from. - */ - kstat_named_t arcstat_evict_skip; - /* - * Number of times arc_evict_state() was unable to evict enough - * buffers to reach its target amount. - */ - kstat_named_t arcstat_evict_not_enough; - kstat_named_t arcstat_evict_l2_cached; - kstat_named_t arcstat_evict_l2_eligible; - kstat_named_t arcstat_evict_l2_ineligible; - kstat_named_t arcstat_evict_l2_skip; - kstat_named_t arcstat_hash_elements; - kstat_named_t arcstat_hash_elements_max; - kstat_named_t arcstat_hash_collisions; - kstat_named_t arcstat_hash_chains; - kstat_named_t arcstat_hash_chain_max; - kstat_named_t arcstat_p; - kstat_named_t arcstat_c; - kstat_named_t arcstat_c_min; - kstat_named_t arcstat_c_max; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_size; - /* - * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. - * Note that the compressed bytes may match the uncompressed bytes - * if the block is either not compressed or compressed arc is disabled. - */ - kstat_named_t arcstat_compressed_size; - /* - * Uncompressed size of the data stored in b_pabd. If compressed - * arc is disabled then this value will be identical to the stat - * above. - */ - kstat_named_t arcstat_uncompressed_size; - /* - * Number of bytes stored in all the arc_buf_t's. This is classified - * as "overhead" since this data is typically short-lived and will - * be evicted from the arc when it becomes unreferenced unless the - * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level - * values have been set (see comment in dbuf.c for more information). - */ - kstat_named_t arcstat_overhead_size; - /* - * Number of bytes consumed by internal ARC structures necessary - * for tracking purposes; these structures are not actually - * backed by ARC buffers. This includes arc_buf_hdr_t structures - * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only - * caches), and arc_buf_t structures (allocated via arc_buf_t - * cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_hdr_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_DATA. This is generally consumed by buffers backing - * on disk user data (e.g. plain file contents). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_data_size; - /* - * Number of bytes consumed by ARC buffers of type equal to - * ARC_BUFC_METADATA. This is generally consumed by buffers - * backing on disk data that is used for internal ZFS - * structures (e.g. ZAP, dnode, indirect blocks, etc). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_metadata_size; - /* - * Number of bytes consumed by various buffers and structures - * not actually backed with ARC buffers. This includes bonus - * buffers (allocated directly via zio_buf_* functions), - * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t - * cache), and dnode_t structures (allocated via dnode_t cache). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_other_size; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_anon state. This includes *all* buffers in the arc_anon - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_anon state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_anon_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mru state. This includes *all* buffers in the arc_mru - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_size; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_DATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_data; - /* - * Number of bytes consumed by ARC buffers that meet the - * following criteria: backing buffers of type ARC_BUFC_METADATA, - * residing in the arc_mru state, and are eligible for eviction - * (e.g. have no outstanding holds on the buffer). - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mru_ghost state. The key thing to note - * here, is the fact that this size doesn't actually indicate - * RAM consumption. The ghost lists only consist of headers and - * don't actually have ARC buffers linked off of these headers. - * Thus, *if* the headers had associated ARC buffers, these - * buffers *would have* consumed this number of bytes. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mru_ghost_evictable_metadata; - /* - * Total number of bytes consumed by ARC buffers residing in the - * arc_mfu state. This includes *all* buffers in the arc_mfu - * state; e.g. data, metadata, evictable, and unevictable buffers - * are all included in this value. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_size; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu - * state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_data; - /* - * Number of bytes consumed by ARC buffers that are eligible for - * eviction, of type ARC_BUFC_METADATA, and reside in the - * arc_mfu state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_evictable_metadata; - /* - * Total number of bytes that *would have been* consumed by ARC - * buffers in the arc_mfu_ghost state. See the comment above - * arcstat_mru_ghost_size for more details. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_size; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_data; - /* - * Number of bytes that *would have been* consumed by ARC - * buffers that are eligible for eviction, of type - * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. - * Not updated directly; only synced in arc_kstat_update. - */ - kstat_named_t arcstat_mfu_ghost_evictable_metadata; - kstat_named_t arcstat_l2_hits; - kstat_named_t arcstat_l2_misses; - kstat_named_t arcstat_l2_feeds; - kstat_named_t arcstat_l2_rw_clash; - kstat_named_t arcstat_l2_read_bytes; - kstat_named_t arcstat_l2_write_bytes; - kstat_named_t arcstat_l2_writes_sent; - kstat_named_t arcstat_l2_writes_done; - kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_lock_retry; - kstat_named_t arcstat_l2_evict_lock_retry; - kstat_named_t arcstat_l2_evict_reading; - kstat_named_t arcstat_l2_evict_l1cached; - kstat_named_t arcstat_l2_free_on_write; - kstat_named_t arcstat_l2_abort_lowmem; - kstat_named_t arcstat_l2_cksum_bad; - kstat_named_t arcstat_l2_io_error; - kstat_named_t arcstat_l2_lsize; - kstat_named_t arcstat_l2_psize; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_l2_hdr_size; - kstat_named_t arcstat_memory_throttle_count; - /* Not updated directly; only synced in arc_kstat_update. */ - kstat_named_t arcstat_meta_used; - kstat_named_t arcstat_meta_limit; - kstat_named_t arcstat_meta_max; - kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_async_upgrade_sync; - kstat_named_t arcstat_demand_hit_predictive_prefetch; - kstat_named_t arcstat_demand_hit_prescient_prefetch; -} arc_stats_t; - -static arc_stats_t arc_stats = { +arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, @@ -742,6 +445,8 @@ static arc_stats_t arc_stats = { { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, @@ -778,6 +483,11 @@ static arc_stats_t arc_stats = { { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, + { "l2_mru_asize", KSTAT_DATA_UINT64 }, + { "l2_mfu_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, @@ -796,6 +506,22 @@ static arc_stats_t arc_stats = { { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, @@ -806,14 +532,6 @@ static arc_stats_t arc_stats = { { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; -#define ARCSTAT(stat) (arc_stats.stat.value.ui64) - -#define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)) - -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) -#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) - #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ @@ -844,6 +562,24 @@ static arc_stats_t arc_stats = { } \ } +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(CONSTCOND) \ + } while (0) + kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; @@ -853,29 +589,6 @@ static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* - * There are several ARC variables that are critical to export as kstats -- - * but we don't want to have to grovel around in the kstat whenever we wish to - * manipulate them. For these variables, we therefore define them to be in - * terms of the statistic variable. This assures that we are not introducing - * the possibility of inconsistency by having shadow copies of the variables, - * while still allowing the code to be readable. - */ -#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ -#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ -#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ -#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ -#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ -#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ -#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ - -/* compressed size of entire arc */ -#define arc_compressed_size ARCSTAT(arcstat_compressed_size) -/* uncompressed size of entire arc */ -#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) -/* number of bytes in the arc from arc_buf_t's */ -#define arc_overhead_size ARCSTAT(arcstat_overhead_size) - -/* * There are also some ARC variables that we want to export, but that are * updated so often that having the canonical representation be the statistic * variable causes a performance bottleneck. We want to use aggsum_t's for these @@ -896,182 +609,6 @@ static hrtime_t arc_growtime; static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_read_done_func_t *acb_done; - arc_buf_t *acb_buf; - boolean_t acb_encrypted; - boolean_t acb_compressed; - boolean_t acb_noauth; - zbookmark_phys_t acb_zb; - zio_t *acb_zio_dummy; - zio_t *acb_zio_head; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_write_done_func_t *awcb_ready; - arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; - arc_write_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -/* - * ARC buffers are separated into multiple structs as a memory saving measure: - * - Common fields struct, always defined, and embedded within it: - * - L2-only fields, always allocated but undefined when not in L2ARC - * - L1-only fields, only allocated when in L1ARC - * - * Buffer in L1 Buffer only in L2 - * +------------------------+ +------------------------+ - * | arc_buf_hdr_t | | arc_buf_hdr_t | - * | | | | - * | | | | - * | | | | - * +------------------------+ +------------------------+ - * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | - * | (undefined if L1-only) | | | - * +------------------------+ +------------------------+ - * | l1arc_buf_hdr_t | - * | | - * | | - * | | - * | | - * +------------------------+ - * - * Because it's possible for the L2ARC to become extremely large, we can wind - * up eating a lot of memory in L2ARC buffer headers, so the size of a header - * is minimized by only allocating the fields necessary for an L1-cached buffer - * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and - * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple - * words in pointers. arc_hdr_realloc() is used to switch a header between - * these two allocation states. - */ -typedef struct l1arc_buf_hdr { - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; -#ifdef ZFS_DEBUG - /* - * Used for debugging with kmem_flags - by allocating and freeing - * b_thawed when the buffer is thawed, we get a record of the stack - * trace that thawed it. - */ - void *b_thawed; -#endif - - arc_buf_t *b_buf; - uint32_t b_bufcnt; - /* for waiting on writes to complete */ - kcondvar_t b_cv; - uint8_t b_byteswap; - - /* protected by arc state mutex */ - arc_state_t *b_state; - multilist_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - - /* self protecting */ - zfs_refcount_t b_refcnt; - - arc_callback_t *b_acb; - abd_t *b_pabd; -} l1arc_buf_hdr_t; - -/* - * Encrypted blocks will need to be stored encrypted on the L2ARC - * disk as they appear in the main pool. In order for this to work we - * need to pass around the encryption parameters so they can be used - * to write data to the L2ARC. This struct is only defined in the - * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED - * flag set. - */ -typedef struct arc_buf_hdr_crypt { - abd_t *b_rabd; /* raw encrypted data */ - dmu_object_type_t b_ot; /* object type */ - uint32_t b_ebufcnt; /* number or encryped buffers */ - - /* dsobj for looking up encryption key for l2arc encryption */ - uint64_t b_dsobj; /* for looking up key */ - - /* encryption parameters */ - uint8_t b_salt[ZIO_DATA_SALT_LEN]; - uint8_t b_iv[ZIO_DATA_IV_LEN]; - - /* - * Technically this could be removed since we will always be able to - * get the mac from the bp when we need it. However, it is inconvenient - * for callers of arc code to have to pass a bp in all the time. This - * also allows us to assert that L2ARC data is properly encrypted to - * match the data in the main storage pool. - */ - uint8_t b_mac[ZIO_DATA_MAC_LEN]; -} arc_buf_hdr_crypt_t; - -typedef struct l2arc_dev l2arc_dev_t; - -typedef struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - - list_node_t b_l2node; -} l2arc_buf_hdr_t; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - - arc_buf_contents_t b_type; - arc_buf_hdr_t *b_hash_next; - arc_flags_t b_flags; - - /* - * This field stores the size of the data buffer after - * compression, and is set in the arc's zio completion handlers. - * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). - * - * While the block pointers can store up to 32MB in their psize - * field, we can only store up to 32MB minus 512B. This is due - * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. - * a field of zeros represents 512B in the bp). We can't use a - * bias of 1 since we need to reserve a psize of zero, here, to - * represent holes and embedded blocks. - * - * This isn't a problem in practice, since the maximum size of a - * buffer is limited to 16MB, so we never need to store 32MB in - * this field. Even in the upstream illumos code base, the - * maximum size of a buffer is limited to 16MB. - */ - uint16_t b_psize; - - /* - * This field stores the size of the data buffer before - * compression, and cannot change once set. It is in units - * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) - */ - uint16_t b_lsize; /* immutable */ - uint64_t b_spa; /* immutable */ - - /* L2ARC fields. Undefined when not in L2ARC. */ - l2arc_buf_hdr_t b_l2hdr; - /* L1ARC fields. Undefined when in l2arc_only state */ - l1arc_buf_hdr_t b_l1hdr; - /* - * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED - * is set and the L1 header exists. - */ - arc_buf_hdr_crypt_t b_crypt_hdr; -}; - #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) @@ -1176,6 +713,13 @@ uint64_t zfs_crc64_table[256]; #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ +/* + * We can feed L2ARC from two states of ARC buffers, mru and mfu, + * and each of the state has two types: data and metadata. + */ +#define L2ARC_FEED_TYPES 4 + + #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -1189,24 +733,11 @@ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ +int l2arc_meta_percent = 33; /* limit on headers size */ /* * L2ARC Internals */ -struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - kmutex_t l2ad_mtx; /* lock for buffer list */ - list_t l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ - zfs_refcount_t l2ad_alloc; /* allocated bytes */ -}; - static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ @@ -1224,11 +755,6 @@ typedef struct l2arc_read_callback { abd_t *l2rcb_abd; /* temporary buffer */ } l2arc_read_callback_t; -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ abd_t *l2df_abd; @@ -1241,7 +767,16 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + +enum arc_hdr_alloc_flags { + ARC_HDR_ALLOC_RDATA = 0x1, + ARC_HDR_DO_ADAPT = 0x2, +}; + + +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); typedef enum arc_fill_flags { ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ @@ -1251,15 +786,16 @@ typedef enum arc_fill_flags { } arc_fill_flags_t; static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); -static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t); -static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, int); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); +static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -1268,6 +804,18 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +static void l2arc_do_free_on_write(void); +static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only); + +#define l2arc_hdr_arcstats_increment(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) +#define l2arc_hdr_arcstats_decrement(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) +#define l2arc_hdr_arcstats_increment_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) +#define l2arc_hdr_arcstats_decrement_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * The arc_all_memory function is a ZoL enhancement that lives in their OSL @@ -1298,6 +846,9 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) +#define HDR_EMPTY_OR_LOCKED(hdr) \ + (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) + #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ @@ -1411,6 +962,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr) } /* + * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU + * metadata and data are cached from ARC into L2ARC. + */ +int l2arc_mfuonly = 0; + +/* * Global data structures and functions for the buf kmem cache. */ @@ -1726,8 +1283,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) { - ASSERT(hdr->b_l1hdr.b_state == arc_anon || - MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { if (!ARC_BUF_COMPRESSED(b)) { @@ -2011,14 +1567,14 @@ arc_buf_freeze(arc_buf_t *buf) static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); hdr->b_flags &= ~flags; } @@ -2032,7 +1588,7 @@ arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so @@ -2125,7 +1681,7 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) void *tmpbuf = NULL; abd_t *abd = hdr->b_l1hdr.b_pabd; - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_AUTHENTICATED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); @@ -2195,10 +1751,10 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) boolean_t no_crypt = B_FALSE; boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT(HDR_ENCRYPTED(hdr)); - arc_hdr_alloc_pabd(hdr, B_FALSE); + arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT); ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, @@ -2225,7 +1781,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) * and then loan a buffer from it, rather than allocating a * linear buffer and wrapping it in an abd later. */ - cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -2315,7 +1871,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) ASSERT(HDR_ENCRYPTED(hdr)); ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, @@ -2538,7 +2094,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, */ ret = SET_ERROR(EIO); spa_log_error(spa, zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0, 0); } @@ -2635,7 +2191,7 @@ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); - if (!MUTEX_HELD(HDR_LOCK(hdr))) { + if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); @@ -2652,7 +2208,11 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } } @@ -2888,9 +2448,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } } - if (HDR_HAS_L1HDR(hdr)) + if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; + if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { + l2arc_hdr_arcstats_decrement_state(hdr); + hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; + l2arc_hdr_arcstats_increment_state(hdr); + } + } + /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. @@ -3040,7 +2607,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Only honor requests for compressed bufs if the hdr is actually @@ -3160,6 +2727,58 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, return (buf); } +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_dev_hdr_update(l2arc_dev_t *dev); +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxilliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); /* * Return a loaned arc buffer to the arc. @@ -3248,7 +2867,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); ASSERT(!ARC_BUF_ENCRYPTED(buf)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Start sharing the data buffer. We transfer the @@ -3281,7 +2900,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * We are no longer sharing this buffer so we need @@ -3316,7 +2935,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) arc_buf_t *lastbuf = NULL; ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); /* * Remove the buf from the hdr list and locate the last @@ -3364,7 +2983,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -3457,9 +3076,11 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, int alloc_flags) { uint64_t size; + boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); + boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3469,12 +3090,14 @@ arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) if (alloc_rdata) { size = HDR_GET_PSIZE(hdr); ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); - hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr); + hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, + do_adapt); ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); } else { size = arc_hdr_size(hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, + do_adapt); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); } @@ -3527,6 +3150,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, arc_buf_contents_t type, boolean_t alloc_rdata) { arc_buf_hdr_t *hdr; + int flags = ARC_HDR_DO_ADAPT; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); if (protected) { @@ -3534,6 +3158,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, } else { hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); } + flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); @@ -3557,7 +3182,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pabd(hdr, alloc_rdata); + arc_hdr_alloc_pabd(hdr, flags); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -3842,7 +3467,6 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, @@ -3853,6 +3477,44 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) } /* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, boolean_t protected, + boolean_t prefetch, arc_state_type_t arcs_state) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + hdr->b_l2hdr.b_arcs_state = arcs_state; + + return (hdr); +} + +/* * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this * for bufs containing metadata. */ @@ -3867,7 +3529,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, @@ -3908,7 +3569,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, compression_type, type, B_TRUE); - ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); hdr->b_crypt_hdr.b_dsobj = dsobj; hdr->b_crypt_hdr.b_ot = ot; @@ -3933,6 +3593,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, } static void +l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + arc_buf_contents_t type = hdr->b_type; + int64_t lsize_s; + int64_t psize_s; + int64_t asize_s; + + if (incr) { + lsize_s = lsize; + psize_s = psize; + asize_s = asize; + } else { + lsize_s = -lsize; + psize_s = -psize; + asize_s = -asize; + } + + /* If the buffer is a prefetch, count it as such. */ + if (HDR_PREFETCH(hdr)) { + ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); + } else { + /* + * We use the value stored in the L2 header upon initial + * caching in L2ARC. This value will be updated in case + * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC + * metadata (log entry) cannot currently be updated. Having + * the ARC state in the L2 header solves the problem of a + * possibly absent L1 header (apparent in buffers restored + * from persistent L2ARC). + */ + switch (hdr->b_l2hdr.b_arcs_state) { + case ARC_STATE_MRU_GHOST: + case ARC_STATE_MRU: + ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); + break; + case ARC_STATE_MFU_GHOST: + case ARC_STATE_MFU: + ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); + break; + default: + break; + } + } + + if (state_only) + return; + + ARCSTAT_INCR(arcstat_l2_psize, psize_s); + ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); + + switch (type) { + case ARC_BUFC_DATA: + ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); + break; + case ARC_BUFC_METADATA: + ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); + break; + default: + break; + } +} + + +static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; @@ -3945,9 +3675,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - + l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), @@ -3967,9 +3695,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - if (!HDR_EMPTY(hdr)) - buf_discard_identity(hdr); - if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); @@ -3993,6 +3718,15 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) mutex_exit(&dev->l2ad_mtx); } + /* + * The header's identity can only be safely discarded once it is no + * longer discoverable. This requires removing it from the hash table + * and the l2arc header list. After this point the hash lock can not + * be used to protect the header. + */ + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); @@ -4006,9 +3740,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) } #endif - if (hdr->b_l1hdr.b_pabd != NULL) { + if (hdr->b_l1hdr.b_pabd != NULL) arc_hdr_free_pabd(hdr, B_FALSE); - } if (HDR_HAS_RABD(hdr)) arc_hdr_free_pabd(hdr, B_TRUE); @@ -4033,7 +3766,6 @@ void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); @@ -4043,7 +3775,9 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) return; } + kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); @@ -4151,6 +3885,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); + + switch (state->arcs_state) { + case ARC_STATE_MRU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mru, + HDR_GET_LSIZE(hdr)); + break; + case ARC_STATE_MFU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mfu, + HDR_GET_LSIZE(hdr)); + break; + default: + break; + } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); @@ -4873,25 +4622,6 @@ arc_available_memory(void) r = FMR_PAGES_PP_MAXIMUM; } -#if defined(__i386) - /* - * If we're on an i386 platform, it's possible that we'll exhaust the - * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the heap_area compare against - * tune.t_minarmem, which is the minimum available real memory that we - * can have in the system. However, this is generally fixed at 25 pages - * which is so low that it's useless. In this comparison, we seek to - * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the calculation, if less than 1/4th is - * free) - */ - n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); - if (n < lowest) { - lowest = n; - r = FMR_HEAP_ARENA; - } -#endif /* * If zio data pages are being allocated out of a separate heap segment, @@ -4954,12 +4684,6 @@ arc_kmem_reap_soon(void) */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } -#if defined(__i386) - /* - * Reclaim unused memory from all kmem caches. - */ - kmem_reap(); -#endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { @@ -5154,9 +4878,6 @@ arc_adapt(int bytes, arc_state_t *state) int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); - if (state == arc_l2c_only) - return; - ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -5238,11 +4959,12 @@ arc_is_overflowing(void) } static abd_t * -arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + boolean_t do_adapt) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag); + arc_get_data_impl(hdr, size, tag, do_adapt); if (type == ARC_BUFC_METADATA) { return (abd_alloc(size, B_TRUE)); } else { @@ -5256,7 +4978,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { arc_buf_contents_t type = arc_buf_type(hdr); - arc_get_data_impl(hdr, size, tag); + arc_get_data_impl(hdr, size, tag, B_TRUE); if (type == ARC_BUFC_METADATA) { return (zio_buf_alloc(size)); } else { @@ -5272,12 +4994,14 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * limit, we'll only signal the reclaim thread and continue on. */ static void -arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + boolean_t do_adapt) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - arc_adapt(size, state); + if (do_adapt) + arc_adapt(size, state); /* * If arc_size is currently overflowing, and has grown past our @@ -5448,10 +5172,14 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; @@ -5480,13 +5208,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * was evicted from the cache. Move it to the * MFU state. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -5747,8 +5478,6 @@ arc_read_done(zio_t *zio) } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); - if (l2arc_noprefetch && HDR_PREFETCH(hdr)) - arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); @@ -5801,7 +5530,8 @@ arc_read_done(zio_t *zio) error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(zio->io_spa, &acb->acb_zb); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); } } @@ -6058,7 +5788,7 @@ top: rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, zb); - zfs_ereport_post( + (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0, 0); } @@ -6073,8 +5803,12 @@ top: ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); @@ -6099,6 +5833,7 @@ top: boolean_t devw = B_FALSE; uint64_t size; abd_t *hdr_abd; + int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; if (hdr == NULL) { /* this block is not in the cache */ @@ -6165,8 +5900,9 @@ top: * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ + arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); arc_access(hdr, hash_lock); - arc_hdr_alloc_pabd(hdr, encrypted_read); + arc_hdr_alloc_pabd(hdr, alloc_flags); } if (encrypted_read) { @@ -6195,8 +5931,13 @@ top: } if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); @@ -6266,7 +6007,7 @@ top: * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. + * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && @@ -6285,6 +6026,17 @@ top: cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + /* + * When Compressed ARC is disabled, but the + * L2ARC block is compressed, arc_hdr_size() + * will have returned LSIZE rather than PSIZE. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr) && + HDR_GET_PSIZE(hdr) != 0) { + size = HDR_GET_PSIZE(hdr); + } + asize = vdev_psize_to_asize(vd, size); if (asize != size) { abd = abd_alloc_for_io(asize, @@ -6566,7 +6318,7 @@ arc_release(arc_buf_t *buf, void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pabd(hdr, B_FALSE); + arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -6789,7 +6541,7 @@ arc_write_ready(zio_t *zio) if (ARC_BUF_ENCRYPTED(buf)) { ASSERT3U(psize, >, 0); ASSERT(ARC_BUF_COMPRESSED(buf)); - arc_hdr_alloc_pabd(hdr, B_TRUE); + arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* @@ -6799,16 +6551,17 @@ arc_write_ready(zio_t *zio) */ if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_pabd(hdr, B_TRUE); + arc_hdr_alloc_pabd(hdr, + ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && !ARC_BUF_COMPRESSED(buf)) { ASSERT3U(psize, >, 0); - arc_hdr_alloc_pabd(hdr, B_FALSE); + arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - arc_hdr_alloc_pabd(hdr, B_FALSE); + arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -6894,8 +6647,8 @@ arc_write_done(zio_t *zio) ASSERT(zfs_refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); arc_hdr_destroy(exists); + mutex_exit(hash_lock); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { @@ -7027,10 +6780,6 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) #ifdef _KERNEL uint64_t available_memory = ptob(freemem); -#if defined(__i386) - available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); -#endif if (freemem > physmem * arc_lotsfree_percent / 100) return (0); @@ -7303,6 +7052,13 @@ arc_state_init(void) aggsum_init(&astat_hdr_size, 0); aggsum_init(&astat_other_size, 0); aggsum_init(&astat_l2_hdr_size, 0); + + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; } static void @@ -7671,6 +7427,103 @@ arc_fini(void) * * These three functions determine what to write, how much, and how quickly * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <<nextwrite>> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. */ static boolean_t @@ -7682,18 +7535,20 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). + * 5. is a prefetch and l2arc_noprefetch is set. */ if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || - HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) + HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) || + (l2arc_noprefetch && HDR_PREFETCH(hdr))) return (B_FALSE); return (B_TRUE); } static uint64_t -l2arc_write_size(void) +l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size; + uint64_t size, dev_size; /* * Make sure our globals have meaningful values in case the user @@ -7710,6 +7565,25 @@ l2arc_write_size(void) if (arc_warm == B_FALSE) size += l2arc_write_boost; + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "plus the overhead of log blocks (persistent L2ARC, " + "%" PRIu64 " bytes) exceeds the size of the cache device " + "(guid %" PRIu64 "), resetting them to the default (%d)", + l2arc_log_blk_overhead(size, dev), + dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + return (size); } @@ -7775,10 +7649,10 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev)); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev)) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) next = NULL; l2arc_dev_last = next; @@ -7827,16 +7701,20 @@ l2arc_do_free_on_write() static void l2arc_write_done(zio_t *zio) { - l2arc_write_callback_t *cb; - l2arc_dev_t *dev; - list_t *buflist; - arc_buf_hdr_t *head, *hdr, *hdr_prev; - kmutex_t *hash_lock; - int64_t bytes_dropped = 0; + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7845,9 +7723,6 @@ l2arc_write_done(zio_t *zio) DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - /* * All writes completed, or an error was hit. */ @@ -7907,8 +7782,7 @@ top: arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -7925,12 +7799,74 @@ top: mutex_exit(hash_lock); } + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_writes_error); + + /* + * Restore the lbps array in the header to its previous state. + * If the list of log block pointers is empty, zero out the + * log block pointers in the device header. + */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + if (lb_ptr_buf == NULL) { + /* + * If the list is empty zero out the device + * header. Otherwise zero out the second log + * block pointer in the header. + */ + if (i == 0) { + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + } else { + bzero(&l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + } + break; + } + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); + ASSERT(dev->l2ad_vdev != NULL); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -7965,7 +7901,8 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) * until arc_read_done(). */ if (BP_IS_ENCRYPTED(bp)) { - abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + B_TRUE); zio_crypt_decode_params_bp(bp, salt, iv); zio_crypt_decode_mac_bp(bp, mac); @@ -8001,7 +7938,8 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { - abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + B_TRUE); void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), @@ -8122,7 +8060,6 @@ l2arc_read_done(zio_t *zio) zio->io_private = hdr; arc_read_done(zio); } else { - mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. @@ -8147,10 +8084,24 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + zio = zio_read(pio, zio->io_spa, zio->io_bp, abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb)); + &cb->l2rcb_zb); + + /* + * Original ZIO will be freed, so we need to update + * ARC header with the new ZIO pointer to be used + * by zio_change_priority() in arc_read(). + */ + for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; + acb != NULL; acb = acb->acb_next) + acb->acb_zio_head = zio; + + mutex_exit(hash_lock); + zio_nowait(zio); + } else { + mutex_exit(hash_lock); } } @@ -8173,7 +8124,7 @@ l2arc_sublist_lock(int list_num) multilist_t *ml = NULL; unsigned int idx; - ASSERT(list_num >= 0 && list_num <= 3); + ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); switch (list_num) { case 0: @@ -8188,6 +8139,8 @@ l2arc_sublist_lock(int list_num) case 3: ml = arc_mru->arcs_list[ARC_BUFC_DATA]; break; + default: + return (NULL); } /* @@ -8201,8 +8154,31 @@ l2arc_sublist_lock(int list_num) } /* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_log_entries == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + +/* * Evict buffers from the device write hand to the distance specified in - * bytes. This distance may span populated buffers, it may span nothing. + * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ @@ -8213,22 +8189,28 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + boolean_t rerun; buflist = &dev->l2ad_buflist; - if (!all && dev->l2ad_first) { - /* - * This is the first sweep through the device. There is - * nothing to evict. - */ - return; - } + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); - if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { /* - * When nearing the end of the device, evict to the end - * before the device write hand jumps to the start. + * When there is no space to accommodate upcoming writes, + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. */ + rerun = B_TRUE; taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; @@ -8236,11 +8218,68 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); -top: + /* + * This check has to be placed after deciding whether to iterate + * (rerun). + */ + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + goto out; + } + + /* + * When rebuilding L2ARC we retrieve the evict hand from the header of + * the device. Of note, l2arc_evict() does not actually delete buffers + * from the cache device, but keeping track of the evict hand will be + * useful when TRIM is implemented. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + +retry: mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + + /* + * We don't worry about log blocks left behind (ie + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); + ASSERT(!HDR_EMPTY(hdr)); hash_lock = HDR_LOCK(hdr); /* @@ -8256,7 +8295,7 @@ top: mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto retry; } /* @@ -8268,7 +8307,7 @@ top: ASSERT(!HDR_L2_WRITING(hdr)); ASSERT(!HDR_L2_WRITE_HEAD(hdr)); - if (!all && (hdr->b_l2hdr.b_daddr >= taddr || + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, @@ -8305,6 +8344,26 @@ top: mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); + +out: + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8424,6 +8483,17 @@ error: return (ret); } +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_put(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + /* * Find and write ARC buffers to the L2ARC device. * @@ -8433,17 +8503,19 @@ error: * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than - * the delta by which the device hand has changed due to alignment). + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *hdr, *hdr_prev, *head; - uint64_t write_asize, write_psize, write_lsize, headroom; - boolean_t full; - l2arc_write_callback_t *cb; - zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -8456,7 +8528,16 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) /* * Copy buffers for L2ARC writing. */ - for (int try = 0; try <= 3; try++) { + for (int try = 0; try < L2ARC_FEED_TYPES; try++) { + /* + * If try == 1 or 3, we cache MRU metadata and data + * respectively. + */ + if (l2arc_mfuonly) { + if (try == 1 || try == 3) + continue; + } + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; @@ -8495,7 +8576,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } passed_sz += HDR_GET_LSIZE(hdr); - if (passed_sz > headroom) { + if (l2arc_headroom != 0 && passed_sz > headroom) { /* * Searched too far. */ @@ -8508,12 +8589,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) continue; } - /* - * We rely on the L1 portion of the header below, so - * it's invalid for this header to have been evicted out - * of the ghost cache, prior to being written out. The - * ARC_FLAG_L2_WRITING bit ensures this won't happen. - */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); @@ -8537,12 +8612,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); - ASSERT(HDR_HAS_L1HDR(hdr)); - - ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT(hdr->b_l1hdr.b_pabd != NULL || - HDR_HAS_RABD(hdr)); - ASSERT3U(arc_hdr_size(hdr), >, 0); /* * If this header has b_rabd, we can use this since it @@ -8595,12 +8664,21 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); @@ -8624,10 +8702,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + (void) zio_nowait(wzio); } @@ -8642,31 +8729,47 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT0(write_lsize); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + if (dev->l2ad_evict != l2dhdr->dh_evict) + l2arc_dev_hdr_update(dev); + return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); - - /* - * Bump device hand to the device start if it is approaching the end. - * l2arc_evict() will already have evicted ahead for this case. - */ - if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_first = B_FALSE; - } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; + /* + * Update the device header after the zio completes as + * l2arc_write_done() may have updated the memory holding the log block + * pointers in the device header. + */ + l2arc_dev_hdr_update(dev); + return (write_asize); } +static boolean_t +l2arc_hdr_limit_reached(void) +{ + int64_t s = aggsum_upper_bound(&astat_l2_hdr_size); + + return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) || + (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100)); +} + /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. @@ -8732,7 +8835,7 @@ l2arc_feed_thread(void *unused) /* * Avoid contributing to memory pressure. */ - if (arc_reclaim_needed()) { + if (l2arc_hdr_limit_reached()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; @@ -8740,7 +8843,7 @@ l2arc_feed_thread(void *unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(); + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -8768,7 +8871,17 @@ l2arc_feed_thread(void *unused) boolean_t l2arc_vdev_present(vdev_t *vd) { - l2arc_dev_t *dev; + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +static l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; @@ -8778,7 +8891,7 @@ l2arc_vdev_present(vdev_t *vd) } mutex_exit(&l2arc_dev_mtx); - return (dev != NULL); + return (dev); } /* @@ -8788,7 +8901,8 @@ l2arc_vdev_present(vdev_t *vd) void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { - l2arc_dev_t *adddev; + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; ASSERT(!l2arc_vdev_present(vd)); @@ -8798,11 +8912,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_start = VDEV_LABEL_START_SIZE; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* @@ -8812,8 +8932,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -8822,6 +8951,82 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); + + /* + * Decide if vdev is eligible for L2ARC rebuild + */ + l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); +} + +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t l2dhdr_asize; + spa_t *spa; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + spa = dev->l2ad_spa; + l2dhdr = dev->l2ad_dev_hdr; + l2dhdr_asize = dev->l2ad_dev_hdr_asize; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (spa_writeable(spa)) { + /* + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. + */ + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } /* @@ -8830,24 +9035,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) void l2arc_remove_vdev(vdev_t *vd) { - l2arc_dev_t *dev, *nextdev, *remdev = NULL; + l2arc_dev_t *remdev = NULL; /* * Find the device by vdev */ - mutex_enter(&l2arc_dev_mtx); - for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { - nextdev = list_next(l2arc_dev_list, dev); - if (vd == dev->l2ad_vdev) { - remdev = dev; - break; - } - } + remdev = l2arc_vdev_get(vd); ASSERT3P(remdev, !=, NULL); /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* * Remove device from global list */ + mutex_enter(&l2arc_dev_mtx); list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); @@ -8858,8 +9068,13 @@ l2arc_remove_vdev(vdev_t *vd) */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -8873,6 +9088,8 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -8897,6 +9114,8 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); @@ -8927,3 +9146,916 @@ l2arc_stop(void) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } + +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, + (void (*)(void *))l2arc_dev_rebuild_start, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_start(l2arc_dev_t *dev) +{ + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (l2arc_hdr_limit_reached()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize); + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, asize, 0, 0); + + /* BEGIN CSTYLED */ + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. + */ + /* END CSTYLED */ + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) + goto out; + + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + kmem_free(this_lb, sizeof (*this_lb)); + kmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + } else if (err == ECANCELED) { + /* + * In case the rebuild was canceled do not log to spa history + * log as the pool may be in the process of being removed. + */ + zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", + zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err != 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + uint64_t asize; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, + dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, + dev->l2ad_hand, dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, asize, sizeof (*this_lb))) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_asize) +{ + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; + + /* + * Usually arc_adapt() is called only for data, not headers, but + * since we may allocate significant amount of memory here, let ARC + * grow its arc_c. + */ + arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only); + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here + * + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop), + L2BLK_GET_STATE((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls vdev_space_update(). + */ + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + exists->b_l2hdr.b_arcs_state = + L2BLK_GET_STATE((le)->le_prop); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + l2arc_hdr_arcstats_increment(exists); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t asize; + zio_t *pio; + l2arc_read_callback_t *cb; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. + */ +static void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); + l2dhdr->dh_flags = 0; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb)); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + _NOTE(CONSTCOND) + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_put(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* BEGIN CSTYLED */ + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + /* END CSTYLED */ + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + + if (dev->l2ad_log_entries == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, dev->l2ad_log_entries); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * <check>--------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------<bottom>============<top>--------------| + * + * bottom > top: Looped-around case: + * <check>--------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============<top>---------------<bottom>===========| + * ^ ^ + * | (or here?) | + * +---------------+---------<check> + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index ae0b1fc878..38c4a83cb1 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -176,6 +176,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag) bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); multilist_link_init(&db->db_cache_link); zfs_refcount_create(&db->db_holds); @@ -189,6 +190,7 @@ dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); + rw_destroy(&db->db_rwlock); cv_destroy(&db->db_changed); ASSERT(!multilist_link_active(&db->db_cache_link)); zfs_refcount_destroy(&db->db_holds); @@ -789,10 +791,10 @@ dbuf_verify(dmu_buf_impl_t *db) db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't - * have the struct_rwlock. XXX indblksz no longer + * have the parent's rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -868,6 +870,44 @@ dbuf_clear_data(dmu_buf_impl_t *db) db->db_state = DB_UNCACHED; } +/* + * This function is used to lock the parent of the provided dbuf. This should be + * used when modifying or reading db_blkptr. + */ +db_lock_type_t +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +{ + enum db_lock_type ret = DLT_NONE; + if (db->db_parent != NULL) { + rw_enter(&db->db_parent->db_rwlock, rw); + ret = DLT_PARENT; + } else if (dmu_objset_ds(db->db_objset) != NULL) { + rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, + tag); + ret = DLT_OBJSET; + } + /* + * We only return a DLT_NONE lock when it's the top-most indirect block + * of the meta-dnode of the MOS. + */ + return (ret); +} + +/* + * We need to pass the lock type in because it's possible that the block will + * move from being the topmost indirect block in a dnode (and thus, have no + * parent) to not the top-most via an indirection increase. This would cause a + * panic if we didn't pass the lock type in. + */ +void +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +{ + if (type == DLT_PARENT) + rw_exit(&db->db_parent->db_rwlock); + else if (type == DLT_OBJSET) + rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); +} + static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { @@ -1042,8 +1082,13 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) return (err); } +/* + * Drops db_mtx and the parent lock specified by dblt and tag before + * returning. + */ static int -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, void *tag) { dnode_t *dn; zbookmark_phys_t zb; @@ -1053,11 +1098,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); - /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); + ASSERT(db->db_parent == NULL || + RW_LOCK_HELD(&db->db_parent->db_rwlock)); if (db->db_blkid == DMU_BONUS_BLKID) { /* @@ -1094,6 +1139,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1134,6 +1180,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (0); } @@ -1150,12 +1197,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) "object set %llu", dmu_objset_id(db->db_objset)); DB_DNODE_EXIT(db); mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); return (SET_ERROR(EIO)); } err = dbuf_read_verify_dnode_crypt(db, flags); if (err != 0) { DB_DNODE_EXIT(db); + dmu_buf_unlock_parent(db, dblt, tag); mutex_exit(&db->db_mtx); return (err); } @@ -1175,11 +1224,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) zio_flags |= ZIO_FLAG_RAW; - - err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + /* + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on + * an l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would be a lock ordering violation. + */ + blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); + (void) arc_read(zio, db->db_objset->os_spa, &bp, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); - return (err); } @@ -1278,8 +1334,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && @@ -1316,29 +1370,32 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (err == 0 && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; boolean_t need_wait = B_FALSE; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + if (zio == NULL && db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, zio, flags); - - /* dbuf_read_impl has dropped db_mtx for us */ - - if (!err && prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); + err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + /* + * dbuf_read_impl has dropped db_mtx and our parent's rwlock + * for us + */ + if (!err && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!err && need_wait) @@ -1353,10 +1410,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); - if (prefetch) - dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); - if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&dn->dn_struct_rwlock); + if (prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ @@ -1536,7 +1593,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); arc_buf_freeze(db->db_buf); } @@ -1558,15 +1617,6 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); - /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); - - /* - * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held - * is OK, because there can be no other references to the db - * when we are changing its size, so no concurrent DB_FILL can - * be happening. - */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers @@ -1643,8 +1693,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; - int drop_struct_lock = FALSE; int txgoff = tx->tx_txg & TXG_MASK; + boolean_t drop_struct_rwlock = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -1846,15 +1896,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (dr); } - /* - * The dn_struct_rwlock prevents db_blkptr from changing - * due to a write from syncing context completing - * while we are running, so we want to acquire it before - * looking at db_blkptr. - */ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; + drop_struct_rwlock = B_TRUE; + } + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + if (db->db_blkptr != NULL) { + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + ddt_prefetch(os->os_spa, db->db_blkptr); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* @@ -1867,19 +1923,12 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); - /* - * If we are overwriting a dedup BP, then unless it is snapshotted, - * when we get to syncing context we will need to decrement its - * refcount in the DDT. Prefetch the relevant DDT block so that - * syncing context won't have to wait for the i/o. - */ - ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { ASSERT(!db->db_objset->os_raw_receive || dn->dn_maxblkid >= db->db_blkid); dnode_new_blkid(dn, db->db_blkid, tx, - drop_struct_lock, B_FALSE); + drop_struct_rwlock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } @@ -1890,15 +1939,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - - parent = dbuf_hold_level(dn, db->db_level+1, + parent = dbuf_hold_level(dn, db->db_level + 1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); - ASSERT3U(db->db_level+1, ==, parent->db_level); + ASSERT3U(db->db_level + 1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); @@ -1919,14 +1967,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } mutex_exit(&db->db_mtx); } else { - ASSERT(db->db_level+1 == dn->dn_nlevels); + ASSERT(db->db_level + 1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); - if (drop_struct_lock) + if (drop_struct_rwlock) rw_exit(&dn->dn_struct_rwlock); } @@ -2447,10 +2495,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } + rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) ASSERT(BP_IS_HOLE(*bpp)); + rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ @@ -2559,8 +2609,29 @@ typedef struct dbuf_prefetch_arg { zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ + dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ + void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; +static void +dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) +{ + if (dpa->dpa_cb != NULL) + dpa->dpa_cb(dpa->dpa_arg, io_done); + kmem_free(dpa, sizeof (*dpa)); +} + +static void +dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + dbuf_prefetch_fini(dpa, B_TRUE); + if (abuf != NULL) + arc_buf_destroy(abuf, private); +} + /* * Actually issue the prefetch read for the block given. */ @@ -2568,7 +2639,7 @@ static void dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) - return; + return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = @@ -2582,7 +2653,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, + dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } @@ -2603,8 +2675,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); @@ -2635,6 +2706,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_zb.zb_level)); dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); + if (db == NULL) { + arc_buf_destroy(abuf, private); + return (dbuf_prefetch_fini(dpa, B_TRUE)); + } (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); @@ -2647,11 +2722,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); if (BP_IS_HOLE(bp)) { - kmem_free(dpa, sizeof (*dpa)); + dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -2681,9 +2755,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) +int +dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; @@ -2693,10 +2768,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) - return; + goto no_issue; - if (dnode_block_freed(dn, blkid)) - return; + if (level == 0 && dnode_block_freed(dn, blkid)) + goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to @@ -2704,11 +2779,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; + goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; + goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); @@ -2718,7 +2793,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ - return; + goto no_issue; } /* @@ -2751,7 +2826,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, bp = dn->dn_phys->dn_blkptr[curblkid]; } if (BP_IS_HOLE(&bp)) - return; + goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); @@ -2769,6 +2844,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; + dpa->dpa_cb = cb; + dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) @@ -2784,7 +2861,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -2805,6 +2881,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * dpa may have already been freed. */ zio_nowait(pio); + return (1); +no_issue: + if (cb != NULL) + cb(arg, B_FALSE); + return (0); +} + +int +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + + return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* @@ -2841,7 +2930,9 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) DBUF_GET_BUFC_TYPE(db), db->db.db_size)); } + rw_enter(&db->db_rwlock, RW_WRITER); bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + rw_exit(&db->db_rwlock); } /* @@ -2967,7 +3058,6 @@ int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); @@ -2976,12 +3066,7 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); - rw_exit(&dn->dn_struct_rwlock); - DB_DNODE_EXIT(db); return (0); } @@ -3009,7 +3094,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; - if (db->db_blkid == DMU_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); @@ -3019,7 +3104,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, (void) zfs_refcount_add(&db->db_holds, tag); result = B_TRUE; } - mutex_exit(&db->db_mtx); + mutex_exit(&found_db->db_mtx); } return (result); } @@ -3697,9 +3782,9 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&db->db_mtx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); *db->db_blkptr = *bp; - rw_exit(&dn->dn_struct_rwlock); + dmu_buf_unlock_parent(db, dblt, FTAG); } /* ARGSUSED */ @@ -3740,9 +3825,9 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) * anybody from reading the blocks we're about to * zero out. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); bzero(db->db.db_data, db->db.db_size); - rw_exit(&dn->dn_struct_rwlock); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); } @@ -3932,7 +4017,7 @@ dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, } static void -dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) { blkptr_t bp_copy = *bp; spa_t *spa = dmu_objset_spa(dn->dn_objset); @@ -3946,14 +4031,16 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx) if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, &drica)) { /* - * The struct_rwlock prevents dbuf_read_impl() from + * The db_rwlock prevents dbuf_read_impl() from * dereferencing the BP while we are changing it. To * avoid lock contention, only grab it when we are actually * changing the BP. */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (rw != NULL) + rw_enter(rw, RW_WRITER); *bp = bp_copy; - rw_exit(&dn->dn_struct_rwlock); + if (rw != NULL) + rw_exit(rw); } } @@ -4026,7 +4113,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_level > 0) { blkptr_t *bp = db->db.db_data; for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { - dbuf_remap_impl(dn, &bp[i], tx); + dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; @@ -4034,7 +4121,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { - dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx); + krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : + &dn->dn_dbuf->db_rwlock); + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, + tx); } } } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 67ad5d10f6..a79f3f19c3 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -172,8 +172,8 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, uint64_t blkid; dmu_buf_impl_t *db; - blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); @@ -197,8 +197,8 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -605,7 +605,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, - read && DNODE_IS_CACHEABLE(dn)); + read && DNODE_IS_CACHEABLE(dn), B_TRUE); } rw_exit(&dn->dn_struct_rwlock); @@ -737,7 +737,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, if (err != 0) return; - rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the @@ -745,6 +744,7 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; @@ -757,7 +757,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } - rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); @@ -2341,6 +2340,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } + /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 6d01fbd2d4..a98097a8ee 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -28,6 +28,7 @@ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -81,6 +82,8 @@ int dmu_find_threads = 0; */ int dmu_rescan_dnode_threshold = 131072; +static char *upgrade_tag = "upgrade_tag"; + static void dmu_objset_find_dp_cb(void *arg); static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb); @@ -681,8 +684,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); @@ -755,8 +759,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); @@ -794,8 +799,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, { dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); @@ -812,9 +818,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; - + ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } @@ -842,7 +849,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); @@ -851,21 +860,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, 0, tag); - VERIFY0(dsl_dataset_own(dp, name, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); + dsl_dataset_disown(ds, flags, tag); + VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { + ds_hold_flags_t flags; + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); - dsl_dataset_disown(os->os_dsl_dataset, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); + dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void @@ -980,6 +990,7 @@ dmu_objset_evict_done(objset_t *os) mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + mutex_destroy(&os->os_upgrade_lock); for (int i = 0; i < TXG_SIZE; i++) { multilist_destroy(os->os_dirty_dnodes[i]); } @@ -1476,14 +1487,20 @@ dmu_objset_upgrade_task_cb(void *data) mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { + int status; + mutex_exit(&os->os_upgrade_lock); - os->os_upgrade_status = os->os_upgrade_cb(os); + status = os->os_upgrade_cb(os); + mutex_enter(&os->os_upgrade_lock); + + os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } static void @@ -1492,6 +1509,9 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) if (os->os_upgrade_id != 0) return; + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag); + mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) { os->os_upgrade_exit = B_FALSE; @@ -1499,8 +1519,12 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) os->os_upgrade_id = taskq_dispatch( os->os_spa->spa_upgrade_taskq, dmu_objset_upgrade_task_cb, os, TQ_SLEEP); - if (os->os_upgrade_id == 0) + if (os->os_upgrade_id == TASKQID_INVALID) { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; + } + } else { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } @@ -1511,10 +1535,12 @@ dmu_objset_upgrade_stop(objset_t *os) mutex_enter(&os->os_upgrade_lock); os->os_upgrade_exit = B_TRUE; if (os->os_upgrade_id != 0) { - os->os_upgrade_id = 0; + taskqid_t tid = os->os_upgrade_id; + mutex_exit(&os->os_upgrade_lock); - taskq_wait(os->os_spa->spa_upgrade_taskq); + taskq_wait_id(os->os_spa->spa_upgrade_taskq, tid); + txg_wait_synced(os->os_spa->spa_dsl_pool, 0); } else { mutex_exit(&os->os_upgrade_lock); } @@ -2215,7 +2241,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; - dn->dn_newgid = dn->dn_oldprojid; + dn->dn_newprojid = dn->dn_oldprojid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; @@ -2306,6 +2332,7 @@ dmu_objset_space_upgrade(objset_t *os) dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { + dmu_buf_rele(db, FTAG); dmu_tx_abort(tx); continue; } diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c index 39f365652e..03e0fee4ff 100644 --- a/usr/src/uts/common/fs/zfs/dmu_recv.c +++ b/usr/src/uts/common/fs/zfs/dmu_recv.c @@ -201,7 +201,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; @@ -399,7 +399,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; @@ -541,7 +541,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; int error; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; @@ -670,7 +670,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; objset_t *os; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; uint64_t dsobj; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; @@ -1824,8 +1824,9 @@ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; + dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has @@ -2832,11 +2833,12 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, dsl_dataset_t *snapds; guid_map_entry_t *gmep; objset_t *os; - ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; int err; ASSERT(guid_map != NULL); + dsflags = (raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 34cfa2c011..d91a48e2ca 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -1222,9 +1222,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; int err; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); @@ -1287,9 +1288,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; boolean_t owned = B_FALSE; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c index 5d6f20d072..08af78d620 100644 --- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c +++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -58,16 +58,29 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; + kstat_named_t zfetchstat_max_completion_us; + kstat_named_t zfetchstat_last_completion_us; + kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, + { "max_completion_us", KSTAT_DATA_UINT64 }, + { "last_completion_us", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, }; #define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); + atomic_inc_64(&zfetch_stats.stat.value.ui64) +#define ZFETCHSTAT_ADD(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, val) +#define ZFETCHSTAT_SET(stat, val) \ + zfetch_stats.stat.value.ui64 = val +#define ZFETCHSTAT_GET(stat) \ + zfetch_stats.stat.value.ui64 + kstat_t *zfetch_ksp; @@ -103,8 +116,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; - zf->zf_dnode = dno; + zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); @@ -113,12 +126,28 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) } static void +dmu_zfetch_stream_fini(zstream_t *zs) +{ + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + +static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); + dmu_zfetch_stream_fini(zs); + zf->zf_numstreams--; +} + +static void +dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + list_remove(&zf->zf_stream, zs); + zs->zs_fetch = NULL; + zf->zf_numstreams--; } /* @@ -133,8 +162,12 @@ dmu_zfetch_fini(zfetch_t *zf) ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); rw_enter(&zf->zf_rwlock, RW_WRITER); - while ((zs = list_head(&zf->zf_stream)) != NULL) - dmu_zfetch_stream_remove(zf, zs); + while ((zs = list_head(&zf->zf_stream)) != NULL) { + if (zfs_refcount_count(&zs->zs_blocks) != 0) + dmu_zfetch_stream_orphan(zf, zs); + else + dmu_zfetch_stream_remove(zf, zs); + } rw_exit(&zf->zf_rwlock); list_destroy(&zf->zf_stream); rw_destroy(&zf->zf_rwlock); @@ -152,7 +185,7 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; - int numstreams = 0; + hrtime_t now = gethrtime(); ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); @@ -162,11 +195,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > + /* + * Skip gethrtime() call if there are still references + */ + if (zfs_refcount_count(&zs->zs_blocks) != 0) + continue; + if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; } /* @@ -180,7 +216,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); - if (numstreams >= max_streams) { + if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } @@ -189,12 +225,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs->zs_blkid = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); + zs->zs_atime = now; + zs->zs_fetch = zf; + zfs_refcount_create(&zs->zs_blocks); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - + zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } +static void +dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +{ + zstream_t *zs = arg; + + if (zs->zs_start_time && io_issued) { + hrtime_t now = gethrtime(); + hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); + + zs->zs_start_time = 0; + ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); + if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) + ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); + } + + if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) + return; + + /* + * The parent fetch structure has gone away + */ + if (zs->zs_fetch == NULL) + dmu_zfetch_stream_fini(zs); +} + /* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts @@ -204,12 +267,13 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ void -dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t have_lock) { zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; uint64_t end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; @@ -230,9 +294,22 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) * As a fast path for small (single-block) files, ignore access * to the first block. */ - if (blkid == 0) + if (!have_lock && blkid == 0) return; + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + + /* + * A fast path for small files for which no prefetch will + * happen. + */ + if (zf->zf_dnode->dn_maxblkid < 2) { + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return; + } rw_enter(&zf->zf_rwlock, RW_READER); /* @@ -257,6 +334,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) /* Already prefetched this before. */ mutex_exit(&zs->zs_lock); rw_exit(&zf->zf_rwlock); + if (!have_lock) { + rw_exit(&zf->zf_dnode-> + dn_struct_rwlock); + } return; } break; @@ -274,6 +355,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) if (rw_tryupgrade(&zf->zf_rwlock)) dmu_zfetch_stream_create(zf, end_of_access_blkid); rw_exit(&zf->zf_rwlock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); return; } @@ -335,9 +418,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; zs->zs_atime = gethrtime(); + /* no prior reads in progress */ + if (zfs_refcount_count(&zs->zs_blocks) == 0) + zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; + zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, + NULL); mutex_exit(&zs->zs_lock); rw_exit(&zf->zf_rwlock); + issued = 0; /* * dbuf_prefetch() is asynchronous (even when it needs to read @@ -346,12 +435,19 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) */ for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); + + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index f5ef390896..53aeb42c0e 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -120,6 +120,7 @@ dnode_cons(void *arg, void *unused, int kmflag) mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is @@ -184,6 +185,7 @@ dnode_dest(void *arg, void *unused) mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); + cv_destroy(&dn->dn_nodnholds); zfs_refcount_destroy(&dn->dn_holds); zfs_refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); @@ -1175,13 +1177,15 @@ dnode_special_close(dnode_handle_t *dnh) dnode_t *dn = dnh->dnh_dnode; /* - * Wait for final references to the dnode to clear. This can - * only happen if the arc is asynchronously evicting state that - * has a hold on this dnode while we are trying to evict this - * dnode. + * Ensure dnode_rele_and_unlock() has released dn_mtx, after final + * zfs_refcount_remove() */ - while (zfs_refcount_count(&dn->dn_holds) > 0) - delay(1); + mutex_enter(&dn->dn_mtx); + if (zfs_refcount_count(&dn->dn_holds) > 0) + cv_wait(&dn->dn_nodnholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0); + ASSERT(dn->dn_dbuf == NULL || dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); @@ -1197,7 +1201,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_t *dn; zrl_init(&dnh->dnh_zrlock); - zrl_tryenter(&dnh->dnh_zrlock); + VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock)); dn = dnode_create(os, dnp, NULL, object, dnh); DNODE_VERIFY(dn); @@ -1342,7 +1346,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, } blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); - db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); @@ -1614,7 +1617,10 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) dnode_handle_t *dnh = dn->dn_handle; refs = zfs_refcount_remove(&dn->dn_holds, tag); + if (refs == 0) + cv_broadcast(&dn->dn_nodnholds); mutex_exit(&dn->dn_mtx); + /* dnode could get destroyed at this point, so don't use it anymore */ /* * It's unsafe to release the last hold on a dnode by dnode_rele() or @@ -1776,10 +1782,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); - if (err == 0) + if (err == 0) { dbuf_new_size(db, size, tx); - else if (err != ENOENT) + } else if (err != ENOENT) { goto fail; + } dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); @@ -2014,7 +2021,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) int trunc = FALSE; int epbs; - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -2031,7 +2037,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { @@ -2040,12 +2046,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) */ blkid = 0; nblks = 1; - if (dn->dn_nlevels > 1) + if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_dirty_l1(dn, 0, tx); + rw_exit(&dn->dn_struct_rwlock); + } goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ - goto out; + return; } else { /* Freeing part of the block. */ head = blksz - off; @@ -2055,19 +2064,26 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) } /* zero out any partial block data at the start of the range */ if (head) { + int res; ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), - TRUE, FALSE, FTAG, &db) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { caddr_t data; + boolean_t dirty; + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, + FTAG); /* don't dirty if it isn't on disk and isn't dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); + dirty = db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, dblt, FTAG); + if (dirty) { dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } @@ -2079,11 +2095,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) /* If the range was less than one block, we're done */ if (len == 0) - goto out; + return; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) - goto out; + return; ASSERT(ISP2(blksz)); if (trunc) @@ -2094,16 +2110,23 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { + int res; if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), - TRUE, FALSE, FTAG, &db) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + boolean_t dirty; /* don't dirty if not on disk and not dirty */ - if (db->db_last_dirty || - (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { - rw_exit(&dn->dn_struct_rwlock); + db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER, + FTAG); + dirty = db->db_last_dirty || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, type, FTAG); + if (dirty) { dmu_buf_will_dirty(&db->db, tx); - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); @@ -2113,7 +2136,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) /* If the range did not include a full block, we are done */ if (len == 0) - goto out; + return; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); @@ -2143,6 +2166,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); uint64_t first, last; first = blkid >> epbs; @@ -2187,6 +2211,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) dnode_dirty_l1(dn, i, tx); } + rw_exit(&dn->dn_struct_rwlock); } done: @@ -2208,9 +2233,6 @@ done: dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); -out: - - rw_exit(&dn->dn_struct_rwlock); } static boolean_t @@ -2322,6 +2344,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); @@ -2354,9 +2378,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, return (error); } data = db->db.db_data; + rw_enter(&db->db_rwlock, RW_READER); } - if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { @@ -2416,8 +2440,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = SET_ERROR(ESRCH); } - if (db) + if (db != NULL) { + rw_exit(&db->db_rwlock); dbuf_rele(db, FTAG); + } return (error); } diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index dc7317b411..396d58da17 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2020 Oxide Computer Company */ #include <sys/zfs_context.h> @@ -51,7 +52,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); @@ -61,8 +61,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); + /* + * Lock ordering requires that we hold the children's db_mutexes (by + * calling dbuf_find()) before holding the parent's db_rwlock. The lock + * order is imposed by dbuf_read's steps of "grab the lock to protect + * db_parent, get db_parent, hold db_parent's db_rwlock". + */ + dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; + ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); + for (i = 0; i < nblkptr; i++) { + children[i] = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + } + /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); @@ -72,8 +88,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = - dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + dmu_buf_impl_t *child = children[i]; if (child == NULL) continue; @@ -106,6 +121,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + rw_exit(&db->db_rwlock); + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); @@ -182,7 +201,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, + err = dbuf_hold_impl(dn, db->db_level - 1, (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) @@ -280,7 +299,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, * ancestor of the first or last block to be freed. The first and * last L1 indirect blocks are always dirtied by dnode_free_range(). */ + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dmu_buf_unlock_parent(db, dblt, FTAG); dbuf_release_bp(db); bp = db->db.db_data; @@ -306,7 +327,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - free_blocks(dn, bp, end-start+1, tx); + rw_enter(&db->db_rwlock, RW_WRITER); + free_blocks(dn, bp, end - start + 1, tx); + rw_exit(&db->db_rwlock); } else { for (uint64_t id = start; id <= end; id++, bp++) { if (BP_IS_HOLE(bp)) @@ -323,10 +346,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } if (free_indirects) { + rw_enter(&db->db_rwlock, RW_WRITER); for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) ASSERT(BP_IS_HOLE(bp)); bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); + rw_exit(&db->db_rwlock); } DB_DNODE_EXIT(db); @@ -378,7 +403,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FALSE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); - free_children(db, blkid, nblks, free_indirects, tx); dbuf_rele(db, FTAG); } @@ -736,13 +760,22 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; dsfra.dsfra_free_indirects = freeing_dnode; + mutex_enter(&dn->dn_mtx); if (freeing_dnode) { ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], 0, dn->dn_maxblkid + 1)); } - mutex_enter(&dn->dn_mtx); - range_tree_vacate(dn->dn_free_ranges[txgoff], + /* + * Because dnode_sync_free_range() must drop dn_mtx during its + * processing, using it as a callback to range_tree_vacate() is + * not safe. No other operations (besides destroy) are allowed + * once range_tree_vacate() has begun, and dropping dn_mtx + * would leave a window open for another thread to observe that + * invalid (and unsafe) state. + */ + range_tree_walk(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); + range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL); range_tree_destroy(dn->dn_free_ranges[txgoff]); dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); diff --git a/usr/src/uts/common/fs/zfs/dsl_crypt.c b/usr/src/uts/common/fs/zfs/dsl_crypt.c index a092326a9c..ed98740f1d 100644 --- a/usr/src/uts/common/fs/zfs/dsl_crypt.c +++ b/usr/src/uts/common/fs/zfs/dsl_crypt.c @@ -1391,10 +1391,17 @@ error: return (ret); } - +/* + * This function deals with the intricacies of updating wrapping + * key references and encryption roots recursively in the event + * of a call to 'zfs change-key' or 'zfs promote'. The 'skip' + * parameter should always be set to B_FALSE when called + * externally. + */ static void spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, - uint64_t new_rddobj, dsl_wrapping_key_t *wkey, dmu_tx_t *tx) + uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip, + dmu_tx_t *tx) { int ret; zap_cursor_t *zc; @@ -1409,7 +1416,7 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, /* hold the dd */ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); - /* ignore hidden dsl dirs */ + /* ignore special dsl dirs */ if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') { dsl_dir_rele(dd, FTAG); return; @@ -1422,7 +1429,8 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, * Stop recursing if this dsl dir didn't inherit from the root * or if this dd is a clone. */ - if (ret == ENOENT || curr_rddobj != rddobj || dsl_dir_is_clone(dd)) { + if (ret == ENOENT || + (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) { dsl_dir_rele(dd, FTAG); return; } @@ -1430,19 +1438,23 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, /* * If we don't have a wrapping key just update the dck to reflect the * new encryption root. Otherwise rewrap the entire dck and re-sync it - * to disk. + * to disk. If skip is set, we don't do any of this work. */ - if (wkey == NULL) { - VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj, - DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx)); - } else { - VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, - FTAG, &dck)); - dsl_wrapping_key_hold(wkey, dck); - dsl_wrapping_key_rele(dck->dck_wkey, dck); - dck->dck_wkey = wkey; - dsl_crypto_key_sync(dck, tx); - spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + if (!skip) { + if (wkey == NULL) { + VERIFY0(zap_update(dp->dp_meta_objset, + dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, + &new_rddobj, tx)); + } else { + VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, + FTAG, &dck)); + dsl_wrapping_key_hold(wkey, dck); + dsl_wrapping_key_rele(dck->dck_wkey, dck); + dck->dck_wkey = wkey; + dsl_crypto_key_sync(dck, tx); + spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + } } zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); @@ -1454,7 +1466,27 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { spa_keystore_change_key_sync_impl(rddobj, - za->za_first_integer, new_rddobj, wkey, tx); + za->za_first_integer, new_rddobj, wkey, B_FALSE, tx); + } + zap_cursor_fini(zc); + + /* + * Recurse into all dsl dirs of clones. We utilize the skip parameter + * here so that we don't attempt to process the clones directly. This + * is because the clone and its origin share the same dck, which has + * already been updated. + */ + for (zap_cursor_init(zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer, + FTAG, &clone)); + spa_keystore_change_key_sync_impl(rddobj, + clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx); + dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(zc); @@ -1534,7 +1566,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) /* recurse through all children and rewrap their keys */ spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object, - new_rddobj, wkey, tx); + new_rddobj, wkey, B_FALSE, tx); /* * All references to the old wkey should be released now (if it @@ -1708,7 +1740,7 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER); spa_keystore_change_key_sync_impl(rddobj, origin->dd_object, - target->dd_object, NULL, tx); + target->dd_object, NULL, B_FALSE, tx); rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock); dsl_dataset_rele(targetds, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index b619719ba9..f6e8db4100 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -24,7 +24,7 @@ * Copyright 2016 Gary Mills * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. */ #include <sys/dsl_scan.h> @@ -549,6 +549,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting in txg %llu", + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } } } @@ -599,6 +615,13 @@ dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) } boolean_t +dsl_scan_resilver_scheduled(dsl_pool_t *dp) +{ + return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || + (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); +} + +boolean_t dsl_scan_scrubbing(const dsl_pool_t *dp) { dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; @@ -794,7 +817,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) (void) spa_vdev_state_exit(spa, NULL, 0); if (func == POOL_SCAN_RESILVER) { - dsl_resilver_restart(spa->spa_dsl_pool, 0); + dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); return (0); } @@ -813,41 +836,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } -/* - * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns - * B_TRUE if we have devices that need to be resilvered and are available to - * accept resilver I/Os. - */ -static boolean_t -dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx) -{ - boolean_t resilver_needed = B_FALSE; - spa_t *spa = vd->vdev_spa; - - for (int c = 0; c < vd->vdev_children; c++) { - resilver_needed |= - dsl_scan_clear_deferred(vd->vdev_child[c], tx); - } - - if (vd == spa->spa_root_vdev && - spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { - spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); - vdev_config_dirty(vd); - spa->spa_resilver_deferred = B_FALSE; - return (resilver_needed); - } - - if (!vdev_is_concrete(vd) || vd->vdev_aux || - !vd->vdev_ops->vdev_op_leaf) - return (resilver_needed); - - if (vd->vdev_resilver_deferred) - vd->vdev_resilver_deferred = B_FALSE; - - return (!vdev_is_dead(vd) && !vd->vdev_offline && - vdev_resilver_needed(vd, NULL, NULL)); -} - /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -915,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* @@ -943,30 +930,33 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) spa_errlog_rotate(spa); /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + + /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); /* - * Clear any deferred_resilver flags in the config. + * Clear any resilver_deferred flags in the config. * If there are drives that need resilvering, kick * off an asynchronous request to start resilver. - * dsl_scan_clear_deferred() may update the config + * vdev_clear_resilver_deferred() may update the config * before the resilver can restart. In the event of * a crash during this period, the spa loading code * will find the drives that need to be resilvered - * when the machine reboots and start the resilver then. + * and start the resilver then. */ - if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { - boolean_t resilver_needed = - dsl_scan_clear_deferred(spa->spa_root_vdev, tx); - if (resilver_needed) { - spa_history_log_internal(spa, - "starting deferred resilver", tx, - "errors=%llu", spa_get_errlog_size(spa)); - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && + vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { + spa_history_log_internal(spa, + "starting deferred resilver", tx, "errors=%llu", + (u_longlong_t)spa_get_errlog_size(spa)); + spa_async_request(spa, SPA_ASYNC_RESILVER); } } @@ -1073,7 +1063,7 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) /* start a new scan, or restart an existing one. */ void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; @@ -1221,10 +1211,13 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) static boolean_t dsl_scan_should_clear(dsl_scan_t *scn) { + spa_t *spa = scn->scn_dp->dp_spa; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - uint64_t mlim_hard, mlim_soft, mused; - uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( - scn->scn_dp->dp_spa)); + uint64_t alloc, mlim_hard, mlim_soft, mused; + + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, zfs_scan_mem_lim_min); @@ -3863,7 +3856,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, /* * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. + * zpool(8) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); @@ -4208,3 +4201,33 @@ dsl_scan_freed(spa_t *spa, const blkptr_t *bp) for (int i = 0; i < BP_GET_NDVAS(bp); i++) dsl_scan_freed_dva(spa, bp, i); } + +/* + * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has + * not started, start it. Otherwise, only restart if max txg in DTL range is + * greater than the max txg in the current scan. If the DTL max is less than + * the scan max, then the vdev has not missed any new data since the resilver + * started, so a restart is not needed. + */ +void +dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) +{ + uint64_t min, max; + + if (!vdev_resilver_needed(vd, &min, &max)) + return; + + if (!dsl_scan_resilvering(dp)) { + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); + return; + } + + if (max <= dp->dp_scan->scn_phys.scn_max_txg) + return; + + /* restart is needed, check if it can be deferred */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_defer_resilver(vd); + else + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); +} diff --git a/usr/src/uts/common/fs/zfs/lua/README.zfs b/usr/src/uts/common/fs/zfs/lua/README.zfs index 0e22de7a4a..bd0804f99e 100644 --- a/usr/src/uts/common/fs/zfs/lua/README.zfs +++ b/usr/src/uts/common/fs/zfs/lua/README.zfs @@ -26,7 +26,7 @@ maintenance policy, the modifications that have been made to it, and how it should (and should not) be used. For a description of the Lua language and features exposed by ZFS channel -programs, please refer to the zfs-program(1m) man page instead. +programs, please refer to the zfs-program(8) man page instead. Maintenance policy diff --git a/usr/src/uts/common/fs/zfs/lua/ldebug.c b/usr/src/uts/common/fs/zfs/lua/ldebug.c index b8ddcff3c6..4ed0094bde 100644 --- a/usr/src/uts/common/fs/zfs/lua/ldebug.c +++ b/usr/src/uts/common/fs/zfs/lua/ldebug.c @@ -467,7 +467,7 @@ static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) { return getobjname(p, pc, GETARG_A(i), name); case OP_TFORCALL: { /* for iterator */ *name = "for iterator"; - return "for iterator"; + return "for iterator"; } /* all other instructions can call only through metamethods */ case OP_SELF: diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 42ba1f9a46..4828824b10 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -2414,7 +2414,7 @@ metaslab_load_impl(metaslab_t *msp) msp->ms_max_size = metaslab_largest_allocatable(msp); ASSERT3U(max_size, <=, msp->ms_max_size); hrtime_t load_end = gethrtime(); - msp->ms_load_time = load_end; + msp->ms_load_time = load_end; if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, " "ms_id %llu, smp_length %llu, " @@ -5639,7 +5639,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); @@ -5686,7 +5686,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) metaslab_claim_cb_arg_t arg; /* - * Only zdb(1M) can claim on indirect vdevs. This is used + * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 0d2d28e1d3..ad4facaf5b 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -24,7 +24,6 @@ * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. - * Copyright (c) 2015 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2019 Joyent, Inc. */ diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 95c35a0f5f..26cc3b0824 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -27,10 +27,11 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. - * Copyright (c) 2017 Datto Inc. - * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* @@ -1730,13 +1731,15 @@ spa_load_l2cache(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + nl2cache = 0; + newvdevs = NULL; if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); - newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); - } else { - nl2cache = 0; - newvdevs = NULL; + if (nl2cache > 0) { + newvdevs = kmem_alloc( + nl2cache * sizeof (void *), KM_SLEEP); + } } oldvdevs = sav->sav_vdevs; @@ -1828,7 +1831,11 @@ spa_load_l2cache(spa_t *spa) VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); - l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + l2cache = NULL; + if (sav->sav_count > 0) { + l2cache = kmem_alloc( + sav->sav_count * sizeof (void *), KM_SLEEP); + } for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); @@ -2407,7 +2414,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); + (void) zfs_ereport_post(ereport, spa, + NULL, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; @@ -3610,6 +3618,7 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); @@ -4379,6 +4388,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) } spa_import_progress_remove(spa); + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_load_note(spa, "LOADED"); return (0); @@ -5375,10 +5386,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - static nvlist_t * -spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) +spa_generate_rootconf(const char *devpath, const char *devid, uint64_t *guid, + uint64_t pool_guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; @@ -5396,6 +5406,19 @@ spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); + if (pool_guid != 0 && pool_guid != pgid) { + /* + * The boot loader provided a pool GUID, but it does not match + * the one we found in the label. Return failure so that we + * can fall back to the full device scan. + */ + zfs_dbgmsg("spa_generate_rootconf: loader pool guid %llu != " + "label pool guid %llu", (u_longlong_t)pool_guid, + (u_longlong_t)pgid); + nvlist_free(config); + return (NULL); + } + /* * Put this pool's top-level vdevs into a root vdev. */ @@ -5462,7 +5485,8 @@ spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) * "/pci@1f,0/ide@d/disk@0,0:a" */ int -spa_import_rootpool(char *devpath, char *devid) +spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid, + uint64_t vdev_guid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; @@ -5470,20 +5494,43 @@ spa_import_rootpool(char *devpath, char *devid) uint64_t guid, txg; char *pname; int error; + const char *altdevpath = NULL; /* * Read the label from the boot device and generate a configuration. */ - config = spa_generate_rootconf(devpath, devid, &guid); + config = spa_generate_rootconf(devpath, devid, &guid, pool_guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); - config = spa_generate_rootconf(devpath, devid, &guid); + config = spa_generate_rootconf(devpath, devid, &guid, + pool_guid); } } #endif + + /* + * We were unable to import the pool using the /devices path or devid + * provided by the boot loader. This may be the case if the boot + * device has been connected to a different location in the system, or + * if a new boot environment has changed the driver used to access the + * boot device. + * + * Attempt an exhaustive scan of all visible block devices to see if we + * can locate an alternative /devices path with a label that matches + * the expected pool and vdev GUID. + */ + if (config == NULL && (altdevpath = + vdev_disk_preroot_lookup(pool_guid, vdev_guid)) != NULL) { + cmn_err(CE_NOTE, "Original /devices path (%s) not available; " + "ZFS is trying an alternate path (%s)", devpath, + altdevpath); + config = spa_generate_rootconf(altdevpath, NULL, &guid, + pool_guid); + } + if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); @@ -6360,9 +6407,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ if (dsl_scan_resilvering(spa_get_dsl(spa)) && spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, newvd); + vdev_defer_resilver(newvd); else - dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg); if (spa->spa_bootfs) spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); @@ -7600,7 +7647,7 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_RESILVER && (!dsl_scan_resilvering(dp) || !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) - dsl_resilver_restart(dp, 0); + dsl_scan_restart_resilver(dp, 0); if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { mutex_enter(&spa_namespace_lock); @@ -7627,6 +7674,17 @@ spa_async_thread(void *arg) } /* + * Kick off L2 cache rebuilding. + */ + if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); + l2arc_spa_rebuild_start(spa); + spa_config_exit(spa, SCL_L2ARC, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); @@ -7716,6 +7774,12 @@ spa_async_request(spa_t *spa, int task) mutex_exit(&spa->spa_async_lock); } +int +spa_async_tasks(spa_t *spa) +{ + return (spa->spa_async_tasks); +} + /* * ========================================================================== * SPA syncing routines diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index 4719696ca4..ae814208fd 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -280,7 +280,8 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) * resource issues are resolved. */ if (target->spa_ccw_fail_time == 0) { - zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, target, NULL, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c index 897d3c6e9a..44a4ec7ddc 100644 --- a/usr/src/uts/common/fs/zfs/spa_history.c +++ b/usr/src/uts/common/fs/zfs/spa_history.c @@ -334,7 +334,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know - * how to parse and understand zfs(1M) command invocations. + * how to parse and understand zfs(8) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 9dac4e2ddc..cb59eef824 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -44,6 +44,7 @@ #include <sys/vdev_impl.h> #include <sys/vdev_initialize.h> #include <sys/vdev_trim.h> +#include <sys/vdev_raidz.h> #include <sys/metaslab.h> #include <sys/uberblock_impl.h> #include <sys/txg.h> @@ -315,6 +316,16 @@ uint64_t zfs_deadman_checktime_ms = 5000ULL; */ int zfs_deadman_enabled = -1; +#if defined(__amd64__) || defined(__i386__) +/* + * Should we allow the use of mechanisms that depend on saving and restoring + * the FPU state? This was disabled initially due to stability issues in + * the kernel FPU routines; see bug 13717. As of the fixes for 13902 and + * 13915, it has once again been enabled. + */ +int zfs_fpu_enabled = 1; +#endif + /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) @@ -1373,7 +1384,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) /* * If anything changed, wait for it to sync. This ensures that, - * from the system administrator's perspective, zpool(1M) commands + * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ @@ -2253,6 +2264,7 @@ spa_init(int mode) zil_init(); vdev_cache_stat_init(); vdev_mirror_stat_init(); + vdev_raidz_math_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); @@ -2271,6 +2283,7 @@ spa_fini(void) vdev_cache_stat_fini(); vdev_mirror_stat_fini(); + vdev_raidz_math_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h index 621635933e..23699c0420 100644 --- a/usr/src/uts/common/fs/zfs/sys/abd.h +++ b/usr/src/uts/common/fs/zfs/sys/abd.h @@ -103,6 +103,15 @@ int abd_cmp(abd_t *, abd_t *, size_t); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + /* * Wrappers for calls with offsets of 0 */ diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 1ef3bb79ca..e5c18febe5 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -179,6 +179,16 @@ typedef enum arc_space_type { ARC_SPACE_NUMTYPES } arc_space_type_t; +typedef enum arc_state_type { + ARC_STATE_ANON, + ARC_STATE_MRU, + ARC_STATE_MRU_GHOST, + ARC_STATE_MFU, + ARC_STATE_MFU_GHOST, + ARC_STATE_L2C_ONLY, + ARC_STATE_NUMTYPES +} arc_state_type_t; + void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); @@ -248,10 +258,14 @@ void arc_fini(void); void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, + uint64_t check); void l2arc_init(void); void l2arc_fini(void); void l2arc_start(void); void l2arc_stop(void); +void l2arc_spa_rebuild_start(spa_t *spa); #ifndef _KERNEL extern boolean_t arc_watch; diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h new file mode 100644 index 0000000000..d35b7eea2d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h @@ -0,0 +1,876 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + */ + +#ifndef _SYS_ARC_IMPL_H +#define _SYS_ARC_IMPL_H + +#include <sys/arc.h> +#include <sys/multilist.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recently used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + /* + * list of evictable buffers + */ + multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + zfs_refcount_t arcs_size; + + arc_state_type_t arcs_state; +} arc_state_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_read_done_func_t *acb_done; + arc_buf_t *acb_buf; + boolean_t acb_encrypted; + boolean_t acb_compressed; + boolean_t acb_noauth; + zbookmark_phys_t acb_zb; + zio_t *acb_zio_dummy; + zio_t *acb_zio_head; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * Used for debugging with kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ + void *b_thawed; +#endif + + arc_buf_t *b_buf; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ + kcondvar_t b_cv; + uint8_t b_byteswap; + + /* protected by arc state mutex */ + arc_state_t *b_state; + multilist_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + + /* self protecting */ + zfs_refcount_t b_refcnt; + + arc_callback_t *b_acb; + abd_t *b_pabd; +} l1arc_buf_hdr_t; + +typedef enum l2arc_dev_hdr_flags_t { + L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ +} l2arc_dev_hdr_flags_t; + +/* + * Pointer used in persistent L2ARC (for pointing to log blocks). + */ +typedef struct l2arc_log_blkptr { + /* + * Offset of log block within the device, in bytes + */ + uint64_t lbp_daddr; + /* + * Aligned payload size (in bytes) of the log block + */ + uint64_t lbp_payload_asize; + /* + * Offset in bytes of the first buffer in the payload + */ + uint64_t lbp_payload_start; + /* + * lbp_prop has the following format: + * * logical size (in bytes) + * * aligned (after compression) size (in bytes) + * * compression algorithm (we always LZ4-compress l2arc logs) + * * checksum algorithm (used for lbp_cksum) + */ + uint64_t lbp_prop; + zio_cksum_t lbp_cksum; /* checksum of log */ +} l2arc_log_blkptr_t; + +/* + * The persistent L2ARC device header. + * Byte order of magic determines whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_dev_hdr_phys { + uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ + uint64_t dh_version; /* Persistent L2ARC version */ + + /* + * Global L2ARC device state and metadata. + */ + uint64_t dh_spa_guid; + uint64_t dh_vdev_guid; + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ + uint64_t dh_evict; /* evicted offset in bytes */ + uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ + /* + * Used in zdb.c for determining if a log block is valid, in the same + * way that l2arc_rebuild() does. + */ + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ + /* + * Start of log block chain. [0] -> newest log, [1] -> one older (used + * for initiating prefetch). + */ + l2arc_log_blkptr_t dh_start_lbps[2]; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ + zio_eck_t dh_tail; +} l2arc_dev_hdr_phys_t; +CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); + +/* + * A single ARC buffer header entry in a l2arc_log_blk_phys_t. + */ +typedef struct l2arc_log_ent_phys { + dva_t le_dva; /* dva of buffer */ + uint64_t le_birth; /* birth txg of buffer */ + /* + * le_prop has the following format: + * * logical size (in bytes) + * * physical (compressed) size (in bytes) + * * compression algorithm + * * object type (used to restore arc_buf_contents_t) + * * protected status (used for encryption) + * * prefetch status (used in l2arc_read_done()) + */ + uint64_t le_prop; + uint64_t le_daddr; /* buf location on l2dev */ + /* + * We pad the size of each entry to a power of 2 so that the size of + * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, + * because of the L2ARC_SET_*SIZE macros. + */ + const uint64_t le_pad[3]; /* pad to 64 bytes */ +} l2arc_log_ent_phys_t; + +#define L2ARC_LOG_BLK_MAX_ENTRIES (1022) + +/* + * A log block of up to 1022 ARC buffer log entries, chained into the + * persistent L2ARC metadata linked list. Byte order of magic determines + * whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_log_blk_phys { + uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ + /* + * There are 2 chains (headed by dh_start_lbps[2]), and this field + * points back to the previous block in this chain. We alternate + * which chain we append to, so they are time-wise and offset-wise + * interleaved, but that is an optimization rather than for + * correctness. + */ + l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ + /* + * Pad header section to 128 bytes + */ + uint64_t lb_pad[7]; + /* Payload */ + l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; +} l2arc_log_blk_phys_t; /* 64K total */ +/* + * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with + * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. + */ +CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), + 1ULL << SPA_MINBLOCKSHIFT)); +CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); +CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); + +/* + * These structures hold in-flight abd buffers for log blocks as they're being + * written to the L2ARC device. + */ +typedef struct l2arc_lb_abd_buf { + abd_t *abd; + list_node_t node; +} l2arc_lb_abd_buf_t; + +/* + * These structures hold pointers to log blocks present on the L2ARC device. + */ +typedef struct l2arc_lb_ptr_buf { + l2arc_log_blkptr_t *lb_ptr; + list_node_t node; +} l2arc_lb_ptr_buf_t; + +/* Macros for setting fields in le_prop and lbp_prop */ +#define L2BLK_GET_LSIZE(field) \ + BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_LSIZE(field, x) \ + BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_PSIZE(field) \ + BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_PSIZE(field, x) \ + BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_COMPRESS(field) \ + BF64_GET((field), 32, SPA_COMPRESSBITS) +#define L2BLK_SET_COMPRESS(field, x) \ + BF64_SET((field), 32, SPA_COMPRESSBITS, x) +#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) +#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) +#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) +#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) +#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) +#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) +#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) +#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) + +#define PTR_SWAP(x, y) \ + do { \ + void *tmp = (x);\ + x = y; \ + y = tmp; \ + _NOTE(CONSTCOND)\ + } while (0) + +#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ +#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + zfs_refcount_t l2ad_alloc; /* allocated bytes */ + /* + * Persistence-related stuff + */ + l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ + uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ + l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ + int l2ad_log_ent_idx; /* index into cur log blk */ + /* Number of bytes in current log block's payload */ + uint64_t l2ad_log_blk_payload_asize; + /* + * Offset (in bytes) of the first buffer in current log block's + * payload. + */ + uint64_t l2ad_log_blk_payload_start; + /* Flag indicating whether a rebuild is scheduled or is going on */ + boolean_t l2ad_rebuild; + boolean_t l2ad_rebuild_cancel; + boolean_t l2ad_rebuild_began; + uint64_t l2ad_log_entries; /* entries per log blk */ + uint64_t l2ad_evict; /* evicted offset in bytes */ + /* List of pointers to log blocks present in the L2ARC device */ + list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; +} l2arc_dev_t; + +/* + * Encrypted blocks will need to be stored encrypted on the L2ARC + * disk as they appear in the main pool. In order for this to work we + * need to pass around the encryption parameters so they can be used + * to write data to the L2ARC. This struct is only defined in the + * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED + * flag set. + */ +typedef struct arc_buf_hdr_crypt { + abd_t *b_rabd; /* raw encrypted data */ + dmu_object_type_t b_ot; /* object type */ + uint32_t b_ebufcnt; /* number or encryped buffers */ + + /* dsobj for looking up encryption key for l2arc encryption */ + uint64_t b_dsobj; /* for looking up key */ + + /* encryption parameters */ + uint8_t b_salt[ZIO_DATA_SALT_LEN]; + uint8_t b_iv[ZIO_DATA_IV_LEN]; + + /* + * Technically this could be removed since we will always be able to + * get the mac from the bp when we need it. However, it is inconvenient + * for callers of arc code to have to pass a bp in all the time. This + * also allows us to assert that L2ARC data is properly encrypted to + * match the data in the main storage pool. + */ + uint8_t b_mac[ZIO_DATA_MAC_LEN]; +} arc_buf_hdr_crypt_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + + arc_state_type_t b_arcs_state; + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* in-flight list of log blocks */ + list_t l2wcb_abd_list; +} l2arc_write_callback_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; + /* + * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED + * is set and the L1 header exists. + */ + arc_buf_hdr_crypt_t b_crypt_hdr; +}; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_eligible_mfu; + kstat_named_t arcstat_evict_l2_eligible_mru; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by various buffers and structures + * not actually backed with ARC buffers. This includes bonus + * buffers (allocated directly via zio_buf_* functions), + * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t + * cache), and dnode_t structures (allocated via dnode_t cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_other_size; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + /* + * Allocated size (in bytes) of L2ARC cached buffers by ARC state. + */ + kstat_named_t arcstat_l2_prefetch_asize; + kstat_named_t arcstat_l2_mru_asize; + kstat_named_t arcstat_l2_mfu_asize; + /* + * Allocated size (in bytes) of L2ARC cached buffers by buffer content + * type. + */ + kstat_named_t arcstat_l2_bufc_data_asize; + kstat_named_t arcstat_l2_bufc_metadata_asize; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_l2_hdr_size; + /* + * Number of L2ARC log blocks written. These are used for restoring the + * L2ARC. Updated during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_log_blk_writes; + /* + * Moving average of the aligned size of the L2ARC log blocks, in + * bytes. Updated during L2ARC rebuild and during writing of L2ARC + * log blocks. + */ + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; + /* + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. + * Updated during L2ARC rebuild and during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_data_to_meta_ratio; + /* + * Number of times the L2ARC rebuild was successful for an L2ARC device. + */ + kstat_named_t arcstat_l2_rebuild_success; + /* + * Number of times the L2ARC rebuild failed because the device header + * was in an unsupported format or corrupted. + */ + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + /* + * Number of times the L2ARC rebuild failed because of IO errors + * while reading a log block. + */ + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + /* + * Number of times the L2ARC rebuild failed because of IO errors when + * reading the device header. + */ + kstat_named_t arcstat_l2_rebuild_abort_dh_errors; + /* + * Number of L2ARC log blocks which failed to be restored due to + * checksum errors. + */ + kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; + /* + * Number of times the L2ARC rebuild was aborted due to low system + * memory. + */ + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + /* Logical size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; + /* + * Number of L2ARC log entries (buffers) that were successfully + * restored in ARC. + */ + kstat_named_t arcstat_l2_rebuild_bufs; + /* + * Number of L2ARC log entries (buffers) already cached in ARC. These + * were not restored again. + */ + kstat_named_t arcstat_l2_rebuild_bufs_precached; + /* + * Number of L2ARC log blocks that were restored successfully. Each + * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. + */ + kstat_named_t arcstat_l2_rebuild_log_blks; + kstat_named_t arcstat_memory_throttle_count; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; +} arc_stats_t; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) + +extern arc_stats_t arc_stats; + +/* used in zdb.c */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index 271232c61c..e543f6ac09 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -108,6 +108,12 @@ typedef enum override_states { DR_OVERRIDDEN } override_states_t; +typedef enum db_lock_type { + DLT_NONE, + DLT_PARENT, + DLT_OBJSET +} db_lock_type_t; + typedef struct dbuf_dirty_record { /* link on our parents dirty list */ list_node_t dr_dirty_node; @@ -217,6 +223,22 @@ typedef struct dmu_buf_impl { */ uint8_t db_level; + /* + * Protects db_buf's contents if they contain an indirect block or data + * block of the meta-dnode. We use this lock to protect the structure of + * the block tree. This means that when modifying this dbuf's data, we + * grab its rwlock. When modifying its parent's data (including the + * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering + * for this lock is: + * 1) dn_struct_rwlock + * 2) db_rwlock + * We don't currently grab multiple dbufs' db_rwlocks at once. + */ + krwlock_t db_rwlock; + + /* buffer holding our data */ + arc_buf_t *db_buf; + /* db_mtx protects the members below */ kmutex_t db_mtx; @@ -232,9 +254,6 @@ typedef struct dmu_buf_impl { */ zfs_refcount_t db_holds; - /* buffer holding our data */ - arc_buf_t *db_buf; - kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; @@ -289,6 +308,8 @@ typedef struct dbuf_hash_table { kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; +typedef void (*dbuf_prefetch_fn)(void *, boolean_t); + uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); @@ -305,7 +326,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, +int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg); +int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); @@ -336,6 +360,8 @@ void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); +db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag); +void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag); boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index d38914dd1d..be834895c8 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -963,7 +963,7 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See - * zfs_prop_table[] and zfs(1m) for details on the properties. + * zfs_prop_table[] and zfs(8) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); @@ -974,7 +974,7 @@ void dmu_objset_stats(objset_t *os, struct nvlist *nv); * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and - * 'available' properties, described in the zfs(1m) manpage. + * 'available' properties, described in the zfs(8) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index ccb5d7ac51..cc32359653 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -55,42 +55,42 @@ extern "C" { * XXX try to improve evicting path? * * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > - * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs * * dp_config_rwlock * must be held before: everything * protects dd namespace changes * protects property changes globally * held from: - * dsl_dir_open/r: - * dsl_dir_create_sync/w: - * dsl_dir_sync_destroy/w: - * dsl_dir_rename_sync/w: - * dsl_prop_changed_notify/r: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: * * os_obj_lock * must be held before: - * everything except dp_config_rwlock + * everything except dp_config_rwlock * protects os_obj_next * held from: - * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock * * dn_struct_rwlock * must be held before: - * everything except dp_config_rwlock and os_obj_lock + * everything except dp_config_rwlock and os_obj_lock * protects structure of dnode (eg. nlevels) - * db_blkptr can change when syncing out change to nlevels - * dn_maxblkid - * dn_nlevels - * dn_*blksz* - * phys nlevels, maxblkid, physical blkptr_t's (?) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) * held from: - * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch - * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) - * dbuf_read_impl: db_mtx, dmu_zfetch() - * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() - * dbuf_new_size: db_mtx - * dbuf_dirty: db_mtx + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx * dbuf_findbp: (callers, phys? - the real need) * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx @@ -103,126 +103,127 @@ extern "C" { * * dn_dbufs_mtx * must be held before: - * db_mtx, hash_mutexes + * db_mtx, hash_mutexes * protects: - * dn_dbufs - * dn_evicted + * dn_dbufs + * dn_evicted * held from: - * dmu_evict_user: db_mtx (dn_dbufs) - * dbuf_free_range: db_mtx (dn_dbufs) - * dbuf_remove_ref: db_mtx, callees: - * dbuf_hash_remove: hash_mutexes, db_mtx - * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) - * dnode_set_blksz: (dn_dbufs) + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) * * hash_mutexes (global) * must be held before: - * db_mtx + * db_mtx * protects dbuf_hash_table (global) and db_hash_next * held from: - * dbuf_find: db_mtx - * dbuf_hash_insert: db_mtx - * dbuf_hash_remove: db_mtx + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx * * db_mtx (meta-leaf) * must be held before: - * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) * protects: - * db_state - * db_holds - * db_buf - * db_changed - * db_data_pending - * db_dirtied - * db_link - * db_dirty_node (??) - * db_dirtycnt - * db_d.* - * db.* + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* * held from: - * dbuf_dirty: dn_mtx, dn_dirty_mtx - * dbuf_dirty->dsl_dir_willuse_space: dd_lock - * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock - * dbuf_undirty: dn_dirty_mtx (db_d) - * dbuf_write_done: dn_dirty_mtx (db_state) - * dbuf_* - * dmu_buf_update_user: none (db_d) - * dmu_evict_user: none (db_d) (maybe can eliminate) - * dbuf_find: none (db_holds) - * dbuf_hash_insert: none (db_holds) - * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) - * dnode_reallocate: none (db) + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) * * dn_mtx (leaf) * protects: - * dn_dirty_dbufs - * dn_ranges - * phys accounting - * dn_allocated_txg - * dn_free_txg - * dn_assigned_txg - * dn_dirty_txg - * dd_assigned_tx - * dn_notxholds - * dn_dirtyctx - * dn_dirtyctx_firstset - * (dn_phys copy fields?) - * (dn_phys contents?) + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dn_dirty_txg + * dd_assigned_tx + * dn_notxholds + * dn_nodnholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) * held from: - * dnode_* - * dbuf_dirty: none - * dbuf_sync: none (phys accounting) - * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) - * dbuf_write_done: none (phys accounting) - * dmu_object_info_from_dnode: none (accounting) - * dmu_tx_commit: none - * dmu_tx_hold_object_impl: none - * dmu_tx_try_assign: dn_notxholds(cv) - * dmu_tx_unassign: none + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none * * dd_lock * must be held before: - * ds_lock - * ancestors' dd_lock + * ds_lock + * ancestors' dd_lock * protects: - * dd_prop_cbs - * dd_sync_* - * dd_used_bytes - * dd_tempreserved - * dd_space_towrite - * dd_myname - * dd_phys accounting? + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? * held from: - * dsl_dir_* - * dsl_prop_changed_notify: none (dd_prop_cbs) - * dsl_prop_register: none (dd_prop_cbs) - * dsl_prop_unregister: none (dd_prop_cbs) + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) * * os_lock (leaf) * protects: - * os_dirty_dnodes - * os_free_dnodes - * os_dnodes - * os_downgraded_dbufs - * dn_dirtyblksz - * dn_dirty_link + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link * held from: - * dnode_create: none (os_dnodes) - * dnode_destroy: none (os_dnodes) - * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) - * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) * * ds_lock * protects: - * ds_objset - * ds_open_refcount - * ds_snapname - * ds_phys accounting + * ds_objset + * ds_open_refcount + * ds_snapname + * ds_phys accounting * ds_phys userrefs zapobj * ds_reserved * held from: - * dsl_dataset_* + * dsl_dataset_* * * dr_mtx (leaf) * protects: diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h index 21a3ff3a20..71f76cc88b 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #ifndef _DMU_ZFETCH_H @@ -40,6 +40,13 @@ extern uint64_t zfetch_array_rd_sz; struct dnode; /* so we can reference dnode */ +typedef struct zfetch { + krwlock_t zf_rwlock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + int zf_numstreams; /* number of zstream_t's */ +} zfetch_t; + typedef struct zstream { uint64_t zs_blkid; /* expect next access at this blkid */ uint64_t zs_pf_blkid; /* next block to prefetch */ @@ -52,21 +59,19 @@ typedef struct zstream { kmutex_t zs_lock; /* protects stream */ hrtime_t zs_atime; /* time last prefetch issued */ + hrtime_t zs_start_time; /* start of last prefetch */ list_node_t zs_node; /* link for zf_stream */ + zfetch_t *zs_fetch; /* parent fetch */ + zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */ } zstream_t; -typedef struct zfetch { - krwlock_t zf_rwlock; /* protects zfetch structure */ - list_t zf_stream; /* list of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ -} zfetch_t; - void zfetch_init(void); void zfetch_fini(void); void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_fini(zfetch_t *); -void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, + boolean_t); #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 054e467bb7..ca94e5f1c9 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -310,6 +310,7 @@ struct dnode { uint64_t dn_assigned_txg; uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ kcondvar_t dn_notxholds; + kcondvar_t dn_nodnholds; enum dnode_dirtycontext dn_dirtyctx; uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 189376eefc..0fd7e1a7e2 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -306,6 +306,7 @@ typedef struct dsl_dataset_snapshot_arg { /* flags for holding the dataset */ typedef enum ds_hold_flags { + DS_HOLD_FLAG_NONE = 0 << 0, DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ } ds_hold_flags_t; diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h index 1b600405ae..4693293290 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_DSL_SCAN_H @@ -164,10 +164,12 @@ void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); -void dsl_resilver_restart(struct dsl_pool *, uint64_t txg); +void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/simd.h b/usr/src/uts/common/fs/zfs/sys/simd.h new file mode 100644 index 0000000000..1ee17c902d --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/simd.h @@ -0,0 +1,184 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _SIMD_H +#define _SIMD_H + +#if defined(__amd64__) || defined(__i386__) + +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) + +#ifdef _KERNEL +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/kfpu.h> +#include <sys/proc.h> +#include <sys/disp.h> +#include <sys/cpuvar.h> + +static inline int +kfpu_allowed(void) +{ + extern int zfs_fpu_enabled; + + return (zfs_fpu_enabled != 0 ? 1 : 0); +} + +static inline void +kfpu_begin(void) +{ + if (curthread->t_lwp != NULL && (curthread->t_procp->p_flag & SSYS)) { + kernel_fpu_begin(NULL, KFPU_USE_LWP); + } else { + kpreempt_disable(); + kernel_fpu_begin(NULL, KFPU_NO_STATE); + } +} + +static inline void +kfpu_end(void) +{ + if (curthread->t_lwp != NULL && (curthread->t_procp->p_flag & SSYS)) { + kernel_fpu_end(NULL, KFPU_USE_LWP); + } else { + kernel_fpu_end(NULL, KFPU_NO_STATE); + kpreempt_enable(); + } +} + +/* + * Check if various vector instruction sets are available. + */ + +static inline boolean_t +zfs_sse_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_SSE)); +} + +static inline boolean_t +zfs_sse2_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_SSE2)); +} + +static inline boolean_t +zfs_sse3_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_SSE3)); +} + +static inline boolean_t +zfs_ssse3_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_SSSE3)); +} + +static inline boolean_t +zfs_avx_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_AVX)); +} + +static inline boolean_t +zfs_avx2_available(void) +{ + return (is_x86_feature(x86_featureset, X86FSET_AVX2)); +} + +#else /* ! _KERNEL */ + +#include <sys/auxv.h> +#include <sys/auxv_386.h> + +#define kfpu_allowed() 1 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +/* + * User-level check if various vector instruction sets are available. + */ + +static inline boolean_t +zfs_sse_available(void) +{ + uint32_t u = 0; + + (void) getisax(&u, 1); + return ((u & AV_386_SSE) != 0); +} + +static inline boolean_t +zfs_sse2_available(void) +{ + uint32_t u = 0; + + (void) getisax(&u, 1); + return ((u & AV_386_SSE2) != 0); +} + +static inline boolean_t +zfs_sse3_available(void) +{ + uint32_t u = 0; + + (void) getisax(&u, 1); + return ((u & AV_386_SSE3) != 0); +} + +static inline boolean_t +zfs_ssse3_available(void) +{ + uint32_t u = 0; + + (void) getisax(&u, 1); + return ((u & AV_386_SSSE3) != 0); +} + +static inline boolean_t +zfs_avx_available(void) +{ + uint_t u = 0; + + (void) getisax(&u, 1); + return ((u & AV_386_AVX) != 0); +} + +static inline boolean_t +zfs_avx2_available(void) +{ + uint32_t u[2] = { 0 }; + + (void) getisax((uint32_t *)&u, 2); + return ((u[1] & AV_386_2_AVX2) != 0); +} + +#endif /* _KERNEL */ + + +#else + +/* Non-x86 CPUs currently always disallow kernel FPU support */ +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#endif + +#endif /* _SIMD_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index e017462613..af8057be8f 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -26,8 +26,9 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2019 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ #ifndef _SYS_SPA_H @@ -759,7 +760,8 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops, struct dsl_crypto_params *dcp); -extern int spa_import_rootpool(char *devpath, char *devid); +extern int spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid, + uint64_t vdev_guid); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); @@ -773,6 +775,7 @@ extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); +extern int spa_async_tasks(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); @@ -789,6 +792,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_INITIALIZE_RESTART 0x100 #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 +#define SPA_ASYNC_L2CACHE_REBUILD 0x800 /* * Controls the behavior of spa_vdev_remove(). diff --git a/usr/src/uts/common/fs/zfs/sys/spa_boot.h b/usr/src/uts/common/fs/zfs/sys/spa_boot.h index 8df5072a55..b1b100e17e 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h @@ -25,6 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ #ifndef _SYS_SPA_BOOT_H @@ -36,8 +37,8 @@ extern "C" { #endif -extern char *spa_get_bootprop(char *prop); -extern void spa_free_bootprop(char *prop); +extern char *spa_get_bootprop(const char *propname); +extern void spa_free_bootprop(char *propval); extern void spa_arch_init(void); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 88a172eed5..45a78717da 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -25,7 +25,6 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2017 Datto Inc. - * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright 2019 Joyent, Inc. */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index a6de7e6f2c..b839ed2359 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -21,8 +21,9 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. */ #ifndef _SYS_VDEV_H @@ -153,6 +154,8 @@ extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd); +extern void vdev_defer_resilver(vdev_t *vd); +extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, @@ -177,6 +180,8 @@ extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags); +extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); +extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 774ed92db5..4e42247345 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -20,10 +20,10 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ #ifndef _SYS_VDEV_IMPL_H @@ -411,7 +411,7 @@ struct vdev { #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +/* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -438,9 +438,38 @@ typedef struct vdev_phys { zio_eck_t vp_zbt; } vdev_phys_t; +typedef enum vbe_vers { + /* + * The bootenv file is stored as ascii text in the envblock. + * It is used by the GRUB bootloader used on Linux to store the + * contents of the grubenv file. The file is stored as raw ASCII, + * and is protected by an embedded checksum. By default, GRUB will + * check if the boot filesystem supports storing the environment data + * in a special location, and if so, will invoke filesystem specific + * logic to retrieve it. This can be overriden by a variable, should + * the user so desire. + */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ @@ -556,6 +585,14 @@ typedef struct vdev_buf { zio_t *vb_io; /* pointer back to the original zio_t */ } vdev_buf_t; +/* + * Support routines used during boot from a ZFS pool + */ +extern int vdev_disk_read_rootlabel(const char *, const char *, nvlist_t **); +extern void vdev_disk_preroot_init(void); +extern void vdev_disk_preroot_fini(void); +extern const char *vdev_disk_preroot_lookup(uint64_t, uint64_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h new file mode 100644 index 0000000000..bf5c840139 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic <neskovic@compeng.uni-frankfurt.de>. + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct zio; +struct raidz_map; +#if !defined(_KERNEL) +struct kernel_param {}; +#endif + +/* + * vdev_raidz interface + */ +struct raidz_map * vdev_raidz_map_alloc(struct zio *, uint64_t, + uint64_t, uint64_t); +void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity(struct raidz_map *); +int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); + +/* + * vdev_raidz_math interface + */ +void vdev_raidz_math_init(void); +void vdev_raidz_math_fini(void); +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +int vdev_raidz_math_generate(struct raidz_map *); +int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, + const int *, const int); +int vdev_raidz_impl_set(const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h new file mode 100644 index 0000000000..18771534bf --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h @@ -0,0 +1,360 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_H +#define _VDEV_RAIDZ_H + +#include <sys/types.h> +#include <sys/debug.h> +#include <sys/kstat.h> +#include <sys/abd.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define CODE_P (0U) +#define CODE_Q (1U) +#define CODE_R (2U) + +#define PARITY_P (1U) +#define PARITY_PQ (2U) +#define PARITY_PQR (3U) + +#define TARGET_X (0U) +#define TARGET_Y (1U) +#define TARGET_Z (2U) + +/* + * Parity generation methods indexes + */ +enum raidz_math_gen_op { + RAIDZ_GEN_P = 0, + RAIDZ_GEN_PQ, + RAIDZ_GEN_PQR, + RAIDZ_GEN_NUM = 3 +}; +/* + * Data reconstruction methods indexes + */ +enum raidz_rec_op { + RAIDZ_REC_P = 0, + RAIDZ_REC_Q, + RAIDZ_REC_R, + RAIDZ_REC_PQ, + RAIDZ_REC_PR, + RAIDZ_REC_QR, + RAIDZ_REC_PQR, + RAIDZ_REC_NUM = 7 +}; + +extern const char *raidz_gen_name[RAIDZ_GEN_NUM]; +extern const char *raidz_rec_name[RAIDZ_REC_NUM]; + +/* + * Methods used to define raidz implementation + * + * @raidz_gen_f Parity generation function + * @par1 pointer to raidz_map + * @raidz_rec_f Data reconstruction function + * @par1 pointer to raidz_map + * @par2 array of reconstruction targets + * @will_work_f Function returns TRUE if impl. is supported on the system + * @init_impl_f Function is called once on init + * @fini_impl_f Function is called once on fini + */ +typedef void (*raidz_gen_f)(void *); +typedef int (*raidz_rec_f)(void *, const int *); +typedef boolean_t (*will_work_f)(void); +typedef void (*init_impl_f)(void); +typedef void (*fini_impl_f)(void); + +#define RAIDZ_IMPL_NAME_MAX (20) + +typedef struct raidz_impl_ops { + init_impl_f init; + fini_impl_f fini; + raidz_gen_f gen[RAIDZ_GEN_NUM]; /* Parity generate functions */ + raidz_rec_f rec[RAIDZ_REC_NUM]; /* Data reconstruction functions */ + will_work_f is_supported; /* Support check function */ + char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ +} raidz_impl_ops_t; + +typedef struct raidz_col { + size_t rc_devidx; /* child device index for I/O */ + size_t rc_offset; /* device offset */ + size_t rc_size; /* I/O size */ + abd_t *rc_abd; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ + int rc_error; /* I/O error for this device */ + unsigned int rc_tried; /* Did we attempt this I/O column? */ + unsigned int rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + size_t rm_cols; /* Regular column count */ + size_t rm_scols; /* Count including skipped columns */ + size_t rm_bigcols; /* Number of oversized columns */ + size_t rm_asize; /* Actual total I/O size */ + size_t rm_missingdata; /* Count of missing data devices */ + size_t rm_missingparity; /* Count of missing parity devices */ + size_t rm_firstdatacol; /* First data column/parity count */ + size_t rm_nskip; /* Skipped sectors for padding */ + size_t rm_skipstart; /* Column index of padding start */ + void *rm_abd_copy; /* rm_asize-buffer of copied data */ + size_t rm_reports; /* # of referencing checksum reports */ + unsigned int rm_freed; /* map no longer has referencing ZIO */ + unsigned int rm_ecksuminjected; /* checksum error was injected */ + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define RAIDZ_ORIGINAL_IMPL (INT_MAX) + +extern const raidz_impl_ops_t vdev_raidz_scalar_impl; +#if defined(__x86) +extern const raidz_impl_ops_t vdev_raidz_sse2_impl; +#endif +#if defined(__x86) +extern const raidz_impl_ops_t vdev_raidz_ssse3_impl; +#endif +#if defined(__x86) +extern const raidz_impl_ops_t vdev_raidz_avx2_impl; +#endif + +/* + * Commonly used raidz_map helpers + * + * raidz_parity Returns parity of the RAIDZ block + * raidz_ncols Returns number of columns the block spans + * raidz_nbigcols Returns number of big columns + * raidz_col_p Returns pointer to a column + * raidz_col_size Returns size of a column + * raidz_big_size Returns size of big columns + * raidz_short_size Returns size of short columns + */ +#define raidz_parity(rm) ((rm)->rm_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_nbigcols(rm) ((rm)->rm_bigcols) +#define raidz_col_p(rm, c) ((rm)->rm_col + (c)) +#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) +#define raidz_big_size(rm) (raidz_col_size(rm, CODE_P)) +#define raidz_short_size(rm) (raidz_col_size(rm, raidz_ncols(rm)-1)) + +/* + * Macro defines an RAIDZ parity generation method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_GEN_WRAP(code, impl) \ +static void \ +impl ## _gen_ ## code(void *rmp) \ +{ \ + raidz_map_t *rm = (raidz_map_t *) rmp; \ + raidz_generate_## code ## _impl(rm); \ +} + +/* + * Macro defines an RAIDZ data reconstruction method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_REC_WRAP(code, impl) \ +static int \ +impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +{ \ + raidz_map_t *rm = (raidz_map_t *) rmp; \ + return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ +} + +/* + * Define all gen methods for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_GEN_METHODS(impl) \ + _RAIDZ_GEN_WRAP(p, impl); \ + _RAIDZ_GEN_WRAP(pq, impl); \ + _RAIDZ_GEN_WRAP(pqr, impl) + +/* + * Define all rec functions for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_REC_METHODS(impl) \ + _RAIDZ_REC_WRAP(p, impl); \ + _RAIDZ_REC_WRAP(q, impl); \ + _RAIDZ_REC_WRAP(r, impl); \ + _RAIDZ_REC_WRAP(pq, impl); \ + _RAIDZ_REC_WRAP(pr, impl); \ + _RAIDZ_REC_WRAP(qr, impl); \ + _RAIDZ_REC_WRAP(pqr, impl) + +#define RAIDZ_GEN_METHODS(impl) \ +{ \ + [RAIDZ_GEN_P] = & impl ## _gen_p, \ + [RAIDZ_GEN_PQ] = & impl ## _gen_pq, \ + [RAIDZ_GEN_PQR] = & impl ## _gen_pqr \ +} + +#define RAIDZ_REC_METHODS(impl) \ +{ \ + [RAIDZ_REC_P] = & impl ## _rec_p, \ + [RAIDZ_REC_Q] = & impl ## _rec_q, \ + [RAIDZ_REC_R] = & impl ## _rec_r, \ + [RAIDZ_REC_PQ] = & impl ## _rec_pq, \ + [RAIDZ_REC_PR] = & impl ## _rec_pr, \ + [RAIDZ_REC_QR] = & impl ## _rec_qr, \ + [RAIDZ_REC_PQR] = & impl ## _rec_pqr \ +} + + +typedef struct raidz_impl_kstat { + uint64_t gen[RAIDZ_GEN_NUM]; /* gen method speed B/s */ + uint64_t rec[RAIDZ_REC_NUM]; /* rec method speed B/s */ +} raidz_impl_kstat_t; + +/* + * Enumerate various multiplication constants + * used in reconstruction methods + */ +typedef enum raidz_mul_info { + /* Reconstruct Q */ + MUL_Q_X = 0, + /* Reconstruct R */ + MUL_R_X = 0, + /* Reconstruct PQ */ + MUL_PQ_X = 0, + MUL_PQ_Y = 1, + /* Reconstruct PR */ + MUL_PR_X = 0, + MUL_PR_Y = 1, + /* Reconstruct QR */ + MUL_QR_XQ = 0, + MUL_QR_X = 1, + MUL_QR_YQ = 2, + MUL_QR_Y = 3, + /* Reconstruct PQR */ + MUL_PQR_XP = 0, + MUL_PQR_XQ = 1, + MUL_PQR_XR = 2, + MUL_PQR_YU = 3, + MUL_PQR_YP = 4, + MUL_PQR_YQ = 5, + + MUL_CNT = 6 +} raidz_mul_info_t; + +/* + * Powers of 2 in the Galois field. + */ +extern const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))); +/* Logs of 2 in the Galois field defined above. */ +extern const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))); + +/* + * Multiply a given number by 2 raised to the given power. + */ +static inline uint8_t +vdev_raidz_exp2(const uint8_t a, const unsigned exp) +{ + if (a == 0) + return (0); + + return (vdev_raidz_pow2[(exp + (unsigned) vdev_raidz_log2[a]) % 255]); +} + +/* + * Galois Field operations. + * + * gf_exp2 - computes 2 raised to the given power + * gf_exp2 - computes 4 raised to the given power + * gf_mul - multiplication + * gf_div - division + * gf_inv - multiplicative inverse + */ +typedef unsigned gf_t; +typedef unsigned gf_log_t; + +static inline gf_t +gf_mul(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + if (a == 0 || b == 0) + return (0); + + logsum = (gf_log_t) vdev_raidz_log2[a] + (gf_log_t) vdev_raidz_log2[b]; + + return ((gf_t) vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_div(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + ASSERT3U(b, >, 0); + if (a == 0) + return (0); + + logsum = (gf_log_t) 255 + (gf_log_t) vdev_raidz_log2[a] - + (gf_log_t) vdev_raidz_log2[b]; + + return ((gf_t) vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_inv(const gf_t a) +{ + gf_log_t logsum; + + ASSERT3U(a, >, 0); + + logsum = (gf_log_t) 255 - (gf_log_t) vdev_raidz_log2[a]; + + return ((gf_t) vdev_raidz_pow2[logsum]); +} + +static inline gf_t +gf_exp2(gf_log_t exp) +{ + return (vdev_raidz_pow2[exp % 255]); +} + +static inline gf_t +gf_exp4(gf_log_t exp) +{ + ASSERT3U(exp, <=, 255); + return ((gf_t) vdev_raidz_pow2[(2 * exp) % 255]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_RAIDZ_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h b/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h new file mode 100644 index 0000000000..703a1c8fa6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Toomas Soome <tsoome@me.com> + */ + +#ifndef _ZFS_BOOTENV_H +#define _ZFS_BOOTENV_H + +/* + * Define macros for label bootenv nvlist pair keys. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define BOOTENV_VERSION "version" + +#define BE_ILLUMOS_VENDOR "illumos" +#define BE_FREEBSD_VENDOR "freebsd" +#define BE_GRUB_VENDOR "grub" + +#define BOOTENV_OS BE_ILLUMOS_VENDOR + +#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap" + +#define FREEBSD_BOOTONCE BE_FREEBSD_VENDOR ":" "bootonce" +#define FREEBSD_BOOTONCE_USED BE_FREEBSD_VENDOR ":" "bootonce-used" +#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce" +#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used" +#define FREEBSD_NVSTORE BE_FREEBSD_VENDOR ":" "nvstore" +#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore" + +#define OS_BOOTONCE BOOTENV_OS ":" "bootonce" +#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used" +#define OS_NVSTORE BOOTENV_OS ":" "nvstore" + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_BOOTENV_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 9947bedf54..5058d48e74 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. */ #ifndef _SYS_ZFS_IOCTL_H @@ -389,6 +390,10 @@ typedef struct zinject_record { #define ZI_NO_DVA (-1) +/* scaled frequency ranges */ +#define ZI_PERCENTAGE_MIN 4294UL +#define ZI_PERCENTAGE_MAX UINT32_MAX + typedef enum zinject_type { ZINJECT_UNINITIALIZED, ZINJECT_DATA_FAULT, @@ -450,7 +455,7 @@ typedef struct zfs_cmd { uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; - uint64_t zc_iflags; /* internal to zfs(7fs) */ + uint64_t zc_iflags; /* internal to zfs(4FS) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; dmu_replay_record_t zc_begin_record; diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 01e892f4c4..8e155979e6 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -27,6 +27,7 @@ * Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -98,6 +99,12 @@ boolean_t vdev_validate_skip = B_FALSE; int zfs_vdev_dtl_sm_blksz = (1 << 12); /* + * Ignore errors during scrub/resilver. Allows to work around resilver + * upon import when there are pool errors. + */ +int zfs_scan_ignore_errors = 0; + +/* * vdev-wide space maps that have lots of entries written to them at * the end of each transaction can benefit from a higher I/O bandwidth * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. @@ -196,7 +203,7 @@ vdev_getops(const char *type) /* * Derive the enumerated alloction bias from string input. - * String origin is either the per-vdev zap or zpool(1M). + * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) @@ -772,7 +779,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_resilver_txg); if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); + vdev_defer_resilver(vd); /* * When importing a pool, we want to ignore the persistent fault @@ -1358,7 +1365,7 @@ vdev_probe_done(zio_t *zio) } else { ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); - zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -1462,7 +1469,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); @@ -1710,7 +1717,8 @@ vdev_open(vdev_t *vd) */ if (ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { - zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + (void) zfs_ereport_post( + FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, spa, vd, NULL, NULL, 0, 0); } @@ -1764,18 +1772,12 @@ vdev_open(vdev_t *vd) } /* - * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a scrub, - * since this would just restart the scrub we are already doing. + * If this is a leaf vdev, assess whether a resilver is needed. + * But don't do this if we are doing a reopen for a scrub, since + * this would just restart the scrub we are already doing. */ - if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && - vdev_resilver_needed(vd, NULL, NULL)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) + dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); return (0); } @@ -2134,7 +2136,7 @@ vdev_hold(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_hold(vd->vdev_child[c]); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) vd->vdev_ops->vdev_op_hold(vd); } @@ -2147,7 +2149,7 @@ vdev_rele(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_rele(vd->vdev_child[c]); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) vd->vdev_ops->vdev_op_rele(vd); } @@ -2177,9 +2179,22 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - vd->vdev_aux == &spa->spa_l2cache && - !l2arc_vdev_present(vd)) - l2arc_add_vdev(spa, vd); + vd->vdev_aux == &spa->spa_l2cache) { + /* + * When reopening we can assume the device label has + * already the attribute l2cache_persistent, since we've + * opened the device in the past and updated the label. + * In case the vdev is present we should evict all ARC + * buffers and pointers to log blocks and reclaim their + * space before restoring its contents to L2ARC. + */ + if (l2arc_vdev_present(vd)) { + l2arc_rebuild_vdev(vd, B_TRUE); + } else { + l2arc_add_vdev(spa, vd); + } + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + } } else { (void) vdev_validate(vd); } @@ -2470,7 +2485,6 @@ vdev_dtl_should_excise(vdev_t *vd) spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2520,10 +2534,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); /* + * If requested, pretend the scan completed cleanly. + */ + if (zfs_scan_ignore_errors && scn) + scn->scn_phys.scn_errors = 0; + + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } + + /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to * excise regions on vdevs that were available during @@ -2559,6 +2592,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], @@ -3543,14 +3584,11 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); - if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) { - if (dsl_scan_resilvering(spa->spa_dsl_pool) && - spa_feature_is_enabled(spa, - SPA_FEATURE_RESILVER_DEFER)) - vdev_set_deferred_resilver(spa, vd); - else - spa_async_request(spa, SPA_ASYNC_RESILVER); - } + /* If a resilver isn't required, check if vdevs can be culled */ + if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); } @@ -3749,6 +3787,8 @@ void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) { vdev_t *tvd = vd->vdev_top; + spa_t *spa = vd->vdev_spa; + mutex_enter(&vd->vdev_stat_lock); if (vs) { bcopy(&vd->vdev_stat, vs, sizeof (*vs)); @@ -3790,8 +3830,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) */ if (vd->vdev_aux == NULL && tvd != NULL) { vs->vs_esize = P2ALIGN( - vd->vdev_max_asize - vd->vdev_asize, - 1ULL << tvd->vdev_ms_shift); + vd->vdev_max_asize - vd->vdev_asize - + spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); } if (vd->vdev_aux == NULL && vd == vd->vdev_top && vdev_is_concrete(vd)) { @@ -4384,7 +4424,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } - zfs_ereport_post(class, spa, vd, NULL, NULL, + (void) zfs_ereport_post(class, spa, vd, NULL, NULL, save_state, 0); } @@ -4414,7 +4454,6 @@ vdev_children_are_offline(vdev_t *vd) /* * Check the vdev configuration to ensure that it's capable of supporting * a root pool. We do not support partial configuration. - * In addition, only a single top-level vdev is allowed. */ boolean_t vdev_is_bootable(vdev_t *vd) @@ -4422,23 +4461,7 @@ vdev_is_bootable(vdev_t *vd) if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && - vd->vdev_children > 1) { - int non_indirect = 0; - - for (int c = 0; c < vd->vdev_children; c++) { - vdev_type = - vd->vdev_child[c]->vdev_ops->vdev_op_type; - if (strcmp(vdev_type, VDEV_TYPE_INDIRECT) != 0) - non_indirect++; - } - /* - * non_indirect > 1 means we have more than one - * top-level vdev, so we stop here. - */ - if (non_indirect > 1) - return (B_FALSE); - } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { + if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } } @@ -4559,18 +4582,46 @@ vdev_deadman(vdev_t *vd) } void -vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd) +vdev_defer_resilver(vdev_t *vd) { - for (uint64_t i = 0; i < vd->vdev_children; i++) - vdev_set_deferred_resilver(spa, vd->vdev_child[i]); + ASSERT(vd->vdev_ops->vdev_op_leaf); - if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) || - range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { - return; + vd->vdev_resilver_deferred = B_TRUE; + vd->vdev_spa->spa_resilver_deferred = B_TRUE; +} + +/* + * Clears the resilver deferred flag on all leaf devs under vd. Returns + * B_TRUE if we have devices that need to be resilvered and are available to + * accept resilver I/Os. + */ +boolean_t +vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) +{ + boolean_t resilver_needed = B_FALSE; + spa_t *spa = vd->vdev_spa; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); } - vd->vdev_resilver_deferred = B_TRUE; - spa->spa_resilver_deferred = B_TRUE; + if (vd == spa->spa_root_vdev && + spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { + spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); + vdev_config_dirty(vd); + spa->spa_resilver_deferred = B_FALSE; + return (resilver_needed); + } + + if (!vdev_is_concrete(vd) || vd->vdev_aux || + !vd->vdev_ops->vdev_op_leaf) + return (resilver_needed); + + vd->vdev_resilver_deferred = B_FALSE; + + return (!vdev_is_dead(vd) && !vd->vdev_offline && + vdev_resilver_needed(vd, NULL, NULL)); } /* diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 9408ec68fb..4be567d551 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -23,6 +23,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright 2020 Joyent, Inc. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> */ #include <sys/zfs_context.h> @@ -365,7 +366,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, error = EINVAL; /* presume failure */ if (vd->vdev_path != NULL) { - if (vd->vdev_wholedisk == -1ULL) { size_t len = strlen(vd->vdev_path) + 3; char *buf = kmem_alloc(len, KM_SLEEP); @@ -480,6 +480,28 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, } } + /* + * If this is early in boot, a sweep of available block devices may + * locate an alternative path that we can try. + */ + if (error != 0) { + const char *altdevpath = vdev_disk_preroot_lookup( + spa_guid(spa), vd->vdev_guid); + + if (altdevpath != NULL) { + vdev_dbgmsg(vd, "Trying alternate preroot path (%s)", + altdevpath); + + validate_devid = B_TRUE; + + if ((error = ldi_open_by_name((char *)altdevpath, + spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) { + vdev_dbgmsg(vd, "Failed to open by preroot " + "path (%s)", altdevpath); + } + } + } + if (error != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", @@ -1063,7 +1085,8 @@ vdev_ops_t vdev_disk_ops = { * the device, and construct a configuration nvlist. */ int -vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) +vdev_disk_read_rootlabel(const char *devpath, const char *devid, + nvlist_t **config) { ldi_handle_t vd_lh; vdev_label_t *label; @@ -1076,7 +1099,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) /* * Read the device label and build the nvlist. */ - if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid, &minor_name) == 0) { error = ldi_open_by_devid(tmpdevid, minor_name, FREAD, kcred, &vd_lh, zfs_li); @@ -1084,9 +1107,10 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) ddi_devid_str_free(minor_name); } - if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, - zfs_li))) + if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD, + kcred, &vd_lh, zfs_li)) != 0) { return (error); + } if (ldi_get_size(vd_lh, &s)) { (void) ldi_close(vd_lh, FREAD, kcred); @@ -1136,3 +1160,150 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) return (error); } + +struct veb { + list_t veb_ents; + boolean_t veb_scanned; +}; + +struct veb_ent { + uint64_t vebe_pool_guid; + uint64_t vebe_vdev_guid; + + char *vebe_devpath; + + list_node_t vebe_link; +}; + +static kmutex_t veb_lock; +static struct veb *veb; + +static int +vdev_disk_preroot_scan_walk(const char *devpath, void *arg) +{ + int r; + nvlist_t *cfg = NULL; + uint64_t pguid = 0, vguid = 0; + + /* + * Attempt to read the label from this block device. + */ + if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) { + /* + * Many of the available block devices will represent slices or + * partitions of disks, or may represent disks that are not at + * all initialised with ZFS. As this is a best effort + * mechanism to locate an alternate path to a particular vdev, + * we will ignore any failures and keep scanning. + */ + return (PREROOT_WALK_BLOCK_DEVICES_NEXT); + } + + /* + * Determine the pool and vdev GUID read from the label for this + * device. Both values must be present and have a non-zero value. + */ + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 || + nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 || + pguid == 0 || vguid == 0) { + /* + * This label was not complete. + */ + goto out; + } + + /* + * Keep track of all of the GUID-to-devpath mappings we find so that + * vdev_disk_preroot_lookup() can search them. + */ + struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP); + vebe->vebe_pool_guid = pguid; + vebe->vebe_vdev_guid = vguid; + vebe->vebe_devpath = spa_strdup(devpath); + + list_insert_tail(&veb->veb_ents, vebe); + +out: + nvlist_free(cfg); + return (PREROOT_WALK_BLOCK_DEVICES_NEXT); +} + +const char * +vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid) +{ + if (pool_guid == 0 || vdev_guid == 0) { + /* + * If we aren't provided both a pool and a vdev GUID, we cannot + * perform a lookup. + */ + return (NULL); + } + + mutex_enter(&veb_lock); + if (veb == NULL) { + /* + * If vdev_disk_preroot_fini() has been called already, there + * is nothing we can do. + */ + mutex_exit(&veb_lock); + return (NULL); + } + + /* + * We want to perform at most one scan of all block devices per boot. + */ + if (!veb->veb_scanned) { + cmn_err(CE_NOTE, "Performing full ZFS device scan!"); + + preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL); + + veb->veb_scanned = B_TRUE; + } + + const char *path = NULL; + for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL; + vebe = list_next(&veb->veb_ents, vebe)) { + if (vebe->vebe_pool_guid == pool_guid && + vebe->vebe_vdev_guid == vdev_guid) { + path = vebe->vebe_devpath; + break; + } + } + + mutex_exit(&veb_lock); + + return (path); +} + +void +vdev_disk_preroot_init(void) +{ + mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL); + + VERIFY3P(veb, ==, NULL); + veb = kmem_zalloc(sizeof (*veb), KM_SLEEP); + list_create(&veb->veb_ents, sizeof (struct veb_ent), + offsetof(struct veb_ent, vebe_link)); + veb->veb_scanned = B_FALSE; +} + +void +vdev_disk_preroot_fini(void) +{ + mutex_enter(&veb_lock); + + if (veb != NULL) { + while (!list_is_empty(&veb->veb_ents)) { + struct veb_ent *vebe = list_remove_head(&veb->veb_ents); + + spa_strfree(vebe->vebe_devpath); + + kmem_free(vebe, sizeof (*vebe)); + } + + kmem_free(veb, sizeof (*veb)); + veb = NULL; + } + + mutex_exit(&veb_lock); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c index effea61bc6..6c636dd4d2 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c @@ -1382,8 +1382,8 @@ vdev_indirect_checksum_error(zio_t *zio, void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); abd_t *good_abd = is->is_good_child->ic_data; void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); - zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, - is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, + zio, is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); abd_return_buf(ic->ic_data, bad_buf, is->is_size); abd_return_buf(good_abd, good_buf, is->is_size); } @@ -1459,7 +1459,7 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, is->is_target_offset, is->is_size, NULL, NULL, NULL); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 6235b06f17..b683c3694b 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright 2020 Joyent, Inc. */ @@ -150,6 +150,8 @@ #include <sys/dsl_scan.h> #include <sys/abd.h> #include <sys/fs/zfs.h> +#include <sys/byteorder.h> +#include <sys/zfs_bootenv.h> /* * Basic routines to read and write from a vdev label. @@ -940,7 +942,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; - abd_t *pad2; + abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; @@ -1101,8 +1103,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(pad2, VDEV_PAD_SIZE); + bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -1121,8 +1123,8 @@ retry: * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, - offsetof(vdev_label_t, vl_pad2), + vdev_label_write(zio, vd, l, bootenv, + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, @@ -1138,7 +1140,7 @@ retry: } nvlist_free(label); - abd_free(pad2); + abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); @@ -1162,6 +1164,212 @@ retry: } /* + * Done callback for vdev_label_read_bootenv_impl. If this is the first + * callback to finish, store our abd in the callback pointer. Otherwise, we + * just free our abd and return. + */ +static void +vdev_label_read_bootenv_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + abd_t **cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); + + if (zio->io_error == 0) { + mutex_enter(&rio->io_lock); + if (*cbp == NULL) { + /* Will free this buffer in vdev_label_read_bootenv. */ + *cbp = zio->io_abd; + } else { + abd_free(zio->io_abd); + } + mutex_exit(&rio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + + /* + * We just use the first label that has a correct checksum; the + * bootloader should have rewritten them all to be the same on boot, + * and any changes we made since boot have been the same across all + * labels. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, + vdev_label_read_bootenv_done, zio, flags); + } + } +} + +int +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv) +{ + nvlist_t *config; + spa_t *spa = rvd->vdev_spa; + abd_t *abd = NULL; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(bootenv); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + zio_t *zio = zio_root(spa, NULL, &abd, flags); + vdev_label_read_bootenv_impl(zio, rvd, flags); + int err = zio_wait(zio); + + if (abd != NULL) { + char *buf; + vdev_boot_envblock_t *vbe = abd_to_buf(abd); + + vbe->vbe_version = ntohll(vbe->vbe_version); + switch (vbe->vbe_version) { + case VB_RAW: + /* + * if we have textual data in vbe_bootenv, create nvlist + * with key "envmap". + */ + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW); + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(bootenv, GRUB_ENVMAP, + vbe->vbe_bootenv); + break; + + case VB_NVLIST: + err = nvlist_unpack(vbe->vbe_bootenv, + sizeof (vbe->vbe_bootenv), &config, 0); + if (err == 0) { + fnvlist_merge(bootenv, config); + nvlist_free(config); + break; + } + /* FALLTHROUGH */ + default: + /* Check for FreeBSD zfs bootonce command string */ + buf = abd_to_buf(abd); + if (*buf == '\0') { + fnvlist_add_uint64(bootenv, BOOTENV_VERSION, + VB_NVLIST); + break; + } + fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf); + } + + /* + * abd was allocated in vdev_label_read_bootenv_impl() + */ + abd_free(abd); + /* + * If we managed to read any successfully, + * return success. + */ + return (0); + } + return (err); +} + +int +vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env) +{ + zio_t *zio; + spa_t *spa = vd->vdev_spa; + vdev_boot_envblock_t *bootenv; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + size_t nvsize; + char *nvbuf; + + error = nvlist_size(env, &nvsize, NV_ENCODE_XDR); + if (error != 0) + return (SET_ERROR(error)); + + if (nvsize >= sizeof (bootenv->vbe_bootenv)) { + return (SET_ERROR(E2BIG)); + } + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + error = ENXIO; + for (int c = 0; c < vd->vdev_children; c++) { + int child_err; + + child_err = vdev_label_write_bootenv(vd->vdev_child[c], env); + /* + * As long as any of the disks managed to write all of their + * labels successfully, return success. + */ + if (child_err == 0) + error = child_err; + } + + if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || + !vdev_writeable(vd)) { + return (error); + } + ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(abd, VDEV_PAD_SIZE); + + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + nvbuf = bootenv->vbe_bootenv; + nvsize = sizeof (bootenv->vbe_bootenv); + + bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION); + switch (bootenv->vbe_version) { + case VB_RAW: + if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) { + (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize); + } + error = 0; + break; + + case VB_NVLIST: + error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR, + KM_SLEEP); + break; + + default: + error = EINVAL; + break; + } + + if (error == 0) { + bootenv->vbe_version = htonll(bootenv->vbe_version); + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + } else { + abd_free(abd); + return (SET_ERROR(error)); + } + +retry: + zio = zio_root(spa, NULL, NULL, flags); + for (int l = 0; l < VDEV_LABELS; l++) { + vdev_label_write(zio, vd, l, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(abd); + return (error); +} + +/* * ========================================================================== * uberblock load/sync * ========================================================================== diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 10772d5265..524ba25cb2 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016 Gvozden NeÅ¡ković. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2014 Integros [integros.com] */ @@ -35,6 +36,8 @@ #include <sys/abd.h> #include <sys/fs/zfs.h> #include <sys/fm/fs/zfs.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> #ifdef ZFS_DEBUG #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -98,7 +101,7 @@ * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial * XOR operation, and 2 and 4 can be computed quickly and generate linearly- * independent coefficients. (There are no additional coefficients that have * this property which is why the uncorrected Plank method breaks down.) @@ -107,34 +110,6 @@ * or in concert to recover missing data columns. */ -typedef struct raidz_col { - uint64_t rc_devidx; /* child device index for I/O */ - uint64_t rc_offset; /* device offset */ - uint64_t rc_size; /* I/O size */ - abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ - int rc_error; /* I/O error for this device */ - uint8_t rc_tried; /* Did we attempt this I/O column? */ - uint8_t rc_skipped; /* Did we skip this I/O column? */ -} raidz_col_t; - -typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ - uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ -} raidz_map_t; - #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 #define VDEV_RAIDZ_R 2 @@ -153,7 +128,7 @@ typedef struct raidz_map { (mask) = (x) & 0x8080808080808080ULL; \ (mask) = ((mask) << 1) - ((mask) >> 7); \ (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ - ((mask) & 0x1d1d1d1d1d1d1d1d); \ + ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ } #define VDEV_RAIDZ_64MUL_4(x, mask) \ @@ -162,106 +137,7 @@ typedef struct raidz_map { VDEV_RAIDZ_64MUL_2((x), mask); \ } -#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) - -/* - * Force reconstruction to use the general purpose method. - */ -int vdev_raidz_default_to_general; - -/* Powers of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_pow2[256] = { - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, - 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, - 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, - 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, - 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, - 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, - 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, - 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, - 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, - 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, - 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, - 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, - 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, - 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, - 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, - 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, - 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, - 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, - 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, - 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, - 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, - 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, - 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, - 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, - 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, - 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, - 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, - 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, - 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, - 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, - 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 -}; -/* Logs of 2 in the Galois field defined above. */ -static const uint8_t vdev_raidz_log2[256] = { - 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, - 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, - 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, - 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, - 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, - 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, - 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, - 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, - 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, - 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, - 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, - 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, - 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, - 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, - 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, - 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, - 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, - 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, - 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, - 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, - 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, - 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, - 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, - 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, - 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, - 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, - 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, - 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, - 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, - 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, - 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, - 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, -}; - -static void vdev_raidz_generate_parity(raidz_map_t *rm); - -/* - * Multiply a given number by 2 raised to the given power. - */ -static uint8_t -vdev_raidz_exp2(uint_t a, int exp) -{ - if (a == 0) - return (0); - - ASSERT(exp >= 0); - ASSERT(vdev_raidz_log2[a] > 0 || a == 1); - - exp += vdev_raidz_log2[a]; - if (exp > 255) - exp -= 255; - - return (vdev_raidz_pow2[exp]); -} - -static void +void vdev_raidz_map_free(raidz_map_t *rm) { int c; @@ -271,7 +147,6 @@ vdev_raidz_map_free(raidz_map_t *rm) if (rm->rm_col[c].rc_gdata != NULL) abd_free(rm->rm_col[c].rc_gdata); - } for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) @@ -311,7 +186,7 @@ static void vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; - size_t c = zcr->zcr_cbinfo; + const size_t c = zcr->zcr_cbinfo; size_t x, offset; const abd_t *good = NULL; @@ -459,19 +334,19 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { * Divides the IO evenly across all child vdevs; usually, dcols is * the number of children in the target vdev. */ -static raidz_map_t * -vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, - uint64_t unit_shift, uint64_t dcols, uint64_t nparity) +raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, + uint64_t nparity) { raidz_map_t *rm; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> unit_shift; + uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ - uint64_t s = size >> unit_shift; + uint64_t s = zio->io_size >> ashift; /* The first column for this stripe. */ uint64_t f = b % dcols; /* The starting byte offset on each child vdev. */ - uint64_t o = (b / dcols) << unit_shift; + uint64_t o = (b / dcols) << ashift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; @@ -530,7 +405,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, coff = o; if (col >= dcols) { col -= dcols; - coff += 1ULL << unit_shift; + coff += 1ULL << ashift; } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; @@ -543,29 +418,29 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, if (c >= acols) rm->rm_col[c].rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << unit_shift; + rm->rm_col[c].rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << unit_shift; + rm->rm_col[c].rc_size = q << ashift; asize += rm->rm_col[c].rc_size; } - ASSERT3U(asize, ==, tot << unit_shift); - rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << ashift); + rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0, + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, rm->rm_col[c].rc_size); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off, + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; } @@ -573,7 +448,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, /* * If all data stored spans all columns, there's a danger that parity * will always be on the same device and, since parity isn't read - * during normal operation, that that device's I/O bandwidth won't be + * during normal operation, that device's I/O bandwidth won't be * used effectively. We therefore switch the parity every 1MB. * * ... at least that was, ostensibly, the theory. As a practical @@ -593,7 +468,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -605,6 +480,9 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, rm->rm_skipstart = 1; } + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + return (rm); } @@ -681,7 +559,6 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size); abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; @@ -793,9 +670,13 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * Generate RAID parity in the first virtual columns according to the number of * parity columns available. */ -static void +void vdev_raidz_generate_parity(raidz_map_t *rm) { + /* Generate using the new math implementation */ + if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + return; + switch (rm->rm_firstdatacol) { case 1: vdev_raidz_generate_parity_p(rm); @@ -873,8 +754,8 @@ vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) int cnt = size / sizeof (dst[0]); for (int i = 0; i < cnt; i++, dst++, rq->q++) { - *dst ^= *rq->q; + *dst ^= *rq->q; int j; uint8_t *b; for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { @@ -1159,9 +1040,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | - * (V|I)' = | 0 0 0 1 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | @@ -1385,8 +1269,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, int i, j, x, cc, c; uint8_t *src; uint64_t ccount; - uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; - uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; uint8_t log = 0; uint8_t val; int ll; @@ -1595,12 +1479,12 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) return (code); } -static int -vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +int +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; - int i, c; + int i, c, ret; int code; int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; @@ -1638,34 +1522,37 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) dt = &tgts[nbadparity]; + /* Reconstruct using the new math implementation */ + ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (ret != RAIDZ_ORIGINAL_IMPL) + return (ret); + /* * See if we can use any of our optimized reconstruction routines. */ - if (!vdev_raidz_default_to_general) { - switch (nbaddata) { - case 1: - if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); - break; + ASSERT(rm->rm_firstdatacol > 2); + break; - case 2: - ASSERT(rm->rm_firstdatacol > 1); + case 2: + ASSERT(rm->rm_firstdatacol > 1); - if (parity_valid[VDEV_RAIDZ_P] && - parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rm->rm_firstdatacol > 2); - break; - } + break; } code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); @@ -1821,11 +1708,16 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, * treat the on-disk format as if the only blocks are the complete 128 * KB size. */ - abd_t *abd = abd_get_from_buf(data - (offset - origoffset), + + /* First, fake a zio for vdev_raidz_map_alloc. */ + zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + zio->io_offset = origoffset; + zio->io_size = SPA_OLD_MAXBLOCKSIZE; + zio->io_abd = abd_get_from_buf(data - (offset - origoffset), SPA_OLD_MAXBLOCKSIZE); - rm = vdev_raidz_map_alloc(abd, - SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift, - vd->vdev_children, vd->vdev_nparity); + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); coloffset = origoffset; @@ -1860,21 +1752,17 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size, VERIFY3U(colsize, <=, rc->rc_size); VERIFY3U(colskip, <=, rc->rc_size); - /* - * Note that the child vdev will have a vdev label at the start - * of its range of offsets, hence the need for - * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another - * example of why this calculation is needed. - */ if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd, ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize, - VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0, - doread, isdump)) != 0) + rc->rc_offset + colskip, 0, doread, isdump)) != 0) { break; + } } vdev_raidz_map_free(rm); - abd_put(abd); + abd_put(zio->io_abd); + kmem_free(zio, sizeof (zio_t)); + #endif /* KERNEL */ return (err); @@ -1965,8 +1853,7 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset, - tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); zio->io_vsd = rm; @@ -2073,7 +1960,7 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - zfs_ereport_post_checksum(zio->io_spa, vd, + (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data, &zbc); } @@ -2141,11 +2028,6 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -/* - * Keep statistics on all the ways that we used parity to correct data. - */ -static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; - static int vdev_raidz_worst_error(raidz_map_t *rm) { @@ -2251,7 +2133,6 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) */ code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); for (i = 0; i < n; i++) { c = tgts[i]; @@ -2466,8 +2347,6 @@ vdev_raidz_io_done(zio_t *zio) code = vdev_raidz_reconstruct(rm, tgts, n); if (raidz_checksum_verify(zio) == 0) { - atomic_inc_64(&raidz_corrected[code]); - /* * If we read more parity disks than were used * for reconstruction, confirm that the other @@ -2620,7 +2499,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) /* * Determine if any portion of the provided block resides on a child vdev * with a dirty DTL and therefore needs to be resilvered. The function - * assumes that at least one DTL is dirty which imples that full stripe + * assumes that at least one DTL is dirty which implies that full stripe * width blocks must be resilvered. */ static boolean_t diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c new file mode 100644 index 0000000000..1591147375 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c @@ -0,0 +1,573 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/types.h> +#include <sys/zio.h> +#include <sys/debug.h> +#include <sys/zfs_debug.h> +#include <sys/vdev_raidz.h> +#include <sys/vdev_raidz_impl.h> +#include <sys/simd.h> + +#ifndef isspace +#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \ + (c) == '\r' || (c) == '\f' || (c) == '\013') +#endif + +extern boolean_t raidz_will_scalar_work(void); + +/* Opaque implementation with NULL methods to represent original methods */ +static const raidz_impl_ops_t vdev_raidz_original_impl = { + .name = "original", + .is_supported = raidz_will_scalar_work, +}; + +/* RAIDZ parity op that contain the fastest methods */ +static raidz_impl_ops_t vdev_raidz_fastest_impl = { + .name = "fastest" +}; + +/* All compiled in implementations */ +const raidz_impl_ops_t *raidz_all_maths[] = { + &vdev_raidz_original_impl, + &vdev_raidz_scalar_impl, +#if defined(__amd64) + &vdev_raidz_sse2_impl, +#endif +#if defined(__amd64) + &vdev_raidz_ssse3_impl, +#endif +#if defined(__amd64) + &vdev_raidz_avx2_impl, +#endif +}; + +/* Indicate that benchmark has been completed */ +static boolean_t raidz_math_initialized = B_FALSE; + +/* Select raidz implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) +#define IMPL_ORIGINAL (0) +#define IMPL_SCALAR (1) + +#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; +static uint32_t user_sel_impl = IMPL_FASTEST; + +/* Hold all supported implementations */ +static size_t raidz_supp_impl_cnt = 0; +static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; + +#if defined(_KERNEL) +/* + * kstats values for supported implementations + * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] + * + * PORTING NOTE: + * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code + * which implements a free-form kstat using additional functionality that does + * not exist in illumos. Because there are no software consumers of this + * information, we omit a kstat API. If an administrator needs to see this + * data for some reason, they can use mdb. + * + * The format of the kstat data on OpenZFS would be a "header" that looks like + * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name" + * arrays, starting with the parity function "implementation" name): + * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr + * This is followed by a row for each parity function implementation, showing + * the "speed" values calculated for that implementation for each of the + * parity generation and reconstruction functions in the "raidz_all_maths" + * array. + */ +static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; + +#endif + +/* + * Returns the RAIDZ operations for raidz_map() parity calculations. When + * a SIMD implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ +const raidz_impl_ops_t * +vdev_raidz_math_get_ops(void) +{ + if (!kfpu_allowed()) + return (&vdev_raidz_scalar_impl); + + raidz_impl_ops_t *ops = NULL; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(raidz_math_initialized); + ops = &vdev_raidz_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through all supported implementations */ + ASSERT(raidz_math_initialized); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; + ops = raidz_supp_impl[idx]; + break; + case IMPL_ORIGINAL: + ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; + break; + case IMPL_SCALAR: + ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; + break; + default: + ASSERT3U(impl, <, raidz_supp_impl_cnt); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + if (impl < ARRAY_SIZE(raidz_all_maths)) + ops = raidz_supp_impl[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +/* + * Select parity generation method for raidz_map + */ +int +vdev_raidz_math_generate(raidz_map_t *rm) +{ + raidz_gen_f gen_parity = NULL; + + switch (raidz_parity(rm)) { + case 1: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; + break; + case 2: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; + break; + case 3: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; + break; + default: + gen_parity = NULL; + cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", + (uint_t)raidz_parity(rm)); + break; + } + + /* if method is NULL execute the original implementation */ + if (gen_parity == NULL) + return (RAIDZ_ORIGINAL_IMPL); + + gen_parity(rm); + + return (0); +} + +static raidz_rec_f +reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1 && parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } + } else if (nbaddata == 2 && + parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } else if (parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_R]); + } + } else if (nbaddata == 2) { + if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PR]); + } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_QR]); + } + } else if (nbaddata == 3 && + parity_valid[CODE_P] && parity_valid[CODE_Q] && + parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQR]); + } + return ((raidz_rec_f) NULL); +} + +/* + * Select data reconstruction method for raidz_map + * @parity_valid - Parity validity flag + * @dt - Failed data index array + * @nbaddata - Number of failed data columns + */ +int +vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, + const int *dt, const int nbaddata) +{ + raidz_rec_f rec_fn = NULL; + + switch (raidz_parity(rm)) { + case PARITY_P: + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQ: + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQR: + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration %u", + (uint_t)raidz_parity(rm)); + break; + } + + if (rec_fn == NULL) + return (RAIDZ_ORIGINAL_IMPL); + else + return (rec_fn(rm, dt)); +} + +const char *raidz_gen_name[] = { + "gen_p", "gen_pq", "gen_pqr" +}; +const char *raidz_rec_name[] = { + "rec_p", "rec_q", "rec_r", + "rec_pq", "rec_pr", "rec_qr", "rec_pqr" +}; + +#if defined(_KERNEL) + +#define BENCH_D_COLS (8ULL) +#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) +#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ +#define BENCH_NS MSEC2NSEC(25) /* 25ms */ + +typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); + +static void +benchmark_gen_impl(raidz_map_t *rm, const int fn) +{ + (void) fn; + vdev_raidz_generate_parity(rm); +} + +static void +benchmark_rec_impl(raidz_map_t *rm, const int fn) +{ + static const int rec_tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); +} + +/* + * Benchmarking of all supported implementations (raidz_supp_impl_cnt) + * is performed by setting the rm_ops pointer and calling the top level + * generate/reconstruct methods of bench_rm. + */ +static void +benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) +{ + uint64_t run_cnt, speed, best_speed = 0; + hrtime_t t_start, t_diff; + raidz_impl_ops_t *curr_impl; + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + int impl, i; + + for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { + /* set an implementation to benchmark */ + curr_impl = raidz_supp_impl[impl]; + bench_rm->rm_ops = curr_impl; + + run_cnt = 0; + t_start = gethrtime(); + + do { + for (i = 0; i < 25; i++, run_cnt++) + bench_fn(bench_rm, fn); + + t_diff = gethrtime() - t_start; + } while (t_diff < BENCH_NS); + + speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; + speed /= (t_diff * BENCH_COLS); + + if (bench_fn == benchmark_gen_impl) + raidz_impl_kstats[impl].gen[fn] = speed; + else + raidz_impl_kstats[impl].rec[fn] = speed; + + /* Update fastest implementation method */ + if (speed > best_speed) { + best_speed = speed; + + if (bench_fn == benchmark_gen_impl) { + fstat->gen[fn] = impl; + vdev_raidz_fastest_impl.gen[fn] = + curr_impl->gen[fn]; + } else { + fstat->rec[fn] = impl; + vdev_raidz_fastest_impl.rec[fn] = + curr_impl->rec[fn]; + } + } + } +} +#endif + +/* + * Initialize and benchmark all supported implementations. + */ +static void +benchmark_raidz(void) +{ + raidz_impl_ops_t *curr_impl; + int i, c; + + /* Move supported impl into raidz_supp_impl */ + for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; + + if (curr_impl->init) + curr_impl->init(); + + if (curr_impl->is_supported()) + raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; + } + membar_producer(); /* complete raidz_supp_impl[] init */ + raidz_supp_impl_cnt = c; /* number of supported impl */ + +#if defined(_KERNEL) + zio_t *bench_zio = NULL; + raidz_map_t *bench_rm = NULL; + uint64_t bench_parity; + + /* Fake a zio and run the benchmark on a warmed up buffer */ + bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + bench_zio->io_offset = 0; + bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); + + /* Benchmark parity generation methods */ + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + bench_parity = fn + 1; + /* New raidz_map is needed for each generate_p/q/r */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_D_COLS + bench_parity, bench_parity); + + benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); + + vdev_raidz_map_free(bench_rm); + } + + /* Benchmark data reconstruction methods */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_COLS, PARITY_PQR); + + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) + benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); + + vdev_raidz_map_free(bench_rm); + + /* cleanup the bench zio */ + abd_free(bench_zio->io_abd); + kmem_free(bench_zio, sizeof (zio_t)); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&vdev_raidz_fastest_impl, + raidz_supp_impl[raidz_supp_impl_cnt - 1], + sizeof (vdev_raidz_fastest_impl)); + strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#endif /* _KERNEL */ +} + +void +vdev_raidz_math_init(void) +{ + /* Determine the fastest available implementation. */ + benchmark_raidz(); + + /* Finish initialization */ + atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); + raidz_math_initialized = B_TRUE; +} + +void +vdev_raidz_math_fini(void) +{ + raidz_impl_ops_t const *curr_impl; + + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = raidz_all_maths[i]; + if (curr_impl->fini) + curr_impl->fini(); + } +} + +static const struct { + char *name; + uint32_t sel; +} math_impl_opts[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, + { "original", IMPL_ORIGINAL }, + { "scalar", IMPL_SCALAR } +}; + +/* + * Function sets desired raidz implementation. + * + * If we are called before init(), user preference will be saved in + * user_sel_impl, and applied in later init() call. This occurs when module + * parameter is specified on module load. Otherwise, directly update + * zfs_vdev_raidz_impl. + * + * @val Name of raidz implementation to use + * @param Unused. + */ +int +vdev_raidz_impl_set(const char *val) +{ + int err = -EINVAL; + char req_name[RAIDZ_IMPL_NAME_MAX]; + uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); + size_t i; + + /* sanitize input */ + i = strnlen(val, RAIDZ_IMPL_NAME_MAX); + if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) + return (err); + + strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); + while (i > 0 && !!isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + /* Check mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { + if (strcmp(req_name, math_impl_opts[i].name) == 0) { + impl = math_impl_opts[i].sel; + err = 0; + break; + } + } + + /* check all supported impl if init() was already called */ + if (err != 0 && raidz_math_initialized) { + /* check all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + if (raidz_math_initialized) + atomic_swap_32(&zfs_vdev_raidz_impl, impl); + else + atomic_swap_32(&user_sel_impl, impl); + } + + return (err); +} + +#if defined(_KERNEL) && defined(__linux__) + +static int +zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) +{ + return (vdev_raidz_impl_set(val)); +} + +static int +zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cnt = 0; + char *fmt; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + ASSERT(raidz_math_initialized); + + /* list mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { + fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); + } + + /* list all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); + } + + return (cnt); +} + +module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, + zfs_vdev_raidz_impl_get, NULL, 0644); +MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); +#endif diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c new file mode 100644 index 0000000000..1a0214547b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c @@ -0,0 +1,424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ +#include <sys/isa_defs.h> + +#if defined(__amd64) + +#include <sys/types.h> +#include <sys/simd.h> + +#define __asm __asm__ __volatile__ + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "ymm"#REG +#define VR1_(_1, REG, ...) "ymm"#REG +#define VR2_(_1, _2, REG, ...) "ymm"#REG +#define VR3_(_1, _2, _3, REG, ...) "ymm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "ymm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 32 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxor 0x40(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxor 0x60(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxor %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxor %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxor %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxor %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxor %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxor %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa 0x40(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa 0x60(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \ + "vmovdqa %%" VR2(r) ", 0x40(%[DST])\n" \ + "vmovdqa %%" VR3(r) ", 0x60(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define FLUSH() \ +{ \ + __asm("vzeroupper"); \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm14, %ymm14"); \ + __asm("vpxor %ymm15, %ymm15 ,%ymm15"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpcmpgtb %" VR0(r)", %ymm15, %ymm12\n" \ + "vpcmpgtb %" VR1(r)", %ymm15, %ymm13\n" \ + "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \ + "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \ + "vpand %ymm14, %ymm12, %ymm12\n" \ + "vpand %ymm14, %ymm13, %ymm13\n" \ + "vpxor %ymm12, %" VR0(r)", %" VR0(r) "\n" \ + "vpxor %ymm13, %" VR1(r)", %" VR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "ymm15" +#define _as "ymm14" +#define _bs "ymm13" +#define _ltmod "ymm12" +#define _ltmul "ymm11" +#define _ta "ymm10" +#define _tb "ymm15" + +static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpbroadcastb (%[mask]), %%" _0f "\n" \ + /* upper bits */ \ + "vbroadcasti128 0x00(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti128 0x10(%[lt]), %%" _ltmul "\n" \ + \ + "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \ + "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \ + "vpand %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpand %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpand %%" _0f ", %%" _as ", %%" _as "\n" \ + "vpand %%" _0f ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \ + "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \ + /* lower bits */ \ + "vbroadcasti128 0x20(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti128 0x30(%[lt]), %%" _ltmul "\n" \ + \ + "vpxor %%" _ta ", %%" _as ", %%" _as "\n" \ + "vpxor %%" _tb ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\ + "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\ + \ + "vpxor %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxor %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxor %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpxor %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \ + : : [mask] "r" (&_mul_mask), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_01(r)); \ + _MULx2(c, R_23(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() \ +{ \ + FLUSH(); \ + kfpu_end(); \ +} + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include <sys/vdev_raidz_impl.h> +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx2); +DEFINE_REC_METHODS(avx2); + +static boolean_t +raidz_will_avx2_work(void) +{ + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx2_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx2), + .rec = RAIDZ_REC_METHODS(avx2), + .is_supported = &raidz_will_avx2_work, + .name = "avx2" +}; + +#elif defined(__i386) + +/* 32-bit stub for user-level fakekernel dependencies */ +#include <sys/vdev_raidz_impl.h> +const raidz_impl_ops_t vdev_raidz_avx2_impl = { + .init = NULL, + .fini = NULL, + .gen = NULL, + .rec = NULL, + .is_supported = NULL, + .name = "avx2" +}; + +#endif /* defined(__amd64) */ diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h new file mode 100644 index 0000000000..89c2082c4a --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h @@ -0,0 +1,1477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_MATH_IMPL_H +#define _VDEV_RAIDZ_MATH_IMPL_H + +#include <sys/types.h> + +#define raidz_inline inline __attribute__((always_inline)) +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +/* + * Functions calculate multiplication constants for data reconstruction. + * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and + * used parity columns for reconstruction. + * @rm RAIDZ map + * @tgtidx array of missing data indexes + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. + */ +static noinline void +raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + gf_t a, b, e; + + a = gf_exp2(x + 255 - y); + b = gf_exp2(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PQ_X] = gf_div(a, e); + coeff[MUL_PQ_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t a, b, e; + + a = gf_exp4(x + 255 - y); + b = gf_exp4(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PR_X] = gf_div(a, e); + coeff[MUL_PR_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t nx, ny, nxxy, nxyy, d; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nxxy = gf_mul(gf_mul(nx, nx), ny); + nxyy = gf_mul(gf_mul(nx, ny), ny); + d = nxxy ^ nxyy; + + coeff[MUL_QR_XQ] = ny; + coeff[MUL_QR_X] = gf_div(ny, d); + coeff[MUL_QR_YQ] = nx; + coeff[MUL_QR_Y] = gf_div(nx, d); +} + +static noinline void +raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + const unsigned z = tgtidx[TARGET_Z]; + + gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nz = gf_exp2(ncols - z - 1); + + nxx = gf_exp4(ncols - x - 1); + nyy = gf_exp4(ncols - y - 1); + nzz = gf_exp4(ncols - z - 1); + + nyyz = gf_mul(gf_mul(ny, nz), ny); + nyzz = gf_mul(nzz, ny); + + xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^ + gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz; + + yd = gf_inv(ny ^ nz); + + coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd); + coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd); + coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd); + coeff[MUL_PQR_YU] = nx; + coeff[MUL_PQR_YP] = gf_mul(nz, yd); + coeff[MUL_PQR_YQ] = yd; +} + +/* + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused + */ +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) +{ + v_t *dst = (v_t *)dc; + size_t i; + + ZERO_DEFINE(); + + (void) private; /* unused */ + + ZERO(ZERO_D); + + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); + } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ +} + +/* + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + COPY_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); + } + + return (0); +} + + +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ +} + +/* + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); + } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ +} + +/* + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. + * + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) + */ +static int +raidz_mul_abd_cb(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *)private); + v_t *d = (v_t *)dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros + */ +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; + + /* add data column */ + raidz_add(pabd, dabd, size); + } + + raidz_math_end(); +} + + +/* + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } +} + + +/* + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pq_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } + + raidz_math_end(); +} + + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + v_t *r = (v_t *)c[CODE_R]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); + } +} + + +/* + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + +/* + * Reconstruct single data column using P parity + * + * @syn_method raidz_add_abd() + * @rec_method not applicable + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } + + raidz_math_end(); + + return (1 << CODE_P); +} + + +/* + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); + + SYN_Q_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); + } +} + + +/* + * Reconstruct single data column using Q parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_q_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); + } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); + + raidz_math_end(); + + return (1 << CODE_Q); +} + + +/* + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); + + SYN_R_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); + } +} + + +/* + * Reconstruct single data column using R parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_r_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); + + raidz_math_end(); + + return (1 << CODE_R); +} + + +/* + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); + } +} + +/* + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PQ_DEFINE(); + + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); + + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); + + /* Save Pxy */ + COPY(REC_PQ_X, REC_PQ_T); + + /* Calc X */ + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); + XOR(REC_PQ_Y, REC_PQ_X); + STORE(x, REC_PQ_X); + + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); + } +} + + +/* + * Reconstruct two data columns using PQ parity + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + unsigned coeff[MUL_CNT]; + raidz_rec_pq_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } +} + +/* + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PR_DEFINE(); + + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); + + /* Save Pxy */ + COPY(REC_PR_X, REC_PR_T); + + /* Calc X */ + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); + XOR(REC_PR_Y, REC_PR_X); + STORE(x, REC_PR_X); + + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); + } +} + + +/* + * Reconstruct two data columns using PR parity + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_QR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); + } +} + + +/* + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_QR_DEFINE(); + + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); + + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); + + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); + + /* Calc X */ + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); + } +} + + +/* + * Reconstruct two data columns using QR parity + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_qr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + + return ((1 << CODE_Q) | (1 << CODE_R)); +} + + +/* + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + v_t *z = (v_t *)c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } +} + + +/* + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + v_t *z = (v_t *)t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + const v_t *r = (v_t *)c[CODE_R]; + + REC_PQR_DEFINE(); + + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); + + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); + + /* Save Pxyz and Qxyz */ + COPY(REC_PQR_X, REC_PQR_XS); + COPY(REC_PQR_Y, REC_PQR_YS); + + /* Calc X */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + XOR(REC_PQR_Y, REC_PQR_X); + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ + STORE(x, REC_PQR_X); + + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); + + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); + } +} + + +/* + * Reconstruct three data columns using PQR parity + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pqr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); +} + +#endif /* _VDEV_RAIDZ_MATH_IMPL_H */ diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c new file mode 100644 index 0000000000..cd742e146c --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/vdev_raidz_impl.h> + +/* + * Provide native CPU scalar routines. + * Support 32bit and 64bit CPUs. + */ +#if ((~(0x0ULL)) >> 24) == 0xffULL +#define ELEM_SIZE 4 +typedef uint32_t iv_t; +#elif ((~(0x0ULL)) >> 56) == 0xffULL +#define ELEM_SIZE 8 +typedef uint64_t iv_t; +#endif + +/* + * Vector type used in scalar implementation + * + * The union is expected to be of native CPU register size. Since addition + * uses XOR operation, it can be performed an all byte elements at once. + * Multiplication requires per byte access. + */ +typedef union { + iv_t e; + uint8_t b[ELEM_SIZE]; +} v_t; + +/* + * Precomputed lookup tables for multiplication by a constant + * + * Reconstruction path requires multiplication by a constant factors. Instead of + * performing two step lookup (log & exp tables), a direct lookup can be used + * instead. Multiplication of element 'a' by a constant 'c' is obtained as: + * + * r = vdev_raidz_mul_lt[c_log][a]; + * + * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because + * they are faster to obtain while solving the syndrome equations. + * + * PERFORMANCE NOTE: + * Even though the complete lookup table uses 64kiB, only relatively small + * portion of it is used at the same time. Following shows number of accessed + * bytes for different cases: + * - 1 failed disk: 256B (1 mul. coefficient) + * - 2 failed disks: 512B (2 mul. coefficients) + * - 3 failed disks: 1536B (6 mul. coefficients) + * + * Size of actually accessed lookup table regions is only larger for + * reconstruction of 3 failed disks, when compared to traditional log/exp + * method. But since the result is obtained in one lookup step performance is + * doubled. + */ +static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256))); + +static void +raidz_init_scalar(void) +{ + int c, i; + for (c = 0; c < 256; c++) + for (i = 0; i < 256; i++) + vdev_raidz_mul_lt[c][i] = gf_mul(c, i); + +} + +#define PREFETCHNTA(ptr, offset) {} +#define PREFETCH(ptr, offset) {} + +#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e +#define XOR(src, acc) acc.e ^= src.e +#define ZERO(acc) acc.e = 0 +#define COPY(src, dst) dst = src +#define LOAD(src, val) val = ((v_t *)src)[0] +#define STORE(dst, val) ((v_t *)dst)[0] = val + +/* + * Constants used for optimized multiplication by 2. + */ +static const struct { + iv_t mod; + iv_t mask; + iv_t msb; +} scalar_mul2_consts = { +#if ELEM_SIZE == 8 + .mod = 0x1d1d1d1d1d1d1d1dULL, + .mask = 0xfefefefefefefefeULL, + .msb = 0x8080808080808080ULL, +#else + .mod = 0x1d1d1d1dULL, + .mask = 0xfefefefeULL, + .msb = 0x80808080ULL, +#endif +}; + +#define MUL2_SETUP() {} + +#define MUL2(a) \ +{ \ + iv_t _mask; \ + \ + _mask = (a).e & scalar_mul2_consts.msb; \ + _mask = (_mask << 1) - (_mask >> 7); \ + (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \ + (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \ +} + +#define MUL4(a) \ +{ \ + MUL2(a); \ + MUL2(a); \ +} + +#define MUL(c, a) \ +{ \ + const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \ + switch (ELEM_SIZE) { \ + case 8: \ + a.b[7] = mul_lt[a.b[7]]; \ + a.b[6] = mul_lt[a.b[6]]; \ + a.b[5] = mul_lt[a.b[5]]; \ + a.b[4] = mul_lt[a.b[4]]; \ + /* falls through */ \ + case 4: \ + a.b[3] = mul_lt[a.b[3]]; \ + a.b[2] = mul_lt[a.b[2]]; \ + a.b[1] = mul_lt[a.b[1]]; \ + a.b[0] = mul_lt[a.b[0]]; \ + break; \ + } \ +} + +#define raidz_math_begin() {} +#define raidz_math_end() {} + +#define SYN_STRIDE 1 + +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 + +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 + +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 + +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 + +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 + +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 + +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 + +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 + + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 + + +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 + +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(scalar); +DEFINE_REC_METHODS(scalar); + +boolean_t +raidz_will_scalar_work(void) +{ + return (B_TRUE); /* always */ +} + +const raidz_impl_ops_t vdev_raidz_scalar_impl = { + .init = raidz_init_scalar, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(scalar), + .rec = RAIDZ_REC_METHODS(scalar), + .is_supported = &raidz_will_scalar_work, + .name = "scalar" +}; + +/* Powers of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; + +/* Logs of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c new file mode 100644 index 0000000000..569f73006b --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c @@ -0,0 +1,642 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/isa_defs.h> + +#if defined(__amd64) + +#include <sys/types.h> +#include <sys/simd.h> +#include <sys/debug.h> + +#define __asm __asm__ __volatile__ + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "xmm"#REG +#define VR1_(_1, REG, ...) "xmm"#REG +#define VR2_(_1, _2, REG, ...) "xmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG + +#define VR0(r...) VR0_(r, 1, 2, 3, 4, 5, 6) +#define VR1(r...) VR1_(r, 1, 2, 3, 4, 5, 6) +#define VR2(r...) VR2_(r, 1, 2, 3, 4, 5, 6) +#define VR3(r...) VR3_(r, 1, 2, 3, 4, 5, 6) +#define VR4(r...) VR4_(r, 1, 2, 3, 4, 5, 6) +#define VR5(r...) VR5_(r, 1, 2, 3, 4, 5, 6) +#define VR6(r...) VR6_(r, 1, 2, 3, 4, 5, 6) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5, 6) + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \ + "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 1: \ + __asm("pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "pxor %" VR0(r) ", %" VR4(r) "\n" \ + "pxor %" VR1(r) ", %" VR5(r) "\n" \ + "pxor %" VR2(r) ", %" VR6(r) "\n" \ + "pxor %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "pxor %" VR0(r) ", %" VR2(r) "\n" \ + "pxor %" VR1(r) ", %" VR3(r)); \ + break; \ + case 2: \ + __asm( \ + "pxor %" VR0(r) ", %" VR1(r)); \ + break; \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR4(r) "\n" \ + "movdqa %" VR1(r) ", %" VR5(r) "\n" \ + "movdqa %" VR2(r) ", %" VR6(r) "\n" \ + "movdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR2(r) "\n" \ + "movdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR1(r)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \ + "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 1: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + "movdqa %%" VR2(r)", 0x20(%[DST])\n" \ + "movdqa %%" VR3(r)", 0x30(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 1: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "movd %[mask], %%xmm15\n" \ + "pshufd $0x0, %%xmm15, %%xmm15\n" \ + : : [mask] "r" (0x1d1d1d1d)); \ +} + +#define _MUL2_x1(a0) \ +{ \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pcmpgtb %" a0", %xmm14\n" \ + "pand %xmm15, %xmm14\n" \ + "paddb %" a0", %" a0 "\n" \ + "pxor %xmm14, %" a0); \ +} + +#define _MUL2_x2(a0, a1) \ +{ \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pxor %xmm13, %xmm13\n" \ + "pcmpgtb %" a0", %xmm14\n" \ + "pcmpgtb %" a1", %xmm13\n" \ + "pand %xmm15, %xmm14\n" \ + "pand %xmm15, %xmm13\n" \ + "paddb %" a0", %" a0 "\n" \ + "paddb %" a1", %" a1 "\n" \ + "pxor %xmm14, %" a0 "\n" \ + "pxor %xmm13, %" a1); \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(VR0(r), VR1(r)); \ + _MUL2_x2(VR2(r), VR3(r)); \ + break; \ + case 2: \ + _MUL2_x2(VR0(r), VR1(r)); \ + break; \ + case 1: \ + _MUL2_x1(VR0(r)); \ + break; \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +/* General multiplication by adding powers of two */ + +#define _MUL_PARAM(x, in, acc) \ +{ \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ + if (x & 0xfe) { MUL2(in); } \ + if (x & 0x02) { XOR(in, acc); } \ + if (x & 0xfc) { MUL2(in); } \ + if (x & 0x04) { XOR(in, acc); } \ + if (x & 0xf8) { MUL2(in); } \ + if (x & 0x08) { XOR(in, acc); } \ + if (x & 0xf0) { MUL2(in); } \ + if (x & 0x10) { XOR(in, acc); } \ + if (x & 0xe0) { MUL2(in); } \ + if (x & 0x20) { XOR(in, acc); } \ + if (x & 0xc0) { MUL2(in); } \ + if (x & 0x40) { XOR(in, acc); } \ + if (x & 0x80) { MUL2(in); XOR(in, acc); } \ +} + +#define _mul_x1_in 11 +#define _mul_x1_acc 12 + +#define MUL_x1_DEFINE(x) \ +static void \ +mul_x1_ ## x(void) { _MUL_PARAM(x, _mul_x1_in, _mul_x1_acc); } + +#define _mul_x2_in 9, 10 +#define _mul_x2_acc 11, 12 + +#define MUL_x2_DEFINE(x) \ +static void \ +mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); } + +MUL_x1_DEFINE(0); MUL_x1_DEFINE(1); MUL_x1_DEFINE(2); MUL_x1_DEFINE(3); +MUL_x1_DEFINE(4); MUL_x1_DEFINE(5); MUL_x1_DEFINE(6); MUL_x1_DEFINE(7); +MUL_x1_DEFINE(8); MUL_x1_DEFINE(9); MUL_x1_DEFINE(10); MUL_x1_DEFINE(11); +MUL_x1_DEFINE(12); MUL_x1_DEFINE(13); MUL_x1_DEFINE(14); MUL_x1_DEFINE(15); +MUL_x1_DEFINE(16); MUL_x1_DEFINE(17); MUL_x1_DEFINE(18); MUL_x1_DEFINE(19); +MUL_x1_DEFINE(20); MUL_x1_DEFINE(21); MUL_x1_DEFINE(22); MUL_x1_DEFINE(23); +MUL_x1_DEFINE(24); MUL_x1_DEFINE(25); MUL_x1_DEFINE(26); MUL_x1_DEFINE(27); +MUL_x1_DEFINE(28); MUL_x1_DEFINE(29); MUL_x1_DEFINE(30); MUL_x1_DEFINE(31); +MUL_x1_DEFINE(32); MUL_x1_DEFINE(33); MUL_x1_DEFINE(34); MUL_x1_DEFINE(35); +MUL_x1_DEFINE(36); MUL_x1_DEFINE(37); MUL_x1_DEFINE(38); MUL_x1_DEFINE(39); +MUL_x1_DEFINE(40); MUL_x1_DEFINE(41); MUL_x1_DEFINE(42); MUL_x1_DEFINE(43); +MUL_x1_DEFINE(44); MUL_x1_DEFINE(45); MUL_x1_DEFINE(46); MUL_x1_DEFINE(47); +MUL_x1_DEFINE(48); MUL_x1_DEFINE(49); MUL_x1_DEFINE(50); MUL_x1_DEFINE(51); +MUL_x1_DEFINE(52); MUL_x1_DEFINE(53); MUL_x1_DEFINE(54); MUL_x1_DEFINE(55); +MUL_x1_DEFINE(56); MUL_x1_DEFINE(57); MUL_x1_DEFINE(58); MUL_x1_DEFINE(59); +MUL_x1_DEFINE(60); MUL_x1_DEFINE(61); MUL_x1_DEFINE(62); MUL_x1_DEFINE(63); +MUL_x1_DEFINE(64); MUL_x1_DEFINE(65); MUL_x1_DEFINE(66); MUL_x1_DEFINE(67); +MUL_x1_DEFINE(68); MUL_x1_DEFINE(69); MUL_x1_DEFINE(70); MUL_x1_DEFINE(71); +MUL_x1_DEFINE(72); MUL_x1_DEFINE(73); MUL_x1_DEFINE(74); MUL_x1_DEFINE(75); +MUL_x1_DEFINE(76); MUL_x1_DEFINE(77); MUL_x1_DEFINE(78); MUL_x1_DEFINE(79); +MUL_x1_DEFINE(80); MUL_x1_DEFINE(81); MUL_x1_DEFINE(82); MUL_x1_DEFINE(83); +MUL_x1_DEFINE(84); MUL_x1_DEFINE(85); MUL_x1_DEFINE(86); MUL_x1_DEFINE(87); +MUL_x1_DEFINE(88); MUL_x1_DEFINE(89); MUL_x1_DEFINE(90); MUL_x1_DEFINE(91); +MUL_x1_DEFINE(92); MUL_x1_DEFINE(93); MUL_x1_DEFINE(94); MUL_x1_DEFINE(95); +MUL_x1_DEFINE(96); MUL_x1_DEFINE(97); MUL_x1_DEFINE(98); MUL_x1_DEFINE(99); +MUL_x1_DEFINE(100); MUL_x1_DEFINE(101); MUL_x1_DEFINE(102); MUL_x1_DEFINE(103); +MUL_x1_DEFINE(104); MUL_x1_DEFINE(105); MUL_x1_DEFINE(106); MUL_x1_DEFINE(107); +MUL_x1_DEFINE(108); MUL_x1_DEFINE(109); MUL_x1_DEFINE(110); MUL_x1_DEFINE(111); +MUL_x1_DEFINE(112); MUL_x1_DEFINE(113); MUL_x1_DEFINE(114); MUL_x1_DEFINE(115); +MUL_x1_DEFINE(116); MUL_x1_DEFINE(117); MUL_x1_DEFINE(118); MUL_x1_DEFINE(119); +MUL_x1_DEFINE(120); MUL_x1_DEFINE(121); MUL_x1_DEFINE(122); MUL_x1_DEFINE(123); +MUL_x1_DEFINE(124); MUL_x1_DEFINE(125); MUL_x1_DEFINE(126); MUL_x1_DEFINE(127); +MUL_x1_DEFINE(128); MUL_x1_DEFINE(129); MUL_x1_DEFINE(130); MUL_x1_DEFINE(131); +MUL_x1_DEFINE(132); MUL_x1_DEFINE(133); MUL_x1_DEFINE(134); MUL_x1_DEFINE(135); +MUL_x1_DEFINE(136); MUL_x1_DEFINE(137); MUL_x1_DEFINE(138); MUL_x1_DEFINE(139); +MUL_x1_DEFINE(140); MUL_x1_DEFINE(141); MUL_x1_DEFINE(142); MUL_x1_DEFINE(143); +MUL_x1_DEFINE(144); MUL_x1_DEFINE(145); MUL_x1_DEFINE(146); MUL_x1_DEFINE(147); +MUL_x1_DEFINE(148); MUL_x1_DEFINE(149); MUL_x1_DEFINE(150); MUL_x1_DEFINE(151); +MUL_x1_DEFINE(152); MUL_x1_DEFINE(153); MUL_x1_DEFINE(154); MUL_x1_DEFINE(155); +MUL_x1_DEFINE(156); MUL_x1_DEFINE(157); MUL_x1_DEFINE(158); MUL_x1_DEFINE(159); +MUL_x1_DEFINE(160); MUL_x1_DEFINE(161); MUL_x1_DEFINE(162); MUL_x1_DEFINE(163); +MUL_x1_DEFINE(164); MUL_x1_DEFINE(165); MUL_x1_DEFINE(166); MUL_x1_DEFINE(167); +MUL_x1_DEFINE(168); MUL_x1_DEFINE(169); MUL_x1_DEFINE(170); MUL_x1_DEFINE(171); +MUL_x1_DEFINE(172); MUL_x1_DEFINE(173); MUL_x1_DEFINE(174); MUL_x1_DEFINE(175); +MUL_x1_DEFINE(176); MUL_x1_DEFINE(177); MUL_x1_DEFINE(178); MUL_x1_DEFINE(179); +MUL_x1_DEFINE(180); MUL_x1_DEFINE(181); MUL_x1_DEFINE(182); MUL_x1_DEFINE(183); +MUL_x1_DEFINE(184); MUL_x1_DEFINE(185); MUL_x1_DEFINE(186); MUL_x1_DEFINE(187); +MUL_x1_DEFINE(188); MUL_x1_DEFINE(189); MUL_x1_DEFINE(190); MUL_x1_DEFINE(191); +MUL_x1_DEFINE(192); MUL_x1_DEFINE(193); MUL_x1_DEFINE(194); MUL_x1_DEFINE(195); +MUL_x1_DEFINE(196); MUL_x1_DEFINE(197); MUL_x1_DEFINE(198); MUL_x1_DEFINE(199); +MUL_x1_DEFINE(200); MUL_x1_DEFINE(201); MUL_x1_DEFINE(202); MUL_x1_DEFINE(203); +MUL_x1_DEFINE(204); MUL_x1_DEFINE(205); MUL_x1_DEFINE(206); MUL_x1_DEFINE(207); +MUL_x1_DEFINE(208); MUL_x1_DEFINE(209); MUL_x1_DEFINE(210); MUL_x1_DEFINE(211); +MUL_x1_DEFINE(212); MUL_x1_DEFINE(213); MUL_x1_DEFINE(214); MUL_x1_DEFINE(215); +MUL_x1_DEFINE(216); MUL_x1_DEFINE(217); MUL_x1_DEFINE(218); MUL_x1_DEFINE(219); +MUL_x1_DEFINE(220); MUL_x1_DEFINE(221); MUL_x1_DEFINE(222); MUL_x1_DEFINE(223); +MUL_x1_DEFINE(224); MUL_x1_DEFINE(225); MUL_x1_DEFINE(226); MUL_x1_DEFINE(227); +MUL_x1_DEFINE(228); MUL_x1_DEFINE(229); MUL_x1_DEFINE(230); MUL_x1_DEFINE(231); +MUL_x1_DEFINE(232); MUL_x1_DEFINE(233); MUL_x1_DEFINE(234); MUL_x1_DEFINE(235); +MUL_x1_DEFINE(236); MUL_x1_DEFINE(237); MUL_x1_DEFINE(238); MUL_x1_DEFINE(239); +MUL_x1_DEFINE(240); MUL_x1_DEFINE(241); MUL_x1_DEFINE(242); MUL_x1_DEFINE(243); +MUL_x1_DEFINE(244); MUL_x1_DEFINE(245); MUL_x1_DEFINE(246); MUL_x1_DEFINE(247); +MUL_x1_DEFINE(248); MUL_x1_DEFINE(249); MUL_x1_DEFINE(250); MUL_x1_DEFINE(251); +MUL_x1_DEFINE(252); MUL_x1_DEFINE(253); MUL_x1_DEFINE(254); MUL_x1_DEFINE(255); + +MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3); +MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7); +MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11); +MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15); +MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19); +MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23); +MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27); +MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31); +MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35); +MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39); +MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43); +MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47); +MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51); +MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55); +MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59); +MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63); +MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67); +MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71); +MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75); +MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79); +MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83); +MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87); +MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91); +MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95); +MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99); +MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103); +MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107); +MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111); +MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115); +MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119); +MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123); +MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127); +MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131); +MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135); +MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139); +MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143); +MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147); +MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151); +MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155); +MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159); +MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163); +MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167); +MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171); +MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175); +MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179); +MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183); +MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187); +MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191); +MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195); +MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199); +MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203); +MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207); +MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211); +MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215); +MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219); +MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223); +MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227); +MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231); +MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235); +MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239); +MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243); +MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247); +MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251); +MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255); + + + +typedef void (*mul_fn_ptr_t)(void); + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x1_mul_fns[256] = { + mul_x1_0, mul_x1_1, mul_x1_2, mul_x1_3, mul_x1_4, mul_x1_5, + mul_x1_6, mul_x1_7, mul_x1_8, mul_x1_9, mul_x1_10, mul_x1_11, + mul_x1_12, mul_x1_13, mul_x1_14, mul_x1_15, mul_x1_16, mul_x1_17, + mul_x1_18, mul_x1_19, mul_x1_20, mul_x1_21, mul_x1_22, mul_x1_23, + mul_x1_24, mul_x1_25, mul_x1_26, mul_x1_27, mul_x1_28, mul_x1_29, + mul_x1_30, mul_x1_31, mul_x1_32, mul_x1_33, mul_x1_34, mul_x1_35, + mul_x1_36, mul_x1_37, mul_x1_38, mul_x1_39, mul_x1_40, mul_x1_41, + mul_x1_42, mul_x1_43, mul_x1_44, mul_x1_45, mul_x1_46, mul_x1_47, + mul_x1_48, mul_x1_49, mul_x1_50, mul_x1_51, mul_x1_52, mul_x1_53, + mul_x1_54, mul_x1_55, mul_x1_56, mul_x1_57, mul_x1_58, mul_x1_59, + mul_x1_60, mul_x1_61, mul_x1_62, mul_x1_63, mul_x1_64, mul_x1_65, + mul_x1_66, mul_x1_67, mul_x1_68, mul_x1_69, mul_x1_70, mul_x1_71, + mul_x1_72, mul_x1_73, mul_x1_74, mul_x1_75, mul_x1_76, mul_x1_77, + mul_x1_78, mul_x1_79, mul_x1_80, mul_x1_81, mul_x1_82, mul_x1_83, + mul_x1_84, mul_x1_85, mul_x1_86, mul_x1_87, mul_x1_88, mul_x1_89, + mul_x1_90, mul_x1_91, mul_x1_92, mul_x1_93, mul_x1_94, mul_x1_95, + mul_x1_96, mul_x1_97, mul_x1_98, mul_x1_99, mul_x1_100, mul_x1_101, + mul_x1_102, mul_x1_103, mul_x1_104, mul_x1_105, mul_x1_106, mul_x1_107, + mul_x1_108, mul_x1_109, mul_x1_110, mul_x1_111, mul_x1_112, mul_x1_113, + mul_x1_114, mul_x1_115, mul_x1_116, mul_x1_117, mul_x1_118, mul_x1_119, + mul_x1_120, mul_x1_121, mul_x1_122, mul_x1_123, mul_x1_124, mul_x1_125, + mul_x1_126, mul_x1_127, mul_x1_128, mul_x1_129, mul_x1_130, mul_x1_131, + mul_x1_132, mul_x1_133, mul_x1_134, mul_x1_135, mul_x1_136, mul_x1_137, + mul_x1_138, mul_x1_139, mul_x1_140, mul_x1_141, mul_x1_142, mul_x1_143, + mul_x1_144, mul_x1_145, mul_x1_146, mul_x1_147, mul_x1_148, mul_x1_149, + mul_x1_150, mul_x1_151, mul_x1_152, mul_x1_153, mul_x1_154, mul_x1_155, + mul_x1_156, mul_x1_157, mul_x1_158, mul_x1_159, mul_x1_160, mul_x1_161, + mul_x1_162, mul_x1_163, mul_x1_164, mul_x1_165, mul_x1_166, mul_x1_167, + mul_x1_168, mul_x1_169, mul_x1_170, mul_x1_171, mul_x1_172, mul_x1_173, + mul_x1_174, mul_x1_175, mul_x1_176, mul_x1_177, mul_x1_178, mul_x1_179, + mul_x1_180, mul_x1_181, mul_x1_182, mul_x1_183, mul_x1_184, mul_x1_185, + mul_x1_186, mul_x1_187, mul_x1_188, mul_x1_189, mul_x1_190, mul_x1_191, + mul_x1_192, mul_x1_193, mul_x1_194, mul_x1_195, mul_x1_196, mul_x1_197, + mul_x1_198, mul_x1_199, mul_x1_200, mul_x1_201, mul_x1_202, mul_x1_203, + mul_x1_204, mul_x1_205, mul_x1_206, mul_x1_207, mul_x1_208, mul_x1_209, + mul_x1_210, mul_x1_211, mul_x1_212, mul_x1_213, mul_x1_214, mul_x1_215, + mul_x1_216, mul_x1_217, mul_x1_218, mul_x1_219, mul_x1_220, mul_x1_221, + mul_x1_222, mul_x1_223, mul_x1_224, mul_x1_225, mul_x1_226, mul_x1_227, + mul_x1_228, mul_x1_229, mul_x1_230, mul_x1_231, mul_x1_232, mul_x1_233, + mul_x1_234, mul_x1_235, mul_x1_236, mul_x1_237, mul_x1_238, mul_x1_239, + mul_x1_240, mul_x1_241, mul_x1_242, mul_x1_243, mul_x1_244, mul_x1_245, + mul_x1_246, mul_x1_247, mul_x1_248, mul_x1_249, mul_x1_250, mul_x1_251, + mul_x1_252, mul_x1_253, mul_x1_254, mul_x1_255 +}; + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x2_mul_fns[256] = { + mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5, + mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11, + mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17, + mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23, + mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29, + mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35, + mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41, + mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47, + mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53, + mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59, + mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65, + mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71, + mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77, + mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83, + mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89, + mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95, + mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101, + mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107, + mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113, + mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119, + mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125, + mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131, + mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137, + mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143, + mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149, + mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155, + mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161, + mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167, + mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173, + mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179, + mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185, + mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191, + mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197, + mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203, + mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209, + mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215, + mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221, + mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227, + mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233, + mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239, + mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245, + mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251, + mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255 +}; + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + COPY(r, _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, r); \ + break; \ + case 1: \ + COPY(r, _mul_x1_in); \ + gf_x1_mul_fns[c](); \ + COPY(_mul_x1_acc, r); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 2 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() MUL2_SETUP() +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() MUL2_SETUP() +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() MUL2_SETUP() +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() MUL2_SETUP() +#define REC_PQR_X 0 +#define REC_PQR_Y 1 +#define REC_PQR_Z 2 +#define REC_PQR_XS 3 +#define REC_PQR_YS 4 + + +#include <sys/vdev_raidz_impl.h> +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(sse2); +DEFINE_REC_METHODS(sse2); + +static boolean_t +raidz_will_sse2_work(void) +{ + return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); +} + +const raidz_impl_ops_t vdev_raidz_sse2_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(sse2), + .rec = RAIDZ_REC_METHODS(sse2), + .is_supported = &raidz_will_sse2_work, + .name = "sse2" +}; + +#elif defined(__i386) + +/* 32-bit stub for user-level fakekernel dependencies */ +#include <sys/vdev_raidz_impl.h> +const raidz_impl_ops_t vdev_raidz_sse2_impl = { + .init = NULL, + .fini = NULL, + .gen = NULL, + .rec = NULL, + .is_supported = NULL, + .name = "avx2" +}; + +#endif /* defined(__amd64) */ diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c new file mode 100644 index 0000000000..03d51901cb --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c @@ -0,0 +1,2483 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include <sys/isa_defs.h> + +#if defined(__amd64) + +#include <sys/types.h> +#include <sys/simd.h> + +#define __asm __asm__ __volatile__ + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "xmm"#REG +#define VR1_(_1, REG, ...) "xmm"#REG +#define VR2_(_1, _2, REG, ...) "xmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \ + "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "pxor %" VR0(r) ", %" VR4(r) "\n" \ + "pxor %" VR1(r) ", %" VR5(r) "\n" \ + "pxor %" VR2(r) ", %" VR6(r) "\n" \ + "pxor %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "pxor %" VR0(r) ", %" VR2(r) "\n" \ + "pxor %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR4(r) "\n" \ + "movdqa %" VR1(r) ", %" VR5(r) "\n" \ + "movdqa %" VR2(r) ", %" VR6(r) "\n" \ + "movdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR2(r) "\n" \ + "movdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \ + "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + "movdqa %%" VR2(r)", 0x20(%[DST])\n" \ + "movdqa %%" VR3(r)", 0x30(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "movd %[mask], %%xmm15\n" \ + "pshufd $0x0, %%xmm15, %%xmm15\n" \ + : : [mask] "r" (0x1d1d1d1d)); \ +} + +#define _MUL2_x2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pxor %xmm13, %xmm13\n" \ + "pcmpgtb %" VR0(r)", %xmm14\n" \ + "pcmpgtb %" VR1(r)", %xmm13\n" \ + "pand %xmm15, %xmm14\n" \ + "pand %xmm15, %xmm13\n" \ + "paddb %" VR0(r)", %" VR0(r) "\n" \ + "paddb %" VR1(r)", %" VR1(r) "\n" \ + "pxor %xmm14, %" VR0(r) "\n" \ + "pxor %xmm13, %" VR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(R_01(r)); \ + _MUL2_x2(R_23(r)); \ + break; \ + case 2: \ + _MUL2_x2(r); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "xmm15" +#define _a_save "xmm14" +#define _b_save "xmm13" +#define _lt_mod_a "xmm12" +#define _lt_clmul_a "xmm11" +#define _lt_mod_b "xmm10" +#define _lt_clmul_b "xmm15" + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + /* lts for upper part */ \ + "movd %[mask], %%" _0f "\n" \ + "pshufd $0x0, %%" _0f ", %%" _0f "\n" \ + "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n" \ + "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n" \ + /* upper part */ \ + "movdqa %%" VR0(r) ", %%" _a_save "\n" \ + "movdqa %%" VR1(r) ", %%" _b_save "\n" \ + "psraw $0x4, %%" VR0(r) "\n" \ + "psraw $0x4, %%" VR1(r) "\n" \ + "pand %%" _0f ", %%" _a_save "\n" \ + "pand %%" _0f ", %%" _b_save "\n" \ + "pand %%" _0f ", %%" VR0(r) "\n" \ + "pand %%" _0f ", %%" VR1(r) "\n" \ + \ + "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ + "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ + \ + "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n" \ + "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n" \ + "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n" \ + "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n" \ + \ + "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n" \ + "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n" \ + "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n" \ + "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n" \ + /* lts for lower part */ \ + "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n" \ + "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n" \ + "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ + "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ + /* lower part */ \ + "pshufb %%" _a_save ",%%" _lt_mod_a "\n" \ + "pshufb %%" _b_save ",%%" _lt_mod_b "\n" \ + "pshufb %%" _a_save ",%%" _lt_clmul_a "\n" \ + "pshufb %%" _b_save ",%%" _lt_clmul_b "\n" \ + \ + "pxor %%" _lt_mod_a ",%%" VR0(r) "\n" \ + "pxor %%" _lt_mod_b ",%%" VR1(r) "\n" \ + "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n" \ + "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n" \ + : : [mask] "r" (0x0f0f0f0f), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_23(r)); \ + _MULx2(c, R_01(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include <sys/vdev_raidz_impl.h> +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(ssse3); +DEFINE_REC_METHODS(ssse3); + +static boolean_t +raidz_will_ssse3_work(void) +{ + return (kfpu_allowed() && zfs_sse_available() && + zfs_sse2_available() && zfs_ssse3_available()); +} + +const raidz_impl_ops_t vdev_raidz_ssse3_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(ssse3), + .rec = RAIDZ_REC_METHODS(ssse3), + .is_supported = &raidz_will_ssse3_work, + .name = "ssse3" +}; + +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = +{ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, + 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, + 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, + 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, + 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, + 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, + 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, + 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, + 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, + 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, + 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, + 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, + 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, + 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, + 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, + 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, + 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, + 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, + 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, + 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, + 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, + 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, + 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, + 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, + 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, + 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, + 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, + 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, + 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, + 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, + 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, + 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, + 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, + 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, + 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, + 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, + 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, + 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, + 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, + 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, + 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, + 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, + 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, + 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, + 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, + 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, + 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, + 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, + 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, + 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, + 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, + 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, + 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, + 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, + 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, + 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, + 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, + 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, + 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, + 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, + 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, + 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, + 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, + 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, + 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, + 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, + 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, + 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, + 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, + 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, + 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, + 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, + 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, + 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, + 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, + 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, + 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, + 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, + 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, + 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, + 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, + 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, + 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, + 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, + 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, + 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, + 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, + 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, + 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, + 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, + 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, + 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, + 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, + 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, + 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, + 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, + 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, + 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, + 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, + 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, + 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, + 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, + 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, + 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, + 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, + 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, + 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, + 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, + 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, + 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, + 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, + 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, + 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, + 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, + 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, + 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, + 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, + 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, + 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, + 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, + 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, + 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, + 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, + 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, + 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, + 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, + 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, + 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, + 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, + 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, + 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, + 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, + 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, + 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, + 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, + 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, + 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, + 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, + 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, + 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, + 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, + 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, + 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, + 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, + 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, + 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, + 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, + 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, + 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, + 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, + 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, + 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, + 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, + 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, + 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, + 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, + 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, + 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, + 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, + 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, + 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, + 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, + 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, + 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, + 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, + 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, + 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, + 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, + 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, + 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, + 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, + 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, + 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, + 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, + 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, + 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, + 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, + 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, + 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, + 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, + 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, + 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, + 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, + 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, + 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, + 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, + 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, + 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, + 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, + 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, + 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, + 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, + 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, + 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, + 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, + 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, + 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, + 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, + 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, + 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, + 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, + 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, + 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, + 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, + 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, + 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, + 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, + 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, + 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, + 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, + 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, + 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, + 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, + 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, + 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, + 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, + 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, + 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, + 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, + 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, + 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } +}; +/* END CSTYLED */ + +#elif defined(__i386) + +/* 32-bit stub for user-level fakekernel dependencies */ +#include <sys/vdev_raidz_impl.h> +const raidz_impl_ops_t vdev_raidz_ssse3_impl = { + .init = NULL, + .fini = NULL, + .gen = NULL, + .rec = NULL, + .is_supported = NULL, + .name = "sse3" +}; + +#endif /* defined(__amd64) */ diff --git a/usr/src/uts/common/fs/zfs/zcp.c b/usr/src/uts/common/fs/zfs/zcp.c index 61ce60a233..e2db01c5b6 100644 --- a/usr/src/uts/common/fs/zfs/zcp.c +++ b/usr/src/uts/common/fs/zfs/zcp.c @@ -22,7 +22,7 @@ * * The ZCP interface allows various ZFS commands and operations ZFS * administrative operations (e.g. creating and destroying snapshots, typically - * performed via an ioctl to /dev/zfs by the zfs(1M) command and + * performed via an ioctl to /dev/zfs by the zfs(8) command and * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP * script is run as a dsl_sync_task and fully executed during one transaction * group sync. This ensures that no other changes can be written concurrently @@ -86,7 +86,7 @@ * longjumps out of the script execution with luaL_error() and returns with the * error. * - * See zfs-program(1M) for more information on high level usage. + * See zfs-program(8) for more information on high level usage. */ #include "lua.h" @@ -718,8 +718,7 @@ static void * zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { zcp_alloc_arg_t *allocargs = ud; - int flags = (allocargs->aa_must_succeed) ? - KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); + int flags = (allocargs->aa_must_succeed) ? KM_SLEEP : KM_NOSLEEP_LAZY; if (nsize == 0) { if (ptr != NULL) { diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index c5add7b25f..5d377a109e 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -21,7 +21,6 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2015, Joyent, Inc. @@ -718,6 +717,7 @@ zfs_rmnode(znode_t *zp) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index dd854c12e1..2118fd549e 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -735,7 +735,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, report->zcr_length = length; #ifdef _KERNEL - zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 153dcf1502..b74baf46ea 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -37,6 +37,7 @@ * Copyright (c) 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017, Datto, Inc. All rights reserved. + * Copyright 2021 The University of Queensland */ /* @@ -63,8 +64,9 @@ * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). - * The ioctl numbers can change from release to release, because - * the caller (libzfs) must be matched to the kernel. + * We want newer versions of libzfs and libzfs_core to run against + * existing zfs kernel modules (i.e. a deferred reboot after an update). + * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to @@ -90,6 +92,10 @@ * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * + * zfs_ioc_key_t *nvl_keys + * The list of expected/allowable innvl input keys. This list is used + * to validate the nvlist input to the ioctl. + * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the @@ -138,6 +144,14 @@ * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. + * + * IOCTL Interface Errors + * + * The following ioctl input errors can be returned: + * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel + * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel + * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing + * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include <sys/types.h> @@ -223,6 +237,37 @@ typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); +/* + * IOC Keys are used to document and validate user->kernel interface inputs. + * See zfs_keys_recv_new for an example declaration. Any key name that is not + * listed will be rejected as input. + * + * The keyname 'optional' is always allowed, and must be an nvlist if present. + * Arguments which older kernels can safely ignore can be placed under the + * "optional" key. + * + * When adding new keys to an existing ioc for new functionality, consider: + * - adding an entry into zfs_sysfs.c zfs_features[] list + * - updating the libzfs_input_check.c test utility + * + * Note: in the ZK_WILDCARDLIST case, the name serves as documentation + * for the expected name (bookmark, snapshot, property, etc) but there + * is no validation in the preflight zfs_check_input_nvpairs() check. + */ +typedef enum { + ZK_OPTIONAL = 1 << 0, /* pair is optional */ + ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */ +} ioc_key_flag_t; + +/* DATA_TYPE_ANY is used when zkey_type can vary. */ +#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN + +typedef struct zfs_ioc_key { + const char *zkey_name; + data_type_t zkey_type; + ioc_key_flag_t zkey_flags; +} zfs_ioc_key_t; + typedef enum { NO_NAME, POOL_NAME, @@ -244,6 +289,8 @@ typedef struct zfs_ioc_vec { zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; + const zfs_ioc_key_t *zvec_nvl_keys; + size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ @@ -861,8 +908,8 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) nvpair_t *pair, *nextpair; int error = 0; - if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) - return (SET_ERROR(EINVAL)); + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); @@ -1009,8 +1056,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) int error = 0; nvpair_t *pair; - if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) - return (SET_ERROR(EINVAL)); + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); @@ -1030,7 +1077,7 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) } /* - * Check for permission to create each snapshot in the nvlist. + * Check for permission to create each bookmark in the nvlist. */ /* ARGSUSED */ static int @@ -1265,9 +1312,7 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) nvlist_t *holds; int error; - error = nvlist_lookup_nvlist(innvl, "holds", &holds); - if (error != 0) - return (SET_ERROR(EINVAL)); + holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { @@ -1338,12 +1383,15 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); - if (error == 0) - error = zfs_secpolicy_hold(zc, innvl, cr); - if (error == 0) - error = zfs_secpolicy_release(zc, innvl, cr); - if (error == 0) - error = zfs_secpolicy_destroy(zc, innvl, cr); + + if (innvl != NULL) { + if (error == 0) + error = zfs_secpolicy_hold(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_release(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_destroy(zc, innvl, cr); + } return (error); } @@ -1929,8 +1977,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; - nvlist_t *config, **l2cache, **spares; - uint_t nl2cache = 0, nspares = 0; + nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) @@ -1938,27 +1985,6 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache); - - (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, - &spares, &nspares); - - /* - * A root pool with concatenated devices is not supported. - * Thus, can not add a device to a root pool. - * - * Intent log device can not be added to a rootpool because - * during mountroot, zil is replayed, a seperated log device - * can not be accessed during the mountroot time. - * - * l2cache and spare devices are ok to be added to a rootpool. - */ - if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { - nvlist_free(config); - spa_close(spa, FTAG); - return (SET_ERROR(EDOM)); - } if (error == 0) { error = spa_vdev_add(spa, config); @@ -3300,6 +3326,13 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, * * outnvl: propname -> error code (int32) */ + +static const zfs_ioc_key_t zfs_keys_create[] = { + {"type", DATA_TYPE_INT32, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3308,14 +3341,11 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; - if (nvlist_lookup_int32(innvl, "type", &type32) != 0) - return (SET_ERROR(EINVAL)); - type = type32; + type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); @@ -3418,6 +3448,12 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * * outnvl: propname -> error code (int32) */ +static const zfs_ioc_key_t zfs_keys_clone[] = { + {"origin", DATA_TYPE_STRING, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3450,6 +3486,10 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +static const zfs_ioc_key_t zfs_keys_remap[] = { + /* no nvl keys */ +}; + /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3469,6 +3509,11 @@ zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * * outnvl: snapshot -> error code (int32) */ +static const zfs_ioc_key_t zfs_keys_snapshot[] = { + {"snaps", DATA_TYPE_NVLIST, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3485,8 +3530,7 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); - if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) - return (SET_ERROR(EINVAL)); + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { @@ -3525,6 +3569,10 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) /* * innvl: "message" -> string */ +static const zfs_ioc_key_t zfs_keys_log_history[] = { + {"message", DATA_TYPE_STRING, 0}, +}; + /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) @@ -3548,10 +3596,7 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) return (error); - if (nvlist_lookup_string(innvl, "message", &message) != 0) { - spa_close(spa, FTAG); - return (SET_ERROR(EINVAL)); - } + message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); @@ -3564,6 +3609,58 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) } /* + * This ioctl is used to set the bootenv configuration on the current + * pool. This configuration is stored in the second padding area of the label, + * and it is used by the bootloader(s) to store bootloader and/or system + * specific data. + * The data is stored as nvlist data stream, and is protected by + * an embedded checksum. + * The version can have two possible values: + * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. + * VB_NVLIST: nvlist with arbitrary <key, value> pairs. + */ +static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { + {"version", DATA_TYPE_UINT64, 0}, + {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, +}; + +static int +zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, + nvlist_t *outnvl __unused) +{ + int error; + spa_t *spa; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { + /* no nvl keys */ +}; + +static int +zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl __unused, + nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +/* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * @@ -3645,6 +3742,11 @@ zfs_destroy_unmount_origin(const char *fsname) * outnvl: snapshot -> error code (int32) * */ +static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { + {"snaps", DATA_TYPE_NVLIST, 0}, + {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3676,6 +3778,10 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) * outnvl: bookmark -> error code (int32) * */ +static const zfs_ioc_key_t zfs_keys_bookmark[] = { + {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, +}; + /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3713,6 +3819,10 @@ zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) * } * */ +static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { + {"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, +}; + static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3727,6 +3837,10 @@ zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * outnvl: bookmark -> error code (int32) * */ +static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { + {"<bookmark>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, +}; + static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3759,6 +3873,15 @@ zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, return (error); } +static const zfs_ioc_key_t zfs_keys_channel_program[] = { + {"program", DATA_TYPE_STRING, 0}, + {"arg", DATA_TYPE_ANY, 0}, + {"hidden_args", DATA_TYPE_ANY, ZK_OPTIONAL}, + {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, + {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3769,9 +3892,7 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvpair_t *nvarg = NULL; nvlist_t *hidden_args = NULL; - if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) { - return (EINVAL); - } + program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } @@ -3781,9 +3902,7 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } - if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) { - return (EINVAL); - } + nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); /* hidden args are optional */ if (nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args) == 0) { @@ -3808,6 +3927,10 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, * innvl: unused * outnvl: empty */ +static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { + /* no nvl keys */ +}; + /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -3819,6 +3942,10 @@ zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) * innvl: unused * outnvl: empty */ +static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { + /* no nvl keys */ +}; + /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, @@ -3909,6 +4036,11 @@ zfs_ioc_destroy(zfs_cmd_t *zc) * EINVAL is returned for an unknown command or if any of the provided vdev * guids have be specified with a type other than uint64. */ +static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { + {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} +}; + static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -3980,6 +4112,12 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) * EINVAL is returned for an unknown command or if any of the provided vdev * guids have be specified with a type other than uint64. */ +static const zfs_ioc_key_t zfs_keys_pool_trim[] = { + {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, + {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, + {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, +}; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) @@ -4044,6 +4182,10 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) * outnvl: "target" -> name of most recent snapshot * } */ +static const zfs_ioc_key_t zfs_keys_rollback[] = { + {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) @@ -5536,9 +5678,6 @@ zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) if (error != 0) return (error); - dsl_dataset_long_hold(dmu_objset_ds(os), FTAG); - dsl_pool_rele(dmu_objset_pool(os), FTAG); - if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); @@ -5552,11 +5691,14 @@ zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) mutex_exit(&os->os_upgrade_lock); } + dsl_pool_rele(dmu_objset_pool(os), FTAG); + taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; + } else { + dsl_pool_rele(dmu_objset_pool(os), FTAG); } - dsl_dataset_long_rele(dmu_objset_ds(os), FTAG); dsl_dataset_rele(dmu_objset_ds(os), FTAG); return (error); @@ -5952,6 +6094,11 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) * ... * } */ +static const zfs_ioc_key_t zfs_keys_hold[] = { + {"holds", DATA_TYPE_NVLIST, 0}, + {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) @@ -5962,9 +6109,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) int error; minor_t minor = 0; - error = nvlist_lookup_nvlist(args, "holds", &holds); - if (error != 0) - return (SET_ERROR(EINVAL)); + holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; @@ -5999,11 +6144,14 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) * ... * } */ +static const zfs_ioc_key_t zfs_keys_get_holds[] = { + /* no nvl keys */ +}; + /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { - ASSERT3P(args, ==, NULL); return (dsl_dataset_get_holds(snapname, outnvl)); } @@ -6018,6 +6166,10 @@ zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) * ... * } */ +static const zfs_ioc_key_t zfs_keys_release[] = { + {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, +}; + /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) @@ -6076,6 +6228,10 @@ zfs_ioc_space_written(zfs_cmd_t *zc) * "uncompressed" -> uncompressed space in bytes * } */ +static const zfs_ioc_key_t zfs_keys_space_snaps[] = { + {"firstsnap", DATA_TYPE_STRING, 0}, +}; + static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { @@ -6085,8 +6241,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) char *firstsnap; uint64_t used, comp, uncomp; - if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) - return (SET_ERROR(EINVAL)); + firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) @@ -6140,6 +6295,17 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * * outnvl is unused */ +static const zfs_ioc_key_t zfs_keys_send_new[] = { + {"fd", DATA_TYPE_INT32, 0}, + {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) @@ -6155,9 +6321,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) uint64_t resumeobj = 0; uint64_t resumeoff = 0; - error = nvlist_lookup_int32(innvl, "fd", &fd); - if (error != 0) - return (SET_ERROR(EINVAL)); + fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); @@ -6202,6 +6366,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) * "space" -> bytes of space (uint64) * } */ +static const zfs_ioc_key_t zfs_keys_send_space[] = { + {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, +}; + static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { @@ -6294,18 +6467,24 @@ out: * * onvl is unused */ +static const zfs_ioc_key_t zfs_keys_pool_sync[] = { + {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, +}; + /* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; - boolean_t force; + boolean_t force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); - force = fnvlist_lookup_boolean_value(innvl, "force"); + if (innvl) + force = fnvlist_lookup_boolean_value(innvl, "force"); + if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); @@ -6327,6 +6506,11 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) * presence indicated key should only be verified, not loaded * } */ +static const zfs_ioc_key_t zfs_keys_load_key[] = { + {"hidden_args", DATA_TYPE_NVLIST, 0}, + {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) @@ -6341,11 +6525,7 @@ zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) goto error; } - ret = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); - if (ret != 0) { - ret = SET_ERROR(EINVAL); - goto error; - } + hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); @@ -6369,6 +6549,10 @@ error: * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ +static const zfs_ioc_key_t zfs_keys_unload_key[] = { + /* no nvl keys */ +}; + /* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) @@ -6401,6 +6585,12 @@ out: * * outnvl is unused */ +static const zfs_ioc_key_t zfs_keys_change_key[] = { + {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + /* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) @@ -6467,7 +6657,7 @@ static void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, - boolean_t allow_log) + boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; @@ -6486,6 +6676,8 @@ zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; + vec->zvec_nvl_keys = nvl_keys; + vec->zvec_nvl_key_count = num_keys; } static void @@ -6549,104 +6741,141 @@ zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, - POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_destroy_bookmarks, + ARRAY_SIZE(zfs_keys_destroy_bookmarks)); + + zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, + zfs_ioc_load_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); + zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, + zfs_ioc_unload_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); + zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, + zfs_ioc_change_key, zfs_secpolicy_change_key, + DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, + B_TRUE, B_TRUE, zfs_keys_change_key, + ARRAY_SIZE(zfs_keys_change_key)); + + zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, + zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, - B_TRUE); + B_TRUE, zfs_keys_channel_program, + ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_discard_checkpoint, + ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); - zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, - zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, - POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); + zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, + zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); - zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, - zfs_ioc_load_key, zfs_secpolicy_load_key, - DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE); - zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, - zfs_ioc_unload_key, zfs_secpolicy_load_key, - DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE); - zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, - zfs_ioc_change_key, zfs_secpolicy_change_key, - DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, - B_TRUE, B_TRUE); + zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, + zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, + zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); /* IOCTLS that use the legacy function signature */ @@ -6783,6 +7012,80 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } +/* + * Verify that for non-legacy ioctls the input nvlist + * pairs match against the expected input. + * + * Possible errors are: + * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered + * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing + * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair + */ +static int +zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) +{ + const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; + boolean_t required_keys_found = B_FALSE; + + /* + * examine each input pair + */ + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *name = nvpair_name(pair); + data_type_t type = nvpair_type(pair); + boolean_t identified = B_FALSE; + + /* + * check pair against the documented names and type + */ + for (int k = 0; k < vec->zvec_nvl_key_count; k++) { + /* if not a wild card name, check for an exact match */ + if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && + strcmp(nvl_keys[k].zkey_name, name) != 0) + continue; + + identified = B_TRUE; + + if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && + nvl_keys[k].zkey_type != type) { + return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); + } + + if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) + continue; + + required_keys_found = B_TRUE; + break; + } + + /* allow an 'optional' key, everything else is invalid */ + if (!identified && + (strcmp(name, "optional") != 0 || + type != DATA_TYPE_NVLIST)) { + return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); + } + } + + /* verify that all required keys were found */ + for (int k = 0; k < vec->zvec_nvl_key_count; k++) { + if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) + continue; + + if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { + /* at least one non-optionial key is expected here */ + if (!required_keys_found) + return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); + continue; + } + + if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) + return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); + } + + return (0); +} + int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) @@ -6933,9 +7236,16 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; + /* + * The registered ioctl list may be sparse, verify that either + * a normal or legacy handler are registered. + */ + if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); @@ -6978,6 +7288,19 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) break; } + /* + * Ensure that all input pairs are valid before we pass them down + * to the lower layers. + * + * The vectored functions can use fnvlist_lookup_{type} for any + * required pairs since zfs_check_input_nvpairs() confirmed that + * they exist and are of the correct type. + */ + if (error == 0 && vec->zvec_func != NULL) { + error = zfs_check_input_nvpairs(innvl, vec); + if (error != 0) + goto out; + } if (error == 0) error = vec->zvec_secpolicy(zc, innvl, cr); diff --git a/usr/src/uts/common/fs/zfs/zfs_onexit.c b/usr/src/uts/common/fs/zfs/zfs_onexit.c index 4ae8dc29a0..99e530ca3c 100644 --- a/usr/src/uts/common/fs/zfs/zfs_onexit.c +++ b/usr/src/uts/common/fs/zfs/zfs_onexit.c @@ -125,13 +125,18 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp) { file_t *fp; zfs_onexit_t *zo; + int ret; fp = getf(fd); if (fp == NULL) return (SET_ERROR(EBADF)); *minorp = getminor(fp->f_vnode->v_rdev); - return (zfs_onexit_minor_to_state(*minorp, &zo)); + ret = zfs_onexit_minor_to_state(*minorp, &zo); + if (ret != 0) + releasef(fd); + + return (ret); } void diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 6b61cd7a84..86d83e7ace 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -20,11 +20,12 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. + * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -64,10 +65,12 @@ #include <sys/zfs_ctldir.h> #include <sys/zfs_fuid.h> #include <sys/bootconf.h> +#include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/dnlc.h> #include <sys/dmu_objset.h> #include <sys/spa_boot.h> +#include <sys/vdev_impl.h> #include "zfs_comutil.h" int zfsfstype; @@ -172,7 +175,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); @@ -921,8 +924,13 @@ zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) int err; if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { - if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) + if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } return (B_FALSE); } @@ -1711,6 +1719,36 @@ zfs_mount_label_policy(vfs_t *vfsp, char *osname) return (retv); } +/* + * Load a string-valued boot property and attempt to convert it to a 64-bit + * unsigned integer. If the value is not present, or the conversion fails, + * return the provided default value. + */ +static uint64_t +spa_get_bootprop_uint64(const char *name, uint64_t defval) +{ + char *propval; + u_longlong_t r; + int e; + + if ((propval = spa_get_bootprop(name)) == NULL) { + /* + * The property does not exist. + */ + return (defval); + } + + e = ddi_strtoull(propval, NULL, 10, &r); + + spa_free_bootprop(propval); + + /* + * If the conversion succeeded, return the value. If there was any + * kind of failure, just return the default value. + */ + return (e == 0 ? r : defval); +} + static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { @@ -1721,6 +1759,8 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why) vnode_t *vp = NULL; char *zfs_bootfs; char *zfs_devid; + uint64_t zfs_bootpool; + uint64_t zfs_bootvdev; ASSERT(vfsp); @@ -1732,6 +1772,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why) if (why == ROOT_INIT) { if (zfsrootdone++) return (SET_ERROR(EBUSY)); + /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do @@ -1746,23 +1787,47 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why) return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); - error = spa_import_rootpool(rootfs.bo_name, zfs_devid); - if (zfs_devid) - spa_free_bootprop(zfs_devid); - if (error) { + + /* + * The boot loader may also provide us with the GUID for both + * the pool and the nominated boot vdev. A GUID value of 0 is + * explicitly invalid (see "spa_change_guid()"), so we use this + * as a sentinel value when no GUID is present. + */ + zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0); + zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0); + + /* + * Initialise the early boot device rescan mechanism. A scan + * will not actually be performed unless we need to do so in + * order to find the correct /devices path for a relocated + * device. + */ + vdev_disk_preroot_init(); + + error = spa_import_rootpool(rootfs.bo_name, zfs_devid, + zfs_bootpool, zfs_bootvdev); + + spa_free_bootprop(zfs_devid); + + if (error != 0) { spa_free_bootprop(zfs_bootfs); + vdev_disk_preroot_fini(); cmn_err(CE_NOTE, "spa_import_rootpool: error %d", error); return (error); } + if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) { spa_free_bootprop(zfs_bootfs); + vdev_disk_preroot_fini(); cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d", error); return (error); } spa_free_bootprop(zfs_bootfs); + vdev_disk_preroot_fini(); if (error = vfs_lock(vfsp)) return (error); @@ -1832,7 +1897,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + (vn_count(mvp) != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } @@ -2169,18 +2234,34 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) * Our count is maintained in the vfs structure, but the * number is off by 1 to indicate a hold on the vfs * structure itself. - * - * The '.zfs' directory maintains a reference of its - * own, and any active references underneath are - * reflected in the vnode count. */ - if (zfsvfs->z_ctldir == NULL) { - if (vfsp->vfs_count > 1) - return (SET_ERROR(EBUSY)); - } else { - if (vfsp->vfs_count > 2 || - zfsvfs->z_ctldir->v_count > 1) - return (SET_ERROR(EBUSY)); + boolean_t draining; + uint_t thresh = 1; + + /* + * The '.zfs' directory maintains a reference of its own, and + * any active references underneath are reflected in the vnode + * count. Allow one additional reference for it. + */ + if (zfsvfs->z_ctldir != NULL) + thresh++; + + /* + * If it's running, the asynchronous unlinked drain task needs + * to be stopped before the number of active vnodes can be + * reliably checked. + */ + draining = zfsvfs->z_draining; + if (draining) + zfs_unlinked_drain_stop_wait(zfsvfs); + + if (vfsp->vfs_count > thresh || (zfsvfs->z_ctldir != NULL && + zfsvfs->z_ctldir->v_count > 1)) { + if (draining) { + /* If it was draining, restart the task */ + zfs_unlinked_drain(zfsvfs); + } + return (SET_ERROR(EBUSY)); } } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 99011b83b4..dd58b4a549 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -738,6 +738,57 @@ out: return (error); } +static void +zfs_write_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, + cred_t *cr, boolean_t *did_check, dmu_tx_t *tx) +{ + ASSERT(did_check != NULL); + ASSERT(tx != NULL); + + if (*did_check) + return; + + zilog_t *zilog = zfsvfs->z_log; + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + ((zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0)) != 0) { + uint64_t newmode; + vattr_t va; + + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + + /* + * Make sure SUID/SGID bits will be removed when we replay the + * log. + */ + bzero(&va, sizeof (va)); + va.va_mask = AT_MODE; + va.va_nodeid = zp->z_id; + va.va_mode = newmode; + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE, NULL); + } + mutex_exit(&zp->z_acl_lock); + + *did_check = B_TRUE; +} + /* * Write the bytes to a file. * @@ -784,6 +835,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; + boolean_t did_clear_setid_bits = B_FALSE; /* * Fasttrack empty write @@ -973,6 +1025,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* + * NB: We must call zfs_write_clear_setid_bits_if_necessary + * before committing the transaction! + */ + + /* * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since rangelock_reduce() will @@ -1049,30 +1106,8 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) break; } - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the excute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); + zfs_write_clear_setid_bits_if_necessary(zfsvfs, zp, cr, + &did_clear_setid_bits, tx); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); @@ -1100,6 +1135,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) prev_error = error; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + /* + * NB: During replay, the TX_SETATTR record logged by + * zfs_write_clear_setid_bits_if_necessary must precede + * any of the TX_WRITE records logged here. + */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); @@ -4839,7 +4879,7 @@ zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, { if (vp->v_type == VDIR) return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + return ((*noffp < 0) ? EINVAL : 0); } /* @@ -5147,27 +5187,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, return (0); } -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * <modify memory> - * munmap() - * close() - * <time lapse> - * putpage() via fsflush - * - * If we wait until fsflush to come along, we can have a modification time that - * is some arbitrary point in the future. In order to prevent this in the - * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is - * torn down. - */ /* ARGSUSED */ static int zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, @@ -5610,7 +5629,6 @@ zfs_isdir() /* * Directory vnode operations template */ -vnodeops_t *zfs_dvnodeops; const fs_operation_def_t zfs_dvnodeops_template[] = { VOPNAME_OPEN, { .vop_open = zfs_open }, VOPNAME_CLOSE, { .vop_close = zfs_close }, @@ -5643,7 +5661,6 @@ const fs_operation_def_t zfs_dvnodeops_template[] = { /* * Regular file vnode operations template */ -vnodeops_t *zfs_fvnodeops; const fs_operation_def_t zfs_fvnodeops_template[] = { VOPNAME_OPEN, { .vop_open = zfs_open }, VOPNAME_CLOSE, { .vop_close = zfs_close }, @@ -5678,7 +5695,6 @@ const fs_operation_def_t zfs_fvnodeops_template[] = { /* * Symbolic link vnode operations template */ -vnodeops_t *zfs_symvnodeops; const fs_operation_def_t zfs_symvnodeops_template[] = { VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, VOPNAME_SETATTR, { .vop_setattr = zfs_setattr }, @@ -5695,7 +5711,6 @@ const fs_operation_def_t zfs_symvnodeops_template[] = { /* * special share hidden files vnode operations template */ -vnodeops_t *zfs_sharevnodeops; const fs_operation_def_t zfs_sharevnodeops_template[] = { VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, VOPNAME_ACCESS, { .vop_access = zfs_access }, @@ -5721,7 +5736,6 @@ const fs_operation_def_t zfs_sharevnodeops_template[] = { * zfs_link() - no links into/out of attribute space * zfs_rename() - no moves into/out of attribute space */ -vnodeops_t *zfs_xdvnodeops; const fs_operation_def_t zfs_xdvnodeops_template[] = { VOPNAME_OPEN, { .vop_open = zfs_open }, VOPNAME_CLOSE, { .vop_close = zfs_close }, @@ -5752,7 +5766,6 @@ const fs_operation_def_t zfs_xdvnodeops_template[] = { /* * Error vnode operations template */ -vnodeops_t *zfs_evnodeops; const fs_operation_def_t zfs_evnodeops_template[] = { VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 257d5b2a35..84ba5947fa 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1246,6 +1246,8 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) if (zp == NULL) { err = SET_ERROR(ENOENT); } else { + if (zp->z_links == 0) + zp->z_unlinked = B_TRUE; *zpp = zp; } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index b02363e7eb..b32dffd79c 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -24,8 +24,8 @@ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright 2020 Joyent, Inc. */ #include <sys/sysmacros.h> @@ -51,6 +51,7 @@ #include <sys/abd.h> #include <sys/cityhash.h> #include <sys/dsl_crypt.h> +#include <sys/stdbool.h> /* * ========================================================================== @@ -482,7 +483,7 @@ error: zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { spa_log_error(spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0, 0); } } else { @@ -1120,7 +1121,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); - ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, @@ -1858,10 +1859,36 @@ zio_execute(zio_t *zio) return; } +#ifdef _KERNEL + /* + * The I/O pipeline is a part of the machinery responsible for + * evacuation of memory pages to disk when we are under + * sufficient memory pressure for pageout to run. By setting + * this flag, allocations may dip into pages in the pageout + * reserved pool in order to try to make forward progress. + */ + bool set_pushpage = false; + if (!(curthread->t_flag & T_PUSHPAGE)) { + /* + * We can be called recursively, so we need to remember + * if this frame was the one that first set the flag or + * not. + */ + set_pushpage = true; + curthread->t_flag |= T_PUSHPAGE; + } +#endif + zio->io_stage = stage; zio->io_pipeline_trace |= zio->io_stage; rv = zio_pipeline[highbit64(stage) - 1](zio); +#ifdef _KERNEL + if (set_pushpage) { + curthread->t_flag &= ~T_PUSHPAGE; + } +#endif + if (rv == ZIO_PIPELINE_STOP) return; @@ -1990,7 +2017,11 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " + "failure and has been suspended; `zpool clear` will be required " + "before the pool can be written to.", spa_name(spa)); + + (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); @@ -4260,7 +4291,7 @@ zio_done(zio_t *zio) zio->io_vd->vdev_stat.vs_slow_ios++; mutex_exit(&zio->io_vd->vdev_stat_lock); - zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0, 0); } @@ -4275,7 +4306,7 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, + (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, &zio->io_bookmark, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & @@ -4286,7 +4317,7 @@ zio_done(zio_t *zio) * error and generate a logical data ereport. */ spa_log_error(spa, &zio->io_bookmark); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, + (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, &zio->io_bookmark, zio, 0, 0); } } diff --git a/usr/src/uts/common/fs/zfs/zio_crypt.c b/usr/src/uts/common/fs/zfs/zio_crypt.c index 78c26e3e90..9541a0a734 100644 --- a/usr/src/uts/common/fs/zfs/zio_crypt.c +++ b/usr/src/uts/common/fs/zfs/zio_crypt.c @@ -1061,14 +1061,17 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, dnode_phys_t *adnp; boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); crypto_data_t cd; - uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + uint8_t tmp_dncore[sizeof (dnode_phys_t)]; + adnp = (dnode_phys_t *)tmp_dncore; cd.cd_format = CRYPTO_DATA_RAW; cd.cd_offset = 0; + cd.cd_length = offsetof(dnode_phys_t, dn_blkptr); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; /* authenticate the core dnode (masking out non-portable bits) */ - bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); - adnp = (dnode_phys_t *)tmp_dncore; + bcopy(dnp, tmp_dncore, cd.cd_length); if (le_bswap) { adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); @@ -1078,10 +1081,6 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; adnp->dn_used = 0; - cd.cd_length = sizeof (tmp_dncore); - cd.cd_raw.iov_base = (char *)adnp; - cd.cd_raw.iov_len = cd.cd_length; - ret = crypto_mac_update(ctx, &cd, NULL); if (ret != CRYPTO_SUCCESS) { ret = SET_ERROR(EIO); diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index a65721d175..e332da9672 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. */ /* @@ -100,6 +101,26 @@ static kmutex_t inject_delay_mtx; static int inject_next_id = 1; /* + * Test if the requested frequency was triggered + */ +static boolean_t +freq_triggered(uint32_t frequency) +{ + /* + * zero implies always (100%) + */ + if (frequency == 0) + return (B_TRUE); + + /* + * Note: we still handle legacy (unscaled) frequecy values + */ + uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; + + return (spa_get_random(maximum) < frequency); +} + +/* * Returns true if the given record matches the I/O in progress. */ static boolean_t @@ -114,8 +135,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); + return (freq_triggered(record->zi_freq)); else return (B_FALSE); } @@ -130,8 +150,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva, zb->zb_blkid <= record->zi_end && (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && error == record->zi_error) { - return (record->zi_freq == 0 || - spa_get_random(100) < record->zi_freq); + return (freq_triggered(record->zi_freq)); } return (B_FALSE); @@ -360,6 +379,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) if (handler->zi_record.zi_error == error) { /* + * limit error injection if requested + */ + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + + /* * For a failed open, pretend like the device * has gone away. */ @@ -527,6 +552,9 @@ zio_handle_io_delay(zio_t *zio) if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) continue; + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + if (vd->vdev_guid != handler->zi_record.zi_guid) continue; diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 3ed5977c20..1b3bc07600 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -24,7 +24,7 @@ * Portions Copyright 2010 Robert Milkowski * * Copyright 2017 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2019 Joyent, Inc. */ @@ -1158,13 +1158,13 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, P2BOUNDARY(offset, size, zv->zv_volblocksize)) { return (SET_ERROR(EINVAL)); } - ASSERT(size <= zv->zv_volblocksize); + VERIFY3U(size, <=, zv->zv_volblocksize); /* Locate the extent this belongs to */ - ze = list_head(&zv->zv_extents); - while (offset >= ze->ze_nblks * zv->zv_volblocksize) { + for (ze = list_head(&zv->zv_extents); + ze != NULL && offset >= ze->ze_nblks * zv->zv_volblocksize; + ze = list_next(&zv->zv_extents, ze)) { offset -= ze->ze_nblks * zv->zv_volblocksize; - ze = list_next(&zv->zv_extents, ze); } if (ze == NULL) @@ -1232,7 +1232,7 @@ zvol_strategy(buf_t *bp) addr = bp->b_un.b_addr; resid = bp->b_bcount; - if (resid > 0 && (off < 0 || off >= volsize)) { + if (resid > 0 && off >= volsize) { bioerror(bp, EIO); biodone(bp); return (0); @@ -1499,7 +1499,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) bytes = volsize - off; tot_bytes += bytes; - dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -1709,7 +1709,7 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, } /* - * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). + * Dirtbag ioctls to support mkfs(8) for UFS filesystems. See dkio(4I). * Also a dirtbag dkio ioctl for unmap/free-block functionality. */ /*ARGSUSED*/ @@ -1767,6 +1767,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case DKIOCGMEDIAINFOEXT: { struct dk_minfo_ext dkmext; + size_t len; bzero(&dkmext, sizeof (dkmext)); dkmext.dki_lbsize = 1U << zv->zv_min_bs; @@ -1774,7 +1775,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; dkmext.dki_media_type = DK_UNKNOWN; mutex_exit(&zfsdev_state_lock); - if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag)) + + switch (ddi_model_convert_from(flag & FMODELS)) { + case DDI_MODEL_ILP32: + len = sizeof (struct dk_minfo_ext32); + break; + default: + len = sizeof (struct dk_minfo_ext); + break; + } + + if (ddi_copyout(&dkmext, (void *)arg, len, flag)) error = SET_ERROR(EFAULT); return (error); } |
