summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/fs')
-rw-r--r--usr/src/uts/common/fs/bootfs/bootfs_vfsops.c2
-rw-r--r--usr/src/uts/common/fs/dev/sdev_ptsops.c23
-rw-r--r--usr/src/uts/common/fs/dnlc.c26
-rw-r--r--usr/src/uts/common/fs/doorfs/door_sys.c14
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c2
-rw-r--r--usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c3
-rw-r--r--usr/src/uts/common/fs/lookup.c29
-rw-r--r--usr/src/uts/common/fs/mntfs/mntvnops.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_client.c36
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_idmap.c38
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_attr.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_srv_ns.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_state.c1
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c2
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_subr.c4
-rw-r--r--usr/src/uts/common/fs/nfs/nfs4_vnops.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_auth.c31
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_cmd.c6
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_export.c27
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_server.c33
-rw-r--r--usr/src/uts/common/fs/nfs/nfs_stats.c2
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_node.c2
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_vfsops.c10
-rw-r--r--usr/src/uts/common/fs/pcfs/pc_vnops.c6
-rw-r--r--usr/src/uts/common/fs/portfs/port.c2
-rw-r--r--usr/src/uts/common/fs/portfs/port_fd.c2
-rw-r--r--usr/src/uts/common/fs/portfs/port_fop.c101
-rw-r--r--usr/src/uts/common/fs/proc/prioctl.c20
-rw-r--r--usr/src/uts/common/fs/proc/prvnops.c5
-rw-r--r--usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c2
-rw-r--r--usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c4
-rw-r--r--usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c15
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_dispatch.c86
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_durable.c29
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c4
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c27
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c2
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_lease.c12
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_negotiate.c528
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_oplock.c27
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c73
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c72
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_query_info.c46
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_read.c11
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_session_setup.c1
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_signing.c90
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb2_write.c3
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb31_preauth.c171
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb3_encrypt.c103
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c75
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb3_kdf.c137
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_authenticate.c18
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c66
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_common_open.c219
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_fem.c11
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_fsops.c380
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_idmap.c14
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c30
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_node.c35
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_notify.c15
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_ofile.c45
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_sd.c46
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_server.c2
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_session.c4
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c2
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c8
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_tree.c51
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_user.c44
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_write.c13
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7c.c2
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7clogd.c4
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.c8
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c2
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_vnops.c44
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter.c2
-rw-r--r--usr/src/uts/common/fs/sockfs/sockfilter_impl.h4
-rw-r--r--usr/src/uts/common/fs/sockfs/sockparams.c2
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c63
-rw-r--r--usr/src/uts/common/fs/sockfs/socksyscalls.c62
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c4
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_dir.c68
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vfsops.c4
-rw-r--r--usr/src/uts/common/fs/tmpfs/tmp_vnops.c2
-rw-r--r--usr/src/uts/common/fs/ufs/lufs_log.c2
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_alloc.c4
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_dir.c21
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_inode.c9
-rw-r--r--usr/src/uts/common/fs/ufs/ufs_vfsops.c2
-rw-r--r--usr/src/uts/common/fs/vnode.c74
-rw-r--r--usr/src/uts/common/fs/xattr.c2
-rw-r--r--usr/src/uts/common/fs/zfs/abd.c183
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c2538
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c276
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c10
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c57
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_recv.c14
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c6
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_zfetch.c138
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c94
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c49
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_crypt.c70
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_scan.c133
-rw-r--r--usr/src/uts/common/fs/zfs/lua/README.zfs2
-rw-r--r--usr/src/uts/common/fs/zfs/lua/ldebug.c2
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c6
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c1
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c98
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c3
-rw-r--r--usr/src/uts/common/fs/zfs/spa_history.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/abd.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h14
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc_impl.h876
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h34
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_impl.h219
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h21
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_scan.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/simd.h184
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_boot.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h7
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h45
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_raidz.h65
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h360
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h52
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h7
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c169
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c181
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_indirect.c6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c222
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c279
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math.c573
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c424
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h1477
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c337
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c642
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c2483
-rw-r--r--usr/src/uts/common/fs/zfs/zcp.c7
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_fm.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c545
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_onexit.c7
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c121
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c117
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c45
-rw-r--r--usr/src/uts/common/fs/zfs/zio_crypt.c13
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c36
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c29
154 files changed, 14192 insertions, 2499 deletions
diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
index e642e86169..5b3171e0d1 100644
--- a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
+++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c
@@ -93,7 +93,7 @@ bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
* there's nothing to be done about that.
*/
vfs_setresource(vfsp, bootfs_name, 0);
- bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI);
+ bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP_LAZY);
if (bfs == NULL)
return (ENOMEM);
diff --git a/usr/src/uts/common/fs/dev/sdev_ptsops.c b/usr/src/uts/common/fs/dev/sdev_ptsops.c
index 4d8f47397b..1b3f1561de 100644
--- a/usr/src/uts/common/fs/dev/sdev_ptsops.c
+++ b/usr/src/uts/common/fs/dev/sdev_ptsops.c
@@ -97,7 +97,6 @@ devpts_strtol(const char *nm, minor_t *mp)
* away, we use the validator to do deferred cleanup i.e. when such
* nodes are encountered during subsequent lookup() and readdir().
*/
-/*ARGSUSED*/
int
devpts_validate(struct sdev_node *dv)
{
@@ -124,8 +123,8 @@ devpts_validate(struct sdev_node *dv)
/*
* Check if pts driver is attached
*/
- if (ptms_slave_attached() == (major_t)-1) {
- sdcmn_err7(("devpts_validate: slave not attached\n"));
+ if (ptms_subsidiary_attached() == (major_t)-1) {
+ sdcmn_err7(("devpts_validate: subsidiary not attached\n"));
return (SDEV_VTOR_INVALID);
}
@@ -159,7 +158,6 @@ devpts_validate(struct sdev_node *dv)
* This callback is invoked from devname_lookup_func() to create
* a pts entry when the node is not found in the cache.
*/
-/*ARGSUSED*/
static int
devpts_create_rvp(struct sdev_node *ddv, char *nm,
void **arg, cred_t *cred, void *whatever, char *whichever)
@@ -177,12 +175,11 @@ devpts_create_rvp(struct sdev_node *ddv, char *nm,
}
/*
- * Check if pts driver is attached and if it is
- * get the major number.
+ * Check if pts driver is attached and if it is get the major number.
*/
- maj = ptms_slave_attached();
+ maj = ptms_subsidiary_attached();
if (maj == (major_t)-1) {
- sdcmn_err7(("devpts_create_rvp: slave not attached\n"));
+ sdcmn_err7(("devpts_create_rvp: subsidiary not attached\n"));
return (-1);
}
@@ -286,7 +283,6 @@ devpts_prunedir(struct sdev_node *ddv)
* access the realvp of the specfs node directly instead of using
* VOP_REALVP().
*/
-/*ARGSUSED3*/
static int
devpts_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
@@ -326,7 +322,6 @@ devpts_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
* - creating an existing dir read-only succeeds, otherwise EISDIR
* - exclusive creates fail - EEXIST
*/
-/*ARGSUSED2*/
static int
devpts_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
int mode, struct vnode **vpp, struct cred *cred, int flag,
@@ -359,11 +354,10 @@ devpts_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
}
/*
- * Display all instantiated pts (slave) device nodes.
- * A /dev/pts entry will be created only after the first lookup of the slave
- * device succeeds.
+ * Display all instantiated pts (subsidiary) device nodes.
+ * A /dev/pts entry will be created only after the first lookup of the
+ * subsidiary device succeeds.
*/
-/*ARGSUSED4*/
static int
devpts_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
int *eofp, caller_context_t *ct, int flags)
@@ -387,7 +381,6 @@ devpts_set_id(struct sdev_node *dv, struct vattr *vap, int protocol)
}
-/*ARGSUSED4*/
static int
devpts_setattr(struct vnode *vp, struct vattr *vap, int flags,
struct cred *cred, caller_context_t *ctp)
diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c
index 102375dedd..b0edec758c 100644
--- a/usr/src/uts/common/fs/dnlc.c
+++ b/usr/src/uts/common/fs/dnlc.c
@@ -25,7 +25,7 @@
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
@@ -256,7 +256,7 @@ vnode_t negative_cache_vnode;
*/
#define dnlc_free(ncp) \
{ \
- kmem_free((ncp), sizeof (ncache_t) + (ncp)->namlen); \
+ kmem_free((ncp), NCACHE_SIZE((ncp)->namlen)); \
atomic_dec_32(&dnlc_nentries); \
}
@@ -460,7 +460,7 @@ dnlc_enter(vnode_t *dp, const char *name, vnode_t *vp)
VN_HOLD_DNLC(dp);
ncp->vp = vp;
VN_HOLD_DNLC(vp);
- bcopy(name, ncp->name, namlen + 1); /* name and null */
+ bcopy(name, ncp->name, namlen);
ncp->hash = hash;
hp = &nc_hash[hash & nc_hashmask];
@@ -534,7 +534,7 @@ dnlc_update(vnode_t *dp, const char *name, vnode_t *vp)
VN_HOLD_DNLC(dp);
ncp->vp = vp;
VN_HOLD_DNLC(vp);
- bcopy(name, ncp->name, namlen + 1); /* name and null */
+ bcopy(name, ncp->name, namlen);
ncp->hash = hash;
hp = &nc_hash[hash & nc_hashmask];
@@ -977,7 +977,7 @@ dnlc_get(uchar_t namlen)
dnlc_max_nentries_cnt++; /* keep a statistic */
return (NULL);
}
- ncp = kmem_alloc(sizeof (ncache_t) + namlen, KM_NOSLEEP);
+ ncp = kmem_alloc(NCACHE_SIZE(namlen), KM_NOSLEEP);
if (ncp == NULL) {
return (NULL);
}
@@ -1257,7 +1257,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
* dnlc_dir_reclaim() is called as a result of memory shortage.
*/
DNLC_DIR_HASH(name, hash, namlen);
- dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
+ dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP);
if (dep == NULL) {
#ifdef DEBUG
/*
@@ -1268,7 +1268,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
* performance running a debug kernel.
* This random error only occurs in debug mode.
*/
- dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
+ dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP);
if (dep != NULL)
goto ok;
#endif
@@ -1278,7 +1278,7 @@ dnlc_dir_add_entry(dcanchor_t *dcap, const char *name, uint64_t handle)
* called with.
*/
dnlc_dir_reclaim(NULL);
- dep = kmem_alloc(sizeof (dcentry_t) - 1 + namlen, KM_NOSLEEP);
+ dep = kmem_alloc(DCENTTRY_SIZE(namlen), KM_NOSLEEP);
if (dep == NULL) {
/*
* still no memory, better delete this cache
@@ -1311,7 +1311,7 @@ ok:
dnlc_dir_max_size) {
mutex_exit(&dcap->dca_lock);
dnlc_dir_purge(dcap);
- kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
+ kmem_free(dep, DCENTTRY_SIZE(namlen));
ncs.ncs_dir_add_max.value.ui64++;
return (DTOOBIG);
}
@@ -1348,7 +1348,7 @@ ok:
return (DOK);
} else {
mutex_exit(&dcap->dca_lock);
- kmem_free(dep, sizeof (dcentry_t) - 1 + namlen);
+ kmem_free(dep, DCENTTRY_SIZE(namlen));
return (DNOCACHE);
}
}
@@ -1481,8 +1481,7 @@ dnlc_dir_abort(dircache_t *dcp)
nhp = dcp->dc_namehash[i];
while (nhp != NULL) { /* for each chained entry */
dep = nhp->de_next;
- kmem_free(nhp, sizeof (dcentry_t) - 1 +
- nhp->de_namelen);
+ kmem_free(nhp, DCENTTRY_SIZE(nhp->de_namelen));
nhp = dep;
}
}
@@ -1578,8 +1577,7 @@ dnlc_dir_rem_entry(dcanchor_t *dcap, const char *name, uint64_t *handlep)
}
te = *prevpp;
*prevpp = (*prevpp)->de_next;
- kmem_free(te, sizeof (dcentry_t) - 1 +
- te->de_namelen);
+ kmem_free(te, DCENTTRY_SIZE(te->de_namelen));
/*
* If the total number of entries
diff --git a/usr/src/uts/common/fs/doorfs/door_sys.c b/usr/src/uts/common/fs/doorfs/door_sys.c
index 68a7a11d82..a2d3812938 100644
--- a/usr/src/uts/common/fs/doorfs/door_sys.c
+++ b/usr/src/uts/common/fs/doorfs/door_sys.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -1114,6 +1115,19 @@ door_stack_copyout(const void *kaddr, void *uaddr, size_t count)
}
/*
+ * The IA32 ABI supplement 1.0 changed the required stack alignment to
+ * 16 bytes (from 4 bytes), so that code can make use of SSE instructions.
+ * This is already done for process entry, thread entry, and makecontext();
+ * We need to do this for door_return as well. The stack will be aligned to
+ * whatever the door_results is aligned.
+ * See: usr/src/lib/libc/i386/gen/makectxt.c for more details.
+ */
+#if defined(__amd64)
+#undef STACK_ALIGN32
+#define STACK_ALIGN32 16
+#endif
+
+/*
* Writes the stack layout for door_return() into the door_server_t of the
* server thread.
*/
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
index 05ee2c6e09..cc03f41c8d 100644
--- a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c
@@ -504,7 +504,7 @@ hldiraddentry(
/* Alloc and init dir entry */
namelen = strlen(name) + 1;
alloc_size = namelen + sizeof (hldirent_t);
- hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP);
+ hdp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY);
if (hdp == NULL)
return (ENOSPC);
diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
index c582a8cac2..bf80da6dbe 100644
--- a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
+++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c
@@ -271,8 +271,7 @@ hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
&dpn)) != 0)
goto out;
- if ((hm = kmem_zalloc(sizeof (hlfsmount_t),
- KM_NORMALPRI | KM_NOSLEEP)) == NULL) {
+ if ((hm = kmem_zalloc(sizeof (hlfsmount_t), KM_NOSLEEP_LAZY)) == NULL) {
pn_free(&dpn);
error = ENOMEM;
goto out;
diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c
index 093db5a4b4..71e2aeb48b 100644
--- a/usr/src/uts/common/fs/lookup.c
+++ b/usr/src/uts/common/fs/lookup.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Joyent, Inc.
*/
@@ -246,6 +245,9 @@ lookuppnvp(
pp = &presrvd;
}
+ if (flags & __FLXNOAUTO)
+ lookup_flags |= __FLXNOAUTO;
+
if (auditing)
audit_anchorpath(pnp, vp == rootvp);
@@ -433,7 +435,7 @@ checkforroot:
* Traverse mount points.
* XXX why don't we need to hold a read lock here (call vn_vfsrlock)?
* What prevents a concurrent update to v_vfsmountedhere?
- * Possible answer: if mounting, we might not see the mount
+ * Possible answer: if mounting, we might not see the mount
* if it is concurrently coming into existence, but that's
* really not much different from the thread running a bit slower.
* If unmounting, we may get into traverse() when we shouldn't,
@@ -1052,7 +1054,26 @@ vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn,
VN_HOLD(vrootp);
if (vrootp != rootdir)
VN_HOLD(vrootp);
- if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp,
+
+ /*
+ * The FOLLOW flag only determines, if the final path component
+ * is a symlink, whether lookuppnvp will return the symlink, or its
+ * target.
+ *
+ * If the vp is a VLNK, then passing the FOLLOW flag will cause
+ * lookuppnvp to return the vnode of its target, instead of itself, and
+ * so vn_compare will fail. Therefore, we do not pass FOLLOW when our vp
+ * is a symlink.
+ *
+ * If the vp is not a VLNK, then we pass FOLLOW on the off-chance that
+ * the stored v_path ends at a symlink, instead of the symlink's target.
+ */
+ if (vp->v_type != VLNK)
+ flags |= FOLLOW;
+ else
+ flags &= ~FOLLOW;
+
+ if (lookuppnvp(pn, rpn, flags, NULL, &compvp, vrootp, vrootp,
cr) == 0) {
/*
* Check to see if the returned vnode is the same as the one we
diff --git a/usr/src/uts/common/fs/mntfs/mntvnops.c b/usr/src/uts/common/fs/mntfs/mntvnops.c
index 7374820f95..6bb3b514fb 100644
--- a/usr/src/uts/common/fs/mntfs/mntvnops.c
+++ b/usr/src/uts/common/fs/mntfs/mntvnops.c
@@ -54,7 +54,7 @@ extern void vfs_mnttab_readop(void);
* mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
* the mounted resources: the read-only file /etc/mnttab, and a collection of
* ioctl() commands. Most of these interfaces are public and are described in
- * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
+ * mnttab(5). Three private ioctl() commands, MNTIOC_GETMNTENT,
* MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
* family of functions, allowing them to support white space in mount names.
*
@@ -1039,7 +1039,7 @@ mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
/*
* The mntnode already has at least one snapshot from
* which to take the size; the user will understand from
- * mnttab(4) that the current size of the in-kernel
+ * mnttab(5) that the current size of the in-kernel
* mnttab is irrelevant.
*/
size = rsnapp->mnts_nmnts ? rsnapp->mnts_text_size :
@@ -1186,7 +1186,7 @@ mntinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
* has a special meaning for /etc/mnttab: it forces mntfs to refresh the
* snapshot at the next ioctl().
*
- * mnttab(4) explains that "the snapshot...is taken any time a read(2) is
+ * mnttab(5) explains that "the snapshot...is taken any time a read(2) is
* performed at offset 0". We therefore ignore the read snapshot here.
*/
/* ARGSUSED */
diff --git a/usr/src/uts/common/fs/nfs/nfs4_client.c b/usr/src/uts/common/fs/nfs/nfs4_client.c
index 5456fc7c63..856da430ea 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_client.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_client.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
+ * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All Rights Reserved
*/
@@ -464,33 +464,15 @@ nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
rp = VTOR4(vp);
mutex_enter(&rp->r_statelock);
was_serial = (rp->r_serial == curthread);
- if (rp->r_serial && !was_serial) {
- klwp_t *lwp = ttolwp(curthread);
-
+ if (rp->r_serial != NULL && !was_serial) {
/*
- * If we're the recovery thread, then purge current attrs
- * and bail out to avoid potential deadlock between another
- * thread caching attrs (r_serial thread), recov thread,
- * and an async writer thread.
+ * Purge current attrs and bail out to avoid potential deadlock
+ * between another thread caching attrs (r_serial thread), this
+ * thread, and a thread trying to read or write pages.
*/
- if (recov) {
- PURGE_ATTRCACHE4_LOCKED(rp);
- mutex_exit(&rp->r_statelock);
- return;
- }
-
- if (lwp != NULL)
- lwp->lwp_nostop++;
- while (rp->r_serial != NULL) {
- if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
- mutex_exit(&rp->r_statelock);
- if (lwp != NULL)
- lwp->lwp_nostop--;
- return;
- }
- }
- if (lwp != NULL)
- lwp->lwp_nostop--;
+ PURGE_ATTRCACHE4_LOCKED(rp);
+ mutex_exit(&rp->r_statelock);
+ return;
}
/*
@@ -3067,7 +3049,7 @@ nfs_free_mi4(mntinfo4_t *mi)
nfs4_oo_hash_bucket_t *bucketp;
nfs4_debug_msg_t *msgp;
int i;
- servinfo4_t *svp;
+ servinfo4_t *svp;
/*
* Code introduced here should be carefully evaluated to make
diff --git a/usr/src/uts/common/fs/nfs/nfs4_idmap.c b/usr/src/uts/common/fs/nfs/nfs4_idmap.c
index c0e2492d56..0eb449b5ef 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_idmap.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_idmap.c
@@ -31,11 +31,11 @@
* mapping code is executing on the client or server. Thus, the following
* rules represents the latest incantation of the id mapping policies.
*
- * 1) For the case in which the nfsmapid(1m) daemon has _never_ been
+ * 1) For the case in which the nfsmapid(8) daemon has _never_ been
* started, the policy is to _always_ work with stringified uid's
* and gid's
*
- * 2) For the case in which the nfsmapid(1m) daemon _was_ started but
+ * 2) For the case in which the nfsmapid(8) daemon _was_ started but
* has either died or become unresponsive, the mapping policies are
* as follows:
*
@@ -72,7 +72,7 @@
* `-------------------------------'---------------------------------'
*
* 3) Lastly, in order to leverage better cache utilization whenever
- * communication with nfsmapid(1m) is currently hindered, cache
+ * communication with nfsmapid(8) is currently hindered, cache
* entry eviction is throttled whenever nfsidmap_daemon_dh == NULL.
*
*
@@ -80,28 +80,28 @@
* ====================================================
*
* GETATTR - Server-side GETATTR *id to attr string conversion policies
- * for unresponsive/dead nfsmapid(1m) daemon
+ * for unresponsive/dead nfsmapid(8) daemon
*
* a) If the *id is *ID_NOBODY, the string "nobody" is returned
*
- * b) If the *id is not *ID_NOBODY _and_ the nfsmapid(1m) daemon
+ * b) If the *id is not *ID_NOBODY _and_ the nfsmapid(8) daemon
* _is_ operational, the daemon is contacted to convert the
* [u/g]id into a string of type "[user/group]@domain"
*
- * c) If the nfsmapid(1m) daemon has died or has become unresponsive,
+ * c) If the nfsmapid(8) daemon has died or has become unresponsive,
* the server returns status == NFS4_OK for the GETATTR operation,
* and returns a strigified [u/g]id to let the client map it into
* the appropriate value.
*
* SETATTR - Server-side SETATTR attr string to *id conversion policies
- * for unresponsive/dead nfsmapid(1m) daemon
+ * for unresponsive/dead nfsmapid(8) daemon
*
* a) If the otw string is a stringified uid (ie. does _not_ contain
* an '@' sign and is of the form "12345") then the literal uid is
* decoded and it is used to perform the mapping.
*
* b) If, on the other hand, the otw string _is_ of the form
- * "[user/group]@domain" and problems arise contacting nfsmapid(1m),
+ * "[user/group]@domain" and problems arise contacting nfsmapid(8),
* the SETATTR operation _must_ fail w/NFS4ERR_DELAY, as the server
* cannot default to *ID_NOBODY, which would allow a file to be
* given away by setting it's owner or owner_group to "nobody".
@@ -329,7 +329,7 @@ nfs_idmap_str_uid(utf8string *u8s, uid_t *uid, bool_t isserver)
}
/*
- * Start-off with upcalls disabled, and once nfsmapid(1m) is up and
+ * Start-off with upcalls disabled, and once nfsmapid(8) is up and
* running, we'll leverage it's first flush to let the kernel know
* when it's up and available to perform mappings. Also, on client
* only, be smarter about when to issue upcalls by checking the
@@ -399,7 +399,7 @@ retry:
/*
* string came in as stringified id. Don't cache !
*
- * nfsmapid(1m) semantics have changed in order to
+ * nfsmapid(8) semantics have changed in order to
* support diskless clients. Thus, for stringified
* id's that have passwd/group entries, we'll go
* ahead and map them, returning no error.
@@ -538,7 +538,7 @@ nfs_idmap_uid_str(uid_t uid, utf8string *u8s, bool_t isserver)
}
/*
- * Start-off with upcalls disabled, and once nfsmapid(1m) is
+ * Start-off with upcalls disabled, and once nfsmapid(8) is
* up and running, we'll leverage it's first flush to let the
* kernel know when it's up and available to perform mappings.
* We fall back to answering with stringified uid's.
@@ -708,7 +708,7 @@ nfs_idmap_str_gid(utf8string *u8s, gid_t *gid, bool_t isserver)
}
/*
- * Start-off with upcalls disabled, and once nfsmapid(1m) is up and
+ * Start-off with upcalls disabled, and once nfsmapid(8) is up and
* running, we'll leverage it's first flush to let the kernel know
* when it's up and available to perform mappings. Also, on client
* only, be smarter about when to issue upcalls by checking the
@@ -779,7 +779,7 @@ retry:
/*
* string came in as stringified id. Don't cache !
*
- * nfsmapid(1m) semantics have changed in order to
+ * nfsmapid(8) semantics have changed in order to
* support diskless clients. Thus, for stringified
* id's that have passwd/group entries, we'll go
* ahead and map them, returning no error.
@@ -918,7 +918,7 @@ nfs_idmap_gid_str(gid_t gid, utf8string *u8s, bool_t isserver)
}
/*
- * Start-off with upcalls disabled, and once nfsmapid(1m) is
+ * Start-off with upcalls disabled, and once nfsmapid(8) is
* up and running, we'll leverage it's first flush to let the
* kernel know when it's up and available to perform mappings.
* We fall back to answering with stringified gid's.
@@ -1119,7 +1119,7 @@ nfs_idmap_args(struct nfsidmap_args *idmp)
nfs_idmap_cache_flush(&nig->s2g_ci);
/*
- * nfsmapid(1m) up and running; enable upcalls
+ * nfsmapid(8) up and running; enable upcalls
* State:
* 0 Just flush caches
* 1 Re-establish door knob
@@ -1309,7 +1309,7 @@ nfs_idmap_cache_s2i_lkup(idmap_cache_info_t *cip, utf8string *u8s,
* Check entry for staleness first, as user's id
* may have changed and may need to be remapped.
* Note that we don't evict entries from the cache
- * if we're having trouble contacting nfsmapid(1m)
+ * if we're having trouble contacting nfsmapid(8)
*/
if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) {
nfs_idmap_cache_rment(p);
@@ -1405,7 +1405,7 @@ nfs_idmap_cache_s2i_insert(idmap_cache_info_t *cip, uid_t id, utf8string *u8s,
* Check entry for staleness first, as user's id
* may have changed and may need to be remapped.
* Note that we don't evict entries from the cache
- * if we're having trouble contacting nfsmapid(1m)
+ * if we're having trouble contacting nfsmapid(8)
*/
if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) {
nfs_idmap_cache_rment(p);
@@ -1486,7 +1486,7 @@ nfs_idmap_cache_i2s_lkup(idmap_cache_info_t *cip, uid_t id, uint_t *hashno,
* Check entry for staleness first, as user's id
* may have changed and may need to be remapped.
* Note that we don't evict entries from the cache
- * if we're having trouble contacting nfsmapid(1m)
+ * if we're having trouble contacting nfsmapid(8)
*/
if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) {
nfs_idmap_cache_rment(p);
@@ -1570,7 +1570,7 @@ nfs_idmap_cache_i2s_insert(idmap_cache_info_t *cip, uid_t id, utf8string *u8s,
* Check entry for staleness first, as user's id
* may have changed and may need to be remapped.
* Note that we don't evict entries from the cache
- * if we're having trouble contacting nfsmapid(1m)
+ * if we're having trouble contacting nfsmapid(8)
*/
if (TIMEOUT(p->id_time) && (*cip->nfsidmap_daemon_dh) != NULL) {
nfs_idmap_cache_rment(p);
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv.c b/usr/src/uts/common/fs/nfs/nfs4_srv.c
index 757964eb84..077fc4a25f 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv.c
@@ -32,6 +32,7 @@
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright 2019 Nexenta Systems, Inc.
* Copyright 2019 Nexenta by DDN, Inc.
+ * Copyright 2021 Racktop Systems, Inc.
*/
#include <sys/param.h>
@@ -5840,13 +5841,12 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi,
ASSERT(exi == NULL);
ASSERT(cr == NULL);
- cr = crget();
+ cr = svc_xprt_cred(req->rq_xprt);
ASSERT(cr != NULL);
if (sec_svc_getcred(req, cr, &cs.principal, &cs.nfsflavor) == 0) {
DTRACE_NFSV4_2(compound__start, struct compound_state *,
&cs, COMPOUND4args *, args);
- crfree(cr);
DTRACE_NFSV4_2(compound__done, struct compound_state *,
&cs, COMPOUND4res *, resp);
svcerr_badcred(req->rq_xprt);
@@ -5965,8 +5965,6 @@ rfs4_compound(COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi,
if (cs.saved_fh.nfs_fh4_val)
kmem_free(cs.saved_fh.nfs_fh4_val, NFS4_FHSIZE);
- if (cs.basecr)
- crfree(cs.basecr);
if (cs.cr)
crfree(cs.cr);
/*
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
index a9ee217a8b..13e5320752 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_attr.c
@@ -2093,7 +2093,7 @@ rfs4_fattr4_owner(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
* occur. Please refer to nfs4_idmap.c for details.
*
* Any other errors, such as the mapping not being found by
- * nfsmapid(1m), and interrupted clnt_call, etc, will result
+ * nfsmapid(8), and interrupted clnt_call, etc, will result
* in NFS4ERR_BADOWNER.
*
* XXX need to return consistent errors, perhaps all
@@ -2206,7 +2206,7 @@ rfs4_fattr4_owner_group(nfs4_attr_cmd_t cmd, struct nfs4_svgetit_arg *sarg,
* cannot occur. Please refer to nfs4_idmap.c for details.
*
* Any other errors, such as the mapping not being found by
- * nfsmapid(1m), and interrupted clnt_call, etc, will result
+ * nfsmapid(8), and interrupted clnt_call, etc, will result
* in NFS4ERR_BADOWNER.
*
* XXX need to return consistent errors, perhaps all
diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
index 920ebeca53..b719b0e2ca 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c
@@ -660,7 +660,6 @@ treeclimb_export(struct exportinfo *exip)
if (error)
break;
- /* XXX KEBE ASKS DO WE NEED THIS?!? */
ASSERT3U(exip->exi_zoneid, ==, curzone->zone_id);
/*
* The root of the file system, or the zone's root for
diff --git a/usr/src/uts/common/fs/nfs/nfs4_state.c b/usr/src/uts/common/fs/nfs/nfs4_state.c
index 0c1efb26df..b95dd6fb02 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_state.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_state.c
@@ -74,6 +74,7 @@ stateid4 special1 = {
int rfs4_debug;
#endif
+rfs4_db_mem_cache_t rfs4_db_mem_cache_table[RFS4_DB_MEM_CACHE_NUM];
static uint32_t rfs4_database_debug = 0x00;
/* CSTYLED */
diff --git a/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c
index 83c84b7892..d0950dd6f0 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_stub_vnops.c
@@ -1906,7 +1906,7 @@ nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
uap->flags = MS_SYSSPACE | MS_DATA;
/* fstype-independent mount options not covered elsewhere */
- /* copy parent's mount(1M) "-m" flag */
+ /* copy parent's mount(8) "-m" flag */
if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
uap->flags |= MS_NOMNTTAB;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_subr.c b/usr/src/uts/common/fs/nfs/nfs4_subr.c
index ec5fda53a0..aaec5ca976 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_subr.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_subr.c
@@ -27,7 +27,7 @@
*/
/*
- * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
+ * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
* All Rights Reserved
*/
@@ -1582,7 +1582,7 @@ rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
{
int i, error;
- enum clnt_stat rpc_status = NFS4_OK;
+ enum clnt_stat rpc_status = RPC_SUCCESS;
int num_resops;
struct nfs4_clnt *nfscl;
diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
index 15c6445146..6a3fbff48e 100644
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c
@@ -2596,12 +2596,6 @@ nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
osp->os_ref_count--;
if (ep->error == 0) {
- /*
- * Avoid a deadlock with the r_serial thread waiting for
- * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
- * held by us. We will wait in nfs4_attr_cache() for the
- * completion of the r_serial thread.
- */
mutex_exit(&osp->os_sync_lock);
*have_sync_lockp = 0;
diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c
index 7ac3c3318b..16979e1422 100644
--- a/usr/src/uts/common/fs/nfs/nfs_auth.c
+++ b/usr/src/uts/common/fs/nfs/nfs_auth.c
@@ -219,7 +219,7 @@ nfsauth_zone_init(nfs_globals_t *ng)
nag = kmem_zalloc(sizeof (*nag), KM_SLEEP);
/*
- * mountd can be restarted by smf(5). We need to make sure
+ * mountd can be restarted by smf(7). We need to make sure
* the updated door handle will safely make it to mountd_dh.
*/
mutex_init(&nag->mountd_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -462,7 +462,7 @@ retry:
if (dh == NULL) {
/*
* The rendezvous point has not been established yet!
- * This could mean that either mountd(1m) has not yet
+ * This could mean that either mountd(8) has not yet
* been started or that _this_ routine nuked the door
* handle after receiving an EINTR for a REVOKED door.
*
@@ -523,8 +523,8 @@ retry:
/*
* The server barfed and revoked
* the (existing) door on us; we
- * want to wait to give smf(5) a
- * chance to restart mountd(1m)
+ * want to wait to give smf(7) a
+ * chance to restart mountd(8)
* and establish a new door handle.
*/
mutex_enter(&nag->mountd_lock);
@@ -910,9 +910,6 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
ASSERT(taddrmask != NULL);
addrmask(&addr, taddrmask);
- ac.auth_flavor = flavor;
- ac.auth_clnt_cred = crdup(cr);
-
acc.authc_addr = addr;
tree = exi->exi_cache[hash(&addr)];
@@ -925,7 +922,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
rw_exit(&exi->exi_cache_lock);
- nc = kmem_alloc(sizeof (*nc), KM_NOSLEEP | KM_NORMALPRI);
+ nc = kmem_alloc(sizeof (*nc), KM_NOSLEEP_LAZY);
if (nc == NULL)
goto retrieve;
@@ -933,8 +930,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
* Initialize the new auth_cache_clnt
*/
nc->authc_addr = addr;
- nc->authc_addr.buf = kmem_alloc(addr.maxlen,
- KM_NOSLEEP | KM_NORMALPRI);
+ nc->authc_addr.buf = kmem_alloc(addr.maxlen, KM_NOSLEEP_LAZY);
if (addr.maxlen != 0 && nc->authc_addr.buf == NULL) {
kmem_free(nc, sizeof (*nc));
goto retrieve;
@@ -964,6 +960,10 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
ASSERT(c != NULL);
rw_enter(&c->authc_lock, RW_READER);
+
+ ac.auth_flavor = flavor;
+ ac.auth_clnt_cred = cr;
+
p = (struct auth_cache *)avl_find(&c->authc_tree, &ac, NULL);
if (p == NULL) {
@@ -971,8 +971,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
rw_exit(&c->authc_lock);
- np = kmem_cache_alloc(exi_cache_handle,
- KM_NOSLEEP | KM_NORMALPRI);
+ np = kmem_cache_alloc(exi_cache_handle, KM_NOSLEEP_LAZY);
if (np == NULL) {
rw_exit(&exi->exi_cache_lock);
goto retrieve;
@@ -983,7 +982,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
*/
np->auth_clnt = c;
np->auth_flavor = flavor;
- np->auth_clnt_cred = ac.auth_clnt_cred;
+ np->auth_clnt_cred = crdup(cr);
np->auth_srv_ngids = 0;
np->auth_srv_gids = NULL;
np->auth_time = np->auth_freshness = gethrestime_sec();
@@ -1004,12 +1003,11 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
cv_destroy(&np->auth_cv);
mutex_destroy(&np->auth_lock);
- crfree(ac.auth_clnt_cred);
+ crfree(np->auth_clnt_cred);
kmem_cache_free(exi_cache_handle, np);
}
} else {
rw_exit(&exi->exi_cache_lock);
- crfree(ac.auth_clnt_cred);
}
mutex_enter(&p->auth_lock);
@@ -1071,7 +1069,7 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
* auth_cache entry
*/
tmpgids = kmem_alloc(tmpngids * sizeof (gid_t),
- KM_NOSLEEP | KM_NORMALPRI);
+ KM_NOSLEEP_LAZY);
if (tmpgids != NULL)
bcopy(*gids, tmpgids,
tmpngids * sizeof (gid_t));
@@ -1212,7 +1210,6 @@ nfsauth_cache_get(struct exportinfo *exi, struct svc_req *req, int flavor,
return (access);
retrieve:
- crfree(ac.auth_clnt_cred);
/*
* Retrieve the required data without caching.
diff --git a/usr/src/uts/common/fs/nfs/nfs_cmd.c b/usr/src/uts/common/fs/nfs/nfs_cmd.c
index 40775bb231..b9d23ba0d6 100644
--- a/usr/src/uts/common/fs/nfs/nfs_cmd.c
+++ b/usr/src/uts/common/fs/nfs/nfs_cmd.c
@@ -138,7 +138,7 @@ retry:
if (dh == NULL) {
/*
* The rendezvous point has not been established yet !
- * This could mean that either mountd(1m) has not yet
+ * This could mean that either mountd(8) has not yet
* been started or that _this_ routine nuked the door
* handle after receiving an EINTR for a REVOKED door.
*
@@ -176,8 +176,8 @@ retry:
/*
* The server barfed and revoked
* the (existing) door on us; we
- * want to wait to give smf(5) a
- * chance to restart mountd(1m)
+ * want to wait to give smf(7) a
+ * chance to restart mountd(8)
* and establish a new door handle.
*/
mutex_enter(&ncg->nfscmd_lock);
diff --git a/usr/src/uts/common/fs/nfs/nfs_export.c b/usr/src/uts/common/fs/nfs/nfs_export.c
index 080dfe1adf..b18912d154 100644
--- a/usr/src/uts/common/fs/nfs/nfs_export.c
+++ b/usr/src/uts/common/fs/nfs/nfs_export.c
@@ -85,7 +85,7 @@ static bool_t exi_id_overflow;
avl_tree_t exi_id_tree;
kmutex_t nfs_exi_id_lock;
-static int unexport(nfs_export_t *, exportinfo_t *);
+static int unexport(nfs_export_t *, exportinfo_t *, cred_t *);
static void exportfree(exportinfo_t *);
static int loadindex(exportdata_t *);
@@ -975,7 +975,15 @@ nfs_export_zone_shutdown(nfs_globals_t *ng)
nfs_export_t *ne = ng->nfs_export;
struct exportinfo *exi, *nexi;
int i, errors;
+ zoneid_t zoneid = ng->nfs_zoneid;
+ cred_t *cr;
+ /*
+ * Use the zone's credential. Since this is a zone shutdown method,
+ * the zone_t should still be around for a zone_get_kcred() call.
+ */
+ cr = zone_get_kcred(zoneid);
+ VERIFY(cr != NULL);
rw_enter(&ne->exported_lock, RW_READER);
errors = 0;
@@ -986,7 +994,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng)
exi_hold(exi);
while (exi != NULL) {
-
+ ASSERT3U(zoneid, ==, exi->exi_zoneid);
/*
* Get and hold next export before
* dropping the rwlock and unexport
@@ -1002,7 +1010,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng)
* create/destroy handling.
*/
if (exi != ne->exi_root &&
- unexport(ne, exi) != 0)
+ unexport(ne, exi, cr) != 0)
errors++;
exi_rele(exi);
@@ -1016,6 +1024,7 @@ nfs_export_zone_shutdown(nfs_globals_t *ng)
}
rw_exit(&ne->exported_lock);
+ crfree(cr);
}
void
@@ -1286,7 +1295,7 @@ exportfs(struct exportfs_args *args, model_t model, cred_t *cr)
pn_free(&lookpn);
if (ex1 == NULL)
return (EINVAL);
- error = unexport(ne, ex1);
+ error = unexport(ne, ex1, cr);
exi_rele(ex1);
return (error);
}
@@ -1886,7 +1895,7 @@ export_unlink(nfs_export_t *ne, struct exportinfo *exi)
* Unexport an exported filesystem
*/
static int
-unexport(nfs_export_t *ne, struct exportinfo *exi)
+unexport(nfs_export_t *ne, struct exportinfo *exi, cred_t *cr)
{
struct secinfo cursec[MAX_FLAVORS];
int curcnt;
@@ -1954,18 +1963,14 @@ unexport(nfs_export_t *ne, struct exportinfo *exi)
* the public filehandle to the root.
*/
- /*
- * XXX KEBE ASKS --> Should CRED() instead be
- * exi->exi_zone->zone_kcred?
- */
if (exi == ne->exi_public) {
ne->exi_public = ne->exi_root;
- nfslog_share_record(ne->exi_public, CRED());
+ nfslog_share_record(ne->exi_public, cr);
}
if (exi->exi_export.ex_flags & EX_LOG)
- nfslog_unshare_record(exi, CRED());
+ nfslog_unshare_record(exi, cr);
exi_rele(exi);
return (0);
diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c
index 5b7658d048..28c079968f 100644
--- a/usr/src/uts/common/fs/nfs/nfs_server.c
+++ b/usr/src/uts/common/fs/nfs/nfs_server.c
@@ -24,6 +24,7 @@
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2017 Joyent Inc
* Copyright 2019 Nexenta by DDN, Inc.
+ * Copyright 2021 Racktop Systems, Inc.
*/
/*
@@ -115,6 +116,13 @@ krwlock_t nfssrv_globals_rwl;
kmem_cache_t *nfs_xuio_cache;
int nfs_loaned_buffers = 0;
+/* array of paths passed-in from nfsd command-line; stored in nvlist */
+char **rfs4_dss_newpaths;
+uint_t rfs4_dss_numnewpaths;
+
+/* nvlists of all DSS paths: current, and before last warmstart */
+nvlist_t *rfs4_dss_paths, *rfs4_dss_oldpaths;
+
int
_init(void)
{
@@ -1356,11 +1364,6 @@ static struct rpc_disptable rfs_disptable[] = {
static int nfs_portmon = 0;
#ifdef DEBUG
-static int cred_hits = 0;
-static int cred_misses = 0;
-#endif
-
-#ifdef DEBUG
/*
* Debug code to allow disabling of rfs_dispatch() use of
* fastxdrargs() and fastxdrres() calls for testing purposes.
@@ -1628,25 +1631,7 @@ common_dispatch(struct svc_req *req, SVCXPRT *xprt, rpcvers_t min_vers,
else
anon_ok = 0;
- cr = xprt->xp_cred;
- ASSERT(cr != NULL);
-#ifdef DEBUG
- {
- if (crgetref(cr) != 1) {
- crfree(cr);
- cr = crget();
- xprt->xp_cred = cr;
- cred_misses++;
- } else
- cred_hits++;
- }
-#else
- if (crgetref(cr) != 1) {
- crfree(cr);
- cr = crget();
- xprt->xp_cred = cr;
- }
-#endif
+ cr = svc_xprt_cred(xprt);
exi = checkexport(fsid, xfid);
diff --git a/usr/src/uts/common/fs/nfs/nfs_stats.c b/usr/src/uts/common/fs/nfs/nfs_stats.c
index 97f820d756..13466a4f33 100644
--- a/usr/src/uts/common/fs/nfs/nfs_stats.c
+++ b/usr/src/uts/common/fs/nfs/nfs_stats.c
@@ -34,7 +34,7 @@
/*
* Key to retrieve per-zone data corresponding to NFS kstats consumed by
- * nfsstat(1m).
+ * nfsstat(8).
*/
zone_key_t nfsstat_zone_key;
diff --git a/usr/src/uts/common/fs/pcfs/pc_node.c b/usr/src/uts/common/fs/pcfs/pc_node.c
index 84a29f4430..bf01336c6f 100644
--- a/usr/src/uts/common/fs/pcfs/pc_node.c
+++ b/usr/src/uts/common/fs/pcfs/pc_node.c
@@ -667,7 +667,7 @@ pc_mark_irrecov(struct pcfs *fsp)
"an irrecoverable error was encountered.\n"
"File damage is possible. To prevent further\n"
"damage, this pcfs instance will now be frozen.\n"
- "Use umount(1M) to release the instance.\n");
+ "Use umount(8) to release the instance.\n");
(void) pc_unlockfs(fsp);
}
}
diff --git a/usr/src/uts/common/fs/pcfs/pc_vfsops.c b/usr/src/uts/common/fs/pcfs/pc_vfsops.c
index 7b2205e1d7..60041a3d71 100644
--- a/usr/src/uts/common/fs/pcfs/pc_vfsops.c
+++ b/usr/src/uts/common/fs/pcfs/pc_vfsops.c
@@ -589,7 +589,7 @@ pcfs_parse_mntopts(struct pcfs *fsp)
/*
* The "secsize=..." mount option is a workaround for the lack of
- * lofi(7d) support for DKIOCGMEDIAINFO. If PCFS wants to parse the
+ * lofi(4D) support for DKIOCGMEDIAINFO. If PCFS wants to parse the
* partition table of a disk image and it has been partitioned with
* sector sizes other than 512 bytes, we'd fail on loopback'ed disk
* images.
@@ -1988,7 +1988,7 @@ parseBPB(struct pcfs *fsp, uchar_t *bpb, int *valid)
mediasize = (len_t)totsec * (len_t)secsize;
/*
* This is not an error because not all devices support the
- * dkio(7i) mediasize queries, and/or not all devices are
+ * dkio(4I) mediasize queries, and/or not all devices are
* partitioned. If we have not been able to figure out the
* size of the underlaying medium, we have to trust the BPB.
*/
@@ -2286,7 +2286,7 @@ recheck:
*
* Test whether the device is:
* - a floppy device from a known controller type via DKIOCINFO
- * - a real floppy using the fd(7d) driver and capable of fdio(7I) ioctls
+ * - a real floppy using the fd(4D) driver and capable of fdio(4I) ioctls
* - a USB floppy drive (identified by drive geometry)
*
* Detecting a floppy will make PCFS metadata updates on such media synchronous,
@@ -2381,7 +2381,7 @@ pcfs_device_getinfo(struct pcfs *fsp)
arg.mi.dki_media_type == DK_JAZ);
/*
- * if this device understands fdio(7I) requests it's
+ * if this device understands fdio(4I) requests it's
* obviously a floppy drive.
*/
if (!isfloppy &&
@@ -2390,7 +2390,7 @@ pcfs_device_getinfo(struct pcfs *fsp)
/*
* some devices we like to treat as floppies, but they don't
- * understand fdio(7I) requests.
+ * understand fdio(4I) requests.
*/
if (!isfloppy &&
!ldi_ioctl(lh, DKIOCINFO, argp, FKIOCTL, cr, NULL) &&
diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c
index b307fe11d7..1965444071 100644
--- a/usr/src/uts/common/fs/pcfs/pc_vnops.c
+++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c
@@ -1852,8 +1852,8 @@ out:
*offp = io_off;
if (lenp)
*lenp = io_len;
- PC_DPRINTF4(4, "pcfs_putapage: vp=%p pp=%p off=%lld len=%lu\n",
- (void *)vp, (void *)pp, io_off, io_len);
+ PC_DPRINTF4(4, "pcfs_putapage: vp=%p pp=%p off=%lld len=%lu\n",
+ (void *)vp, (void *)pp, io_off, io_len);
if (err) {
PC_DPRINTF1(1, "pcfs_putapage err=%d", err);
}
@@ -2093,7 +2093,7 @@ set_long_fn_chunk(struct pcdir_lfn *ep, char *buf, int len)
static int
get_long_fn_chunk(struct pcdir_lfn *ep, char *buf)
{
- char *tmp = buf;
+ char *tmp = buf;
int i;
/* Copy all the names, no filtering now */
diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c
index 91d998b4b5..dd32c82434 100644
--- a/usr/src/uts/common/fs/portfs/port.c
+++ b/usr/src/uts/common/fs/portfs/port.c
@@ -156,7 +156,7 @@
* interested on.
* The internal pollwakeup() function is used by all the file
* systems --which are supporting the VOP_POLL() interface- to notify
- * the upper layer (poll(2), devpoll(7d) and now event ports) about
+ * the upper layer (poll(2), devpoll(4D) and now event ports) about
* the event triggered (see valid events in poll(2)).
* The pollwakeup() function forwards the event to the layer registered
* to receive the current event.
diff --git a/usr/src/uts/common/fs/portfs/port_fd.c b/usr/src/uts/common/fs/portfs/port_fd.c
index a1a1d6fb68..511c15e979 100644
--- a/usr/src/uts/common/fs/portfs/port_fd.c
+++ b/usr/src/uts/common/fs/portfs/port_fd.c
@@ -230,7 +230,7 @@ port_associate_fd(port_t *pp, int source, uintptr_t object, int events,
* Allocate a polldat_t structure per fd
* The use of the polldat_t structure to cache file descriptors
* is required to be able to share the pollwakeup() function
- * with poll(2) and devpoll(7d).
+ * with poll(2) and devpoll(4D).
*/
pfd = kmem_zalloc(sizeof (portfd_t), KM_SLEEP);
pdp = PFTOD(pfd);
diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c
index 019de0540a..a6ca583a4d 100644
--- a/usr/src/uts/common/fs/portfs/port_fop.c
+++ b/usr/src/uts/common/fs/portfs/port_fop.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
*/
/*
@@ -257,7 +257,7 @@ const fs_operation_def_t port_vnodesrc_template[] = {
VOPNAME_READ, { .femop_read = port_fop_read },
VOPNAME_WRITE, { .femop_write = port_fop_write },
VOPNAME_MAP, { .femop_map = port_fop_map },
- VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr },
+ VOPNAME_SETATTR, { .femop_setattr = port_fop_setattr },
VOPNAME_CREATE, { .femop_create = port_fop_create },
VOPNAME_REMOVE, { .femop_remove = port_fop_remove },
VOPNAME_LINK, { .femop_link = port_fop_link },
@@ -266,7 +266,7 @@ const fs_operation_def_t port_vnodesrc_template[] = {
VOPNAME_RMDIR, { .femop_rmdir = port_fop_rmdir },
VOPNAME_READDIR, { .femop_readdir = port_fop_readdir },
VOPNAME_SYMLINK, { .femop_symlink = port_fop_symlink },
- VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr },
+ VOPNAME_SETSECATTR, { .femop_setsecattr = port_fop_setsecattr },
VOPNAME_VNEVENT, { .femop_vnevent = port_fop_vnevent },
NULL, NULL
};
@@ -275,7 +275,7 @@ const fs_operation_def_t port_vnodesrc_template[] = {
* Fsem - vfs ops hooks
*/
const fs_operation_def_t port_vfssrc_template[] = {
- VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount },
+ VFSNAME_UNMOUNT, { .fsemop_unmount = port_fop_unmount },
NULL, NULL
};
@@ -539,14 +539,14 @@ port_fop_trimpfplist(vnode_t *vp)
port_pcache_remove_fop(pfcp, pfp);
mutex_exit(&pfcp->pfc_lock);
if (tdvp != NULL)
- VN_RELE(tdvp);
+ VN_PHANTOM_RELE(tdvp);
}
}
}
/*
* This routine returns 1, if the vnode can be rele'ed by the caller.
- * The caller has to VN_RELE the vnode with out holding any
+ * The caller has to VN_PHANTOM_RELE the vnode with out holding any
* locks.
*/
int
@@ -616,7 +616,7 @@ port_fop_femuninstall(vnode_t *vp)
* able to remove it from the port's queue).
*
* vpp and dvpp will point to the vnode and directory vnode which the caller
- * is required to VN_RELE without holding any locks.
+ * is required to VN_PHANTOM_RELE without holding any locks.
*/
int
port_remove_fop(portfop_t *pfp, portfop_cache_t *pfcp, int cleanup,
@@ -726,12 +726,12 @@ port_cache_lookup_fop(portfop_cache_t *pfcp, pid_t pid, uintptr_t obj)
/*
* Given the file name, get the vnode and also the directory vnode
- * On return, the vnodes are held (VN_HOLD). The caller has to VN_RELE
- * the vnode(s).
+ * On return, the vnodes are held with phantom holds (VN_PHANTOM_HOLD). The
+ * caller has to VN_PHANTOM_RELE the vnode(s).
*/
int
port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp,
- char **cname, int *len, int follow)
+ char **cname, int *len, int follow)
{
int error = 0;
struct pathname pn;
@@ -777,6 +777,17 @@ port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp,
}
}
+ /* Trade VN_HOLD()s from lookuppn with VN_PHANTOM_HOLD()s */
+ if (dvp != NULL && *dvp != NULL) {
+ VN_PHANTOM_HOLD(*dvp);
+ VN_RELE(*dvp);
+ }
+
+ if (vp != NULL && *vp != NULL) {
+ VN_PHANTOM_HOLD(*vp);
+ VN_RELE(*vp);
+ }
+
pn_free(&pn);
return (error);
}
@@ -815,7 +826,7 @@ port_getsrc(port_t *pp, int source)
*/
static void
port_check_timestamp(portfop_cache_t *pfcp, vnode_t *vp, vnode_t *dvp,
- portfop_t *pfp, void *objptr, uintptr_t object)
+ portfop_t *pfp, void *objptr, uintptr_t object)
{
vattr_t vatt;
portfop_vp_t *pvp = vp->v_fopdata;
@@ -1102,8 +1113,8 @@ port_install_fopdata(vnode_t *vp)
*/
int
port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp,
- uintptr_t object, int events, void *user, char *cname, int clen,
- vnode_t *dvp)
+ uintptr_t object, int events, void *user, char *cname, int clen,
+ vnode_t *dvp)
{
portfop_t *pfp = NULL;
port_kevent_t *pkevp;
@@ -1176,7 +1187,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp,
* Hold a reference to the vnode since
* we successfully installed the hooks.
*/
- VN_HOLD(vp);
+ VN_PHANTOM_HOLD(vp);
} else {
(void) fem_uninstall(vp, femp, vp);
pvp->pvp_femp = NULL;
@@ -1209,7 +1220,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp,
* Hold the directory vnode since we have a reference now.
*/
if (dvp != NULL)
- VN_HOLD(dvp);
+ VN_PHANTOM_HOLD(dvp);
*pfpp = pfp;
return (0);
}
@@ -1224,9 +1235,9 @@ port_resolve_vp(vnode_t *vp)
*/
if (vfs_mntdummyvp && mntfstype != 0 &&
vp->v_vfsp->vfs_fstype == mntfstype) {
- VN_RELE(vp);
+ VN_PHANTOM_RELE(vp);
vp = vfs_mntdummyvp;
- VN_HOLD(vfs_mntdummyvp);
+ VN_PHANTOM_HOLD(vfs_mntdummyvp);
}
/*
@@ -1234,8 +1245,8 @@ port_resolve_vp(vnode_t *vp)
* hardlinks.
*/
if ((VOP_REALVP(vp, &rvp, NULL) == 0) && vp != rvp) {
- VN_HOLD(rvp);
- VN_RELE(vp);
+ VN_PHANTOM_HOLD(rvp);
+ VN_PHANTOM_RELE(vp);
vp = rvp;
}
return (vp);
@@ -1247,10 +1258,10 @@ port_resolve_vp(vnode_t *vp)
* The association is identified by the object pointer and the pid.
* The events argument contains the events to be monitored for.
*
- * The vnode will have a VN_HOLD once the fem hooks are installed.
+ * The vnode will have a VN_PHANTOM_HOLD once the fem hooks are installed.
*
- * Every reference(pfp) to the directory vnode will have a VN_HOLD to ensure
- * that the directory vnode pointer does not change.
+ * Every reference(pfp) to the directory vnode will have a VN_PHANTOM_HOLD to
+ * ensure that the directory vnode pointer does not change.
*/
int
port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
@@ -1330,7 +1341,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
*/
if (dvp != NULL && dvp->v_vfsp != vp->v_vfsp &&
!(orig->v_type == VPROC && vp != NULL && vp->v_type != VPROC)) {
- VN_RELE(dvp);
+ VN_PHANTOM_RELE(dvp);
dvp = NULL;
}
@@ -1350,8 +1361,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
pfp = port_cache_lookup_fop(pfcp, curproc->p_pid, object);
/*
- * If it is not the same vnode, just discard it. VN_RELE needs to be
- * called with no locks held, therefore save vnode pointers and
+ * If it is not the same vnode, just discard it. VN_PHANTOM_RELE needs
+ * to be called with no locks held, therefore save vnode pointers and
* vn_rele them later.
*/
if (pfp != NULL && (pfp->pfop_vp != vp || pfp->pfop_dvp != dvp)) {
@@ -1404,7 +1415,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
* This vnode pointer is just used
* for comparison, so rele it
*/
- VN_RELE(tvp);
+ VN_PHANTOM_RELE(tvp);
}
}
@@ -1437,8 +1448,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
* active and it is not being removed from
* the vnode list. This is checked in
* port_remove_fop with the vnode lock held.
- * The vnode returned is VN_RELE'ed after dropping
- * the locks.
+ * The vnode returned is VN_PHANTOM_RELE'ed after
+ * dropping the locks.
*/
tdvp = tvp = NULL;
if (port_remove_fop(pfp, pfcp, 0, NULL, &tvp, &tdvp)) {
@@ -1451,9 +1462,9 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events,
}
mutex_exit(&pfcp->pfc_lock);
if (tvp != NULL)
- VN_RELE(tvp);
+ VN_PHANTOM_RELE(tvp);
if (tdvp != NULL)
- VN_RELE(tdvp);
+ VN_PHANTOM_RELE(tdvp);
goto errout;
}
} else {
@@ -1519,14 +1530,14 @@ errout:
* Release the hold acquired due to the lookup operation.
*/
if (vp != NULL)
- VN_RELE(vp);
+ VN_PHANTOM_RELE(vp);
if (dvp != NULL)
- VN_RELE(dvp);
+ VN_PHANTOM_RELE(dvp);
if (oldvp != NULL)
- VN_RELE(oldvp);
+ VN_PHANTOM_RELE(oldvp);
if (olddvp != NULL)
- VN_RELE(olddvp);
+ VN_PHANTOM_RELE(olddvp);
/*
* copied file name not used, free it.
@@ -1587,9 +1598,9 @@ port_dissociate_fop(port_t *pp, uintptr_t object)
(void) port_remove_fop(pfp, pfcp, 1, &active, &tvp, &tdvp);
mutex_exit(&pfcp->pfc_lock);
if (tvp != NULL)
- VN_RELE(tvp);
+ VN_PHANTOM_RELE(tvp);
if (tdvp != NULL)
- VN_RELE(tdvp);
+ VN_PHANTOM_RELE(tdvp);
return (active ? 0 : ENOENT);
}
@@ -1610,7 +1621,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose)
portfop_t *pfpnext;
int index, i;
port_source_t *pse;
- vnode_t *tdvp = NULL;
+ vnode_t *tdvp = NULL;
vnode_t *vpl[PORTFOP_NVP];
pse = port_getsrc(pp, PORT_SOURCE_FILE);
@@ -1627,7 +1638,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose)
* be possible as the port is being closed.
*
* The common case is that the port is not shared and all the entries
- * are of this pid and have to be freed. Since VN_RELE has to be
+ * are of this pid and have to be freed. Since VN_PHANTOM_RELE has to be
* called outside the lock, we do it in batches.
*/
hashtbl = (portfop_t **)pfcp->pfc_hash;
@@ -1654,14 +1665,14 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose)
if (pfp == NULL)
index++;
/*
- * Now call VN_RELE if we have collected enough vnodes or
- * we have reached the end of the hash table.
+ * Now call VN_PHANTOM_RELE if we have collected enough vnodes
+ * or we have reached the end of the hash table.
*/
if (i >= (PORTFOP_NVP - 1) ||
(i > 0 && index == PORTFOP_HASHSIZE)) {
mutex_exit(&pfcp->pfc_lock);
while (i > 0) {
- VN_RELE(vpl[--i]);
+ VN_PHANTOM_RELE(vpl[--i]);
vpl[i] = NULL;
}
mutex_enter(&pfcp->pfc_lock);
@@ -1769,7 +1780,7 @@ port_fop_excep(list_t *tlist, int op)
port_pcache_remove_fop(pfcp, pfp);
mutex_exit(&pfcp->pfc_lock);
if (tdvp != NULL)
- VN_RELE(tdvp);
+ VN_PHANTOM_RELE(tdvp);
}
}
@@ -1933,7 +1944,7 @@ port_fop_sendevent(vnode_t *vp, int events, vnode_t *dvp, char *cname)
* that may be attempting to remove an object from the vnode's.
*/
if (port_fop_femuninstall(vp))
- VN_RELE(vp);
+ VN_PHANTOM_RELE(vp);
/*
* Send exception events and discard the watch entries.
@@ -1980,7 +1991,7 @@ port_fop(vnode_t *vp, int op, int retval)
event |= FILE_TRUNC;
}
if (event) {
- port_fop_sendevent(vp, event, NULL, NULL);
+ port_fop_sendevent(vp, event, NULL, NULL);
}
}
@@ -2068,7 +2079,7 @@ port_fop_unmount(fsemarg_t *vf, int flag, cred_t *cr)
* unmount is in process.
*/
port_fop_sendevent(pvp->pvp_vp, UNMOUNTED, NULL, NULL);
- VN_RELE(pvp->pvp_vp);
+ VN_PHANTOM_RELE(pvp->pvp_vp);
}
error = vfsnext_unmount(vf, flag, cr);
diff --git a/usr/src/uts/common/fs/proc/prioctl.c b/usr/src/uts/common/fs/proc/prioctl.c
index 08c5f6ffc0..d596d06a34 100644
--- a/usr/src/uts/common/fs/proc/prioctl.c
+++ b/usr/src/uts/common/fs/proc/prioctl.c
@@ -71,7 +71,7 @@
#include <sys/ctfs_impl.h>
#include <sys/ctfs.h>
-#if defined(__i386) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
#include <sys/sysi86.h>
#endif
@@ -133,6 +133,7 @@ prctioctl(prnode_t *pnp, int cmd, intptr_t arg, int flag, cred_t *cr)
/*
* Control operations (lots).
*/
+/* BEGIN CSTYLED */
/*ARGSUSED*/
#ifdef _SYSCALL32_IMPL
static int
@@ -144,6 +145,7 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
int *rvalp, caller_context_t *ct)
#endif /* _SYSCALL32_IMPL */
{
+/* END CSTYLED */
int nsig = PROC_IS_BRANDED(curproc)? BROP(curproc)->b_nsig : NSIG;
caddr_t cmaddr = (caddr_t)arg;
proc_t *p;
@@ -275,11 +277,11 @@ prioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
case PIOCAUXV:
break;
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
case PIOCNLDT:
case PIOCLDT:
break;
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
#if defined(__sparc)
case PIOCGWIN:
@@ -1235,7 +1237,7 @@ startover:
break;
}
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
case PIOCNLDT: /* get number of LDT entries */
{
int n;
@@ -1290,7 +1292,7 @@ startover:
kmem_free(ssd, (n+1) * sizeof (*ssd));
break;
}
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
#if defined(__sparc)
case PIOCGWIN: /* get gwindows_t (see sys/reg.h) */
@@ -1830,11 +1832,11 @@ prioctl32(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
case PIOCAUXV:
break;
-#if defined(__i386) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
case PIOCNLDT:
case PIOCLDT:
break;
-#endif /* __i386 || __i386_COMPAT */
+#endif /* __i386_COMPAT */
#if defined(__sparc)
case PIOCGWIN:
@@ -2867,7 +2869,7 @@ startover:
break;
}
-#if defined(__i386) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
case PIOCNLDT: /* get number of LDT entries */
{
int n;
@@ -2922,7 +2924,7 @@ startover:
kmem_free(ssd, (n+1) * sizeof (*ssd));
break;
}
-#endif /* __i386 || __i386_COMPAT */
+#endif /* __i386_COMPAT */
#if defined(__sparc)
case PIOCGWIN: /* get gwindows_t (see sys/reg.h) */
diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c
index d096168b48..2dccbb2f63 100644
--- a/usr/src/uts/common/fs/proc/prvnops.c
+++ b/usr/src/uts/common/fs/proc/prvnops.c
@@ -954,8 +954,7 @@ pr_read_fdinfo(prnode_t *pnp, uio_t *uiop, cred_t *cr)
fdinfo = pr_iol_newbuf(&data, offsetof(prfdinfo_t, pr_misc));
fdinfo->pr_fd = fd;
fdinfo->pr_fdflags = ufp_flag;
- /* FEPOLLED on f_flag2 should never be user-visible */
- fdinfo->pr_fileflags = (fp->f_flag2 & ~FEPOLLED) << 16 | fp->f_flag;
+ fdinfo->pr_fileflags = fp->f_flag2 << 16 | fp->f_flag;
if ((fdinfo->pr_fileflags & (FSEARCH | FEXEC)) == 0)
fdinfo->pr_fileflags += FOPEN;
fdinfo->pr_offset = fp->f_offset;
@@ -6236,7 +6235,7 @@ prseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
/*
* We use the p_execdir member of proc_t to expand the %d token in core file
* paths (the directory path for the executable that dumped core; see
- * coreadm(1M) for details). We'd like gcore(1) to be able to expand %d in
+ * coreadm(8) for details). We'd like gcore(1) to be able to expand %d in
* the same way as core dumping from the kernel, but there's no convenient
* and comprehensible way to export the path name for p_execdir. To solve
* this, we try to find the actual path to the executable that was used. In
diff --git a/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c b/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c
index 4235c94a06..f1a24bfeff 100644
--- a/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c
+++ b/usr/src/uts/common/fs/smbclnt/netsmb/nsmb_sign_kcf.c
@@ -32,7 +32,7 @@
* Common function to see if a mech is available.
*/
static int
-find_mech(smb_sign_mech_t *mech, crypto_mech_name_t name)
+find_mech(smb_sign_mech_t *mech, const char *name)
{
crypto_mech_type_t t;
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c
index 73b5c62225..16b9987972 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_smb.c
@@ -34,7 +34,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -439,7 +439,7 @@ out:
if (fhp != NULL)
smb_fh_rele(fhp);
- return (0);
+ return (error);
}
void
diff --git a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
index 3fca806155..c19e92976f 100644
--- a/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
+++ b/usr/src/uts/common/fs/smbclnt/smbfs/smbfs_vnops.c
@@ -34,7 +34,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -326,6 +326,7 @@ smbfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
/*
* We have a new FID and access rights.
*/
+ VERIFY(fid != NULL);
oldfid = np->n_fid;
np->n_fid = fid;
np->n_fidrefs++;
@@ -562,6 +563,10 @@ smbfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
return (EIO);
+ /* Sanity check: should have a valid open */
+ if (np->n_fid == NULL)
+ return (EIO);
+
ASSERT(smbfs_rw_lock_held(&np->r_rwlock, RW_READER));
if (vp->v_type != VREG)
@@ -723,6 +728,10 @@ smbfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
return (EIO);
+ /* Sanity check: should have a valid open */
+ if (np->n_fid == NULL)
+ return (EIO);
+
ASSERT(smbfs_rw_lock_held(&np->r_rwlock, RW_WRITER));
if (vp->v_type != VREG)
@@ -4427,6 +4436,10 @@ smbfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
if (smi->smi_flags & SMI_DEAD || vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
return (EIO);
+ /* Sanity check: should have a valid open */
+ if (np->n_fid == NULL)
+ return (EIO);
+
if (vp->v_flag & VNOMAP)
return (ENOSYS);
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
index 9010e3a181..9aafb6e4d7 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_dispatch.c
@@ -11,7 +11,7 @@
/*
* Copyright 2019 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2019 RackTop Systems.
+ * Copyright 2020 RackTop Systems, Inc.
*/
@@ -973,6 +973,19 @@ cmd_done:
*/
(void) smb2_encode_header(sr, B_TRUE);
+ /*
+ * Cannot move this into smb2_session_setup() - encoded header required.
+ */
+ if (session->dialect >= SMB_VERS_3_11 &&
+ sr->smb2_cmd_code == SMB2_SESSION_SETUP &&
+ sr->smb2_status == NT_STATUS_MORE_PROCESSING_REQUIRED) {
+ if (smb31_preauth_sha512_calc(sr, &sr->reply,
+ sr->uid_user->u_preauth_hashval,
+ sr->uid_user->u_preauth_hashval) != 0)
+ cmn_err(CE_WARN, "(3) Preauth hash calculation "
+ "failed");
+ }
+
/* Don't sign if we're going to encrypt */
if (sr->tform_ssn == NULL &&
(sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0)
@@ -1109,8 +1122,8 @@ cmd_start:
disconnect = B_TRUE;
goto cleanup;
}
- sr->smb2_hdr_flags |= (SMB2_FLAGS_SERVER_TO_REDIR |
- SMB2_FLAGS_ASYNC_COMMAND);
+ sr->smb2_hdr_flags |= (SMB2_FLAGS_SERVER_TO_REDIR |
+ SMB2_FLAGS_ASYNC_COMMAND);
sr->smb2_async_id = SMB2_ASYNCID(sr);
/*
@@ -1479,8 +1492,7 @@ smb2_send_reply(smb_request_t *sr)
if ((session->capabilities & SMB2_CAP_ENCRYPTION) == 0 ||
sr->tform_ssn == NULL) {
- if (smb_session_send(sr->session, 0, &sr->reply) == 0)
- sr->reply.chain = 0;
+ (void) smb_session_send(sr->session, 0, &sr->reply);
return;
}
@@ -1505,8 +1517,8 @@ smb2_send_reply(smb_request_t *sr)
goto errout;
}
- if (smb_session_send(sr->session, 0, &enc_reply) == 0)
- enc_reply.chain = 0;
+ (void) smb_session_send(sr->session, 0, &enc_reply);
+ kmem_free(tmpbuf, buflen);
return;
errout:
@@ -1590,6 +1602,66 @@ smb2sr_put_error_data(smb_request_t *sr, uint32_t status, mbuf_chain_t *mbc)
}
/*
+ * Build an SMB2 error context response (dialect 3.1.1).
+ */
+void
+smb2sr_put_error_ctx(smb_request_t *sr, uint32_t status, uint32_t errid,
+ mbuf_chain_t *mbc)
+{
+ DWORD len;
+
+ /*
+ * The common dispatch code writes this when it
+ * updates the SMB2 header before sending.
+ */
+ sr->smb2_status = status;
+
+ /* Rewind to the end of the SMB header. */
+ sr->reply.chain_offset = sr->smb2_reply_hdr + SMB2_HDR_SIZE;
+
+ /*
+ * Error Context is 8-byte header plus encaps. data (ErrorContextData),
+ * which can be zero-length.
+ */
+ if (mbc != NULL && (len = MBC_LENGTH(mbc)) != 0) {
+ (void) smb_mbc_encodef(
+ &sr->reply,
+ "wbblllC",
+ 9, /* StructSize */ /* w */
+ 1, /* ErrorContextCount */ /* b */
+ 0, /* reserved */ /* b */
+ 8+len, /* ByteCount */ /* l */
+ len, /* ErrorDataLength */ /* l */
+ errid, /* ErrorId */ /* l */
+ mbc); /* C */
+ } else {
+ (void) smb_mbc_encodef(
+ &sr->reply,
+ "wbblll",
+ 9, /* StructSize */ /* w */
+ 1, /* ErrorContextCount */ /* b */
+ 0, /* reserved */ /* b */
+ 8, /* ByteCount */ /* l */
+ 0, /* ErrorDataLength */ /* l */
+ errid); /* ErrorId */ /* l */
+ }
+}
+
+/*
+ * Build an SMB2 error context response with SMB2_ERROR_ID_DEFAULT ErrorId.
+ *
+ * This only handles the case we currently need, encapsulating a
+ * single error data section inside an SMB2_ERROR_ID_DEFAULT
+ * error context type (which is type zero, and that's what
+ * the zero on the end of this function name refers to).
+ */
+void
+smb2sr_put_error_ctx0(smb_request_t *sr, uint32_t status, mbuf_chain_t *mbc)
+{
+ return (smb2sr_put_error_ctx(sr, status, SMB2_ERROR_ID_DEFAULT, mbc));
+}
+
+/*
* smb2sr_lookup_fid
*
* Setup sr->fid_ofile, either inherited from a related command,
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_durable.c b/usr/src/uts/common/fs/smbsrv/smb2_durable.c
index 56dda62832..c783cd9659 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_durable.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_durable.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -179,6 +179,8 @@ preserve_some:
/* preserve_opens == SMB2_DH_PRESERVE_SOME */
switch (of->dh_vers) {
+ uint32_t ol_state;
+
case SMB2_RESILIENT:
return (B_TRUE);
@@ -188,7 +190,11 @@ preserve_some:
/* FALLTHROUGH */
case SMB2_DURABLE_V1:
/* IS durable (v1 or v2) */
- if ((of->f_oplock.og_state & (OPLOCK_LEVEL_BATCH |
+ if (of->f_lease != NULL)
+ ol_state = of->f_lease->ls_state;
+ else
+ ol_state = of->f_oplock.og_state;
+ if ((ol_state & (OPLOCK_LEVEL_BATCH |
OPLOCK_LEVEL_CACHE_HANDLE)) != 0)
return (B_TRUE);
/* FALLTHROUGH */
@@ -360,6 +366,12 @@ smb2_dh_import_share(void *arg)
break;
/*
+ * If the server's stopping, no point importing.
+ */
+ if (smb_server_is_stopping(sr->sr_server))
+ break;
+
+ /*
* Read a stream name and info
*/
rc = smb_odir_read_streaminfo(sr, od, str_info, &eof);
@@ -392,6 +404,7 @@ smb2_dh_import_share(void *arg)
of = NULL;
}
sr->fid_ofile = NULL;
+ smb_llist_flush(&sr->tid_tree->t_ofile_list);
} while (!eof);
@@ -813,7 +826,7 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node,
smb_attr_t attr;
iovec_t iov;
uio_t uio;
- smb_kshare_t *shr = sr->arg.tcon.si;
+ smb_tree_t *tree = sr->tid_tree;
cred_t *kcr = zone_kcred();
size_t flen;
int rc;
@@ -823,14 +836,14 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node,
rc = smb_node_getattr(NULL, node, kcr, NULL, &attr);
if (rc != 0) {
cmn_err(CE_NOTE, "CA import (%s/%s) getattr rc=%d",
- shr->shr_path, node->od_name, rc);
+ tree->t_resource, node->od_name, rc);
return (rc);
}
if (attr.sa_vattr.va_size < 4 ||
attr.sa_vattr.va_size > sr->sr_req_length) {
cmn_err(CE_NOTE, "CA import (%s/%s) bad size=%" PRIu64,
- shr->shr_path, node->od_name,
+ tree->t_resource, node->od_name,
(uint64_t)attr.sa_vattr.va_size);
return (EINVAL);
}
@@ -847,19 +860,19 @@ smb2_dh_read_nvlist(smb_request_t *sr, smb_node_t *node,
rc = smb_fsop_read(sr, kcr, node, NULL, &uio, 0);
if (rc != 0) {
cmn_err(CE_NOTE, "CA import (%s/%s) read, rc=%d",
- shr->shr_path, node->od_name, rc);
+ tree->t_resource, node->od_name, rc);
return (rc);
}
if (uio.uio_resid != 0) {
cmn_err(CE_NOTE, "CA import (%s/%s) short read",
- shr->shr_path, node->od_name);
+ tree->t_resource, node->od_name);
return (EIO);
}
rc = nvlist_unpack(sr->sr_request_buf, flen, nvlpp, KM_SLEEP);
if (rc != 0) {
cmn_err(CE_NOTE, "CA import (%s/%s) unpack, rc=%d",
- shr->shr_path, node->od_name, rc);
+ tree->t_resource, node->od_name, rc);
return (rc);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c
index 4240328207..930bd353c4 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_copychunk.c
@@ -204,7 +204,7 @@ smb2_fsctl_copychunk(smb_request_t *sr, smb_fsctl_t *fsctl)
* The client should then fall back to normal copy.
*/
args->bufsize = smb2_copychunk_max_seg;
- args->buffer = kmem_alloc(args->bufsize, KM_NOSLEEP | KM_NORMALPRI);
+ args->buffer = kmem_alloc(args->bufsize, KM_NOSLEEP_LAZY);
if (args->buffer == NULL) {
status = NT_STATUS_INSUFF_SERVER_RESOURCES;
goto out;
@@ -447,6 +447,8 @@ smb2_fsctl_copychunk_meta(smb_request_t *sr, smb_ofile_t *src_of)
* here don't generally have WRITE_DAC access (sigh) so we
* have to bypass ofile access checks for this operation.
* The file-system level still does its access checking.
+ *
+ * TODO: this should really copy the SACL, too.
*/
smb_fssd_init(&fs_sd, secinfo, sd_flags);
sr->fid_ofile = NULL;
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c
index 829beda2e4..381fd7663e 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_fs.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -23,7 +23,16 @@
#include <smbsrv/smb_fsops.h>
#include <smb/winioctl.h>
-/* ARGSUSED */
+/*
+ * XXX: Should use smb2_fsctl_invalid in place of smb2_fsctl_notsup
+ * but that will require some re-testing.
+ */
+static uint32_t
+smb2_fsctl_invalid(smb_request_t *sr, smb_fsctl_t *fsctl)
+{
+ return (NT_STATUS_INVALID_DEVICE_REQUEST);
+}
+
static uint32_t
smb2_fsctl_notsup(smb_request_t *sr, smb_fsctl_t *fsctl)
{
@@ -52,9 +61,12 @@ smb2_fsctl_get_compression(smb_request_t *sr, smb_fsctl_t *fsctl)
{
_NOTE(ARGUNUSED(sr))
uint16_t compress_state = 0;
+ int rc;
- (void) smb_mbc_encodef(fsctl->in_mbc, "w",
+ rc = smb_mbc_encodef(fsctl->in_mbc, "w",
compress_state);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (NT_STATUS_SUCCESS);
}
@@ -97,6 +109,7 @@ smb2_fsctl_get_resume_key(smb_request_t *sr, smb_fsctl_t *fsctl)
{
smb_ofile_t *of = sr->fid_ofile;
smb2fid_t smb2fid;
+ int rc;
/* Caller makes sure we have of = sr->fid_ofile */
/* Don't insist on a plain file (see above). */
@@ -104,10 +117,12 @@ smb2_fsctl_get_resume_key(smb_request_t *sr, smb_fsctl_t *fsctl)
smb2fid.persistent = of->f_persistid;
smb2fid.temporal = of->f_fid;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
fsctl->out_mbc, "qq16.",
smb2fid.persistent,
smb2fid.temporal);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (NT_STATUS_SUCCESS);
}
@@ -130,9 +145,11 @@ smb2_fsctl_fs(smb_request_t *sr, smb_fsctl_t *fsctl)
break;
case FSCTL_SET_REPARSE_POINT: /* 41 */
case FSCTL_GET_REPARSE_POINT: /* 42 */
- case FSCTL_CREATE_OR_GET_OBJECT_ID: /* 48 */
func = smb2_fsctl_notsup;
break;
+ case FSCTL_CREATE_OR_GET_OBJECT_ID: /* 48 */
+ func = smb2_fsctl_invalid;
+ break;
case FSCTL_SET_SPARSE: /* 49 */
func = smb2_fsctl_set_sparse;
break;
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c
index 0452cddb39..fe748bbd62 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_fsctl_odx.c
@@ -667,7 +667,7 @@ smb2_fsctl_odx_write_native1(smb_request_t *sr,
* allow the allocation to fail and return an error.
* The client should then fall back to normal copy.
*/
- buffer = kmem_alloc(bufsize, KM_NOSLEEP | KM_NORMALPRI);
+ buffer = kmem_alloc(bufsize, KM_NOSLEEP_LAZY);
if (buffer == NULL) {
status = NT_STATUS_INSUFF_SERVER_RESOURCES;
goto out;
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_lease.c b/usr/src/uts/common/fs/smbsrv/smb2_lease.c
index 95d7d9c7f1..a23f474cec 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_lease.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_lease.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -653,7 +653,6 @@ done:
ofile->f_oplock.og_state = op->op_oplock_state;
mutex_enter(&lease->ls_mutex);
lease->ls_state = op->op_oplock_state & CACHE_RWH;
- lease->ls_oplock_ofile = ofile;
lease->ls_epoch++;
mutex_exit(&lease->ls_mutex);
}
@@ -685,6 +684,9 @@ smb2_lease_ofile_close(smb_ofile_t *ofile)
smb_lease_t *lease = ofile->f_lease;
smb_ofile_t *o;
+ ASSERT(RW_READ_HELD(&node->n_ofile_list.ll_lock));
+ ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex));
+
/*
* If this ofile was not the oplock owner for this lease,
* we can leave things as they are.
@@ -696,24 +698,22 @@ smb2_lease_ofile_close(smb_ofile_t *ofile)
* Find another ofile to which we can move the oplock.
* The ofile must be open and allow a new ref.
*/
- smb_llist_enter(&node->n_ofile_list, RW_READER);
FOREACH_NODE_OFILE(node, o) {
if (o == ofile)
continue;
if (o->f_lease != lease)
continue;
+ if (o->f_oplock.og_closing)
+ continue;
/* If we can get a hold, use this ofile. */
if (smb_ofile_hold(o))
break;
}
if (o == NULL) {
/* Normal for last close on a lease. */
- smb_llist_exit(&node->n_ofile_list);
return;
}
smb_oplock_move(node, ofile, o);
- lease->ls_oplock_ofile = o;
- smb_llist_exit(&node->n_ofile_list);
smb_ofile_release(o);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
index e8d8419f93..7d67247588 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_negotiate.c
@@ -11,7 +11,7 @@
/*
* Copyright 2019 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2019 RackTop Systems.
+ * Copyright 2021 RackTop Systems, Inc.
*/
/*
@@ -20,14 +20,7 @@
#include <smbsrv/smb2_kproto.h>
#include <smbsrv/smb2.h>
-
-/*
- * Note from [MS-SMB2] Sec. 2.2.3: Windows servers return
- * invalid parameter if the dialect count is greater than 64
- * This is here (and not in smb2.h) because this is technically
- * an implementation detail, not protocol specification.
- */
-#define SMB2_NEGOTIATE_MAX_DIALECTS 64
+#include <sys/random.h>
static int smb2_negotiate_common(smb_request_t *, uint16_t);
@@ -85,6 +78,7 @@ static uint16_t smb2_versions[] = {
0x210, /* SMB 2.1 */
0x300, /* SMB 3.0 */
0x302, /* SMB 3.02 */
+ 0x311, /* SMB 3.11 */
};
static uint16_t smb2_nversions =
sizeof (smb2_versions) / sizeof (smb2_versions[0]);
@@ -210,16 +204,377 @@ smb2_find_best_dialect(smb_session_t *s, uint16_t cl_versions[],
* Return value is 0 for success, and anything else will
* terminate the reader thread (drop the connection).
*/
+enum smb2_neg_ctx_type {
+ SMB2_PREAUTH_INTEGRITY_CAPS = 1,
+ SMB2_ENCRYPTION_CAPS = 2,
+ SMB2_COMPRESSION_CAPS = 3, /* not imlemented */
+ SMB2_NETNAME_NEGOTIATE_CONTEXT_ID = 5 /* not imlemented */
+};
+
+typedef struct smb2_negotiate_ctx {
+ uint16_t type;
+ uint16_t datalen;
+} smb2_neg_ctx_t;
+
+#define SMB31_PREAUTH_CTX_SALT_LEN 32
+
+/*
+ * SMB 3.1.1 originally specified a single hashing algorithm - SHA-512 - and
+ * two encryption ones - AES-128-CCM and AES-128-GCM.
+ * Windows Server 2022 and Windows 11 introduced two further encryption
+ * algorithms - AES-256-CCM and AES-256-GCM.
+ */
+#define MAX_HASHID_NUM (1)
+#define MAX_CIPHER_NUM (4)
+
+typedef struct smb2_preauth_integrity_caps {
+ uint16_t picap_hash_count;
+ uint16_t picap_salt_len;
+ uint16_t picap_hash_id;
+ uint8_t picap_salt[SMB31_PREAUTH_CTX_SALT_LEN];
+} smb2_preauth_caps_t;
+
+typedef struct smb2_encryption_caps {
+ uint16_t encap_cipher_count;
+ uint16_t encap_cipher_ids[MAX_CIPHER_NUM];
+} smb2_encrypt_caps_t;
+
+/*
+ * The contexts we support
+ */
+typedef struct smb2_preauth_neg_ctx {
+ smb2_neg_ctx_t neg_ctx;
+ smb2_preauth_caps_t preauth_caps;
+} smb2_preauth_neg_ctx_t;
+
+typedef struct smb2_encrypt_neg_ctx {
+ smb2_neg_ctx_t neg_ctx;
+ smb2_encrypt_caps_t encrypt_caps;
+} smb2_encrypt_neg_ctx_t;
+
+typedef struct smb2_neg_ctxs {
+ uint32_t offset;
+ uint16_t count;
+ smb2_preauth_neg_ctx_t preauth_ctx;
+ smb2_encrypt_neg_ctx_t encrypt_ctx;
+} smb2_neg_ctxs_t;
+
+#define NEG_CTX_INFO_OFFSET (SMB2_HDR_SIZE + 28)
+#define NEG_CTX_OFFSET_OFFSET (SMB2_HDR_SIZE + 64)
+#define NEG_CTX_MAX_COUNT (16)
+#define NEG_CTX_MAX_DATALEN (256)
+
+#define STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP (0xC05D0000)
+
+#define STATUS_PREAUTH_HASH_OVERLAP \
+ STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP
+
+#define SMB3_CIPHER_ENABLED(c, f) ((c) <= SMB3_CIPHER_MAX && \
+ SMB3_CIPHER_BIT(c) & (f))
+
+/*
+ * This function should be called only for dialect >= 0x311
+ * Negotiate context list should contain exactly one
+ * SMB2_PREAUTH_INTEGRITY_CAPS context.
+ * Otherwise STATUS_INVALID_PARAMETER.
+ * It should contain at least 1 hash algorith what server does support.
+ * Otehrwise STATUS_SMB_NO_PREAUTH_INEGRITY_HASH_OVERLAP.
+ */
+static uint32_t
+smb31_decode_neg_ctxs(smb_request_t *sr, smb2_neg_ctxs_t *neg_ctxs)
+{
+ smb_session_t *s = sr->session;
+ smb2_preauth_caps_t *picap = &neg_ctxs->preauth_ctx.preauth_caps;
+ smb2_encrypt_caps_t *encap = &neg_ctxs->encrypt_ctx.encrypt_caps;
+ boolean_t found_sha512 = B_FALSE;
+ boolean_t found_cipher = B_FALSE;
+ uint16_t ciphers = sr->sr_server->sv_cfg.skc_encrypt_cipher;
+ uint32_t status = 0;
+ int32_t skip;
+ int found_preauth_ctx = 0;
+ int found_encrypt_ctx = 0;
+ int cnt, i;
+ int rc;
+
+ sr->command.chain_offset = NEG_CTX_INFO_OFFSET;
+
+ rc = smb_mbc_decodef(&sr->command, "lw2.",
+ &neg_ctxs->offset, /* l */
+ &neg_ctxs->count); /* w */
+ if (rc != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+ /*
+ * There should be exactly 1 SMB2_PREAUTH_INTEGRITY_CAPS negotiate ctx.
+ * SMB2_ENCRYPTION_CAPS is optional one.
+ * If there is no contexts or there are to many then stop parsing.
+ */
+ cnt = neg_ctxs->count;
+ if (cnt < 1 || cnt > NEG_CTX_MAX_COUNT) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Cannot proceed parsing if the first context isn't aligned by 8.
+ */
+ if (neg_ctxs->offset % 8 != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ if ((skip = neg_ctxs->offset - sr->command.chain_offset) != 0 &&
+ smb_mbc_decodef(&sr->command, "#.", skip) != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Parse negotiate contexts. Ignore non-decoding errors to fill
+ * as much as possible data for dtrace probe.
+ */
+ for (i = 0; i < cnt; i++) {
+ smb2_neg_ctx_t neg_ctx;
+ int32_t ctx_end_off;
+ int32_t ctx_next_off;
+
+ if (i > 0) {
+ if ((skip = ctx_next_off - ctx_end_off) != 0 &&
+ smb_mbc_decodef(&sr->command, "#.", skip) != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+ }
+
+ rc = smb_mbc_decodef(
+ &sr->command, "ww4.",
+ &neg_ctx.type, /* w */
+ &neg_ctx.datalen); /* w */
+ if (rc != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * We got something crazy
+ */
+ if (neg_ctx.datalen > NEG_CTX_MAX_DATALEN) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ ctx_end_off = sr->command.chain_offset + neg_ctx.datalen;
+ ctx_next_off = P2ROUNDUP(ctx_end_off, 8);
+
+ switch (neg_ctx.type) {
+ case SMB2_PREAUTH_INTEGRITY_CAPS:
+ memcpy(&neg_ctxs->preauth_ctx.neg_ctx, &neg_ctx,
+ sizeof (neg_ctx));
+
+ if (found_preauth_ctx++ != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ continue;
+ }
+
+ rc = smb_mbc_decodef(
+ &sr->command, "ww",
+ &picap->picap_hash_count, /* w */
+ &picap->picap_salt_len); /* w */
+ if (rc != 0 || picap->picap_hash_count >
+ MAX_HASHID_NUM) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Get hash id
+ */
+ rc = smb_mbc_decodef(
+ &sr->command, "#w",
+ picap->picap_hash_count,
+ &picap->picap_hash_id); /* w */
+ if (rc != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Get salt
+ */
+ rc = smb_mbc_decodef(
+ &sr->command, "#c",
+ sizeof (picap->picap_salt),
+ &picap->picap_salt[0]); /* w */
+ if (rc != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * In SMB 0x311 there should be exactly 1 preauth
+ * negotiate context, and there should be exactly 1
+ * hash value in the list - SHA512.
+ */
+ if (picap->picap_hash_count != 1) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ continue;
+ }
+
+ if (picap->picap_hash_id == SMB3_HASH_SHA512)
+ found_sha512 = B_TRUE;
+ break;
+ case SMB2_ENCRYPTION_CAPS:
+ memcpy(&neg_ctxs->preauth_ctx.neg_ctx, &neg_ctx,
+ sizeof (neg_ctx));
+
+ if (found_encrypt_ctx++ != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ continue;
+ }
+
+ rc = smb_mbc_decodef(
+ &sr->command, "w",
+ &encap->encap_cipher_count); /* w */
+ if (rc != 0 || encap->encap_cipher_count >
+ MAX_CIPHER_NUM) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Get cipher list
+ */
+ rc = smb_mbc_decodef(
+ &sr->command, "#w",
+ encap->encap_cipher_count,
+ &encap->encap_cipher_ids[0]); /* w */
+ if (rc != 0) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ /*
+ * Select the first enabled cipher.
+ * Client should list more prioritized ciphers first.
+ */
+ for (int k = 0; k < encap->encap_cipher_count; k++) {
+ uint16_t c = encap->encap_cipher_ids[k];
+
+ if (SMB3_CIPHER_ENABLED(c, ciphers)) {
+ s->smb31_enc_cipherid = c;
+ found_cipher = B_TRUE;
+ break;
+ }
+ }
+ break;
+ default:
+ ;
+ }
+ }
+
+ if (status)
+ goto errout;
+
+ /* Not found mandatory SMB2_PREAUTH_INTEGRITY_CAPS ctx */
+ if (found_preauth_ctx != 1 || found_encrypt_ctx > 1) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+
+ if (!found_sha512) {
+ status = STATUS_PREAUTH_HASH_OVERLAP;
+ goto errout;
+ }
+
+ s->smb31_preauth_hashid = SMB3_HASH_SHA512;
+
+ if (!found_cipher)
+ s->smb31_enc_cipherid = 0;
+
+errout:
+ return (status);
+}
+
+static int
+smb31_encode_neg_ctxs(smb_request_t *sr, smb2_neg_ctxs_t *neg_ctxs)
+{
+ smb_session_t *s = sr->session;
+ smb2_preauth_caps_t *picap = &neg_ctxs->preauth_ctx.preauth_caps;
+ smb2_encrypt_caps_t *encap = &neg_ctxs->encrypt_ctx.encrypt_caps;
+ uint16_t salt_len = sizeof (picap->picap_salt);
+ uint32_t preauth_ctx_len = 6 + salt_len;
+ uint32_t enc_ctx_len = 4;
+ uint32_t neg_ctx_off = NEG_CTX_OFFSET_OFFSET +
+ P2ROUNDUP(sr->sr_cfg->skc_negtok_len, 8);
+ uint32_t rc;
+
+ bzero(neg_ctxs, sizeof (*neg_ctxs));
+
+ if ((rc = smb_mbc_put_align(&sr->reply, 8)) != 0)
+ return (rc);
+
+ ASSERT3S(neg_ctx_off, ==, sr->reply.chain_offset);
+
+ encap->encap_cipher_ids[0] = s->smb31_enc_cipherid;
+ picap->picap_hash_id = s->smb31_preauth_hashid;
+ picap->picap_salt_len = salt_len;
+
+ (void) random_get_pseudo_bytes(picap->picap_salt, salt_len);
+
+ rc = smb_mbc_encodef(
+ &sr->reply, "ww4.",
+ SMB2_PREAUTH_INTEGRITY_CAPS,
+ preauth_ctx_len
+ /* 4. */); /* reserved */
+ if (rc != 0)
+ return (rc);
+
+ rc = smb_mbc_encodef(
+ &sr->reply, "www#c",
+ 1, /* hash algo count */
+ salt_len, /* salt length */
+ s->smb31_preauth_hashid, /* hash id */
+ salt_len, /* salt length */
+ picap->picap_salt);
+
+ /* aligned on 8-bytes boundary */
+ if (rc != 0 || s->smb31_enc_cipherid == 0) {
+ cmn_err(CE_NOTE, "Encryption is not supported");
+ return (rc);
+ }
+
+ if ((rc = smb_mbc_put_align(&sr->reply, 8)) != 0)
+ return (rc);
+
+ rc = smb_mbc_encodef(
+ &sr->reply, "ww4.",
+ SMB2_ENCRYPTION_CAPS,
+ enc_ctx_len
+ /* 4. */); /* reserved */
+
+ rc = smb_mbc_encodef(
+ &sr->reply, "ww",
+ 1, /* cipher count */
+ s->smb31_enc_cipherid); /* encrypt. cipher id */
+
+ return (rc);
+}
+
int
smb2_newrq_negotiate(smb_request_t *sr)
{
smb_session_t *s = sr->session;
+ smb2_neg_ctxs_t neg_in_ctxs;
+ smb2_neg_ctxs_t neg_out_ctxs;
+ smb2_arg_negotiate_t *nego2 = &sr->sr_nego2;
int rc;
uint32_t status = 0;
uint16_t struct_size;
uint16_t best_version;
- uint16_t version_cnt;
- uint16_t cl_versions[SMB2_NEGOTIATE_MAX_DIALECTS];
+
+ bzero(&neg_in_ctxs, sizeof (neg_in_ctxs));
+ bzero(&neg_out_ctxs, sizeof (neg_out_ctxs));
sr->smb2_cmd_hdr = sr->command.chain_offset;
rc = smb2_decode_header(sr);
@@ -239,7 +594,7 @@ smb2_newrq_negotiate(smb_request_t *sr)
rc = smb_mbc_decodef(
&sr->command, "www..l16c8.",
&struct_size, /* w */
- &version_cnt, /* w */
+ &s->cli_dialect_cnt, /* w */
&s->cli_secmode, /* w */
/* reserved (..) */
&s->capabilities, /* l */
@@ -255,33 +610,16 @@ smb2_newrq_negotiate(smb_request_t *sr)
*
* Be somewhat tolerant while decoding the variable part
* so we can return errors instead of dropping the client.
- * Will limit decoding to the size of cl_versions here,
- * and do the error checks on version_cnt after the
+ * Will limit decoding to the size of cli_dialects here,
+ * and do the error checks on s->cli_dialect_cnt after the
* dtrace start probe.
*/
- if (version_cnt > 0 &&
- version_cnt <= SMB2_NEGOTIATE_MAX_DIALECTS &&
- smb_mbc_decodef(&sr->command, "#w", version_cnt,
- cl_versions) != 0) {
- /* decode error; force an error below */
- version_cnt = 0;
- }
-
- DTRACE_SMB2_START(op__Negotiate, smb_request_t *, sr);
-
- sr->smb2_hdr_flags |= SMB2_FLAGS_SERVER_TO_REDIR;
- (void) smb2_encode_header(sr, B_FALSE);
-
- /*
- * [MS-SMB2] 3.3.5.2.4 Verifying the Signature
- * "If the SMB2 header of the SMB2 NEGOTIATE request has the
- * SMB2_FLAGS_SIGNED bit set in the Flags field, the server
- * MUST fail the request with STATUS_INVALID_PARAMETER."
- */
- if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0) {
- sr->smb2_hdr_flags &= ~SMB2_FLAGS_SIGNED;
- status = NT_STATUS_INVALID_PARAMETER;
- goto errout;
+ if (s->cli_dialect_cnt > 0 &&
+ s->cli_dialect_cnt <= SMB2_NEGOTIATE_MAX_DIALECTS &&
+ smb_mbc_decodef(&sr->command, "#w", s->cli_dialect_cnt,
+ s->cli_dialects) != 0) {
+ /* decode error; force an error below */
+ s->cli_dialect_cnt = 0;
}
/*
@@ -289,26 +627,53 @@ smb2_newrq_negotiate(smb_request_t *sr)
* "If the DialectCount of the SMB2 NEGOTIATE Request is 0, the
* server MUST fail the request with STATUS_INVALID_PARAMETER."
*/
- if (version_cnt == 0 ||
- version_cnt > SMB2_NEGOTIATE_MAX_DIALECTS) {
+ if (s->cli_dialect_cnt == 0 ||
+ s->cli_dialect_cnt > SMB2_NEGOTIATE_MAX_DIALECTS) {
status = NT_STATUS_INVALID_PARAMETER;
- goto errout;
}
/*
* The client offers an array of protocol versions it
- * supports, which we have decoded into cl_versions[].
+ * supports, which we have decoded into s->cli_dialects[].
* We walk the array and pick the highest supported.
*
* [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request
* "If a common dialect is not found, the server MUST fail
* the request with STATUS_NOT_SUPPORTED."
*/
- best_version = smb2_find_best_dialect(s, cl_versions, version_cnt);
- if (best_version == 0) {
- status = NT_STATUS_NOT_SUPPORTED;
+
+ if (status == 0) {
+ best_version = smb2_find_best_dialect(s, s->cli_dialects,
+ s->cli_dialect_cnt);
+ if (best_version >= SMB_VERS_3_11) {
+ status = smb31_decode_neg_ctxs(sr, &neg_in_ctxs);
+ nego2->neg_in_ctxs = &neg_in_ctxs;
+ } else if (best_version == 0) {
+ status = NT_STATUS_NOT_SUPPORTED;
+ }
+ }
+
+ DTRACE_SMB2_START(op__Negotiate, smb_request_t *, sr);
+ nego2->neg_in_ctxs = NULL;
+
+ sr->smb2_hdr_flags |= SMB2_FLAGS_SERVER_TO_REDIR;
+ (void) smb2_encode_header(sr, B_FALSE);
+
+ if (status != 0)
+ goto errout;
+
+ /*
+ * [MS-SMB2] 3.3.5.2.4 Verifying the Signature
+ * "If the SMB2 header of the SMB2 NEGOTIATE request has the
+ * SMB2_FLAGS_SIGNED bit set in the Flags field, the server
+ * MUST fail the request with STATUS_INVALID_PARAMETER."
+ */
+ if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) != 0) {
+ sr->smb2_hdr_flags &= ~SMB2_FLAGS_SIGNED;
+ status = NT_STATUS_INVALID_PARAMETER;
goto errout;
}
+
s->dialect = best_version;
/* Allow normal SMB2 requests now. */
@@ -318,14 +683,30 @@ smb2_newrq_negotiate(smb_request_t *sr)
if (smb2_negotiate_common(sr, best_version) != 0)
status = NT_STATUS_INTERNAL_ERROR;
+ if (s->dialect >= SMB_VERS_3_11 && status == 0) {
+ if (smb31_encode_neg_ctxs(sr, &neg_out_ctxs) != 0)
+ status = NT_STATUS_INTERNAL_ERROR;
+ nego2->neg_out_ctxs = &neg_out_ctxs;
+ }
+
errout:
sr->smb2_status = status;
DTRACE_SMB2_DONE(op__Negotiate, smb_request_t *, sr);
+ nego2->neg_out_ctxs = NULL;
if (sr->smb2_status != 0)
smb2sr_put_error(sr, sr->smb2_status);
(void) smb2_encode_header(sr, B_TRUE);
+ if (s->dialect >= SMB_VERS_3_11 && sr->smb2_status == 0) {
+ ASSERT3U(s->smb31_preauth_hashid, !=, 0);
+ if (smb31_preauth_sha512_calc(sr, &sr->reply,
+ s->smb31_preauth_hashval,
+ s->smb31_preauth_hashval) != 0)
+ cmn_err(CE_WARN, "(1) Preauth hash calculation "
+ "failed");
+ }
+
smb2_send_reply(sr);
return (rc);
@@ -347,6 +728,8 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
int rc;
uint32_t max_rwsize;
uint16_t secmode;
+ uint16_t neg_ctx_cnt = 0;
+ uint32_t neg_ctx_off = 0;
/*
* Negotiation itself. First the Security Mode.
@@ -379,6 +762,8 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
*/
if (version != 0x2FF)
smb2_sign_init_mech(s);
+ if (version >= 0x311)
+ smb31_preauth_init_mech(s);
/*
* [MS-SMB2] 3.3.5.4 Receiving an SMB2 NEGOTIATE Request
@@ -404,6 +789,21 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
if ((s->srv_cap & SMB2_CAP_ENCRYPTION) != 0 &&
smb3_encrypt_init_mech(s) != 0) {
s->srv_cap &= ~SMB2_CAP_ENCRYPTION;
+ s->smb31_enc_cipherid = 0;
+ }
+
+ if (s->dialect >= SMB_VERS_3_11) {
+ neg_ctx_cnt = s->smb31_enc_cipherid == 0 ? 1 : 2;
+ neg_ctx_off = NEG_CTX_OFFSET_OFFSET +
+ P2ROUNDUP(sr->sr_cfg->skc_negtok_len, 8);
+
+ ASSERT3U(s->smb31_preauth_hashid, !=, 0);
+
+ if (smb31_preauth_sha512_calc(sr, &sr->command,
+ s->smb31_preauth_hashval,
+ s->smb31_preauth_hashval) != 0)
+ cmn_err(CE_WARN, "(0) Preauth hash calculation "
+ "failed");
}
}
@@ -421,7 +821,7 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
65, /* StructSize */ /* w */
s->srv_secmode, /* w */
version, /* w */
- 0, /* reserved */ /* w */
+ neg_ctx_cnt, /* w */
UUID_LEN, /* # */
&s->s_cfg.skc_machine_uuid, /* c */
s->srv_cap, /* l */
@@ -432,10 +832,12 @@ smb2_negotiate_common(smb_request_t *sr, uint16_t version)
&boot_tv, /* T */
128, /* SecBufOff */ /* w */
sr->sr_cfg->skc_negtok_len, /* w */
- 0, /* reserved */ /* l */
+ neg_ctx_off, /* l */
sr->sr_cfg->skc_negtok_len, /* # */
sr->sr_cfg->skc_negtok); /* c */
+
+
/* smb2_send_reply(sr); in caller */
(void) ksocket_setsockopt(s->sock, SOL_SOCKET,
@@ -467,12 +869,12 @@ uint32_t
smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl)
{
smb_session_t *s = sr->session;
+ boolean_t smb311 = s->s_cfg.skc_max_protocol >= SMB_VERS_3_11;
int rc;
/*
* The spec. says to parse the VALIDATE_NEGOTIATE_INFO here
* and verify that the original negotiate was not modified.
- * The request MUST be signed, and we MUST validate the signature.
*
* One interesting requirement here is that we MUST reply
* with exactly the same information as we returned in our
@@ -480,12 +882,22 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl)
* If we don't the client closes the connection.
*/
- /* dialects[8] taken from cl_versions[8] in smb2_newrq_negotiate */
uint32_t capabilities;
- uint16_t secmode, num_dialects, dialects[8];
+ uint16_t secmode;
+ uint16_t num_dialects;
+ uint16_t dialects[SMB2_NEGOTIATE_MAX_DIALECTS];
uint8_t clnt_guid[16];
- if ((sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) == 0)
+ if (s->dialect >= SMB_VERS_3_11)
+ goto drop;
+
+ /*
+ * [MS-SMB2] 3.3.5.2.4 Verifying the Signature
+ *
+ * If the dialect is SMB3 and the message was successfully
+ * decrypted we MUST skip processing of the signature.
+ */
+ if (!sr->encrypted && (sr->smb2_hdr_flags & SMB2_FLAGS_SIGNED) == 0)
goto drop;
if (fsctl->InputCount < 24)
@@ -497,7 +909,9 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl)
&secmode, /* w */
&num_dialects); /* w */
- if (num_dialects == 0 || num_dialects > 8)
+ if (num_dialects == 0 || num_dialects > SMB2_NEGOTIATE_MAX_DIALECTS)
+ goto drop;
+ if (smb311 && num_dialects != s->cli_dialect_cnt)
goto drop;
if (secmode != s->cli_secmode)
goto drop;
@@ -513,8 +927,16 @@ smb2_nego_validate(smb_request_t *sr, smb_fsctl_t *fsctl)
if (rc != 0)
goto drop;
- if (smb2_find_best_dialect(s, dialects, num_dialects) != s->dialect)
- goto drop;
+ if (smb311) {
+ for (int i = 0; i < num_dialects; i++) {
+ if (dialects[i] != s->cli_dialects[i])
+ goto drop;
+ }
+ } else {
+ if (smb2_find_best_dialect(s, dialects, num_dialects) !=
+ s->dialect)
+ goto drop;
+ }
rc = smb_mbc_encodef(
fsctl->out_mbc, "l#cww",
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_oplock.c b/usr/src/uts/common/fs/smbsrv/smb2_oplock.c
index 84bd8ccafb..f3f96c2b21 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_oplock.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_oplock.c
@@ -10,7 +10,8 @@
*/
/*
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
+ * Copyright 2019 RackTop Systems.
*/
/*
@@ -96,13 +97,35 @@ smb2_oplock_break_ack(smb_request_t *sr)
NewLevel = OPLOCK_LEVEL_BATCH;
break;
case SMB2_OPLOCK_LEVEL_LEASE: /* 0xFF */
- default:
NewLevel = OPLOCK_LEVEL_NONE;
break;
+ default:
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
}
ofile = sr->fid_ofile;
+ if (ofile->f_oplock.og_breaking == 0) {
+ /*
+ * This is an unsolicited Ack. (There is no
+ * outstanding oplock break in progress now.)
+ * There are WPTS tests that care which error
+ * is returned. See [MS-SMB2] 3.3.5.22.1
+ */
+ if (smbOplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
+ status = NT_STATUS_INVALID_PARAMETER;
+ goto errout;
+ }
+ if (NewLevel >= (ofile->f_oplock.og_state &
+ OPLOCK_LEVEL_TYPE_MASK)) {
+ status = NT_STATUS_INVALID_OPLOCK_PROTOCOL;
+ goto errout;
+ }
+ status = NT_STATUS_INVALID_DEVICE_STATE;
+ goto errout;
+ }
ofile->f_oplock.og_breaking = 0;
+
status = smb_oplock_ack_break(sr, ofile, &NewLevel);
if (status == NT_STATUS_OPLOCK_BREAK_IN_PROGRESS) {
status = smb2sr_go_async(sr);
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c
index ab682b7966..929f02522b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_file.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -240,10 +240,11 @@ static uint32_t
smb2_qif_basic(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_attr_t *sa = &qi->qi_attr;
+ int rc;
ASSERT((sa->sa_mask & SMB_AT_BASIC) == SMB_AT_BASIC);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "TTTTll",
&sa->sa_crtime, /* T */
&sa->sa_vattr.va_atime, /* T */
@@ -251,6 +252,8 @@ smb2_qif_basic(smb_request_t *sr, smb_queryinfo_t *qi)
&sa->sa_vattr.va_ctime, /* T */
sa->sa_dosattr, /* l */
0); /* reserved */ /* l */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -265,10 +268,11 @@ static uint32_t
smb2_qif_standard(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_attr_t *sa = &qi->qi_attr;
+ int rc;
ASSERT((sa->sa_mask & SMB_AT_STANDARD) == SMB_AT_STANDARD);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "qqlbbw",
sa->sa_allocsz, /* q */
sa->sa_vattr.va_size, /* q */
@@ -276,6 +280,8 @@ smb2_qif_standard(smb_request_t *sr, smb_queryinfo_t *qi)
qi->qi_delete_on_close, /* b */
qi->qi_isdir, /* b */
0); /* reserved */ /* w */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -290,6 +296,7 @@ smb2_qif_internal(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_attr_t *sa = &qi->qi_attr;
u_longlong_t nodeid;
+ int rc;
ASSERT((sa->sa_mask & SMB_AT_NODEID) == SMB_AT_NODEID);
nodeid = sa->sa_vattr.va_nodeid;
@@ -298,9 +305,11 @@ smb2_qif_internal(smb_request_t *sr, smb_queryinfo_t *qi)
(sr->session->s_flags & SMB_SSN_AAPL_CCEXT) != 0)
nodeid = 0;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "q",
nodeid); /* q */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -315,9 +324,12 @@ static uint32_t
smb2_qif_ea_size(smb_request_t *sr, smb_queryinfo_t *qi)
{
_NOTE(ARGUNUSED(qi))
+ int rc;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "l", 0);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -341,10 +353,13 @@ smb2_qif_access(smb_request_t *sr, smb_queryinfo_t *qi)
{
_NOTE(ARGUNUSED(qi))
smb_ofile_t *of = sr->fid_ofile;
+ int rc;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "l",
of->f_granted_access);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -358,14 +373,17 @@ smb2_qif_access(smb_request_t *sr, smb_queryinfo_t *qi)
static uint32_t
smb2_qif_name(smb_request_t *sr, smb_queryinfo_t *qi)
{
+ int rc;
ASSERT(qi->qi_namelen > 0);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "llU",
0, /* FileIndex (l) */
qi->qi_namelen, /* l */
qi->qi_name); /* U */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -379,13 +397,16 @@ smb2_qif_position(smb_request_t *sr, smb_queryinfo_t *qi)
_NOTE(ARGUNUSED(qi))
smb_ofile_t *of = sr->fid_ofile;
uint64_t pos;
+ int rc;
mutex_enter(&of->f_mutex);
pos = of->f_seek_pos;
mutex_exit(&of->f_mutex);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "q", pos);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -399,9 +420,12 @@ static uint32_t
smb2_qif_mode(smb_request_t *sr, smb_queryinfo_t *qi)
{
_NOTE(ARGUNUSED(qi))
+ int rc;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "l", 0);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -413,9 +437,12 @@ static uint32_t
smb2_qif_alignment(smb_request_t *sr, smb_queryinfo_t *qi)
{
_NOTE(ARGUNUSED(qi))
+ int rc;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "l", 0);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -430,6 +457,7 @@ static uint32_t
smb2_qif_altname(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_ofile_t *of = sr->fid_ofile;
+ int rc;
ASSERT(qi->qi_namelen > 0);
ASSERT(qi->qi_attr.sa_mask & SMB_AT_NODEID);
@@ -442,10 +470,12 @@ smb2_qif_altname(smb_request_t *sr, smb_queryinfo_t *qi)
/* fill in qi->qi_shortname */
smb_query_shortname(of->f_node, qi);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "%lU", sr,
smb_wcequiv_strlen(qi->qi_shortname),
qi->qi_shortname);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -481,6 +511,7 @@ smb2_qif_pipe(smb_request_t *sr, smb_queryinfo_t *qi)
smb_ofile_t *of = sr->fid_ofile;
uint32_t pipe_mode;
uint32_t nonblock;
+ int rc;
switch (of->f_ftype) {
case SMB_FTYPE_BYTE_PIPE:
@@ -496,9 +527,11 @@ smb2_qif_pipe(smb_request_t *sr, smb_queryinfo_t *qi)
}
nonblock = 0; /* XXX todo: Get this from the pipe handle. */
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "ll",
pipe_mode, nonblock);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -532,13 +565,16 @@ smb2_qif_compr(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_attr_t *sa = &qi->qi_attr;
uint16_t CompressionFormat = 0; /* COMPRESSION_FORMAT_NONE */
+ int rc;
ASSERT(sa->sa_mask & SMB_AT_SIZE);
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "qw6.",
sa->sa_vattr.va_size, /* q */
CompressionFormat); /* w */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -550,8 +586,9 @@ static uint32_t
smb2_qif_opens(smb_request_t *sr, smb_queryinfo_t *qi)
{
smb_attr_t *sa = &qi->qi_attr;
+ int rc;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "TTTTqqll",
&sa->sa_crtime, /* T */
&sa->sa_vattr.va_atime, /* T */
@@ -561,6 +598,8 @@ smb2_qif_opens(smb_request_t *sr, smb_queryinfo_t *qi)
sa->sa_vattr.va_size, /* q */
sa->sa_dosattr, /* l */
0); /* reserved */ /* l */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -578,8 +617,12 @@ static uint32_t
smb2_qif_tags(smb_request_t *sr, smb_queryinfo_t *qi)
{
_NOTE(ARGUNUSED(qi))
- (void) smb_mbc_encodef(
+ int rc;
+
+ rc = smb_mbc_encodef(
&sr->raw_data, "ll", 0, 0);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c
index 856a59e939..7bf3d1339e 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_qinfo_fs.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -100,6 +100,7 @@ smb2_qfs_volume(smb_request_t *sr)
smb_node_t *snode;
fsid_t fsid;
uint32_t LabelLength;
+ int rc;
if (!STYPE_ISDSK(tree->t_res_type))
return (NT_STATUS_INVALID_PARAMETER);
@@ -112,14 +113,16 @@ smb2_qfs_volume(smb_request_t *sr)
/*
* NT has the "supports objects" flag set to 1.
*/
- (void) smb_mbc_encodef(
- &sr->raw_data, "qllb.U",
- 0LL, /* Volume creation time (q) */
+ rc = smb_mbc_encodef(
+ &sr->raw_data, "Tllb.U",
+ &tree->t_create_time, /* (T) */
fsid.val[0], /* serial no. (l) */
LabelLength, /* (l) */
0, /* Supports objects (b) */
/* reserved (.) */
tree->t_volume); /* (U) */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -141,12 +144,14 @@ smb2_qfs_size(smb_request_t *sr)
if (rc)
return (smb_errno2status(rc));
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "qqll",
fssize.fs_caller_units,
fssize.fs_caller_avail,
fssize.fs_sectors_per_unit,
fssize.fs_bytes_per_sector);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -168,13 +173,15 @@ smb2_qfs_fullsize(smb_request_t *sr)
if (rc)
return (smb_errno2status(rc));
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "qqqll",
fssize.fs_caller_units,
fssize.fs_caller_avail,
fssize.fs_volume_avail,
fssize.fs_sectors_per_unit,
fssize.fs_bytes_per_sector);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -188,6 +195,7 @@ smb2_qfs_device(smb_request_t *sr)
smb_tree_t *tree = sr->tid_tree;
uint32_t DeviceType;
uint32_t Characteristics;
+ int rc;
if (!STYPE_ISDSK(tree->t_res_type))
return (NT_STATUS_INVALID_PARAMETER);
@@ -195,10 +203,12 @@ smb2_qfs_device(smb_request_t *sr)
DeviceType = FILE_DEVICE_DISK;
Characteristics = FILE_DEVICE_IS_MOUNTED;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "ll",
DeviceType,
Characteristics);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -213,6 +223,7 @@ smb2_qfs_attr(smb_request_t *sr)
char *fsname;
uint32_t namelen;
uint32_t FsAttr;
+ int rc;
/* This call is OK on all tree types. */
switch (tree->t_res_type & STYPE_MASK) {
@@ -247,12 +258,14 @@ smb2_qfs_attr(smb_request_t *sr)
if (tree->t_flags & SMB_TREE_SPARSE)
FsAttr |= FILE_SUPPORTS_SPARSE_FILES;
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "lllU",
FsAttr,
MAXNAMELEN-1,
namelen,
fsname);
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -264,6 +277,7 @@ uint32_t
smb2_qfs_control(smb_request_t *sr)
{
smb_tree_t *tree = sr->tid_tree;
+ int rc;
if (!STYPE_ISDSK(tree->t_res_type))
return (NT_STATUS_INVALID_PARAMETER);
@@ -275,7 +289,7 @@ smb2_qfs_control(smb_request_t *sr)
return (NT_STATUS_VOLUME_NOT_UPGRADED);
}
- (void) smb_mbc_encodef(
+ rc = smb_mbc_encodef(
&sr->raw_data, "qqqqqll",
0, /* free space start filtering - MUST be 0 */
0, /* free space threshold - MUST be 0 */
@@ -284,6 +298,8 @@ smb2_qfs_control(smb_request_t *sr)
SMB_QUOTA_UNLIMITED, /* default quota limit */
FILE_VC_QUOTA_ENFORCE, /* fs control flag */
0); /* pad bytes */
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
@@ -364,7 +380,7 @@ smb2_qfs_sectorsize(smb_request_t *sr)
smb_fssize_t fssize;
smb_tree_t *tree = sr->tid_tree;
uint32_t lbps, pbps;
- uint32_t flags;
+ uint32_t flags, unk;
int rc;
if (!STYPE_ISDSK(tree->t_res_type))
@@ -373,24 +389,15 @@ smb2_qfs_sectorsize(smb_request_t *sr)
rc = smb_fssize(sr, &fssize);
if (rc)
return (smb_errno2status(rc));
+
+ // PhysicalBytesPerSector
pbps = fssize.fs_bytes_per_sector;
+
+ // LogicalBytesPerSector
lbps = fssize.fs_sectors_per_unit * pbps;
if (lbps > smb2_max_logical_sector_size)
lbps = smb2_max_logical_sector_size;
- // LogicalBytesPerSector
- (void) smb_mbc_encodef(&sr->raw_data, "l", lbps);
-
- // PhysicalBytesPerSectorForAtomicity
- (void) smb_mbc_encodef(&sr->raw_data, "l", pbps);
-
- // PhysicalBytesPerSectorForPerformance
- // Using logical size here.
- (void) smb_mbc_encodef(&sr->raw_data, "l", lbps);
-
- // FileSystemEffectivePhysicalBytesPerSectorForAtomicity
- (void) smb_mbc_encodef(&sr->raw_data, "l", pbps);
-
// Flags
// We include "no seek penalty" because our files are
// always ZFS-backed, which can reorder things on disk.
@@ -398,15 +405,24 @@ smb2_qfs_sectorsize(smb_request_t *sr)
flags = SSINFO_FLAGS_ALIGNED_DEVICE |
SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE |
SSINFO_FLAGS_NO_SEEK_PENALTY;
- (void) smb_mbc_encodef(&sr->raw_data, "l", flags);
// ByteOffsetForSectorAlignment
// ByteOffsetForPartitionAlignment
// Just say "unknown" for these two.
- (void) smb_mbc_encodef(
- &sr->raw_data, "l",
- SSINFO_OFFSET_UNKNOWN,
- SSINFO_OFFSET_UNKNOWN);
+ unk = SSINFO_OFFSET_UNKNOWN;
+
+ rc = smb_mbc_encodef(
+ &sr->raw_data,
+ "lllllll",
+ lbps, // LogicalBytesPerSector
+ pbps, // PhysicalBytesPerSectorForAtomicity
+ lbps, // PhysicalBytesPerSectorForPerformance
+ pbps, // FileSystemEffectivePhysicalBytesPerSectorForAtomicity
+ flags,
+ unk, unk);
+
+ if (rc != 0)
+ return (NT_STATUS_BUFFER_OVERFLOW);
return (0);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_query_info.c b/usr/src/uts/common/fs/smbsrv/smb2_query_info.c
index dc59307fc3..61c6cbb97d 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_query_info.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_query_info.c
@@ -14,6 +14,10 @@
*/
/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
+/*
* Dispatch function for SMB2_QUERY_INFO
*/
@@ -107,16 +111,42 @@ errout:
/* Not really an error, per se. Advisory. */
break;
- case NT_STATUS_BUFFER_TOO_SMALL:
- case NT_STATUS_INFO_LENGTH_MISMATCH:
+ case NT_STATUS_BUFFER_TOO_SMALL: /* only in smb2_qinfo_sec.c */
/*
- * These are special, per. [MS-SMB2] 3.2.5.17
- * The error data is a 4-byte count of the size
- * required to successfully query the data.
- * That error data is built by the functions
- * that returns one of these errors.
+ * [MS-SMB2] 3.3.5.20.3
+ * Handling SMB2_0_INFO_SECURITY
+ * If dialect 3.1.1 must return 4-byte value
+ * containing required buffer size.
+ * ByteCount==12, ErrorContextCount==1,
+ * ErrorData: ErrorDataLength==4,ErrorId==0
+ * ErrorContextData==<buffer size>
+ * Otherwise ByteCount==4
+ *
+ * When returning with data, 3.1.1 encapsulate.
*/
- smb2sr_put_error_data(sr, status, &sr->raw_data);
+ if (sr->session->dialect < SMB_VERS_3_11) {
+ smb2sr_put_error_data(sr, status, &sr->raw_data);
+ } else {
+ smb2sr_put_error_ctx0(sr, status, &sr->raw_data);
+ }
+ return (SDRC_SUCCESS);
+
+ case NT_STATUS_INFO_LENGTH_MISMATCH: /* there is no in smb2_qinfo_*.c */
+ /*
+ * [MS-SMB2] 3.3.5.20.1
+ * SMB 3.1.1 Handling SMB2_0_INFO_FILE
+ * [MS-SMB2] 3.3.5.20.2
+ * SMB 3.1.1 Handling SMB2_0_INFO_FILESYSTEM
+ *
+ * ByteCount==8, ErrorContextCount==1,
+ * ErrorData: ErrorDataLength==0,ErrorId==0
+ * Otherwise ByteCount==0
+ */
+ if (sr->session->dialect < SMB_VERS_3_11) {
+ smb2sr_put_error_data(sr, status, NULL);
+ } else {
+ smb2sr_put_error_ctx0(sr, status, NULL);
+ }
return (SDRC_SUCCESS);
default:
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_read.c b/usr/src/uts/common/fs/smbsrv/smb2_read.c
index f8c91c878f..936fa06f6c 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_read.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_read.c
@@ -10,11 +10,12 @@
*/
/*
- * Copyright 2019 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
* Dispatch function for SMB2_READ
+ * MS-SMB2 sec. 3.3.5.12
*/
#include <smbsrv/smb2_kproto.h>
@@ -160,6 +161,14 @@ smb2_read(smb_request_t *sr)
MBC_ATTACH_MBUF(&sr->raw_data, m);
/*
+ * [MS-SMB2] If the read returns fewer bytes than specified by
+ * the MinimumCount field of the request, the server MUST fail
+ * the request with STATUS_END_OF_FILE
+ */
+ if (status == 0 && XferCount < MinCount)
+ status = NT_STATUS_END_OF_FILE;
+
+ /*
* Checking the error return _after_ dealing with
* the returned data so that if m was allocated,
* it will be free'd via sr->raw_data cleanup.
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c b/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c
index 0a258f1bf4..9be9630d57 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_session_setup.c
@@ -11,6 +11,7 @@
/*
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_signing.c b/usr/src/uts/common/fs/smbsrv/smb2_signing.c
index 704dfc652a..fd4c4ecfb4 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_signing.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_signing.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
* These routines provide the SMB MAC signing for the SMB2 server.
@@ -118,16 +119,6 @@ smb3_sign_calc(smb_request_t *sr,
return (rv);
}
-/*
- * Input to KDF for SigningKey.
- * See comment for smb3_do_kdf for content.
- */
-static uint8_t sign_kdf_input[29] = {
- 0, 0, 0, 1, 'S', 'M', 'B', '2',
- 'A', 'E', 'S', 'C', 'M', 'A', 'C', 0,
- 0, 'S', 'm', 'b', 'S', 'i', 'g', 'n',
- 0, 0, 0, 0, 0x80 };
-
void
smb2_sign_init_mech(smb_session_t *s)
{
@@ -196,10 +187,21 @@ smb2_sign_begin(smb_request_t *sr, smb_token_t *token)
* For SMB3, the signing key is a "KDF" hash of the
* session key.
*/
- if (smb3_do_kdf(sign_key->key, sign_kdf_input,
- sizeof (sign_kdf_input), token->tkn_ssnkey.val,
- token->tkn_ssnkey.len) != 0)
- return;
+ if (s->dialect >= SMB_VERS_3_11) {
+ if (smb3_kdf(sign_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMBSigningKey", 14,
+ u->u_preauth_hashval, SHA512_DIGEST_LENGTH)
+ != 0)
+ return;
+ } else {
+ if (smb3_kdf(sign_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMB2AESCMAC", 12,
+ (uint8_t *)"SmbSign", 8)
+ != 0)
+ return;
+ }
sign_key->len = SMB3_KEYLEN;
} else {
/*
@@ -430,63 +432,3 @@ smb2_sign_reply(smb_request_t *sr)
(void) smb_mbc_poke(&sr->reply, hdr_off, "#c",
SMB2_SIG_SIZE, reply_sig);
}
-
-/*
- * Derive SMB3 key as described in [MS-SMB2] 3.1.4.2
- * and [NIST SP800-108]
- *
- * r = 32, L = 128, PRF = HMAC-SHA256, key = (session key)
- *
- * Note that these describe pre-3.1.1 inputs.
- *
- * Session.SigningKey for binding a session:
- * - Session.SessionKey as K1
- * - label = SMB2AESCMAC (size 12)
- * - context = SmbSign (size 8)
- * Channel.SigningKey for for all other requests
- * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1;
- * - otherwise, Session.SessionKey as K1
- * - label = SMB2AESCMAC (size 12)
- * - context = SmbSign (size 8)
- * Session.ApplicationKey for ... (not sure what yet)
- * - Session.SessionKey as K1
- * - label = SMB2APP (size 8)
- * - context = SmbRpc (size 7)
- * Session.EncryptionKey for encrypting server messages
- * - Session.SessionKey as K1
- * - label = "SMB2AESCCM" (size 11)
- * - context = "ServerOut" (size 10)
- * Session.DecryptionKey for decrypting client requests
- * - Session.SessionKey as K1
- * - label = "SMB2AESCCM" (size 11)
- * - context = "ServerIn " (size 10) (Note the space)
- */
-
-int
-smb3_do_kdf(void *outbuf, void *input, size_t input_len,
- uint8_t *key, uint32_t key_len)
-{
- uint8_t digest32[SHA256_DIGEST_LENGTH];
- smb_crypto_mech_t mech;
- smb_sign_ctx_t hctx = 0;
- int rc;
-
- bzero(&mech, sizeof (mech));
- if ((rc = smb2_hmac_getmech(&mech)) != 0)
- return (rc);
-
- /* Limit the SessionKey input to its maximum size (16 bytes) */
- rc = smb2_hmac_init(&hctx, &mech, key, MIN(key_len, SMB2_KEYLEN));
- if (rc != 0)
- return (rc);
-
- if ((rc = smb2_hmac_update(hctx, input, input_len)) != 0)
- return (rc);
-
- if ((rc = smb2_hmac_final(hctx, digest32)) != 0)
- return (rc);
-
- /* Output is first 16 bytes of digest. */
- bcopy(digest32, outbuf, SMB3_KEYLEN);
- return (0);
-}
diff --git a/usr/src/uts/common/fs/smbsrv/smb2_write.c b/usr/src/uts/common/fs/smbsrv/smb2_write.c
index 776ea24ae1..8f10f67d49 100644
--- a/usr/src/uts/common/fs/smbsrv/smb2_write.c
+++ b/usr/src/uts/common/fs/smbsrv/smb2_write.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2019 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -148,7 +148,6 @@ smb2_write(smb_request_t *sr)
&vdb->vdb_uio, &XferCount, stability);
if (rc)
break;
- of->f_written = B_TRUE;
/* This revokes read cache delegations. */
(void) smb_oplock_break_WRITE(of->f_node, of);
break;
diff --git a/usr/src/uts/common/fs/smbsrv/smb31_preauth.c b/usr/src/uts/common/fs/smbsrv/smb31_preauth.c
new file mode 100644
index 0000000000..35455b9784
--- /dev/null
+++ b/usr/src/uts/common/fs/smbsrv/smb31_preauth.c
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
+#include <smbsrv/smb2_kproto.h>
+#include <smbsrv/smb2.h>
+#include <sys/crypto/api.h>
+#include <smbsrv/smb_kproto.h>
+#include <smbsrv/smb_kcrypt.h>
+
+/*
+ * SMB 3.1.1 Preauth Integrity
+ */
+int
+smb3_sha512_getmech(smb_crypto_mech_t *mech)
+{
+ crypto_mech_type_t t;
+
+ t = crypto_mech2id(SUN_CKM_SHA512);
+ if (t == CRYPTO_MECH_INVALID) {
+ cmn_err(CE_NOTE, "smb: no kcf mech: %s", SUN_CKM_SHA512);
+ return (-1);
+ }
+ mech->cm_type = t;
+ return (0);
+}
+
+/*
+ * (called from smb2_negotiate_common)
+ */
+void
+smb31_preauth_init_mech(smb_session_t *s)
+{
+ smb_crypto_mech_t *mech;
+ int rc;
+
+ ASSERT3S(s->dialect, >=, SMB_VERS_3_11);
+
+ if (s->preauth_mech != NULL)
+ return;
+
+ mech = kmem_zalloc(sizeof (*mech), KM_SLEEP);
+ rc = smb3_sha512_getmech(mech);
+ if (rc != 0) {
+ kmem_free(mech, sizeof (*mech));
+ return;
+ }
+ s->preauth_mech = mech;
+}
+
+void
+smb31_preauth_fini(smb_session_t *s)
+{
+ smb_crypto_mech_t *mech;
+
+ if ((mech = s->preauth_mech) != NULL) {
+ kmem_free(mech, sizeof (*mech));
+ s->preauth_mech = NULL;
+ }
+}
+
+/*
+ * Start the KCF session, load the key
+ */
+int
+smb_sha512_init(smb_sign_ctx_t *ctxp, smb_crypto_mech_t *mech)
+{
+ int rv;
+
+ rv = crypto_digest_init(mech, ctxp, NULL);
+
+ return (rv == CRYPTO_SUCCESS ? 0 : -1);
+}
+
+/*
+ * Digest one segment
+ */
+int
+smb_sha512_update(smb_sign_ctx_t ctx, void *buf, size_t len)
+{
+ crypto_data_t data;
+ int rv;
+
+ bzero(&data, sizeof (data));
+ data.cd_format = CRYPTO_DATA_RAW;
+ data.cd_length = len;
+ data.cd_raw.iov_base = buf;
+ data.cd_raw.iov_len = len;
+
+ rv = crypto_digest_update(ctx, &data, 0);
+
+ if (rv != CRYPTO_SUCCESS) {
+ crypto_cancel_ctx(ctx);
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Get the final digest.
+ */
+int
+smb_sha512_final(smb_sign_ctx_t ctx, uint8_t *digest)
+{
+ crypto_data_t out;
+ int rv;
+
+ bzero(&out, sizeof (out));
+ out.cd_format = CRYPTO_DATA_RAW;
+ out.cd_length = SHA512_DIGEST_LENGTH;
+ out.cd_raw.iov_len = SHA512_DIGEST_LENGTH;
+ out.cd_raw.iov_base = (void *)digest;
+
+ rv = crypto_digest_final(ctx, &out, 0);
+
+ return (rv == CRYPTO_SUCCESS ? 0 : -1);
+}
+
+int
+smb31_preauth_sha512_calc(smb_request_t *sr, struct mbuf_chain *mbc,
+ uint8_t *in_hashval, uint8_t *out_hashval)
+{
+ smb_session_t *s = sr->session;
+ smb_sign_ctx_t ctx = 0;
+ struct mbuf *mbuf = mbc->chain;
+ int rc;
+
+ ASSERT3U(s->smb31_preauth_hashid, !=, 0);
+
+ if (s->preauth_mech == NULL)
+ return (-1);
+
+ if ((rc = smb_sha512_init(&ctx, s->preauth_mech)) != 0)
+ return (rc);
+
+ /* Digest current hashval */
+ rc = smb_sha512_update(ctx, in_hashval, SHA512_DIGEST_LENGTH);
+ if (rc != 0)
+ return (rc);
+
+ while (mbuf != NULL) {
+ rc = smb_sha512_update(ctx, mbuf->m_data, mbuf->m_len);
+ if (rc != 0)
+ return (rc);
+ mbuf = mbuf->m_next;
+ }
+
+ rc = smb_sha512_final(ctx, out_hashval);
+ return (rc);
+}
diff --git a/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c b/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c
index fdbd49ef74..8b2f36f802 100644
--- a/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c
+++ b/usr/src/uts/common/fs/smbsrv/smb3_encrypt.c
@@ -11,6 +11,7 @@
/*
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -24,23 +25,8 @@
#define SMB3_NONCE_OFFS 20
#define SMB3_SIG_OFFS 4
-#define SMB3_NONCE_SIZE 11 /* 12 for gcm later */
-
-/*
- * Inputs to KDF for EncryptionKey and DecryptionKey.
- * See comment for smb3_do_kdf for content.
- */
-static uint8_t encrypt_kdf_input[30] = {
- 0, 0, 0, 1, 'S', 'M', 'B', '2',
- 'A', 'E', 'S', 'C', 'C', 'M', 0, 0,
- 'S', 'e', 'r', 'v', 'e', 'r', 'O',
- 'u', 't', 0, 0, 0, 0, 0x80 };
-
-static uint8_t decrypt_kdf_input[30] = {
- 0, 0, 0, 1, 'S', 'M', 'B', '2',
- 'A', 'E', 'S', 'C', 'C', 'M', 0, 0,
- 'S', 'e', 'r', 'v', 'e', 'r', 'I',
- 'n', ' ', 0, 0, 0, 0, 0x80 };
+#define SMB3_AES128_CCM_NONCE_SIZE 11
+#define SMB3_AES128_GCM_NONCE_SIZE 12
/*
* Arbitrary value used to prevent nonce reuse via overflow. Currently
@@ -100,8 +86,23 @@ smb3_encrypt_init_mech(smb_session_t *s)
if (s->enc_mech != NULL)
return (0);
+ if (s->dialect < SMB_VERS_3_11)
+ s->smb31_enc_cipherid = SMB3_CIPHER_AES128_CCM;
+
mech = kmem_zalloc(sizeof (*mech), KM_SLEEP);
- rc = smb3_encrypt_getmech(mech);
+
+ switch (s->smb31_enc_cipherid) {
+ case SMB3_CIPHER_AES128_GCM:
+ rc = smb3_aes_gcm_getmech(mech);
+ break;
+ case SMB3_CIPHER_AES128_CCM:
+ rc = smb3_aes_ccm_getmech(mech);
+ break;
+ default:
+ rc = -1;
+ break;
+ }
+
if (rc != 0) {
kmem_free(mech, sizeof (*mech));
return (rc);
@@ -150,15 +151,31 @@ smb3_encrypt_begin(smb_request_t *sr, smb_token_t *token)
* For SMB3, the encrypt/decrypt keys are derived from
* the session key using KDF in counter mode.
*/
- if (smb3_do_kdf(enc_key->key, encrypt_kdf_input,
- sizeof (encrypt_kdf_input), token->tkn_ssnkey.val,
- token->tkn_ssnkey.len) != 0)
- return;
-
- if (smb3_do_kdf(dec_key->key, decrypt_kdf_input,
- sizeof (decrypt_kdf_input), token->tkn_ssnkey.val,
- token->tkn_ssnkey.len) != 0)
- return;
+ if (s->dialect >= SMB_VERS_3_11) {
+ if (smb3_kdf(enc_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMBS2CCipherKey", 16,
+ u->u_preauth_hashval, SHA512_DIGEST_LENGTH) != 0)
+ return;
+
+ if (smb3_kdf(dec_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMBC2SCipherKey", 16,
+ u->u_preauth_hashval, SHA512_DIGEST_LENGTH) != 0)
+ return;
+ } else {
+ if (smb3_kdf(enc_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMB2AESCCM", 11,
+ (uint8_t *)"ServerOut", 10) != 0)
+ return;
+
+ if (smb3_kdf(dec_key->key,
+ token->tkn_ssnkey.val, token->tkn_ssnkey.len,
+ (uint8_t *)"SMB2AESCCM", 11,
+ (uint8_t *)"ServerIn ", 10) != 0)
+ return;
+ }
smb3_encrypt_init_nonce(u);
@@ -184,6 +201,10 @@ smb3_decrypt_sr(smb_request_t *sr)
int offset, resid, tlen, rc;
smb3_crypto_param_t param;
smb_crypto_mech_t mech;
+ boolean_t gcm = sr->session->smb31_enc_cipherid ==
+ SMB3_CIPHER_AES128_GCM;
+ size_t nonce_size = (gcm ? SMB3_AES128_GCM_NONCE_SIZE :
+ SMB3_AES128_CCM_NONCE_SIZE);
ASSERT(u != NULL);
if (s->enc_mech == NULL || dec_key->len != 16) {
@@ -210,8 +231,12 @@ smb3_decrypt_sr(smb_request_t *sr)
* The transform header, minus the PROTOCOL_ID and the
* SIGNATURE, is authenticated but not encrypted.
*/
- smb3_crypto_init_param(&param, sr->nonce, SMB3_NONCE_SIZE,
- tmp_hdr, tlen, sr->msgsize + SMB2_SIG_SIZE);
+ if (gcm)
+ smb3_crypto_init_gcm_param(&param, sr->nonce, nonce_size,
+ tmp_hdr, tlen);
+ else
+ smb3_crypto_init_ccm_param(&param, sr->nonce, nonce_size,
+ tmp_hdr, tlen, sr->msgsize + SMB2_SIG_SIZE);
/*
* Unlike signing, which uses one global mech struct,
@@ -317,13 +342,17 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc,
int resid, tlen, rc;
smb3_crypto_param_t param;
smb_crypto_mech_t mech;
+ boolean_t gcm = sr->session->smb31_enc_cipherid ==
+ SMB3_CIPHER_AES128_GCM;
+ size_t nonce_size = (gcm ? SMB3_AES128_GCM_NONCE_SIZE :
+ SMB3_AES128_CCM_NONCE_SIZE);
ASSERT(u != NULL);
if (s->enc_mech == NULL || enc_key->len != 16) {
return (-1);
}
- rc = smb3_encrypt_gen_nonce(u, sr->nonce, SMB3_NONCE_SIZE);
+ rc = smb3_encrypt_gen_nonce(u, sr->nonce, nonce_size);
if (rc != 0) {
cmn_err(CE_WARN, "ran out of nonces");
@@ -331,7 +360,7 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc,
}
(void) smb_mbc_poke(out_mbc, SMB3_NONCE_OFFS, "#c",
- SMB3_NONCE_SIZE, sr->nonce);
+ nonce_size, sr->nonce);
resid = in_mbc->max_bytes;
@@ -339,10 +368,14 @@ smb3_encrypt_sr(smb_request_t *sr, struct mbuf_chain *in_mbc,
* The transform header, minus the PROTOCOL_ID and the
* SIGNATURE, is authenticated but not encrypted.
*/
- smb3_crypto_init_param(&param,
- sr->nonce, SMB3_NONCE_SIZE,
- buf + SMB3_NONCE_OFFS, SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS,
- resid);
+ if (gcm)
+ smb3_crypto_init_gcm_param(&param, sr->nonce, nonce_size,
+ buf + SMB3_NONCE_OFFS,
+ SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS);
+ else
+ smb3_crypto_init_ccm_param(&param, sr->nonce, nonce_size,
+ buf + SMB3_NONCE_OFFS,
+ SMB3_TFORM_HDR_SIZE - SMB3_NONCE_OFFS, resid);
/*
* Unlike signing, which uses one global mech struct,
diff --git a/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c b/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c
index 690a2d792d..c4392feb01 100644
--- a/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c
+++ b/usr/src/uts/common/fs/smbsrv/smb3_encrypt_kcf.c
@@ -11,6 +11,7 @@
/*
* Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -28,36 +29,63 @@
#include <sys/cmn_err.h>
/*
- * SMB3 encryption helpers:
- * (getmech, init, update, final)
+ * Common function to see if a mech is available.
*/
-
-int
-smb3_encrypt_getmech(smb_crypto_mech_t *mech)
+static int
+find_mech(smb_crypto_mech_t *mech, const char *name)
{
crypto_mech_type_t t;
- t = crypto_mech2id(SUN_CKM_AES_CCM);
+ t = crypto_mech2id(name);
if (t == CRYPTO_MECH_INVALID) {
- cmn_err(CE_NOTE, "smb: no kcf mech: %s", SUN_CKM_AES_CCM);
+ cmn_err(CE_NOTE, "smb: no kcf mech: %s", name);
return (-1);
}
mech->cm_type = t;
-
return (0);
}
+/*
+ * SMB3 encryption helpers:
+ * (getmech, init, update, final)
+ */
+
+int
+smb3_aes_ccm_getmech(smb_crypto_mech_t *mech)
+{
+ return (find_mech(mech, SUN_CKM_AES_CCM));
+}
+
+int
+smb3_aes_gcm_getmech(smb_crypto_mech_t *mech)
+{
+ return (find_mech(mech, SUN_CKM_AES_GCM));
+}
+
void
-smb3_crypto_init_param(smb3_crypto_param_t *param,
+smb3_crypto_init_ccm_param(smb3_crypto_param_t *param,
uint8_t *nonce, size_t noncesize, uint8_t *auth, size_t authsize,
size_t datasize)
{
- param->ulMACSize = SMB2_SIG_SIZE;
- param->ulNonceSize = noncesize;
- param->nonce = nonce;
- param->ulDataSize = datasize;
- param->ulAuthDataSize = authsize;
- param->authData = auth;
+ param->ccm.ulMACSize = SMB2_SIG_SIZE;
+ param->ccm.ulNonceSize = noncesize;
+ param->ccm.nonce = nonce;
+ param->ccm.ulDataSize = datasize;
+ param->ccm.ulAuthDataSize = authsize;
+ param->ccm.authData = auth;
+}
+
+void
+smb3_crypto_init_gcm_param(smb3_crypto_param_t *param,
+ uint8_t *nonce, size_t noncesize, uint8_t *auth, size_t authsize)
+{
+ ASSERT3U(noncesize, ==, 12);
+ param->gcm.pIv = nonce;
+ param->gcm.ulIvLen = noncesize; /* should be 12 bytes */
+ /* tform hdr size - (protcolo id + signing) == 32 bytes */
+ param->gcm.ulTagBits = SMB2_SIG_SIZE << 3; /* convert bytes to bits */
+ param->gcm.pAAD = auth; /* auth data */
+ param->gcm.ulAADLen = authsize; /* auth data len */
}
/*
@@ -199,7 +227,22 @@ smb3_encrypt_final(smb3_enc_ctx_t *ctxp, uint8_t *digest16)
return (-1);
}
- outlen = out.cd_offset - SMB2_SIG_SIZE;
+ /*
+ * For some reason AES module processes ccm_encrypt_final and
+ * gcm_encrypt_final differently.
+ * For GCM it restores original offset (which is 0) and updates
+ * cd_length to size of residual data + mac len.
+ * For CCM it does nothing, what means offset is updated and cd_length
+ * is decreased by size of residual data + mac len.
+ */
+ if (out.cd_offset == 0) {
+ /* GCM */
+ outlen = out.cd_length - SMB2_SIG_SIZE;
+ } else {
+ /* CCM */
+ outlen = out.cd_offset - SMB2_SIG_SIZE;
+ }
+
if (outlen > 0)
bcopy(buf, ctxp->output.cd_raw.iov_base +
ctxp->output.cd_offset, outlen);
diff --git a/usr/src/uts/common/fs/smbsrv/smb3_kdf.c b/usr/src/uts/common/fs/smbsrv/smb3_kdf.c
new file mode 100644
index 0000000000..e62acd8808
--- /dev/null
+++ b/usr/src/uts/common/fs/smbsrv/smb3_kdf.c
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
+ */
+
+#include <smbsrv/smb_kcrypt.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+
+/*
+ * Derive SMB3 key as described in [MS-SMB2] 3.1.4.2
+ * and [NIST SP800-108]
+ *
+ * r = 32, L = 128, PRF = HMAC-SHA256, key = (session key)
+ */
+
+/*
+ * SMB 3.0.2 KDF Input
+ *
+ * Session.SigningKey for binding a session:
+ * - Session.SessionKey as K1
+ * - label = "SMB2AESCMAC" (size 12)
+ * - context = "SmbSign" (size 8)
+ * Channel.SigningKey for for all other requests
+ * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1;
+ * - otherwise, Session.SessionKey as K1
+ * - label = "SMB2AESCMAC" (size 12)
+ * - context = "SmbSign" (size 8)
+ * Session.ApplicationKey for ... (not sure what yet)
+ * - Session.SessionKey as K1
+ * - label = "SMB2APP" (size 8)
+ * - context = "SmbRpc" (size 7)
+ * Session.EncryptionKey for encrypting server messages
+ * - Session.SessionKey as K1
+ * - label = "SMB2AESCCM" (size 11)
+ * - context = "ServerOut" (size 10)
+ * Session.DecryptionKey for decrypting client requests
+ * - Session.SessionKey as K1
+ * - label = "SMB2AESCCM" (size 11)
+ * - context = "ServerIn " (size 10) (Note the space)
+ */
+
+/*
+ * SMB 3.1.1 KDF Input
+ *
+ * Session.SigningKey for binding a session:
+ * - Session.SessionKey as K1
+ * - label = "SMBSigningKey" (size 14)
+ * - context = preauth hashval
+ * Channel.SigningKey for for all other requests
+ * - if SMB2_SESSION_FLAG_BINDING, GSS key (in Session.SessionKey?) as K1;
+ * - otherwise, Session.SessionKey as K1
+ * - label = "SMBSigningKey" (size 14)
+ * - context = preauth hashval
+ * Session.EncryptionKey for encrypting server messages
+ * - Session.SessionKey as K1
+ * - label = "SMBS2CCipherKey" (size 16)
+ * - context = preauth hashval
+ * Session.DecryptionKey for decrypting client requests
+ * - Session.SessionKey as K1
+ * - label = "SMBC2SCipherKey" (size 16)
+ * - context = preauth hashval
+ */
+
+/*
+ * SMB3KDF(Ki, Label, Context)
+ * counter || Label || 0x00 || Context || L
+ */
+int
+smb3_kdf(uint8_t *outbuf,
+ uint8_t *key, size_t key_len,
+ uint8_t *label, size_t label_len,
+ uint8_t *context, size_t context_len)
+{
+ static uint8_t L[4] = { 0, 0, 0, 0x80 };
+ uint8_t digest32[SHA256_DIGEST_LENGTH];
+ /* Maximum length of kdf input is 89 for Encription/Decryption key */
+ uint8_t kdfbuf[89] = { 0, 0, 0, 1 }; /* initialized by counter */
+ smb_crypto_mech_t mech;
+ smb_sign_ctx_t hctx = 0;
+ int pos = 4; /* skip counter */
+ int rc;
+
+ bcopy(label, &kdfbuf[pos], label_len);
+ pos += label_len;
+
+ kdfbuf[pos] = 0;
+ pos++;
+
+ bcopy(context, &kdfbuf[pos], context_len);
+ pos += context_len;
+
+ bcopy(L, &kdfbuf[pos], 4);
+ pos += 4;
+
+ bzero(&mech, sizeof (mech));
+ if ((rc = smb2_hmac_getmech(&mech)) != 0)
+ return (rc);
+
+ /* Limit the SessionKey input to its maximum size (16 bytes) */
+ rc = smb2_hmac_init(&hctx, &mech, key, MIN(key_len, SMB2_KEYLEN));
+ if (rc != 0)
+ return (rc);
+
+ if ((rc = smb2_hmac_update(hctx, kdfbuf, pos)) != 0)
+ return (rc);
+
+ if ((rc = smb2_hmac_final(hctx, digest32)) != 0)
+ return (rc);
+
+ /* Output is first 16 bytes of digest. */
+ bcopy(digest32, outbuf, SMB3_KEYLEN);
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_authenticate.c b/usr/src/uts/common/fs/smbsrv/smb_authenticate.c
index 64f26363a6..c6da5a5158 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_authenticate.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_authenticate.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -36,6 +37,7 @@
#include <smbsrv/smb_idmap.h>
#include <smbsrv/smb_kproto.h>
#include <smbsrv/smb_token.h>
+#include <smbsrv/smb2_kproto.h>
static uint32_t smb_authsock_open(smb_request_t *);
static int smb_authsock_send(ksocket_t, void *, size_t);
@@ -285,6 +287,14 @@ smb_authenticate_ext(smb_request_t *sr)
goto errout;
msg_hdr.lmh_msgtype = LSA_MTYPE_ESFIRST;
+
+ if (sr->session->dialect >= SMB_VERS_3_11) {
+ if (smb31_preauth_sha512_calc(sr, &sr->command,
+ sr->session->smb31_preauth_hashval,
+ user->u_preauth_hashval) != 0)
+ cmn_err(CE_WARN, "(2) Preauth hash calculation "
+ "failed");
+ }
} else {
user = smb_session_lookup_uid_st(sr->session,
sr->smb2_ssnid, sr->smb_uid, SMB_USER_STATE_LOGGING_ON);
@@ -295,6 +305,14 @@ smb_authenticate_ext(smb_request_t *sr)
sr->uid_user = user;
msg_hdr.lmh_msgtype = LSA_MTYPE_ESNEXT;
+
+ if (sr->session->dialect >= SMB_VERS_3_11) {
+ if (smb31_preauth_sha512_calc(sr, &sr->command,
+ user->u_preauth_hashval,
+ user->u_preauth_hashval) != 0)
+ cmn_err(CE_WARN, "(4) Preauth hash calculation "
+ "failed");
+ }
}
/*
diff --git a/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c
index 39d67dd824..8ec21f5f37 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_cmn_oplock.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -495,9 +495,20 @@ smb_oplock_request(smb_request_t *sr, smb_ofile_t *ofile, uint32_t *statep)
}
/* Give caller back the "Granular" bit. */
- if (status == NT_STATUS_SUCCESS)
+ if (status == NT_STATUS_SUCCESS) {
*statep |= LEVEL_GRANULAR;
+ /*
+ * The oplock lease may have moved to this ofile. Update.
+ * Minor violation of layering here (leases vs oplocks)
+ * but we want this update coverd by the oplock mutex.
+ */
+#ifndef TESTJIG
+ if (ofile->f_lease != NULL)
+ ofile->f_lease->ls_oplock_ofile = ofile;
+#endif
+ }
+
out:
mutex_exit(&node->n_oplock.ol_mutex);
smb_llist_exit(&node->n_ofile_list);
@@ -545,6 +556,12 @@ smb_oplock_req_excl(
ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex));
/*
+ * Don't allow grants on closing ofiles.
+ */
+ if (ofile->f_oplock.og_closing)
+ return (status);
+
+ /*
* If Open.Stream.Oplock is empty:
* Build a new Oplock object with fields initialized as follows:
* Oplock.State set to NO_OPLOCK.
@@ -1030,6 +1047,12 @@ smb_oplock_req_shared(
ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex));
/*
+ * Don't allow grants on closing ofiles.
+ */
+ if (ofile->f_oplock.og_closing)
+ return (status);
+
+ /*
* If Open.Stream.Oplock is empty:
* Build a new Oplock object with fields initialized as follows:
* Oplock.State set to NO_OPLOCK.
@@ -2036,6 +2059,20 @@ smb_oplock_ack_break(
} /* Switch (oplock.state) */
out:
+ if (status == NT_STATUS_INVALID_OPLOCK_PROTOCOL)
+ *rop = LEVEL_NONE;
+
+ if (status == NT_STATUS_SUCCESS &&
+ type == LEVEL_GRANULAR &&
+ *rop != LEVEL_NONE) {
+ *rop |= LEVEL_GRANULAR;
+ /* As above, leased oplock may have moved. */
+#ifndef TESTJIG
+ if (ofile->f_lease != NULL)
+ ofile->f_lease->ls_oplock_ofile = ofile;
+#endif
+ }
+
/*
* The spec. describes waiting for a break here,
* but we let the caller do that (when needed) if
@@ -2044,14 +2081,6 @@ out:
mutex_exit(&node->n_oplock.ol_mutex);
smb_llist_exit(&node->n_ofile_list);
- if (status == NT_STATUS_INVALID_OPLOCK_PROTOCOL)
- *rop = LEVEL_NONE;
-
- if (status == NT_STATUS_SUCCESS &&
- type == LEVEL_GRANULAR &&
- *rop != LEVEL_NONE)
- *rop |= LEVEL_GRANULAR;
-
return (status);
}
@@ -2257,13 +2286,12 @@ smb_oplock_break_CLOSE(smb_node_t *node, smb_ofile_t *ofile)
{
smb_ofile_t *o;
- if (ofile == NULL) {
- ASSERT(0);
- return;
- }
+ ASSERT(RW_READ_HELD(&node->n_ofile_list.ll_lock));
+ ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex));
- smb_llist_enter(&node->n_ofile_list, RW_READER);
- mutex_enter(&node->n_oplock.ol_mutex);
+ if (ofile->f_oplock.og_closing)
+ return;
+ ofile->f_oplock.og_closing = B_TRUE;
/*
* If Oplock.IIOplocks is not empty:
@@ -2481,8 +2509,6 @@ smb_oplock_break_CLOSE(smb_node_t *node, smb_ofile_t *ofile)
if ((node->n_oplock.ol_state & BREAK_ANY) == 0)
cv_broadcast(&node->n_oplock.WaitingOpenCV);
- mutex_exit(&node->n_oplock.ol_mutex);
- smb_llist_exit(&node->n_ofile_list);
}
/*
@@ -3515,8 +3541,7 @@ smb_oplock_move(smb_node_t *node,
ASSERT(fr_ofile->f_node == node);
ASSERT(to_ofile->f_node == node);
-
- mutex_enter(&node->n_oplock.ol_mutex);
+ ASSERT(MUTEX_HELD(&node->n_oplock.ol_mutex));
/*
* The ofile to which we're moving the oplock
@@ -3541,5 +3566,4 @@ smb_oplock_move(smb_node_t *node,
if (node->n_oplock.excl_open == fr_ofile)
node->n_oplock.excl_open = to_ofile;
- mutex_exit(&node->n_oplock.ol_mutex);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_common_open.c b/usr/src/uts/common/fs/smbsrv/smb_common_open.c
index 8007463ba1..fb4d46f599 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_common_open.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_common_open.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -253,6 +253,7 @@ smb_common_open(smb_request_t *sr)
smb_node_t *fnode = NULL;
smb_node_t *dnode = NULL;
smb_node_t *cur_node = NULL;
+ smb_node_t *tmp_node = NULL;
smb_arg_open_t *op = &sr->sr_open;
smb_pathname_t *pn = &op->fqi.fq_path;
smb_ofile_t *of = NULL;
@@ -269,6 +270,7 @@ smb_common_open(smb_request_t *sr)
uint16_t tree_fid = 0;
boolean_t created = B_FALSE;
boolean_t last_comp_found = B_FALSE;
+ boolean_t stream_found = B_FALSE;
boolean_t opening_incr = B_FALSE;
boolean_t dnode_held = B_FALSE;
boolean_t dnode_wlock = B_FALSE;
@@ -278,6 +280,7 @@ smb_common_open(smb_request_t *sr)
boolean_t did_open = B_FALSE;
boolean_t did_break_handle = B_FALSE;
boolean_t did_cleanup_orphans = B_FALSE;
+ char *sname = NULL;
/* Get out now if we've been cancelled. */
mutex_enter(&sr->sr_mutex);
@@ -418,9 +421,13 @@ smb_common_open(smb_request_t *sr)
if ((op->desired_access & ~FILE_READ_ATTRIBUTES) == DELETE)
lookup_flags &= ~SMB_FOLLOW_LINKS;
- rc = smb_fsop_lookup_name(sr, zone_kcred(), lookup_flags,
+ /*
+ * Lookup *just* the file portion of the name.
+ * Returns stream name in sname, which this allocates
+ */
+ rc = smb_fsop_lookup_file(sr, zone_kcred(), lookup_flags,
sr->tid_tree->t_snode, op->fqi.fq_dnode, op->fqi.fq_last_comp,
- &op->fqi.fq_fnode);
+ &sname, &op->fqi.fq_fnode);
if (rc == 0) {
last_comp_found = B_TRUE;
@@ -449,9 +456,6 @@ smb_common_open(smb_request_t *sr)
if (last_comp_found) {
- smb_node_unlock(dnode);
- dnode_wlock = B_FALSE;
-
fnode = op->fqi.fq_fnode;
dnode = op->fqi.fq_dnode;
@@ -468,8 +472,9 @@ smb_common_open(smb_request_t *sr)
* it must NOT be (required by Lotus Notes)
* - the target is NOT a directory and client requires that
* it MUST be.
+ * Streams are never directories.
*/
- if (smb_node_is_dir(fnode)) {
+ if (smb_node_is_dir(fnode) && sname == NULL) {
if (op->create_options & FILE_NON_DIRECTORY_FILE) {
status = NT_STATUS_FILE_IS_A_DIRECTORY;
goto errout;
@@ -482,20 +487,81 @@ smb_common_open(smb_request_t *sr)
}
}
- /*
- * No more open should be accepted when "Delete on close"
- * flag is set.
- */
- if (fnode->flags & NODE_FLAGS_DELETE_ON_CLOSE) {
- status = NT_STATUS_DELETE_PENDING;
- goto errout;
+ /* If we're given a stream name, look it up now */
+ if (sname != NULL) {
+ tmp_node = fnode;
+ rc = smb_fsop_lookup_stream(sr, zone_kcred(),
+ lookup_flags, sr->tid_tree->t_snode, fnode, sname,
+ &fnode);
+ } else {
+ rc = 0;
}
- /*
- * Specified file already exists so the operation should fail.
- */
- if (op->create_disposition == FILE_CREATE) {
- status = NT_STATUS_OBJECT_NAME_COLLISION;
+ if (rc == 0) { /* Stream Exists (including unnamed stream) */
+ stream_found = B_TRUE;
+ smb_node_unlock(dnode);
+ dnode_wlock = B_FALSE;
+
+ if (tmp_node != NULL)
+ smb_node_release(tmp_node);
+
+ /*
+ * No more open should be accepted when
+ * "Delete on close" flag is set.
+ */
+ if (fnode->flags & NODE_FLAGS_DELETE_ON_CLOSE) {
+ status = NT_STATUS_DELETE_PENDING;
+ goto errout;
+ }
+
+ /*
+ * Specified file already exists
+ * so the operation should fail.
+ */
+ if (op->create_disposition == FILE_CREATE) {
+ status = NT_STATUS_OBJECT_NAME_COLLISION;
+ goto errout;
+ }
+
+ if ((op->create_disposition == FILE_SUPERSEDE) ||
+ (op->create_disposition == FILE_OVERWRITE_IF) ||
+ (op->create_disposition == FILE_OVERWRITE)) {
+
+ if (sname == NULL) {
+ if (!smb_sattr_check(
+ op->fqi.fq_fattr.sa_dosattr,
+ op->dattr)) {
+ status =
+ NT_STATUS_ACCESS_DENIED;
+ goto errout;
+ }
+ op->desired_access |=
+ FILE_WRITE_ATTRIBUTES;
+ }
+
+ if (smb_node_is_dir(fnode)) {
+ status = NT_STATUS_ACCESS_DENIED;
+ goto errout;
+ }
+ }
+
+ /* MS-FSA 2.1.5.1.2 */
+ if (op->create_disposition == FILE_SUPERSEDE)
+ op->desired_access |= DELETE;
+ if ((op->create_disposition == FILE_OVERWRITE_IF) ||
+ (op->create_disposition == FILE_OVERWRITE))
+ op->desired_access |= FILE_WRITE_DATA;
+ } else if (rc == ENOENT) { /* File Exists, but Stream doesn't */
+ if (op->create_disposition == FILE_OPEN ||
+ op->create_disposition == FILE_OVERWRITE) {
+ status = NT_STATUS_OBJECT_NAME_NOT_FOUND;
+ goto errout;
+ }
+
+ op->desired_access |= FILE_WRITE_DATA;
+ } else { /* Error looking up stream */
+ status = smb_errno2status(rc);
+ fnode = tmp_node;
goto errout;
}
@@ -520,29 +586,6 @@ smb_common_open(smb_request_t *sr)
}
}
- if ((op->create_disposition == FILE_SUPERSEDE) ||
- (op->create_disposition == FILE_OVERWRITE_IF) ||
- (op->create_disposition == FILE_OVERWRITE)) {
-
- if (!smb_sattr_check(op->fqi.fq_fattr.sa_dosattr,
- op->dattr)) {
- status = NT_STATUS_ACCESS_DENIED;
- goto errout;
- }
-
- if (smb_node_is_dir(fnode)) {
- status = NT_STATUS_ACCESS_DENIED;
- goto errout;
- }
- }
-
- /* MS-FSA 2.1.5.1.2 */
- if (op->create_disposition == FILE_SUPERSEDE)
- op->desired_access |= DELETE;
- if ((op->create_disposition == FILE_OVERWRITE_IF) ||
- (op->create_disposition == FILE_OVERWRITE))
- op->desired_access |= FILE_WRITE_DATA;
-
/* Dataset roots can't be deleted, so don't set DOC */
if ((op->create_options & FILE_DELETE_ON_CLOSE) != 0 &&
(fnode->flags & NODE_FLAGS_VFSROOT) != 0) {
@@ -552,6 +595,7 @@ smb_common_open(smb_request_t *sr)
status = smb_fsop_access(sr, sr->user_cr, fnode,
op->desired_access);
+
if (status != NT_STATUS_SUCCESS)
goto errout;
@@ -575,6 +619,31 @@ smb_common_open(smb_request_t *sr)
if ((op->desired_access & FILE_DATA_ALL) != 0)
op->desired_access |= FILE_READ_ATTRIBUTES;
+ /* If the stream didn't exist, create it now */
+ if (!stream_found) {
+ smb_node_t *tmp_node = fnode;
+
+ bzero(&new_attr, sizeof (new_attr));
+ new_attr.sa_vattr.va_type = VREG;
+ new_attr.sa_vattr.va_mode = S_IRUSR;
+ new_attr.sa_mask |= SMB_AT_TYPE | SMB_AT_MODE;
+
+ rc = smb_fsop_create_stream(sr, sr->user_cr, dnode,
+ fnode, sname, lookup_flags, &new_attr, &fnode);
+ smb_node_release(tmp_node);
+
+ if (rc != 0) {
+ status = smb_errno2status(rc);
+ fnode_held = B_FALSE;
+ goto errout;
+ }
+ op->action_taken = SMB_OACT_CREATED;
+ created = B_TRUE;
+
+ smb_node_unlock(dnode);
+ dnode_wlock = B_FALSE;
+ }
+
/*
* Oplock break is done prior to sharing checks as the break
* may cause other clients to close the file which would
@@ -593,6 +662,24 @@ smb_common_open(smb_request_t *sr)
smb_node_inc_opening_count(fnode);
opening_incr = B_TRUE;
+ if (!stream_found) {
+ /*
+ * Stake our Share Access claim.
+ */
+ smb_node_wrlock(fnode);
+ fnode_wlock = B_TRUE;
+
+ status = smb_fsop_shrlock(sr->user_cr, fnode, uniq_fid,
+ op->desired_access, op->share_access);
+ if (status != 0)
+ goto errout;
+
+ fnode_shrlk = B_TRUE;
+ smb_node_unlock(fnode);
+ fnode_wlock = B_FALSE;
+ goto stream_created;
+ }
+
/*
* XXX Supposed to do share access checks next.
* [MS-FSA] describes that as part of access check:
@@ -780,11 +867,20 @@ smb_common_open(smb_request_t *sr)
case FILE_SUPERSEDE:
case FILE_OVERWRITE_IF:
case FILE_OVERWRITE:
- op->dattr |= FILE_ATTRIBUTE_ARCHIVE;
- /* Don't apply readonly until smb_set_open_attributes */
- if (op->dattr & FILE_ATTRIBUTE_READONLY) {
- op->dattr &= ~FILE_ATTRIBUTE_READONLY;
- op->created_readonly = B_TRUE;
+ bzero(&new_attr, sizeof (new_attr));
+ if (sname == NULL) {
+ op->dattr |= FILE_ATTRIBUTE_ARCHIVE;
+ /*
+ * Don't apply readonly until
+ * smb_set_open_attributes
+ */
+ if (op->dattr & FILE_ATTRIBUTE_READONLY) {
+ op->dattr &= ~FILE_ATTRIBUTE_READONLY;
+ op->created_readonly = B_TRUE;
+ }
+ new_attr.sa_dosattr = op->dattr;
+ } else {
+ new_attr.sa_dosattr = FILE_ATTRIBUTE_ARCHIVE;
}
/*
@@ -793,8 +889,6 @@ smb_common_open(smb_request_t *sr)
* after we have an ofile. See:
* smb_set_open_attributes
*/
- bzero(&new_attr, sizeof (new_attr));
- new_attr.sa_dosattr = op->dattr;
new_attr.sa_vattr.va_size = 0;
new_attr.sa_mask = SMB_AT_DOSATTR | SMB_AT_SIZE;
rc = smb_fsop_setattr(sr, sr->user_cr, fnode,
@@ -844,6 +938,12 @@ create:
goto errout;
}
+ if ((op->desired_access & ACCESS_SYSTEM_SECURITY) != 0 &&
+ !smb_user_has_security_priv(sr->uid_user, sr->user_cr)) {
+ status = NT_STATUS_ACCESS_DENIED;
+ goto errout;
+ }
+
if (pn->pn_fname && smb_is_invalid_filename(pn->pn_fname)) {
status = NT_STATUS_OBJECT_NAME_INVALID;
goto errout;
@@ -982,6 +1082,7 @@ create:
(void) smb_oplock_break_PARENT(dnode, of);
}
+stream_created:
/*
* We might have blocked in smb_oplock_break_OPEN long enough
* so a tree disconnect might have happened. In that case,
@@ -1061,6 +1162,8 @@ create:
* how that happens is protocol-specific.
*/
+ if (sname != NULL)
+ kmem_free(sname, MAXNAMELEN);
if (fnode_wlock)
smb_node_unlock(fnode);
if (opening_incr)
@@ -1091,6 +1194,8 @@ errout:
smb_delete_new_object(sr);
}
+ if (sname != NULL)
+ kmem_free(sname, MAXNAMELEN);
if (fnode_wlock)
smb_node_unlock(fnode);
if (opening_incr)
@@ -1147,22 +1252,6 @@ smb_set_open_attributes(smb_request_t *sr, smb_ofile_t *of)
attr.sa_mask |= SMB_AT_MTIME;
}
- /*
- * Used to have code here to set mtime, ctime, atime
- * when the open op->create_disposition is any of:
- * FILE_SUPERSEDE, FILE_OVERWRITE_IF, FILE_OVERWRITE.
- * We know that in those cases we will have set the
- * file size, in which case the file system will
- * update those times, so we don't have to.
- *
- * However, keep track of the fact that we modified
- * the file via this handle, so we can do the evil,
- * gratuitious mtime update on close that Windows
- * clients expect.
- */
- if (op->action_taken == SMB_OACT_TRUNCATED)
- of->f_written = B_TRUE;
-
if (attr.sa_mask != 0)
rc = smb_node_setattr(sr, node, of->f_cr, of, &attr);
diff --git a/usr/src/uts/common/fs/smbsrv/smb_fem.c b/usr/src/uts/common/fs/smbsrv/smb_fem.c
index c41ddddac8..b68466edaa 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_fem.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_fem.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
* Copyright 2015 Joyent, Inc.
*/
@@ -170,12 +170,15 @@ smb_fem_fcn_install(smb_node_t *node)
return (rc);
}
-void
+int
smb_fem_fcn_uninstall(smb_node_t *node)
{
+ int rc;
+
if (smb_fcn_ops == NULL)
- return;
- VERIFY0(fem_uninstall(node->vp, smb_fcn_ops, (void *)node));
+ return (ENOSYS);
+ rc = fem_uninstall(node->vp, smb_fcn_ops, (void *)node);
+ return (rc);
}
int
diff --git a/usr/src/uts/common/fs/smbsrv/smb_fsops.c b/usr/src/uts/common/fs/smbsrv/smb_fsops.c
index 8fafac5f60..4d6ffa5754 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_fsops.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_fsops.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Nexenta by DDN, Inc. All rights reserved.
*/
#include <sys/sid.h>
@@ -35,8 +35,8 @@
extern caller_context_t smb_ct;
-static int smb_fsop_create_stream(smb_request_t *, cred_t *, smb_node_t *,
- char *, char *, int, smb_attr_t *, smb_node_t **);
+static int smb_fsop_create_file_with_stream(smb_request_t *, cred_t *,
+ smb_node_t *, char *, char *, int, smb_attr_t *, smb_node_t **);
static int smb_fsop_create_file(smb_request_t *, cred_t *, smb_node_t *,
char *, int, smb_attr_t *, smb_node_t **);
@@ -136,6 +136,7 @@ smb_fsop_create_with_sd(smb_request_t *sr, cred_t *cr,
boolean_t is_dir;
ASSERT(fs_sd);
+ ASSERT(ret_snode != NULL);
if (SMB_TREE_IS_CASEINSENSITIVE(sr))
flags = SMB_IGNORE_CASE;
@@ -147,10 +148,9 @@ smb_fsop_create_with_sd(smb_request_t *sr, cred_t *cr,
is_dir = ((fs_sd->sd_flags & SMB_FSSD_FLAGS_DIR) != 0);
if (smb_tree_has_feature(sr->tid_tree, SMB_TREE_ACLONCREATE)) {
- if (fs_sd->sd_secinfo & SMB_ACL_SECINFO) {
- dacl = fs_sd->sd_zdacl;
- sacl = fs_sd->sd_zsacl;
- ASSERT(dacl || sacl);
+ dacl = fs_sd->sd_zdacl;
+ sacl = fs_sd->sd_zsacl;
+ if (dacl != NULL || sacl != NULL) {
if (dacl && sacl) {
acl = smb_fsacl_merge(dacl, sacl);
} else if (dacl) {
@@ -320,7 +320,7 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode,
sname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
smb_stream_parse_name(name, fname, sname);
- rc = smb_fsop_create_stream(sr, cr, dnode,
+ rc = smb_fsop_create_file_with_stream(sr, cr, dnode,
fname, sname, flags, attr, ret_snode);
kmem_free(fname, MAXNAMELEN);
@@ -349,39 +349,31 @@ smb_fsop_create(smb_request_t *sr, cred_t *cr, smb_node_t *dnode,
/*
- * smb_fsop_create_stream
+ * smb_fsop_create_file_with_stream
*
- * Create NTFS named stream file (sname) on unnamed stream
- * file (fname), creating the unnamed stream file if it
+ * Create named stream (sname) on file (fname), creating the file if it
* doesn't exist.
- * If we created the unnamed stream file and then creation
- * of the named stream file fails, we delete the unnamed stream.
+ * If we created the file and then creation of the named stream fails,
+ * we delete the file.
* Since we use the real file name for the smb_vop_remove we
* clear the SMB_IGNORE_CASE flag to ensure a case sensitive
* match.
*
- * The second parameter of smb_vop_setattr() is set to
- * NULL, even though an unnamed stream exists. This is
- * because we want to set the UID and GID on the named
- * stream in this case for consistency with the (unnamed
- * stream) file (see comments for smb_vop_setattr()).
- *
* Note that some stream "types" are "restricted" and only
* internal callers (cr == kcred) can create those.
*/
static int
-smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
+smb_fsop_create_file_with_stream(smb_request_t *sr, cred_t *cr,
smb_node_t *dnode, char *fname, char *sname, int flags,
smb_attr_t *attr, smb_node_t **ret_snode)
{
- smb_attr_t fattr;
smb_node_t *fnode;
- vnode_t *xattrdvp;
- vnode_t *vp;
cred_t *kcr = zone_kcred();
int rc = 0;
boolean_t fcreate = B_FALSE;
+ ASSERT(ret_snode != NULL);
+
if (cr != kcr && smb_strname_restricted(sname))
return (EACCES);
@@ -390,8 +382,10 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
sr->tid_tree->t_snode, dnode, fname, &fnode);
if (rc == 0) {
if (smb_fsop_access(sr, sr->user_cr, fnode,
- sr->sr_open.desired_access) != 0)
+ sr->sr_open.desired_access) != 0) {
+ smb_node_release(fnode);
rc = EACCES;
+ }
} else if (rc == ENOENT) {
fcreate = B_TRUE;
rc = smb_fsop_create_file(sr, cr, dnode, fname, flags,
@@ -400,38 +394,77 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
if (rc != 0)
return (rc);
- fattr.sa_mask = SMB_AT_UID | SMB_AT_GID;
- rc = smb_vop_getattr(fnode->vp, NULL, &fattr, 0, kcr);
+ rc = smb_fsop_create_stream(sr, cr, dnode, fnode, sname, flags, attr,
+ ret_snode);
- if (rc == 0) {
- /* create the named stream, sname */
- rc = smb_vop_stream_create(fnode->vp, sname, attr,
- &vp, &xattrdvp, flags, cr);
- }
if (rc != 0) {
if (fcreate) {
flags &= ~SMB_IGNORE_CASE;
(void) smb_vop_remove(dnode->vp,
fnode->od_name, flags, cr);
}
- smb_node_release(fnode);
- return (rc);
}
+ smb_node_release(fnode);
+ return (rc);
+}
+
+/*
+ * smb_fsop_create_stream
+ *
+ * Create named stream (sname) on existing file (fnode).
+ *
+ * The second parameter of smb_vop_setattr() is set to
+ * NULL, even though an unnamed stream exists. This is
+ * because we want to set the UID and GID on the named
+ * stream in this case for consistency with the (unnamed
+ * stream) file (see comments for smb_vop_setattr()).
+ *
+ * Note that some stream "types" are "restricted" and only
+ * internal callers (cr == kcred) can create those.
+ */
+int
+smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
+ smb_node_t *dnode, smb_node_t *fnode, char *sname, int flags,
+ smb_attr_t *attr, smb_node_t **ret_snode)
+{
+ smb_attr_t fattr;
+ vnode_t *xattrdvp;
+ vnode_t *vp;
+ cred_t *kcr = zone_kcred();
+ int rc = 0;
+
+ ASSERT(ret_snode != NULL);
+
+ if (cr != kcr && smb_strname_restricted(sname))
+ return (EACCES);
+
+ bzero(&fattr, sizeof (fattr));
+ fattr.sa_mask = SMB_AT_UID | SMB_AT_GID;
+ rc = smb_vop_getattr(fnode->vp, NULL, &fattr, 0, kcr);
+
+ if (rc == 0) {
+ /* create the named stream, sname */
+ rc = smb_vop_stream_create(fnode->vp, sname,
+ attr, &vp, &xattrdvp, flags, cr);
+ }
+ if (rc != 0)
+ return (rc);
+
attr->sa_vattr.va_uid = fattr.sa_vattr.va_uid;
attr->sa_vattr.va_gid = fattr.sa_vattr.va_gid;
attr->sa_mask = SMB_AT_UID | SMB_AT_GID;
rc = smb_vop_setattr(vp, NULL, attr, 0, kcr);
if (rc != 0) {
- smb_node_release(fnode);
+ VN_RELE(xattrdvp);
+ VN_RELE(vp);
return (rc);
}
*ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdvp,
vp, sname);
- smb_node_release(fnode);
VN_RELE(xattrdvp);
VN_RELE(vp);
@@ -441,7 +474,7 @@ smb_fsop_create_stream(smb_request_t *sr, cred_t *cr,
/* notify change to the unnamed stream */
if (rc == 0)
smb_node_notify_change(dnode,
- FILE_ACTION_ADDED_STREAM, fname);
+ FILE_ACTION_ADDED_STREAM, fnode->od_name);
return (rc);
}
@@ -458,6 +491,8 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr,
vnode_t *vp;
int rc;
+ ASSERT(ret_snode != NULL);
+
#ifdef _KERNEL
smb_fssd_t fs_sd;
uint32_t secinfo;
@@ -466,15 +501,24 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr,
if (op->sd) {
/*
* SD sent by client in Windows format. Needs to be
- * converted to FS format. No inheritance.
+ * converted to FS format. Inherit DACL/SACL if they're not
+ * specified.
*/
secinfo = smb_sd_get_secinfo(op->sd);
+
+ if ((secinfo & SMB_SACL_SECINFO) != 0 &&
+ !smb_user_has_security_priv(sr->uid_user, cr))
+ return (EPERM);
+
smb_fssd_init(&fs_sd, secinfo, 0);
status = smb_sd_tofs(op->sd, &fs_sd);
if (status == NT_STATUS_SUCCESS) {
- rc = smb_fsop_create_with_sd(sr, cr, dnode,
- name, attr, ret_snode, &fs_sd);
+ rc = smb_fsop_sdinherit(sr, dnode, &fs_sd);
+ if (rc == 0)
+ rc = smb_fsop_create_with_sd(sr, cr, dnode,
+ name, attr, ret_snode, &fs_sd);
+
} else {
rc = EINVAL;
}
@@ -485,7 +529,7 @@ smb_fsop_create_file(smb_request_t *sr, cred_t *cr,
* Server applies Windows inheritance rules,
* see smb_fsop_sdinherit() comments as to why.
*/
- smb_fssd_init(&fs_sd, SMB_ACL_SECINFO, 0);
+ smb_fssd_init(&fs_sd, 0, 0);
rc = smb_fsop_sdinherit(sr, dnode, &fs_sd);
if (rc == 0) {
rc = smb_fsop_create_with_sd(sr, cr, dnode,
@@ -607,15 +651,23 @@ smb_fsop_mkdir(
if (op->sd) {
/*
* SD sent by client in Windows format. Needs to be
- * converted to FS format. No inheritance.
+ * converted to FS format. Inherit DACL/SACL if they're not
+ * specified.
*/
secinfo = smb_sd_get_secinfo(op->sd);
+
+ if ((secinfo & SMB_SACL_SECINFO) != 0 &&
+ !smb_user_has_security_priv(sr->uid_user, cr))
+ return (EPERM);
+
smb_fssd_init(&fs_sd, secinfo, SMB_FSSD_FLAGS_DIR);
status = smb_sd_tofs(op->sd, &fs_sd);
if (status == NT_STATUS_SUCCESS) {
- rc = smb_fsop_create_with_sd(sr, cr, dnode,
- name, attr, ret_snode, &fs_sd);
+ rc = smb_fsop_sdinherit(sr, dnode, &fs_sd);
+ if (rc == 0)
+ rc = smb_fsop_create_with_sd(sr, cr, dnode,
+ name, attr, ret_snode, &fs_sd);
}
else
rc = EINVAL;
@@ -626,7 +678,7 @@ smb_fsop_mkdir(
* Server applies Windows inheritance rules,
* see smb_fsop_sdinherit() comments as to why.
*/
- smb_fssd_init(&fs_sd, SMB_ACL_SECINFO, SMB_FSSD_FLAGS_DIR);
+ smb_fssd_init(&fs_sd, 0, SMB_FSSD_FLAGS_DIR);
rc = smb_fsop_sdinherit(sr, dnode, &fs_sd);
if (rc == 0) {
rc = smb_fsop_create_with_sd(sr, cr, dnode,
@@ -1519,7 +1571,7 @@ smb_fsop_write(
cr = kcr;
}
- smb_node_start_crit(snode, RW_WRITER);
+ smb_node_start_crit(snode, RW_READER);
rc = nbl_svmand(vp, kcr, &svmand);
if (rc) {
smb_node_end_crit(snode);
@@ -1691,10 +1743,7 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
* it's not part of DACL. It's only granted via proper
* privileges.
*/
- if ((sr->uid_user->u_privileges &
- (SMB_USER_PRIV_BACKUP |
- SMB_USER_PRIV_RESTORE |
- SMB_USER_PRIV_SECURITY)) == 0)
+ if (!smb_user_has_security_priv(sr->uid_user, cr))
return (NT_STATUS_PRIVILEGE_NOT_HELD);
faccess &= ~ACCESS_SYSTEM_SECURITY;
@@ -1736,9 +1785,13 @@ smb_fsop_access(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
/*
* smb_fsop_lookup_name()
*
+ * Lookup both the file and stream specified in 'name'.
* If name indicates that the file is a stream file, perform
* stream specific lookup, otherwise call smb_fsop_lookup.
*
+ * On success, returns the found node in *ret_snode. This will be either a named
+ * or unnamed stream node, depending on the name specified.
+ *
* Return an error if the looked-up file is in outside the tree.
* (Required when invoked from open path.)
*
@@ -1760,18 +1813,64 @@ smb_fsop_lookup_name(
char *name,
smb_node_t **ret_snode)
{
- smb_node_t *fnode;
- vnode_t *xattrdirvp;
- vnode_t *vp;
- char *od_name;
+ char *sname = NULL;
+ int rc;
+ smb_node_t *tmp_node;
+
+ ASSERT(ret_snode != NULL);
+
+ rc = smb_fsop_lookup_file(sr, cr, flags, root_node, dnode, name,
+ &sname, ret_snode);
+
+ if (rc != 0 || sname == NULL)
+ return (rc);
+
+ tmp_node = *ret_snode;
+ rc = smb_fsop_lookup_stream(sr, cr, flags, root_node, tmp_node, sname,
+ ret_snode);
+ kmem_free(sname, MAXNAMELEN);
+ smb_node_release(tmp_node);
+
+ return (rc);
+}
+
+/*
+ * smb_fsop_lookup_file()
+ *
+ * Look up of the file portion of 'name'. If a Stream is specified,
+ * return the stream name in 'sname', which this allocates.
+ * The caller must free 'sname'.
+ *
+ * Return an error if the looked-up file is outside the tree.
+ * (Required when invoked from open path.)
+ *
+ * Case sensitivity flags (SMB_IGNORE_CASE, SMB_CASE_SENSITIVE):
+ * if SMB_CASE_SENSITIVE is set, the SMB_IGNORE_CASE flag will NOT be set
+ * based on the tree's case sensitivity. However, if the SMB_IGNORE_CASE
+ * flag is set in the flags value passed as a parameter, a case insensitive
+ * lookup WILL be done (regardless of whether SMB_CASE_SENSITIVE is set
+ * or not).
+ */
+
+int
+smb_fsop_lookup_file(
+ smb_request_t *sr,
+ cred_t *cr,
+ int flags,
+ smb_node_t *root_node,
+ smb_node_t *dnode,
+ char *name,
+ char **sname,
+ smb_node_t **ret_snode)
+{
char *fname;
- char *sname;
int rc;
ASSERT(cr);
ASSERT(dnode);
ASSERT(dnode->n_magic == SMB_NODE_MAGIC);
ASSERT(dnode->n_state != SMB_NODE_STATE_DESTROYING);
+ ASSERT(ret_snode != NULL);
/*
* The following check is required for streams processing, below
@@ -1782,11 +1881,11 @@ smb_fsop_lookup_name(
flags |= SMB_IGNORE_CASE;
}
- fname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
- sname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-
+ *sname = NULL;
if (smb_is_stream_name(name)) {
- smb_stream_parse_name(name, fname, sname);
+ *sname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ fname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ smb_stream_parse_name(name, fname, *sname);
/*
* Look up the unnamed stream (i.e. fname).
@@ -1794,49 +1893,8 @@ smb_fsop_lookup_name(
* as well as any link target.
*/
rc = smb_fsop_lookup(sr, cr, flags, root_node, dnode,
- fname, &fnode);
-
- if (rc != 0) {
- kmem_free(fname, MAXNAMELEN);
- kmem_free(sname, MAXNAMELEN);
- return (rc);
- }
-
- od_name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-
- /*
- * od_name is the on-disk name of the stream, except
- * without the prepended stream prefix (SMB_STREAM_PREFIX)
- */
-
- /*
- * XXX
- * What permissions NTFS requires for stream lookup if any?
- */
- rc = smb_vop_stream_lookup(fnode->vp, sname, &vp, od_name,
- &xattrdirvp, flags, root_node->vp, cr);
-
- if (rc != 0) {
- smb_node_release(fnode);
- kmem_free(fname, MAXNAMELEN);
- kmem_free(sname, MAXNAMELEN);
- kmem_free(od_name, MAXNAMELEN);
- return (rc);
- }
-
- *ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdirvp,
- vp, od_name);
-
- kmem_free(od_name, MAXNAMELEN);
- smb_node_release(fnode);
- VN_RELE(xattrdirvp);
- VN_RELE(vp);
-
- if (*ret_snode == NULL) {
- kmem_free(fname, MAXNAMELEN);
- kmem_free(sname, MAXNAMELEN);
- return (ENOMEM);
- }
+ fname, ret_snode);
+ kmem_free(fname, MAXNAMELEN);
} else {
rc = smb_fsop_lookup(sr, cr, flags, root_node, dnode, name,
ret_snode);
@@ -1851,8 +1909,66 @@ smb_fsop_lookup_name(
}
}
- kmem_free(fname, MAXNAMELEN);
- kmem_free(sname, MAXNAMELEN);
+ if (rc != 0 && *sname != NULL) {
+ kmem_free(*sname, MAXNAMELEN);
+ *sname = NULL;
+ }
+ return (rc);
+}
+
+/*
+ * smb_fsop_lookup_stream
+ *
+ * The file exists, see if the stream exists.
+ */
+int
+smb_fsop_lookup_stream(
+ smb_request_t *sr,
+ cred_t *cr,
+ int flags,
+ smb_node_t *root_node,
+ smb_node_t *fnode,
+ char *sname,
+ smb_node_t **ret_snode)
+{
+ char *od_name;
+ vnode_t *xattrdirvp;
+ vnode_t *vp;
+ int rc;
+
+ /*
+ * The following check is required for streams processing, below
+ */
+
+ if (!(flags & SMB_CASE_SENSITIVE)) {
+ if (SMB_TREE_IS_CASEINSENSITIVE(sr))
+ flags |= SMB_IGNORE_CASE;
+ }
+
+ od_name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+
+ /*
+ * od_name is the on-disk name of the stream, except
+ * without the prepended stream prefix (SMB_STREAM_PREFIX)
+ */
+
+ rc = smb_vop_stream_lookup(fnode->vp, sname, &vp, od_name,
+ &xattrdirvp, flags, root_node->vp, cr);
+
+ if (rc != 0) {
+ kmem_free(od_name, MAXNAMELEN);
+ return (rc);
+ }
+
+ *ret_snode = smb_stream_node_lookup(sr, cr, fnode, xattrdirvp,
+ vp, od_name);
+
+ kmem_free(od_name, MAXNAMELEN);
+ VN_RELE(xattrdirvp);
+ VN_RELE(vp);
+
+ if (*ret_snode == NULL)
+ return (ENOMEM);
return (rc);
}
@@ -2391,6 +2507,8 @@ smb_fsop_sdmerge(smb_request_t *sr, smb_node_t *snode, smb_fssd_t *fs_sd)
* owner has been specified. Callers should translate this to
* STATUS_INVALID_OWNER which is not the normal mapping for EPERM
* in upper layers, so EPERM is mapped to EBADE.
+ *
+ * If 'overwrite' is non-zero, then the existing ACL is ignored.
*/
int
smb_fsop_sdwrite(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
@@ -2456,14 +2574,13 @@ smb_fsop_sdwrite(smb_request_t *sr, cred_t *cr, smb_node_t *snode,
}
if (fs_sd->sd_secinfo & SMB_ACL_SECINFO) {
- if (overwrite == 0) {
+ if (overwrite == 0)
error = smb_fsop_sdmerge(sr, snode, fs_sd);
- if (error)
- return (error);
- }
- error = smb_fsop_aclwrite(sr, cr, snode, fs_sd);
- if (error) {
+ if (error == 0)
+ error = smb_fsop_aclwrite(sr, cr, snode, fs_sd);
+
+ if (error != 0) {
/*
* Revert uid/gid changes if required.
*/
@@ -2511,39 +2628,46 @@ smb_fsop_sdinherit(smb_request_t *sr, smb_node_t *dnode, smb_fssd_t *fs_sd)
acl_t *sacl = NULL;
int is_dir;
int error;
+ uint32_t secinfo;
+ smb_fssd_t pfs_sd;
ASSERT(fs_sd);
- if (sr->tid_tree->t_acltype != ACE_T) {
- /*
- * No forced inheritance for non-ZFS filesystems.
- */
- fs_sd->sd_secinfo = 0;
+ secinfo = fs_sd->sd_secinfo;
+
+ /* Anything to do? */
+ if ((secinfo & SMB_ACL_SECINFO) == SMB_ACL_SECINFO)
+ return (0);
+
+ /*
+ * No forced inheritance for non-ZFS filesystems.
+ */
+ if (sr->tid_tree->t_acltype != ACE_T)
return (0);
- }
+ smb_fssd_init(&pfs_sd, SMB_ACL_SECINFO, fs_sd->sd_flags);
/* Fetch parent directory's ACL */
- error = smb_fsop_sdread(sr, zone_kcred(), dnode, fs_sd);
+ error = smb_fsop_sdread(sr, zone_kcred(), dnode, &pfs_sd);
if (error) {
return (error);
}
is_dir = (fs_sd->sd_flags & SMB_FSSD_FLAGS_DIR);
- dacl = smb_fsacl_inherit(fs_sd->sd_zdacl, is_dir, SMB_DACL_SECINFO,
- sr->user_cr);
- sacl = smb_fsacl_inherit(fs_sd->sd_zsacl, is_dir, SMB_SACL_SECINFO,
- sr->user_cr);
-
- if (sacl == NULL)
- fs_sd->sd_secinfo &= ~SMB_SACL_SECINFO;
-
- smb_fsacl_free(fs_sd->sd_zdacl);
- smb_fsacl_free(fs_sd->sd_zsacl);
+ if ((secinfo & SMB_DACL_SECINFO) == 0) {
+ dacl = smb_fsacl_inherit(pfs_sd.sd_zdacl, is_dir,
+ SMB_DACL_SECINFO, sr->user_cr);
+ fs_sd->sd_zdacl = dacl;
+ }
- fs_sd->sd_zdacl = dacl;
- fs_sd->sd_zsacl = sacl;
+ if ((secinfo & SMB_SACL_SECINFO) == 0) {
+ sacl = smb_fsacl_inherit(pfs_sd.sd_zsacl, is_dir,
+ SMB_SACL_SECINFO, sr->user_cr);
+ fs_sd->sd_zsacl = sacl;
+ }
+ smb_fsacl_free(pfs_sd.sd_zdacl);
+ smb_fsacl_free(pfs_sd.sd_zsacl);
return (0);
}
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/fs/smbsrv/smb_idmap.c b/usr/src/uts/common/fs/smbsrv/smb_idmap.c
index b9bfa991c4..e6c04193b0 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_idmap.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_idmap.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -83,12 +83,12 @@ smb_idmap_getsid(uid_t id, int idtype, smb_sid_t **sid)
switch (idtype) {
case SMB_IDMAP_USER:
- sim.sim_stat = kidmap_getsidbyuid(global_zone, id,
+ sim.sim_stat = kidmap_getsidbyuid(curzone, id,
(const char **)&sim.sim_domsid, &sim.sim_rid);
break;
case SMB_IDMAP_GROUP:
- sim.sim_stat = kidmap_getsidbygid(global_zone, id,
+ sim.sim_stat = kidmap_getsidbygid(curzone, id,
(const char **)&sim.sim_domsid, &sim.sim_rid);
break;
@@ -150,17 +150,17 @@ smb_idmap_getid(smb_sid_t *sid, uid_t *id, int *idtype)
switch (*idtype) {
case SMB_IDMAP_USER:
- sim.sim_stat = kidmap_getuidbysid(global_zone, sim.sim_domsid,
+ sim.sim_stat = kidmap_getuidbysid(curzone, sim.sim_domsid,
sim.sim_rid, sim.sim_id);
break;
case SMB_IDMAP_GROUP:
- sim.sim_stat = kidmap_getgidbysid(global_zone, sim.sim_domsid,
+ sim.sim_stat = kidmap_getgidbysid(curzone, sim.sim_domsid,
sim.sim_rid, sim.sim_id);
break;
case SMB_IDMAP_UNKNOWN:
- sim.sim_stat = kidmap_getpidbysid(global_zone, sim.sim_domsid,
+ sim.sim_stat = kidmap_getpidbysid(curzone, sim.sim_domsid,
sim.sim_rid, sim.sim_id, &sim.sim_idtype);
break;
@@ -186,7 +186,7 @@ smb_idmap_batch_create(smb_idmap_batch_t *sib, uint16_t nmap, int flags)
bzero(sib, sizeof (smb_idmap_batch_t));
- sib->sib_idmaph = kidmap_get_create(global_zone);
+ sib->sib_idmaph = kidmap_get_create(curzone);
sib->sib_flags = flags;
sib->sib_nmap = nmap;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c b/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c
index 132820a147..1476850683 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_mbuf_marshaling.c
@@ -22,7 +22,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
*/
/*
@@ -800,7 +800,7 @@ smb_mbc_poke(mbuf_chain_t *mbc, int offset, const char *fmt, ...)
*/
int
smb_mbc_copy(mbuf_chain_t *dst_mbc, const mbuf_chain_t *src_mbc,
- int copy_offset, int copy_len)
+ int copy_offset, int copy_len)
{
mbuf_t *src_m;
int offset, len;
@@ -1109,8 +1109,6 @@ mbc_marshal_put_oem_string(mbuf_chain_t *mbc, char *mbs, int repc)
*/
if (repc <= 0)
repc = oemlen + 1;
- if (mbc_marshal_make_room(mbc, repc))
- return (DECODE_NO_MORE_DATA);
/*
* Convert into a temporary buffer
@@ -1133,6 +1131,10 @@ mbc_marshal_put_oem_string(mbuf_chain_t *mbc, char *mbs, int repc)
*/
s = oembuf;
while (repc > 0) {
+ if (mbc_marshal_make_room(mbc, 1)) {
+ rc = DECODE_NO_MORE_DATA;
+ goto out;
+ }
mbc_marshal_store_byte(mbc, *s);
if (*s != '\0')
s++;
@@ -1158,6 +1160,7 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc)
{
smb_wchar_t *wcsbuf = NULL;
smb_wchar_t *wp;
+ smb_wchar_t wchar;
size_t wcslen, wcsbytes;
size_t rlen;
int rc;
@@ -1183,8 +1186,6 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc)
*/
if (repc <= 0)
repc = wcsbytes + 2;
- if (mbc_marshal_make_room(mbc, repc))
- return (DECODE_NO_MORE_DATA);
/*
* Convert into a temporary buffer
@@ -1208,18 +1209,27 @@ mbc_marshal_put_unicode_string(mbuf_chain_t *mbc, char *mbs, int repc)
* little-endian order while copying.
*/
wp = wcsbuf;
- while (repc > 1) {
- smb_wchar_t wchar = LE_IN16(wp);
+ while (repc >= sizeof (smb_wchar_t)) {
+ if (mbc_marshal_make_room(mbc, sizeof (smb_wchar_t))) {
+ rc = DECODE_NO_MORE_DATA;
+ goto out;
+ }
+ wchar = LE_IN16(wp);
mbc_marshal_store_byte(mbc, wchar);
mbc_marshal_store_byte(mbc, wchar >> 8);
if (wchar != 0)
wp++;
repc -= sizeof (smb_wchar_t);
}
- if (repc > 0)
+ if (repc > 0) {
+ if (mbc_marshal_make_room(mbc, 1)) {
+ rc = DECODE_NO_MORE_DATA;
+ goto out;
+ }
mbc_marshal_store_byte(mbc, 0);
-
+ }
rc = 0;
+
out:
if (wcsbuf != NULL)
smb_mem_free(wcsbuf);
diff --git a/usr/src/uts/common/fs/smbsrv/smb_node.c b/usr/src/uts/common/fs/smbsrv/smb_node.c
index 8ce3e70712..a204326514 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_node.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_node.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2019 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
* SMB Node State Machine
@@ -478,6 +478,18 @@ smb_node_release(smb_node_t *node)
case SMB_NODE_STATE_AVAILABLE:
node->n_state = SMB_NODE_STATE_DESTROYING;
+
+ /*
+ * While we still hold n_mutex,
+ * make sure FEM hooks are gone.
+ */
+ if (node->n_fcn_count > 0) {
+ DTRACE_PROBE1(fem__fcn__dangles,
+ smb_node_t *, node);
+ node->n_fcn_count = 0;
+ (void) smb_fem_fcn_uninstall(node);
+ }
+
mutex_exit(&node->n_mutex);
smb_llist_enter(node->n_hash_bucket, RW_WRITER);
@@ -883,8 +895,9 @@ smb_node_fcn_unsubscribe(smb_node_t *node)
mutex_enter(&node->n_mutex);
node->n_fcn_count--;
- if (node->n_fcn_count == 0)
- smb_fem_fcn_uninstall(node);
+ if (node->n_fcn_count == 0) {
+ VERIFY0(smb_fem_fcn_uninstall(node));
+ }
mutex_exit(&node->n_mutex);
}
@@ -1479,6 +1492,7 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node,
int rc;
uint_t times_mask;
smb_attr_t tmp_attr;
+ smb_node_t *unnamed_node;
SMB_NODE_VALID(node);
@@ -1543,14 +1557,6 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node,
}
/*
- * If we have an open file, and we set the size,
- * then set the "written" flag so that at close,
- * we can force an mtime update.
- */
- if (of != NULL && (attr->sa_mask & SMB_AT_SIZE) != 0)
- of->f_written = B_TRUE;
-
- /*
* When operating on an open file, some settable attributes
* become "sticky" in the open file object until close.
* (see above re. timestamps)
@@ -1615,6 +1621,13 @@ smb_node_setattr(smb_request_t *sr, smb_node_t *node,
FILE_ACTION_MODIFIED, node->od_name);
}
+ if ((unnamed_node = SMB_IS_STREAM(node)) != NULL) {
+ ASSERT(unnamed_node->n_magic == SMB_NODE_MAGIC);
+ ASSERT(unnamed_node->n_state != SMB_NODE_STATE_DESTROYING);
+ smb_node_notify_change(node->n_dnode,
+ FILE_ACTION_MODIFIED_STREAM, node->od_name);
+ }
+
return (0);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_notify.c b/usr/src/uts/common/fs/smbsrv/smb_notify.c
index fda9197e6e..602fa1db3b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_notify.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_notify.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -97,8 +97,8 @@
* smb_notify_act1:
* Validate parameters, setup ofile buffer.
* If data already available, return it, all done.
- * (In the "all done" case, skip act2 & act3.)
- * If no data available, return a special error
+ * (In the "all done" case, skip act2 & act3.)
+ * If no data available, return a special error
* ("STATUS_PENDING") to tell the caller they must
* proceed with calls to act2 & act3.
*
@@ -201,6 +201,15 @@ smb_notify_act1(smb_request_t *sr, uint32_t buflen, uint32_t filter)
mutex_enter(&of->f_mutex);
/*
+ * It's possible this ofile has started closing, in which case
+ * we must not subscribe it for events etc.
+ */
+ if (of->f_state != SMB_OFILE_STATE_OPEN) {
+ mutex_exit(&of->f_mutex);
+ return (NT_STATUS_FILE_CLOSED);
+ }
+
+ /*
* On the first FCN call with this ofile, subscribe to
* events on the node. The corresponding unsubscribe
* happens in smb_ofile_delete().
diff --git a/usr/src/uts/common/fs/smbsrv/smb_ofile.c b/usr/src/uts/common/fs/smbsrv/smb_ofile.c
index d5388037c3..1d7a5c134f 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_ofile.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_ofile.c
@@ -22,7 +22,7 @@
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Syneto S.R.L. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
- * Copyright 2019 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -446,10 +446,23 @@ void
smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec)
{
smb_attr_t *pa;
- timestruc_t now;
SMB_OFILE_VALID(of);
+ if (of->f_ftype == SMB_FTYPE_DISK) {
+ smb_node_t *node = of->f_node;
+
+ smb_llist_enter(&node->n_ofile_list, RW_READER);
+ mutex_enter(&node->n_oplock.ol_mutex);
+
+ if (of->f_lease != NULL)
+ smb2_lease_ofile_close(of);
+ smb_oplock_break_CLOSE(node, of);
+
+ mutex_exit(&node->n_oplock.ol_mutex);
+ smb_llist_exit(&node->n_ofile_list);
+ }
+
mutex_enter(&of->f_mutex);
ASSERT(of->f_refcnt);
@@ -480,9 +493,6 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec)
smb2_dh_close_persistent(of);
if (of->f_persistid != 0)
smb_ofile_del_persistid(of);
- if (of->f_lease != NULL)
- smb2_lease_ofile_close(of);
- smb_oplock_break_CLOSE(of->f_node, of);
/* FALLTHROUGH */
case SMB_FTYPE_PRINTER: /* or FTYPE_DISK */
@@ -498,20 +508,6 @@ smb_ofile_close(smb_ofile_t *of, int32_t mtime_sec)
pa->sa_mask |= SMB_AT_MTIME;
}
- /*
- * If we have ever modified data via this handle
- * (write or truncate) and if the mtime was not
- * set via this handle, update the mtime again
- * during the close. Windows expects this.
- * [ MS-FSA 2.1.5.4 "Update Timestamps" ]
- */
- if (of->f_written &&
- (pa->sa_mask & SMB_AT_MTIME) == 0) {
- pa->sa_mask |= SMB_AT_MTIME;
- gethrestime(&now);
- pa->sa_vattr.va_mtime = now;
- }
-
if (of->f_flags & SMB_OFLAGS_SET_DELETE_ON_CLOSE) {
/* We delete using the on-disk name. */
uint32_t flags = SMB_CASE_SENSITIVE;
@@ -1457,11 +1453,18 @@ smb_ofile_delete(void *arg)
*/
if (of->f_ftype == SMB_FTYPE_DISK ||
of->f_ftype == SMB_FTYPE_PRINTER) {
- ASSERT(of->f_node != NULL);
+ smb_node_t *node = of->f_node;
+
+ /*
+ * Oplock cleanup should have made sure that
+ * excl_open does not point to this ofile.
+ */
+ VERIFY(node->n_oplock.excl_open != of);
+
/*
* Note smb_ofile_close did smb_node_dec_open_ofiles()
*/
- smb_node_rem_ofile(of->f_node, of);
+ smb_node_rem_ofile(node, of);
}
/*
diff --git a/usr/src/uts/common/fs/smbsrv/smb_sd.c b/usr/src/uts/common/fs/smbsrv/smb_sd.c
index ddbd7b9413..946503fa8f 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_sd.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_sd.c
@@ -22,7 +22,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -243,28 +243,52 @@ smb_sd_tofs(smb_sd_t *sd, smb_fssd_t *fs_sd)
}
}
+ /*
+ * In SMB, the 'secinfo' determines which parts of the SD the client
+ * intends to change. Notably, this includes changing the DACL_PRESENT
+ * and SACL_PRESENT control bits. The client can specify e.g.
+ * SACL_SECINFO, but not SACL_PRESENT, and this means the client intends
+ * to remove the SACL.
+ *
+ * Note that Windows behavior differs from that described in [MS-DTYP].
+ * MS-DTYP states that the offset is nonzero if-and-only-if the PRESENT
+ * bit is set. It also states that a DACL that is marked non-present
+ * is equivalent to 'no security', but one that is marked present and
+ * provides no ACEs is equivalent to 'no access'.
+ *
+ * Windows, on the other hand, allows the offset to be 0 even when
+ * the PRESENT bit is set, and only provides security when the DACL
+ * offset is non-zero. It will also convert an SD where the DACL is
+ * marked not-present to one where the PRESENT bit is set and the
+ * offset is 0.
+ *
+ * If the *_PRESENT bit isn't set, then the respective ACL will be NULL.
+ * For the fssd, we allow the SACL to be NULL, but we MUST have a DACL.
+ * If the DACL is NULL, that's equivalent to "everyone:full_set:allow".
+ *
+ * The IMPLY's should be enforced by smb_decode_sd().
+ */
+
/* DACL */
if (fs_sd->sd_secinfo & SMB_DACL_SECINFO) {
- if (sd->sd_control & SE_DACL_PRESENT) {
- status = smb_acl_to_zfs(sd->sd_dacl, flags,
- SMB_DACL_SECINFO, &fs_sd->sd_zdacl);
- if (status != NT_STATUS_SUCCESS)
- return (status);
- }
- else
- return (NT_STATUS_INVALID_ACL);
+ IMPLY(sd->sd_dacl != NULL,
+ (sd->sd_control & SE_DACL_PRESENT) != 0);
+ status = smb_acl_to_zfs(sd->sd_dacl, flags,
+ SMB_DACL_SECINFO, &fs_sd->sd_zdacl);
+ if (status != NT_STATUS_SUCCESS)
+ return (status);
}
/* SACL */
if (fs_sd->sd_secinfo & SMB_SACL_SECINFO) {
+ IMPLY(sd->sd_sacl != NULL,
+ (sd->sd_control & SE_SACL_PRESENT) != 0);
if (sd->sd_control & SE_SACL_PRESENT) {
status = smb_acl_to_zfs(sd->sd_sacl, flags,
SMB_SACL_SECINFO, &fs_sd->sd_zsacl);
if (status != NT_STATUS_SUCCESS) {
return (status);
}
- } else {
- return (NT_STATUS_INVALID_ACL);
}
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c
index 13df16f55d..3b69a5699b 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_server.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_server.c
@@ -22,6 +22,7 @@
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017 by Delphix. All rights reserved.
* Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
/*
@@ -2094,6 +2095,7 @@ smb_server_store_cfg(smb_server_t *sv, smb_ioc_cfg_t *ioc)
sv->sv_cfg.skc_max_protocol = ioc->max_protocol;
sv->sv_cfg.skc_min_protocol = ioc->min_protocol;
sv->sv_cfg.skc_encrypt = ioc->encrypt;
+ sv->sv_cfg.skc_encrypt_cipher = ioc->encrypt_cipher;
sv->sv_cfg.skc_execflags = ioc->exec_flags;
sv->sv_cfg.skc_negtok_len = ioc->negtok_len;
sv->sv_cfg.skc_version = ioc->version;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c
index 17bbc16e72..6739fee326 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_session.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_session.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2019 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
*/
#include <sys/atomic.h>
@@ -857,6 +858,9 @@ smb_session_delete(smb_session_t *session)
session->signing.mackey_len);
}
+ if (session->preauth_mech != NULL)
+ smb31_preauth_fini(session);
+
session->s_magic = 0;
smb_rwx_destroy(&session->s_lock);
diff --git a/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c b/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c
index 55f4bc9d0e..44aa6ba117 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_sign_kcf.c
@@ -32,7 +32,7 @@
* Common function to see if a mech is available.
*/
static int
-find_mech(smb_crypto_mech_t *mech, crypto_mech_name_t name)
+find_mech(smb_crypto_mech_t *mech, const char *name)
{
crypto_mech_type_t t;
diff --git a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
index 12d425d438..d4811f6857 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_srv_oplock.c
@@ -10,7 +10,7 @@
*/
/*
- * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -173,7 +173,8 @@ smb_oplock_ind_break_in_ack(smb_request_t *ack_sr, smb_ofile_t *ofile,
* We're going to schedule a request that will have a
* reference to this ofile. Get the hold first.
*/
- if (!smb_ofile_hold_olbrk(ofile)) {
+ if (ofile->f_oplock.og_closing ||
+ !smb_ofile_hold_olbrk(ofile)) {
/* It's closing (or whatever). Nothing to do. */
return;
}
@@ -264,7 +265,8 @@ smb_oplock_ind_break(smb_ofile_t *ofile, uint32_t NewLevel,
* We're going to schedule a request that will have a
* reference to this ofile. Get the hold first.
*/
- if (!smb_ofile_hold_olbrk(ofile)) {
+ if (ofile->f_oplock.og_closing ||
+ !smb_ofile_hold_olbrk(ofile)) {
/* It's closing (or whatever). Nothing to do. */
return;
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_tree.c b/usr/src/uts/common/fs/smbsrv/smb_tree.c
index aedacf2123..45f381ffb1 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_tree.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_tree.c
@@ -21,8 +21,8 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
*/
/*
@@ -188,6 +188,7 @@ static void smb_tree_dealloc(void *);
static boolean_t smb_tree_is_connected_locked(smb_tree_t *);
static char *smb_tree_get_sharename(char *);
static int smb_tree_getattr(const smb_kshare_t *, smb_node_t *, smb_tree_t *);
+static void smb_tree_get_creation(smb_node_t *, smb_tree_t *);
static void smb_tree_get_volname(vfs_t *, smb_tree_t *);
static void smb_tree_get_flags(const smb_kshare_t *, vfs_t *, smb_tree_t *);
static void smb_tree_log(smb_request_t *, const char *, const char *, ...);
@@ -917,10 +918,6 @@ smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si,
tree->t_session = session;
tree->t_server = session->s_server;
- /* grab a ref for tree->t_owner */
- smb_user_hold_internal(sr->uid_user);
- tree->t_owner = sr->uid_user;
-
if (STYPE_ISDSK(stype) || STYPE_ISPRN(stype)) {
if (smb_tree_getattr(si, snode, tree) != 0) {
smb_idpool_free(&session->s_tid_pool, tid);
@@ -964,6 +961,10 @@ smb_tree_alloc(smb_request_t *sr, const smb_kshare_t *si,
tree->t_connect_time = gethrestime_sec();
tree->t_execflags = execflags;
+ /* grab a ref for tree->t_owner */
+ smb_user_hold_internal(sr->uid_user);
+ tree->t_owner = sr->uid_user;
+
/* if FS is readonly, enforce that here */
if (tree->t_flags & SMB_TREE_READONLY)
tree->t_access &= ~ACE_ALL_WRITE_PERMS;
@@ -1099,15 +1100,29 @@ static int
smb_tree_getattr(const smb_kshare_t *si, smb_node_t *node, smb_tree_t *tree)
{
vfs_t *vfsp = SMB_NODE_VFS(node);
+ vfs_t *realvfsp;
smb_cfg_val_t srv_encrypt;
ASSERT(vfsp);
- if (getvfs(&vfsp->vfs_fsid) != vfsp)
- return (ESTALE);
-
+ smb_tree_get_creation(node, tree);
smb_tree_get_volname(vfsp, tree);
- smb_tree_get_flags(si, vfsp, tree);
+
+ /*
+ * In the case of an lofs mount, we need to ask the (real)
+ * underlying filesystem about capabilities, where the
+ * passed in vfs_t will be from lofs.
+ */
+ realvfsp = getvfs(&vfsp->vfs_fsid);
+ if (realvfsp != NULL) {
+ smb_tree_get_flags(si, realvfsp, tree);
+ VFS_RELE(realvfsp);
+ } else {
+ cmn_err(CE_NOTE, "Failed getting info for share: %s",
+ si->shr_name);
+ /* do the best we can without realvfsp */
+ smb_tree_get_flags(si, vfsp, tree);
+ }
srv_encrypt = tree->t_session->s_server->sv_cfg.skc_encrypt;
if (tree->t_session->dialect >= SMB_VERS_3_0) {
@@ -1122,11 +1137,27 @@ smb_tree_getattr(const smb_kshare_t *si, smb_node_t *node, smb_tree_t *tree)
} else
tree->t_encrypt = SMB_CONFIG_DISABLED;
- VFS_RELE(vfsp);
return (0);
}
/*
+ * File volume creation time
+ */
+static void
+smb_tree_get_creation(smb_node_t *node, smb_tree_t *tree)
+{
+ smb_attr_t attr;
+ cred_t *kcr = zone_kcred();
+
+ bzero(&attr, sizeof (attr));
+ attr.sa_mask = SMB_AT_CRTIME;
+ (void) smb_node_getattr(NULL, node, kcr, NULL, &attr);
+ /* On failure we'll have time zero, which is OK */
+
+ tree->t_create_time = attr.sa_crtime;
+}
+
+/*
* Extract the volume name.
*/
static void
diff --git a/usr/src/uts/common/fs/smbsrv/smb_user.c b/usr/src/uts/common/fs/smbsrv/smb_user.c
index b46cad1b6f..8934a213eb 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_user.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_user.c
@@ -205,6 +205,8 @@
#include <sys/types.h>
#include <sys/sid.h>
#include <sys/priv_names.h>
+#include <sys/priv.h>
+#include <sys/policy.h>
#include <smbsrv/smb_kproto.h>
#include <smbsrv/smb_door.h>
@@ -831,6 +833,45 @@ smb_user_setcred(smb_user_t *user, cred_t *cr, uint32_t privileges)
#endif /* _KERNEL */
/*
+ * Determines whether a user can be granted ACCESS_SYSTEM_SECURITY
+ */
+boolean_t
+smb_user_has_security_priv(smb_user_t *user, cred_t *cr)
+{
+ /* Need SeSecurityPrivilege to get/set SACL */
+ if ((user->u_privileges & SMB_USER_PRIV_SECURITY) != 0)
+ return (B_TRUE);
+
+#ifdef _KERNEL
+ /*
+ * ACCESS_SYSTEM_SECURITY is also granted if the file is opened with
+ * BACKUP/RESTORE intent by a user with BACKUP/RESTORE privilege,
+ * which means we'll be using u_privcred.
+ *
+ * We translate BACKUP as DAC_READ and RESTORE as DAC_WRITE,
+ * to account for our various SMB_USER_* privileges.
+ */
+ if (PRIV_POLICY_ONLY(cr,
+ priv_getbyname(PRIV_FILE_DAC_READ, 0), B_FALSE) ||
+ PRIV_POLICY_ONLY(cr,
+ priv_getbyname(PRIV_FILE_DAC_WRITE, 0), B_FALSE))
+ return (B_TRUE);
+#else
+ /*
+ * No "real" privileges in fksmbsrv, so use the SMB privs instead.
+ */
+ if ((user->u_privileges &
+ (SMB_USER_PRIV_BACKUP |
+ SMB_USER_PRIV_RESTORE |
+ SMB_USER_PRIV_READ_FILE |
+ SMB_USER_PRIV_WRITE_FILE)) != 0)
+ return (B_TRUE);
+#endif
+
+ return (B_FALSE);
+}
+
+/*
* Private function to support smb_user_enum.
*/
static int
@@ -959,6 +1000,9 @@ smb_is_same_user(cred_t *cr1, cred_t *cr2)
ksid_t *ks1 = crgetsid(cr1, KSID_USER);
ksid_t *ks2 = crgetsid(cr2, KSID_USER);
+ if (ks1 == NULL || ks2 == NULL) {
+ return (B_FALSE);
+ }
return (ks1->ks_rid == ks2->ks_rid &&
strcmp(ks1->ks_domain->kd_name, ks2->ks_domain->kd_name) == 0);
}
diff --git a/usr/src/uts/common/fs/smbsrv/smb_write.c b/usr/src/uts/common/fs/smbsrv/smb_write.c
index 6db8cc9e1a..fbf85da282 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_write.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_write.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 Tintri by DDN, Inc. All rights reserved.
*/
#include <sys/sdt.h>
@@ -517,17 +517,6 @@ smb_common_write(smb_request_t *sr, smb_rw_param_t *param)
if (rc)
return (rc);
- /*
- * Used to have code here to set mtime.
- * We have just done a write, so we know
- * the file system will update mtime.
- * No need to do it again here.
- *
- * However, keep track of the fact that
- * we have written data via this handle.
- */
- ofile->f_written = B_TRUE;
-
/* This revokes read cache delegations. */
(void) smb_oplock_break_WRITE(node, ofile);
diff --git a/usr/src/uts/common/fs/sockfs/nl7c.c b/usr/src/uts/common/fs/sockfs/nl7c.c
index c76dada8d7..a71572cbd4 100644
--- a/usr/src/uts/common/fs/sockfs/nl7c.c
+++ b/usr/src/uts/common/fs/sockfs/nl7c.c
@@ -598,7 +598,7 @@ done:
/*
* Open and read each line from "/etc/nca/ncalogd.conf" and parse for
- * the tokens and token text (i.e. key and value ncalogd.conf(4)):
+ * the tokens and token text (i.e. key and value ncalogd.conf(5)):
*
* status=enabled
*
diff --git a/usr/src/uts/common/fs/sockfs/nl7clogd.c b/usr/src/uts/common/fs/sockfs/nl7clogd.c
index 1580a08c6c..4dd40abf2d 100644
--- a/usr/src/uts/common/fs/sockfs/nl7clogd.c
+++ b/usr/src/uts/common/fs/sockfs/nl7clogd.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/sysmacros.h>
#include <sys/callb.h>
#include <sys/fcntl.h>
@@ -53,7 +51,7 @@ static void logit_flush(void *);
* NL7C reuses the NCA logging scheme, the directory "/var/nca" contains
* the symlink "current" to 1 of up to 16 NCA BLF logging files, by default
* a single logging file "log", optionally paths of up to 16 log files can
- * be specified via ncalogd.conf(4), note that these log files need not be
+ * be specified via ncalogd.conf(5), note that these log files need not be
* in the "/var/nca" directory.
*
* NL7C reuses the NCA logging APIs defined in <inet/nca/ncalogd.h>, at
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
index e7d69f9896..edcb41951c 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -458,16 +458,16 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags)
vp->v_data = so;
vn_setops(vp, socket_vnodeops);
- so->so_priv = NULL;
+ so->so_priv = NULL;
so->so_oobmsg = NULL;
so->so_proto_handle = NULL;
- so->so_peercred = NULL;
+ so->so_peercred = NULL;
so->so_rcv_queued = 0;
- so->so_rcv_q_head = NULL;
- so->so_rcv_q_last_head = NULL;
+ so->so_rcv_q_head = NULL;
+ so->so_rcv_q_last_head = NULL;
so->so_rcv_head = NULL;
so->so_rcv_last_head = NULL;
so->so_rcv_wanted = 0;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index bc2878ccc8..59d052084f 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -174,7 +174,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
/*
* Force a zero sa_family to match so_family.
*
- * Some programs like inetd(1M) don't set the
+ * Some programs like inetd(8) don't set the
* family field. Other programs leave
* sin_family set to garbage - SunOS 4.X does
* not check the family field on a bind.
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c
index d49bdbcc6d..532a24c223 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c
@@ -25,6 +25,7 @@
*/
/*
* Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright 2021 Racktop Systems, Inc.
*/
#include <sys/types.h>
@@ -68,19 +69,19 @@ static int socket_vop_ioctl(struct vnode *, int, intptr_t, int,
struct cred *, int32_t *, caller_context_t *);
static int socket_vop_setfl(struct vnode *, int, int, cred_t *,
caller_context_t *);
-static int socket_vop_getattr(struct vnode *, struct vattr *, int,
+static int socket_vop_getattr(struct vnode *, struct vattr *, int,
struct cred *, caller_context_t *);
-static int socket_vop_setattr(struct vnode *, struct vattr *, int,
+static int socket_vop_setattr(struct vnode *, struct vattr *, int,
struct cred *, caller_context_t *);
-static int socket_vop_access(struct vnode *, int, int, struct cred *,
+static int socket_vop_access(struct vnode *, int, int, struct cred *,
caller_context_t *);
-static int socket_vop_fsync(struct vnode *, int, struct cred *,
+static int socket_vop_fsync(struct vnode *, int, struct cred *,
caller_context_t *);
static void socket_vop_inactive(struct vnode *, struct cred *,
caller_context_t *);
-static int socket_vop_fid(struct vnode *, struct fid *,
+static int socket_vop_fid(struct vnode *, struct fid *,
caller_context_t *);
-static int socket_vop_seek(struct vnode *, offset_t, offset_t *,
+static int socket_vop_seek(struct vnode *, offset_t, offset_t *,
caller_context_t *);
static int socket_vop_poll(struct vnode *, short, int, short *,
struct pollhead **, caller_context_t *);
@@ -282,16 +283,23 @@ socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags,
struct cred *cr, caller_context_t *ct)
{
dev_t fsid;
- struct sonode *so;
+ struct sonode *so;
static int sonode_shift = 0;
/*
* Calculate the amount of bitshift to a sonode pointer which will
- * still keep it unique. See below.
+ * still keep it unique. See below. Note that highbit() uses
+ * 1-based indexing for the highest bit set (and 0 for 'no bits set').
+ * To use the result of highbit() as a shift value, we must subtract 1
+ * from the result.
*/
- if (sonode_shift == 0)
- sonode_shift = highbit(sizeof (struct sonode));
- ASSERT(sonode_shift > 0);
+ if (sonode_shift == 0) {
+ int bit = highbit(sizeof (struct sonode));
+
+ /* Sanity check */
+ VERIFY3S(bit, >, 0);
+ sonode_shift = bit - 1;
+ }
so = VTOSO(vp);
fsid = sockdev;
@@ -311,11 +319,17 @@ socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags,
vap->va_uid = vap->va_gid = 0;
vap->va_fsid = fsid;
/*
- * If the va_nodeid is > MAX_USHORT, then i386 stats might fail.
- * So we shift down the sonode pointer to try and get the most
- * uniqueness into 16-bits.
+ * If the va_nodeid is > UINT32_MAX, then stat(2) might fail in
+ * unexpected ways inside non-largefile aware 32-bit processes --
+ * historically, socket inode values (va_nodeid values) were capped at
+ * UINT16_MAX (for even more ancient reasons long since unnecessary).
+ * To avoid the potential of surprise failures, we shift down
+ * the sonode pointer address to try and get the most
+ * uniqueness into 32-bits. In practice, this represents the unique
+ * portion of the kernel address space, so the chance of duplicate
+ * socket inode values is minimized.
*/
- vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF;
+ vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFFFFFF;
vap->va_nlink = 0;
vap->va_size = 0;
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c
index ea161e30ae..62a079f419 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter.c
+++ b/usr/src/uts/common/fs/sockfs/sockfilter.c
@@ -42,7 +42,7 @@
*
* Socket filter entry (sof_entry_t):
*
- * There exists one entry for each configured filter (done via soconfig(1M)),
+ * There exists one entry for each configured filter (done via soconfig(8)),
* and they are all in sof_entry_list. In addition to the global list, each
* sockparams entry maintains a list of filters that is interested in that
* particular socket type. So the filter entry may be referenced by multiple
diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
index cf2ad8b20d..e63831e172 100644
--- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
+++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h
@@ -80,7 +80,7 @@ struct sof_entry_kstat {
/*
* Socket filter entry - one for each configured filter (added and
- * removed by soconfig(1M)).
+ * removed by soconfig(8)).
*
* sofe_flags, sofe_refcnt and sofe_mod are protected by sofe_lock, and all
* other fields are write once.
@@ -106,7 +106,7 @@ struct sof_entry {
/* Filter entry flags */
#define SOFEF_AUTO 0x1 /* automatic filter */
#define SOFEF_PROG 0x2 /* programmatic filter */
-#define SOFEF_CONDEMED 0x4 /* removed by soconfig(1M) */
+#define SOFEF_CONDEMED 0x4 /* removed by soconfig(8) */
/*
* Socket filter instance - one for each socket using a sof_entry_t
diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c
index 1015decaac..86cbced50c 100644
--- a/usr/src/uts/common/fs/sockfs/sockparams.c
+++ b/usr/src/uts/common/fs/sockfs/sockparams.c
@@ -64,7 +64,7 @@ static int sockparams_sdev_init(struct sockparams *, char *, int);
static void sockparams_sdev_fini(struct sockparams *);
/*
- * Global sockparams list (populated via soconfig(1M)).
+ * Global sockparams list (populated via soconfig(8)).
*/
static list_t sphead;
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 2c010343bb..e686978fd0 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -23,8 +23,8 @@
* Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015, Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
* Copyright 2015, Joyent, Inc. All rights reserved.
+ * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
#include <sys/types.h>
@@ -962,7 +962,46 @@ so_closefds(void *control, t_uscalar_t controllen, int oldflg,
(int)CMSG_CONTENTLEN(cmsg),
startoff - (int)sizeof (struct cmsghdr));
}
- startoff -= cmsg->cmsg_len;
+ startoff -= ROUNDUP_cmsglen(cmsg->cmsg_len);
+ }
+}
+
+/*
+ * Handle truncation of a cmsg when the receive buffer is not big enough.
+ * Adjust the cmsg_len header field in the last cmsg that will be included in
+ * the buffer to reflect the number of bytes included.
+ */
+void
+so_truncatecmsg(void *control, t_uscalar_t controllen, uint_t maxlen)
+{
+ struct cmsghdr *cmsg;
+ uint_t len = 0;
+
+ if (control == NULL)
+ return;
+
+ for (cmsg = control;
+ CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
+ cmsg = CMSG_NEXT(cmsg)) {
+
+ len += ROUNDUP_cmsglen(cmsg->cmsg_len);
+
+ if (len > maxlen) {
+ /*
+ * This cmsg is the last one that will be included in
+ * the truncated buffer.
+ */
+ socklen_t diff = len - maxlen;
+
+ if (diff < CMSG_CONTENTLEN(cmsg)) {
+ dprint(1, ("so_truncatecmsg: %d -> %d\n",
+ cmsg->cmsg_len, cmsg->cmsg_len - diff));
+ cmsg->cmsg_len -= diff;
+ } else {
+ cmsg->cmsg_len = sizeof (struct cmsghdr);
+ }
+ break;
+ }
}
}
@@ -1282,8 +1321,24 @@ so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
cmsg->cmsg_level = tohp->level;
cmsg->cmsg_type = tohp->name;
- cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
- sizeof (struct cmsghdr));
+ cmsg->cmsg_len = (socklen_t)sizeof (struct cmsghdr);
+ if (tohp->level == IPPROTO_IP &&
+ (tohp->name == IP_RECVTOS ||
+ tohp->name == IP_RECVTTL)) {
+ /*
+ * The data for these is a uint8_t but, in
+ * order to maintain alignment for any
+ * following TPI primitives in the message,
+ * there will be some trailing padding bytes
+ * which are included in the TPI_TOPT_DATALEN.
+ * For these types, we set the cmsg_len
+ * explicitly to the correct value.
+ */
+ cmsg->cmsg_len += (socklen_t)sizeof (uint8_t);
+ } else {
+ cmsg->cmsg_len +=
+ (socklen_t)(_TPI_TOPT_DATALEN(tohp));
+ }
/* copy content to control data part */
bcopy(&tohp[1], CMSG_CONTENT(cmsg),
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index 6a049b1828..30666f73ca 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -24,6 +24,7 @@
* Copyright 2015, Joyent, Inc. All rights reserved.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
#include <sys/types.h>
@@ -831,7 +832,7 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
void *name;
socklen_t namelen;
void *control;
- socklen_t controllen;
+ socklen_t controllen, free_controllen;
ssize_t len;
int error;
@@ -858,6 +859,8 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
lwp_stat_update(LWP_STAT_MSGRCV, 1);
releasef(sock);
+ free_controllen = msg->msg_controllen;
+
error = copyout_name(name, namelen, namelenp,
msg->msg_name, msg->msg_namelen);
if (error)
@@ -887,11 +890,7 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
goto err;
}
}
- /*
- * Note: This MUST be done last. There can be no "goto err" after this
- * point since it could make so_closefds run twice on some part
- * of the file descriptor array.
- */
+
if (controllen != 0) {
if (!(flags & MSG_XPG4_2)) {
/*
@@ -900,36 +899,65 @@ recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
*/
controllen &= ~((int)sizeof (uint32_t) - 1);
}
+
+ if (msg->msg_controllen > controllen || control == NULL) {
+ /*
+ * If the truncated part contains file descriptors,
+ * then they must be closed in the kernel as they
+ * will not be included in the data returned to
+ * user space. Close them now so that the header size
+ * can be safely adjusted prior to copyout. In case of
+ * an error during copyout, the remaining file
+ * descriptors will be closed in the error handler
+ * below.
+ */
+ so_closefds(msg->msg_control, msg->msg_controllen,
+ !(flags & MSG_XPG4_2),
+ control == NULL ? 0 : controllen);
+
+ /*
+ * In the case of a truncated control message, the last
+ * cmsg header that fits into the available buffer
+ * space must be adjusted to reflect the actual amount
+ * of associated data that will be returned. This only
+ * needs to be done for XPG4 messages as non-XPG4
+ * messages are not structured (they are just a
+ * buffer and a length - msg_accrights(len)).
+ */
+ if (control != NULL && (flags & MSG_XPG4_2)) {
+ so_truncatecmsg(msg->msg_control,
+ msg->msg_controllen, controllen);
+ msg->msg_controllen = controllen;
+ }
+ }
+
error = copyout_arg(control, controllen, controllenp,
msg->msg_control, msg->msg_controllen);
+
if (error)
goto err;
- if (msg->msg_controllen > controllen || control == NULL) {
- if (control == NULL)
- controllen = 0;
- so_closefds(msg->msg_control, msg->msg_controllen,
- !(flags & MSG_XPG4_2), controllen);
- }
}
if (msg->msg_namelen != 0)
kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
- if (msg->msg_controllen != 0)
- kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
+ if (free_controllen != 0)
+ kmem_free(msg->msg_control, (size_t)free_controllen);
return (len - uiop->uio_resid);
err:
/*
* If we fail and the control part contains file descriptors
- * we have to close the fd's.
+ * we have to close them. For a truncated control message, the
+ * descriptors which were cut off have already been closed and the
+ * length adjusted so that they will not be closed again.
*/
if (msg->msg_controllen != 0)
so_closefds(msg->msg_control, msg->msg_controllen,
!(flags & MSG_XPG4_2), 0);
if (msg->msg_namelen != 0)
kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
- if (msg->msg_controllen != 0)
- kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
+ if (free_controllen != 0)
+ kmem_free(msg->msg_control, (size_t)free_controllen);
return (set_errno(error));
}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index b8d83105e8..0e9883498b 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -865,7 +865,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
/*
* Force a zero sa_family to match so_family.
*
- * Some programs like inetd(1M) don't set the
+ * Some programs like inetd(8) don't set the
* family field. Other programs leave
* sin_family set to garbage - SunOS 4.X does
* not check the family field on a bind.
@@ -6518,7 +6518,7 @@ socktpi_init(void)
{
/*
* Create sonode caches. We create a special one for AF_UNIX so
- * that we can track them for netstat(1m).
+ * that we can track them for netstat(8).
*/
socktpi_cache = kmem_cache_create("socktpi_cache",
sizeof (struct sotpi_sonode), 0, socktpi_constructor,
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
index 1a620642cc..b28ced7111 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c
@@ -54,6 +54,11 @@ static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
#define T_HASH_SIZE 8192 /* must be power of 2 */
#define T_MUTEX_SIZE 64
+/* Non-static so compilers won't constant-fold these away. */
+clock_t tmpfs_rename_backoff_delay = 1;
+unsigned int tmpfs_rename_backoff_tries = 0;
+unsigned long tmpfs_rename_loops = 0;
+
static struct tdirent *t_hashtable[T_HASH_SIZE];
static kmutex_t t_hashmutex[T_MUTEX_SIZE];
@@ -266,8 +271,65 @@ tdirenter(
* to see if it has been removed while it was unlocked.
*/
if (op == DE_LINK || op == DE_RENAME) {
- if (tp != dir)
- rw_enter(&tp->tn_rwlock, RW_WRITER);
+ if (tp != dir) {
+ unsigned int tries = 0;
+
+ /*
+ * If we are acquiring tp->tn_rwlock (for SOURCE)
+ * inside here, we must consider the following:
+ *
+ * - dir->tn_rwlock (TARGET) is already HELD (see
+ * above ASSERT()).
+ *
+ * - It is possible our SOURCE is a parent of our
+ * TARGET. Yes it's unusual, but it will return an
+ * error below via tdircheckpath().
+ *
+ * - It is also possible that another thread,
+ * concurrent to this one, is performing
+ * rmdir(TARGET), which means it will first acquire
+ * SOURCE's lock, THEN acquire TARGET's lock, which
+ * could result in this thread holding TARGET and
+ * trying for SOURCE, but the other thread holding
+ * SOURCE and trying for TARGET. This is deadlock,
+ * and it's inducible.
+ *
+ * To prevent this, we borrow some techniques from UFS
+ * and rw_tryenter(), delaying if we fail, and
+ * if someone tweaks the number of backoff tries to be
+ * nonzero, return EBUSY after that number of tries.
+ */
+ while (!rw_tryenter(&tp->tn_rwlock, RW_WRITER)) {
+ /*
+ * Sloppy, but this is a diagnostic so atomic
+ * increment would be overkill.
+ */
+ tmpfs_rename_loops++;
+
+ if (tmpfs_rename_backoff_tries != 0) {
+ if (tries > tmpfs_rename_backoff_tries)
+ return (EBUSY);
+ tries++;
+ }
+ /*
+ * NOTE: We're still holding dir->tn_rwlock,
+ * so drop it over the delay, so any other
+ * thread can get its business done.
+ *
+ * No state change or state inspection happens
+ * prior to here, so it is not wholly dangerous
+ * to release-and-reacquire dir->tn_rwlock.
+ *
+ * Hold the vnode of dir in case it gets
+ * released by another thread, though.
+ */
+ VN_HOLD(TNTOV(dir));
+ rw_exit(&dir->tn_rwlock);
+ delay(tmpfs_rename_backoff_delay);
+ rw_enter(&dir->tn_rwlock, RW_WRITER);
+ VN_RELE(TNTOV(dir));
+ }
+ }
mutex_enter(&tp->tn_tlock);
if (tp->tn_nlink == 0) {
mutex_exit(&tp->tn_tlock);
@@ -928,7 +990,7 @@ tdiraddentry(
tm = TNTOTM(dir);
namelen = strlen(name) + 1;
alloc_size = namelen + sizeof (struct tdirent);
- tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP | KM_NORMALPRI);
+ tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP_LAZY);
if (tdp == NULL)
return (ENOSPC);
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
index c52a6f7c77..24310fefe5 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c
@@ -325,8 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
goto out;
}
- if ((tm = kmem_zalloc(sizeof (struct tmount),
- KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
+ if ((tm = kmem_zalloc(sizeof (struct tmount), KM_NOSLEEP_LAZY)) ==
+ NULL) {
pn_free(&dpn);
error = ENOMEM;
goto out;
diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
index a356f22750..cbe19aefea 100644
--- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
+++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c
@@ -1645,7 +1645,7 @@ tmp_symlink(
return (error);
}
len = strlen(tnm) + 1;
- cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP | KM_NORMALPRI);
+ cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP_LAZY);
if (cp == NULL) {
tmpnode_rele(self);
return (ENOSPC);
diff --git a/usr/src/uts/common/fs/ufs/lufs_log.c b/usr/src/uts/common/fs/ufs/lufs_log.c
index 2ec3f7907c..052c53d507 100644
--- a/usr/src/uts/common/fs/ufs/lufs_log.c
+++ b/usr/src/uts/common/fs/ufs/lufs_log.c
@@ -1591,7 +1591,7 @@ ldl_seterror(ml_unit_t *ul, char *why)
cmn_err(CE_WARN, "%s", why);
cmn_err(CE_WARN, "ufs log for %s changed state to Error",
ul->un_ufsvfs->vfs_fs->fs_fsmnt);
- cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
+ cmn_err(CE_WARN, "Please umount(8) %s and run fsck(8)",
ul->un_ufsvfs->vfs_fs->fs_fsmnt);
/*
diff --git a/usr/src/uts/common/fs/ufs/ufs_alloc.c b/usr/src/uts/common/fs/ufs/ufs_alloc.c
index ee7d99143e..3b052f75c0 100644
--- a/usr/src/uts/common/fs/ufs/ufs_alloc.c
+++ b/usr/src/uts/common/fs/ufs/ufs_alloc.c
@@ -381,7 +381,7 @@ loop:
rw_exit(&ip->i_contents);
VN_RELE(ITOV(ip));
cmn_err(CE_WARN,
- "%s: unexpected allocated inode %d, run fsck(1M)%s",
+ "%s: unexpected allocated inode %d, run fsck(8)%s",
fs->fs_fsmnt, (int)ino,
(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
goto loop;
@@ -406,7 +406,7 @@ loop:
if (ip->i_size) {
cmn_err(CE_WARN,
- "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
+ "%s: free inode %d had size 0x%llx, run fsck(8)%s",
fs->fs_fsmnt, (int)ino, ip->i_size,
(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
}
diff --git a/usr/src/uts/common/fs/ufs/ufs_dir.c b/usr/src/uts/common/fs/ufs/ufs_dir.c
index 8035e76025..02f7e57fcd 100644
--- a/usr/src/uts/common/fs/ufs/ufs_dir.c
+++ b/usr/src/uts/common/fs/ufs/ufs_dir.c
@@ -2870,9 +2870,6 @@ ufs_dirpurgedotdot(
* Scan the directoy. If clr_dotdot is true clear the ..
* directory else check to see if the directory is empty.
*
- * Using a struct dirtemplate here is not precisely
- * what we want, but better than using a struct direct.
- *
* clr_dotdot is used as a flag to tell us if we need
* to clear the dotdot entry
*
@@ -2886,20 +2883,19 @@ ufs_dirscan(
int clr_dotdot)
{
offset_t off;
- struct dirtemplate dbuf;
- struct direct *dp = (struct direct *)&dbuf;
+ struct tmp_dir dbuf, *dp;
int err, count;
int empty = 1; /* Assume it's empty */
-#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
+ dp = &dbuf;
ASSERT(RW_LOCK_HELD(&ip->i_contents));
ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
for (off = 0; off < ip->i_size; off += dp->d_reclen) {
err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
- (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
+ sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr);
/*
- * Since we read MINDIRSIZ, residual must
+ * Since we read sizeof (struct tmp_dir), residual must
* be 0 unless we're at end of file.
*/
if (err || count != 0 || dp->d_reclen == 0) {
@@ -3108,20 +3104,19 @@ int
ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
{
offset_t off;
- struct dirtemplate dbuf;
- struct direct *dp = (struct direct *)&dbuf;
+ struct tmp_dir dbuf, *dp;
int err, count;
int empty = 1; /* Assume it's empty */
-#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
+ dp = &dbuf;
ASSERT(RW_LOCK_HELD(&ip->i_contents));
ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
for (off = 0; off < ip->i_size; off += dp->d_reclen) {
err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
- (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
+ sizeof (struct tmp_dir), off, UIO_SYSSPACE, &count, cr);
/*
- * Since we read MINDIRSIZ, residual must
+ * Since we read sizeof (struct tmp_dir), residual must
* be 0 unless we're at end of file.
*/
diff --git a/usr/src/uts/common/fs/ufs/ufs_inode.c b/usr/src/uts/common/fs/ufs/ufs_inode.c
index 05f23a6d29..35b66b203c 100644
--- a/usr/src/uts/common/fs/ufs/ufs_inode.c
+++ b/usr/src/uts/common/fs/ufs/ufs_inode.c
@@ -24,7 +24,7 @@
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
-/* All Rights Reserved */
+/* All Rights Reserved */
/*
* University Copyright- Copyright (c) 1982, 1986, 1988
@@ -107,6 +107,7 @@ union ihead *ihead; /* inode LRU cache, Chris Maltby */
kmutex_t *ih_lock; /* protect inode cache hash table */
static int ino_hashlen = 4; /* desired average hash chain length */
int inohsz; /* number of buckets in the hash table */
+struct timeval32 iuniqtime;
kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */
kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */
@@ -611,7 +612,7 @@ again:
vp->v_vfsp = &EIO_vfs;
VN_RELE(vp);
cmn_err(CE_NOTE,
- "%s: unexpected free inode %d, run fsck(1M)%s",
+ "%s: unexpected free inode %d, run fsck(8)%s",
fs->fs_fsmnt, (int)ino,
(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
return (EIO);
@@ -838,8 +839,8 @@ ufs_iupdat(struct inode *ip, int waitfor)
struct buf *bp;
struct fs *fp;
struct dinode *dp;
- struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
- int i;
+ struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
+ int i;
int do_trans_times;
ushort_t flag;
o_uid_t suid;
diff --git a/usr/src/uts/common/fs/ufs/ufs_vfsops.c b/usr/src/uts/common/fs/ufs/ufs_vfsops.c
index afd43e7e63..390319bfef 100644
--- a/usr/src/uts/common/fs/ufs/ufs_vfsops.c
+++ b/usr/src/uts/common/fs/ufs/ufs_vfsops.c
@@ -983,7 +983,7 @@ mountfs(struct vfs *vfsp, enum whymountroot why, struct vnode *devvp,
*/
if (!(vfsp->vfs_flag & VFS_RDONLY)) {
cmn_err(CE_WARN, "Error accessing ufs "
- "log for %s; Please run fsck(1M)", path);
+ "log for %s; Please run fsck(8)", path);
goto out;
}
}
diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c
index 9e0b071999..953ee80471 100644
--- a/usr/src/uts/common/fs/vnode.c
+++ b/usr/src/uts/common/fs/vnode.c
@@ -21,9 +21,11 @@
/*
* Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
+ * Copyright 2022 Spencer Evans-Cole.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -51,6 +53,7 @@
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/vnode.h>
+#include <sys/filio.h>
#include <sys/rwstlock.h>
#include <sys/fem.h>
#include <sys/stat.h>
@@ -841,18 +844,48 @@ done:
void
vn_rele(vnode_t *vp)
{
+ mutex_enter(&vp->v_lock);
+ if (vp->v_count == 1) {
+ mutex_exit(&vp->v_lock);
+ VOP_INACTIVE(vp, CRED(), NULL);
+ return;
+ }
VERIFY(vp->v_count > 0);
+ VN_RELE_LOCKED(vp);
+ mutex_exit(&vp->v_lock);
+}
+
+void
+vn_phantom_rele(vnode_t *vp)
+{
mutex_enter(&vp->v_lock);
+ VERIFY3U(vp->v_count, >=, vp->v_phantom_count);
+ vp->v_phantom_count--;
+ DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp);
if (vp->v_count == 1) {
+ ASSERT0(vp->v_phantom_count);
mutex_exit(&vp->v_lock);
VOP_INACTIVE(vp, CRED(), NULL);
return;
}
+ VERIFY(vp->v_count > 0);
VN_RELE_LOCKED(vp);
mutex_exit(&vp->v_lock);
}
/*
+ * Return the number of non-phantom holds. Things such as portfs will use
+ * phantom holds to prevent it from blocking filesystems from mounting over
+ * watched directories.
+ */
+uint_t
+vn_count(vnode_t *vp)
+{
+ ASSERT(MUTEX_HELD(&vp->v_lock));
+ return (vp->v_count - vp->v_phantom_count);
+}
+
+/*
* Release a vnode referenced by the DNLC. Multiple DNLC references are treated
* as a single reference, so v_count is not decremented until the last DNLC hold
* is released. This makes it possible to distinguish vnodes that are referenced
@@ -861,8 +894,8 @@ vn_rele(vnode_t *vp)
void
vn_rele_dnlc(vnode_t *vp)
{
- VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
mutex_enter(&vp->v_lock);
+ VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0));
if (--vp->v_count_dnlc == 0) {
if (vp->v_count == 1) {
mutex_exit(&vp->v_lock);
@@ -884,7 +917,6 @@ vn_rele_dnlc(vnode_t *vp)
void
vn_rele_stream(vnode_t *vp)
{
- VERIFY(vp->v_count > 0);
mutex_enter(&vp->v_lock);
vp->v_stream = NULL;
if (vp->v_count == 1) {
@@ -892,6 +924,7 @@ vn_rele_stream(vnode_t *vp)
VOP_INACTIVE(vp, CRED(), NULL);
return;
}
+ VERIFY(vp->v_count > 0);
VN_RELE_LOCKED(vp);
mutex_exit(&vp->v_lock);
}
@@ -915,7 +948,6 @@ vn_rele_inactive(vnode_t *vp)
void
vn_rele_async(vnode_t *vp, taskq_t *taskq)
{
- VERIFY(vp->v_count > 0);
mutex_enter(&vp->v_lock);
if (vp->v_count == 1) {
mutex_exit(&vp->v_lock);
@@ -923,6 +955,7 @@ vn_rele_async(vnode_t *vp, taskq_t *taskq)
vp, TQ_SLEEP) != TASKQID_INVALID);
return;
}
+ VERIFY(vp->v_count > 0);
VN_RELE_LOCKED(vp);
mutex_exit(&vp->v_lock);
}
@@ -1133,7 +1166,20 @@ top:
* Do remaining checks for FNOFOLLOW and FNOLINKS.
*/
if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
- error = ELOOP;
+ /*
+ * The __FLXPATH flag is a private interface for use by the lx
+ * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
+ * when a symbolic link is encountered, returns a file
+ * descriptor which references it.
+ * See uts/common/brand/lx/syscall/lx_open.c
+ *
+ * When this flag is set, VOP_OPEN() is not called (for a
+ * symlink, most filesystems will return ENOSYS anyway)
+ * and the link's vnode is returned to be linked to the
+ * file descriptor.
+ */
+ if ((filemode & __FLXPATH) == 0)
+ error = ELOOP;
goto out;
}
if (filemode & FNOLINKS) {
@@ -1223,6 +1269,22 @@ top:
if ((error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)) != 0)
goto out;
}
+
+ /*
+ * Turn on directio, if requested.
+ */
+ if (filemode & FDIRECT) {
+ if ((error = VOP_IOCTL(vp, _FIODIRECTIO, DIRECTIO_ON, 0,
+ CRED(), NULL, NULL)) != 0) {
+ /*
+ * On Linux, O_DIRECT returns EINVAL when the file
+ * system does not support directio, so we'll do the
+ * same.
+ */
+ error = EINVAL;
+ goto out;
+ }
+ }
out:
ASSERT(vp->v_count > 0);
@@ -2428,6 +2490,7 @@ vn_reinit(vnode_t *vp)
{
vp->v_count = 1;
vp->v_count_dnlc = 0;
+ vp->v_phantom_count = 0;
vp->v_vfsp = NULL;
vp->v_stream = NULL;
vp->v_vfsmountedhere = NULL;
@@ -2484,6 +2547,7 @@ vn_free(vnode_t *vp)
*/
ASSERT((vp->v_count == 0) || (vp->v_count == 1));
ASSERT(vp->v_count_dnlc == 0);
+ ASSERT0(vp->v_phantom_count);
VERIFY(vp->v_path != NULL);
if (vp->v_path != vn_vpath_empty) {
kmem_free(vp->v_path, strlen(vp->v_path) + 1);
diff --git a/usr/src/uts/common/fs/xattr.c b/usr/src/uts/common/fs/xattr.c
index 2326a42747..ffa68a362e 100644
--- a/usr/src/uts/common/fs/xattr.c
+++ b/usr/src/uts/common/fs/xattr.c
@@ -28,7 +28,7 @@
*
* The Solaris VFS layer presents extended file attributes using a special
* "XATTR" directory under files or directories that have extended file
- * attributes. See fsattr(5) for background.
+ * attributes. See fsattr(7) for background.
*
* This design avoids the need for a separate set of VFS or vnode functions
* for operating on XATTR objects. File system implementations that support
diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c
index 66a7a49d73..b841a8f38e 100644
--- a/usr/src/uts/common/fs/zfs/abd.c
+++ b/usr/src/uts/common/fs/zfs/abd.c
@@ -12,6 +12,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
*/
/*
@@ -218,7 +219,7 @@ abd_init(void)
* Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH
* so that no allocator metadata is stored with the buffers.
*/
- abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+ abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 64,
NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH);
abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
@@ -764,7 +765,8 @@ abd_iter_map(struct abd_iter *aiter)
} else {
size_t index = abd_iter_scatter_chunk_index(aiter);
offset = abd_iter_scatter_chunk_offset(aiter);
- aiter->iter_mapsize = zfs_abd_chunk_size - offset;
+ aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index];
}
aiter->iter_mapaddr = (char *)paddr + offset;
@@ -993,3 +995,180 @@ abd_cmp(abd_t *dabd, abd_t *sabd, size_t size)
{
return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL));
}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @dabd data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ * is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+ int i;
+ ssize_t len, dlen;
+ struct abd_iter caiters[3];
+ struct abd_iter daiter = {0};
+ void *caddrs[3];
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++)
+ abd_iter_init(&caiters[i], cabds[i]);
+
+ if (dabd)
+ abd_iter_init(&daiter, dabd);
+
+ ASSERT3S(dsize, >=, 0);
+
+#ifdef _KERNEL
+ kpreempt_disable();
+#endif
+ while (csize > 0) {
+ len = csize;
+
+ if (dabd && dsize > 0)
+ abd_iter_map(&daiter);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&caiters[i]);
+ caddrs[i] = caiters[i].iter_mapaddr;
+ }
+
+ switch (parity) {
+ case 3:
+ len = MIN(caiters[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(caiters[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(caiters[0].iter_mapsize, len);
+ }
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+
+ if (dabd && dsize > 0) {
+ /* this needs precise iter.length */
+ len = MIN(daiter.iter_mapsize, len);
+ len = MIN(dsize, len);
+ dlen = len;
+ } else
+ dlen = 0;
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&caiters[i]);
+ abd_iter_advance(&caiters[i], len);
+ }
+
+ if (dabd && dsize > 0) {
+ abd_iter_unmap(&daiter);
+ abd_iter_advance(&daiter, dlen);
+ dsize -= dlen;
+ }
+
+ csize -= len;
+
+ ASSERT3S(dsize, >=, 0);
+ ASSERT3S(csize, >=, 0);
+ }
+#ifdef _KERNEL
+ kpreempt_enable();
+#endif
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @tabds rec target ABDs, at most 3
+ * @tsize size of data target columns
+ * @func_raidz_rec expects syndrome data in target columns. Function
+ * reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul)
+{
+ int i;
+ ssize_t len;
+ struct abd_iter citers[3];
+ struct abd_iter xiters[3];
+ void *caddrs[3], *xaddrs[3];
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_init(&citers[i], cabds[i]);
+ abd_iter_init(&xiters[i], tabds[i]);
+ }
+
+#ifdef _KERNEL
+ kpreempt_disable();
+#endif
+ while (tsize > 0) {
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&citers[i]);
+ abd_iter_map(&xiters[i]);
+ caddrs[i] = citers[i].iter_mapaddr;
+ xaddrs[i] = xiters[i].iter_mapaddr;
+ }
+
+ len = tsize;
+ switch (parity) {
+ case 3:
+ len = MIN(xiters[2].iter_mapsize, len);
+ len = MIN(citers[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(xiters[1].iter_mapsize, len);
+ len = MIN(citers[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(xiters[0].iter_mapsize, len);
+ len = MIN(citers[0].iter_mapsize, len);
+ }
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_rec(xaddrs, len, caddrs, mul);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&xiters[i]);
+ abd_iter_unmap(&citers[i]);
+ abd_iter_advance(&xiters[i], len);
+ abd_iter_advance(&citers[i], len);
+ }
+
+ tsize -= len;
+ ASSERT3S(tsize, >=, 0);
+ }
+#ifdef _KERNEL
+ kpreempt_enable();
+#endif
+}
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 9e04e5e00d..12b5872cdc 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -24,6 +24,12 @@
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2019, Delphix. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ * under sponsorship from the FreeBSD Foundation.
*/
/*
@@ -294,6 +300,7 @@
#include <sys/kstat.h>
#include <sys/zthr.h>
#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
#include <sys/aggsum.h>
#include <sys/cityhash.h>
#include <sys/param.h>
@@ -408,54 +415,6 @@ uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */
boolean_t zfs_compressed_arc_enabled = B_TRUE;
-/*
- * Note that buffers can be in one of 6 states:
- * ARC_anon - anonymous (discussed below)
- * ARC_mru - recently used, currently cached
- * ARC_mru_ghost - recentely used, no longer in cache
- * ARC_mfu - frequently used, currently cached
- * ARC_mfu_ghost - frequently used, no longer in cache
- * ARC_l2c_only - exists in L2ARC but not other states
- * When there are no active references to the buffer, they are
- * are linked onto a list in one of these arc states. These are
- * the only buffers that can be evicted or deleted. Within each
- * state there are multiple lists, one for meta-data and one for
- * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
- * etc.) is tracked separately so that it can be managed more
- * explicitly: favored over data, limited explicitly.
- *
- * Anonymous buffers are buffers that are not associated with
- * a DVA. These are buffers that hold dirty block copies
- * before they are written to stable storage. By definition,
- * they are "ref'd" and are considered part of arc_mru
- * that cannot be freed. Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru list.
- *
- * The ARC_l2c_only state is for buffers that are in the second
- * level ARC but no longer in any of the ARC_m* lists. The second
- * level ARC itself may also contain buffers that are in any of
- * the ARC_m* states - meaning that a buffer can exist in two
- * places. The reason for the ARC_l2c_only state is to keep the
- * buffer header in the hash table, so that reads that hit the
- * second level ARC benefit from these fast lookups.
- */
-
-typedef struct arc_state {
- /*
- * list of evictable buffers
- */
- multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
- /*
- * total amount of evictable data in this state
- */
- zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
- /*
- * total amount of data in this state; this includes: evictable,
- * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
- */
- zfs_refcount_t arcs_size;
-} arc_state_t;
-
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -464,263 +423,7 @@ static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
static arc_state_t ARC_l2c_only;
-typedef struct arc_stats {
- kstat_named_t arcstat_hits;
- kstat_named_t arcstat_misses;
- kstat_named_t arcstat_demand_data_hits;
- kstat_named_t arcstat_demand_data_misses;
- kstat_named_t arcstat_demand_metadata_hits;
- kstat_named_t arcstat_demand_metadata_misses;
- kstat_named_t arcstat_prefetch_data_hits;
- kstat_named_t arcstat_prefetch_data_misses;
- kstat_named_t arcstat_prefetch_metadata_hits;
- kstat_named_t arcstat_prefetch_metadata_misses;
- kstat_named_t arcstat_mru_hits;
- kstat_named_t arcstat_mru_ghost_hits;
- kstat_named_t arcstat_mfu_hits;
- kstat_named_t arcstat_mfu_ghost_hits;
- kstat_named_t arcstat_deleted;
- /*
- * Number of buffers that could not be evicted because the hash lock
- * was held by another thread. The lock may not necessarily be held
- * by something using the same buffer, since hash locks are shared
- * by multiple buffers.
- */
- kstat_named_t arcstat_mutex_miss;
- /*
- * Number of buffers skipped when updating the access state due to the
- * header having already been released after acquiring the hash lock.
- */
- kstat_named_t arcstat_access_skip;
- /*
- * Number of buffers skipped because they have I/O in progress, are
- * indirect prefetch buffers that have not lived long enough, or are
- * not from the spa we're trying to evict from.
- */
- kstat_named_t arcstat_evict_skip;
- /*
- * Number of times arc_evict_state() was unable to evict enough
- * buffers to reach its target amount.
- */
- kstat_named_t arcstat_evict_not_enough;
- kstat_named_t arcstat_evict_l2_cached;
- kstat_named_t arcstat_evict_l2_eligible;
- kstat_named_t arcstat_evict_l2_ineligible;
- kstat_named_t arcstat_evict_l2_skip;
- kstat_named_t arcstat_hash_elements;
- kstat_named_t arcstat_hash_elements_max;
- kstat_named_t arcstat_hash_collisions;
- kstat_named_t arcstat_hash_chains;
- kstat_named_t arcstat_hash_chain_max;
- kstat_named_t arcstat_p;
- kstat_named_t arcstat_c;
- kstat_named_t arcstat_c_min;
- kstat_named_t arcstat_c_max;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_size;
- /*
- * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
- * Note that the compressed bytes may match the uncompressed bytes
- * if the block is either not compressed or compressed arc is disabled.
- */
- kstat_named_t arcstat_compressed_size;
- /*
- * Uncompressed size of the data stored in b_pabd. If compressed
- * arc is disabled then this value will be identical to the stat
- * above.
- */
- kstat_named_t arcstat_uncompressed_size;
- /*
- * Number of bytes stored in all the arc_buf_t's. This is classified
- * as "overhead" since this data is typically short-lived and will
- * be evicted from the arc when it becomes unreferenced unless the
- * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
- * values have been set (see comment in dbuf.c for more information).
- */
- kstat_named_t arcstat_overhead_size;
- /*
- * Number of bytes consumed by internal ARC structures necessary
- * for tracking purposes; these structures are not actually
- * backed by ARC buffers. This includes arc_buf_hdr_t structures
- * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
- * caches), and arc_buf_t structures (allocated via arc_buf_t
- * cache).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_hdr_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_DATA. This is generally consumed by buffers backing
- * on disk user data (e.g. plain file contents).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_data_size;
- /*
- * Number of bytes consumed by ARC buffers of type equal to
- * ARC_BUFC_METADATA. This is generally consumed by buffers
- * backing on disk data that is used for internal ZFS
- * structures (e.g. ZAP, dnode, indirect blocks, etc).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_metadata_size;
- /*
- * Number of bytes consumed by various buffers and structures
- * not actually backed with ARC buffers. This includes bonus
- * buffers (allocated directly via zio_buf_* functions),
- * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
- * cache), and dnode_t structures (allocated via dnode_t cache).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_other_size;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_anon state. This includes *all* buffers in the arc_anon
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_anon state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_anon_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mru state. This includes *all* buffers in the arc_mru
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_size;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_DATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that meet the
- * following criteria: backing buffers of type ARC_BUFC_METADATA,
- * residing in the arc_mru state, and are eligible for eviction
- * (e.g. have no outstanding holds on the buffer).
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mru_ghost state. The key thing to note
- * here, is the fact that this size doesn't actually indicate
- * RAM consumption. The ghost lists only consist of headers and
- * don't actually have ARC buffers linked off of these headers.
- * Thus, *if* the headers had associated ARC buffers, these
- * buffers *would have* consumed this number of bytes.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mru_ghost_evictable_metadata;
- /*
- * Total number of bytes consumed by ARC buffers residing in the
- * arc_mfu state. This includes *all* buffers in the arc_mfu
- * state; e.g. data, metadata, evictable, and unevictable buffers
- * are all included in this value.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_size;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
- * state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_data;
- /*
- * Number of bytes consumed by ARC buffers that are eligible for
- * eviction, of type ARC_BUFC_METADATA, and reside in the
- * arc_mfu state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_evictable_metadata;
- /*
- * Total number of bytes that *would have been* consumed by ARC
- * buffers in the arc_mfu_ghost state. See the comment above
- * arcstat_mru_ghost_size for more details.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_size;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_data;
- /*
- * Number of bytes that *would have been* consumed by ARC
- * buffers that are eligible for eviction, of type
- * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
- * Not updated directly; only synced in arc_kstat_update.
- */
- kstat_named_t arcstat_mfu_ghost_evictable_metadata;
- kstat_named_t arcstat_l2_hits;
- kstat_named_t arcstat_l2_misses;
- kstat_named_t arcstat_l2_feeds;
- kstat_named_t arcstat_l2_rw_clash;
- kstat_named_t arcstat_l2_read_bytes;
- kstat_named_t arcstat_l2_write_bytes;
- kstat_named_t arcstat_l2_writes_sent;
- kstat_named_t arcstat_l2_writes_done;
- kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_lock_retry;
- kstat_named_t arcstat_l2_evict_lock_retry;
- kstat_named_t arcstat_l2_evict_reading;
- kstat_named_t arcstat_l2_evict_l1cached;
- kstat_named_t arcstat_l2_free_on_write;
- kstat_named_t arcstat_l2_abort_lowmem;
- kstat_named_t arcstat_l2_cksum_bad;
- kstat_named_t arcstat_l2_io_error;
- kstat_named_t arcstat_l2_lsize;
- kstat_named_t arcstat_l2_psize;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_memory_throttle_count;
- /* Not updated directly; only synced in arc_kstat_update. */
- kstat_named_t arcstat_meta_used;
- kstat_named_t arcstat_meta_limit;
- kstat_named_t arcstat_meta_max;
- kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_async_upgrade_sync;
- kstat_named_t arcstat_demand_hit_predictive_prefetch;
- kstat_named_t arcstat_demand_hit_prescient_prefetch;
-} arc_stats_t;
-
-static arc_stats_t arc_stats = {
+arc_stats_t arc_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "demand_data_hits", KSTAT_DATA_UINT64 },
@@ -742,6 +445,8 @@ static arc_stats_t arc_stats = {
{ "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 },
+ { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
{ "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
@@ -778,6 +483,11 @@ static arc_stats_t arc_stats = {
{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_prefetch_asize", KSTAT_DATA_UINT64 },
+ { "l2_mru_asize", KSTAT_DATA_UINT64 },
+ { "l2_mfu_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_data_asize", KSTAT_DATA_UINT64 },
+ { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
{ "l2_rw_clash", KSTAT_DATA_UINT64 },
{ "l2_read_bytes", KSTAT_DATA_UINT64 },
@@ -796,6 +506,22 @@ static arc_stats_t arc_stats = {
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_asize", KSTAT_DATA_UINT64 },
+ { "l2_log_blk_count", KSTAT_DATA_UINT64 },
+ { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_success", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_size", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_asize", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
+ { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
{ "arc_meta_used", KSTAT_DATA_UINT64 },
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
@@ -806,14 +532,6 @@ static arc_stats_t arc_stats = {
{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
};
-#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
-
-#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val))
-
-#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
-#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
-
#define ARCSTAT_MAX(stat, val) { \
uint64_t m; \
while ((val) > (m = arc_stats.stat.value.ui64) && \
@@ -844,6 +562,24 @@ static arc_stats_t arc_stats = {
} \
}
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define ARCSTAT_F_AVG_FACTOR 3
+#define ARCSTAT_F_AVG(stat, value) \
+ do { \
+ uint64_t x = ARCSTAT(stat); \
+ x = x - x / ARCSTAT_F_AVG_FACTOR + \
+ (value) / ARCSTAT_F_AVG_FACTOR; \
+ ARCSTAT(stat) = x; \
+ _NOTE(CONSTCOND) \
+ } while (0)
+
kstat_t *arc_ksp;
static arc_state_t *arc_anon;
static arc_state_t *arc_mru;
@@ -853,29 +589,6 @@ static arc_state_t *arc_mfu_ghost;
static arc_state_t *arc_l2c_only;
/*
- * There are several ARC variables that are critical to export as kstats --
- * but we don't want to have to grovel around in the kstat whenever we wish to
- * manipulate them. For these variables, we therefore define them to be in
- * terms of the statistic variable. This assures that we are not introducing
- * the possibility of inconsistency by having shadow copies of the variables,
- * while still allowing the code to be readable.
- */
-#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
-#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
-#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
-#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
-#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
-#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-
-/* compressed size of entire arc */
-#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
-/* uncompressed size of entire arc */
-#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
-/* number of bytes in the arc from arc_buf_t's */
-#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
-
-/*
* There are also some ARC variables that we want to export, but that are
* updated so often that having the canonical representation be the statistic
* variable causes a performance bottleneck. We want to use aggsum_t's for these
@@ -896,182 +609,6 @@ static hrtime_t arc_growtime;
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-typedef struct arc_callback arc_callback_t;
-
-struct arc_callback {
- void *acb_private;
- arc_read_done_func_t *acb_done;
- arc_buf_t *acb_buf;
- boolean_t acb_encrypted;
- boolean_t acb_compressed;
- boolean_t acb_noauth;
- zbookmark_phys_t acb_zb;
- zio_t *acb_zio_dummy;
- zio_t *acb_zio_head;
- arc_callback_t *acb_next;
-};
-
-typedef struct arc_write_callback arc_write_callback_t;
-
-struct arc_write_callback {
- void *awcb_private;
- arc_write_done_func_t *awcb_ready;
- arc_write_done_func_t *awcb_children_ready;
- arc_write_done_func_t *awcb_physdone;
- arc_write_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
-};
-
-/*
- * ARC buffers are separated into multiple structs as a memory saving measure:
- * - Common fields struct, always defined, and embedded within it:
- * - L2-only fields, always allocated but undefined when not in L2ARC
- * - L1-only fields, only allocated when in L1ARC
- *
- * Buffer in L1 Buffer only in L2
- * +------------------------+ +------------------------+
- * | arc_buf_hdr_t | | arc_buf_hdr_t |
- * | | | |
- * | | | |
- * | | | |
- * +------------------------+ +------------------------+
- * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
- * | (undefined if L1-only) | | |
- * +------------------------+ +------------------------+
- * | l1arc_buf_hdr_t |
- * | |
- * | |
- * | |
- * | |
- * +------------------------+
- *
- * Because it's possible for the L2ARC to become extremely large, we can wind
- * up eating a lot of memory in L2ARC buffer headers, so the size of a header
- * is minimized by only allocating the fields necessary for an L1-cached buffer
- * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
- * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
- * words in pointers. arc_hdr_realloc() is used to switch a header between
- * these two allocation states.
- */
-typedef struct l1arc_buf_hdr {
- kmutex_t b_freeze_lock;
- zio_cksum_t *b_freeze_cksum;
-#ifdef ZFS_DEBUG
- /*
- * Used for debugging with kmem_flags - by allocating and freeing
- * b_thawed when the buffer is thawed, we get a record of the stack
- * trace that thawed it.
- */
- void *b_thawed;
-#endif
-
- arc_buf_t *b_buf;
- uint32_t b_bufcnt;
- /* for waiting on writes to complete */
- kcondvar_t b_cv;
- uint8_t b_byteswap;
-
- /* protected by arc state mutex */
- arc_state_t *b_state;
- multilist_node_t b_arc_node;
-
- /* updated atomically */
- clock_t b_arc_access;
-
- /* self protecting */
- zfs_refcount_t b_refcnt;
-
- arc_callback_t *b_acb;
- abd_t *b_pabd;
-} l1arc_buf_hdr_t;
-
-/*
- * Encrypted blocks will need to be stored encrypted on the L2ARC
- * disk as they appear in the main pool. In order for this to work we
- * need to pass around the encryption parameters so they can be used
- * to write data to the L2ARC. This struct is only defined in the
- * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
- * flag set.
- */
-typedef struct arc_buf_hdr_crypt {
- abd_t *b_rabd; /* raw encrypted data */
- dmu_object_type_t b_ot; /* object type */
- uint32_t b_ebufcnt; /* number or encryped buffers */
-
- /* dsobj for looking up encryption key for l2arc encryption */
- uint64_t b_dsobj; /* for looking up key */
-
- /* encryption parameters */
- uint8_t b_salt[ZIO_DATA_SALT_LEN];
- uint8_t b_iv[ZIO_DATA_IV_LEN];
-
- /*
- * Technically this could be removed since we will always be able to
- * get the mac from the bp when we need it. However, it is inconvenient
- * for callers of arc code to have to pass a bp in all the time. This
- * also allows us to assert that L2ARC data is properly encrypted to
- * match the data in the main storage pool.
- */
- uint8_t b_mac[ZIO_DATA_MAC_LEN];
-} arc_buf_hdr_crypt_t;
-
-typedef struct l2arc_dev l2arc_dev_t;
-
-typedef struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
-
- list_node_t b_l2node;
-} l2arc_buf_hdr_t;
-
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
-
- arc_buf_contents_t b_type;
- arc_buf_hdr_t *b_hash_next;
- arc_flags_t b_flags;
-
- /*
- * This field stores the size of the data buffer after
- * compression, and is set in the arc's zio completion handlers.
- * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
- *
- * While the block pointers can store up to 32MB in their psize
- * field, we can only store up to 32MB minus 512B. This is due
- * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
- * a field of zeros represents 512B in the bp). We can't use a
- * bias of 1 since we need to reserve a psize of zero, here, to
- * represent holes and embedded blocks.
- *
- * This isn't a problem in practice, since the maximum size of a
- * buffer is limited to 16MB, so we never need to store 32MB in
- * this field. Even in the upstream illumos code base, the
- * maximum size of a buffer is limited to 16MB.
- */
- uint16_t b_psize;
-
- /*
- * This field stores the size of the data buffer before
- * compression, and cannot change once set. It is in units
- * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
- */
- uint16_t b_lsize; /* immutable */
- uint64_t b_spa; /* immutable */
-
- /* L2ARC fields. Undefined when not in L2ARC. */
- l2arc_buf_hdr_t b_l2hdr;
- /* L1ARC fields. Undefined when in l2arc_only state */
- l1arc_buf_hdr_t b_l1hdr;
- /*
- * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
- * is set and the L1 header exists.
- */
- arc_buf_hdr_crypt_t b_crypt_hdr;
-};
-
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
@@ -1176,6 +713,13 @@ uint64_t zfs_crc64_table[256];
#define L2ARC_FEED_SECS 1 /* caching interval secs */
#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
+/*
+ * We can feed L2ARC from two states of ARC buffers, mru and mfu,
+ * and each of the state has two types: data and metadata.
+ */
+#define L2ARC_FEED_TYPES 4
+
+
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
@@ -1189,24 +733,11 @@ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
+int l2arc_meta_percent = 33; /* limit on headers size */
/*
* L2ARC Internals
*/
-struct l2arc_dev {
- vdev_t *l2ad_vdev; /* vdev */
- spa_t *l2ad_spa; /* spa */
- uint64_t l2ad_hand; /* next write location */
- uint64_t l2ad_start; /* first addr on device */
- uint64_t l2ad_end; /* last addr on device */
- boolean_t l2ad_first; /* first sweep through */
- boolean_t l2ad_writing; /* currently writing */
- kmutex_t l2ad_mtx; /* lock for buffer list */
- list_t l2ad_buflist; /* buffer list */
- list_node_t l2ad_node; /* device list node */
- zfs_refcount_t l2ad_alloc; /* allocated bytes */
-};
-
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
@@ -1224,11 +755,6 @@ typedef struct l2arc_read_callback {
abd_t *l2rcb_abd; /* temporary buffer */
} l2arc_read_callback_t;
-typedef struct l2arc_write_callback {
- l2arc_dev_t *l2wcb_dev; /* device info */
- arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
-} l2arc_write_callback_t;
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
abd_t *l2df_abd;
@@ -1241,7 +767,16 @@ static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
-static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
+static kmutex_t l2arc_rebuild_thr_lock;
+static kcondvar_t l2arc_rebuild_thr_cv;
+
+enum arc_hdr_alloc_flags {
+ ARC_HDR_ALLOC_RDATA = 0x1,
+ ARC_HDR_DO_ADAPT = 0x2,
+};
+
+
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
typedef enum arc_fill_flags {
ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */
ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */
@@ -1251,15 +786,16 @@ typedef enum arc_fill_flags {
} arc_fill_flags_t;
static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
-static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t);
-static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
+static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, int);
static void arc_access(arc_buf_hdr_t *, kmutex_t *);
static boolean_t arc_is_overflowing();
static void arc_buf_watch(arc_buf_t *);
+static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@@ -1268,6 +804,18 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
static void l2arc_read_done(zio_t *);
+static void l2arc_do_free_on_write(void);
+static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only);
+
+#define l2arc_hdr_arcstats_increment(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
+#define l2arc_hdr_arcstats_decrement(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
+#define l2arc_hdr_arcstats_increment_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
+#define l2arc_hdr_arcstats_decrement_state(hdr) \
+ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
/*
* The arc_all_memory function is a ZoL enhancement that lives in their OSL
@@ -1298,6 +846,9 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
((hdr)->b_dva.dva_word[0] == 0 && \
(hdr)->b_dva.dva_word[1] == 0)
+#define HDR_EMPTY_OR_LOCKED(hdr) \
+ (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
+
#define HDR_EQUAL(spa, dva, birth, hdr) \
((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
@@ -1411,6 +962,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
}
/*
+ * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
+ * metadata and data are cached from ARC into L2ARC.
+ */
+int l2arc_mfuonly = 0;
+
+/*
* Global data structures and functions for the buf kmem cache.
*/
@@ -1726,8 +1283,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr)
static boolean_t
arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
{
- ASSERT(hdr->b_l1hdr.b_state == arc_anon ||
- MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
if (!ARC_BUF_COMPRESSED(b)) {
@@ -2011,14 +1567,14 @@ arc_buf_freeze(arc_buf_t *buf)
static inline void
arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
hdr->b_flags |= flags;
}
static inline void
arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
hdr->b_flags &= ~flags;
}
@@ -2032,7 +1588,7 @@ arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
static void
arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
{
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Holes and embedded blocks will always have a psize = 0 so
@@ -2125,7 +1681,7 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
void *tmpbuf = NULL;
abd_t *abd = hdr->b_l1hdr.b_pabd;
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_AUTHENTICATED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
@@ -2195,10 +1751,10 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
boolean_t no_crypt = B_FALSE;
boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT(HDR_ENCRYPTED(hdr));
- arc_hdr_alloc_pabd(hdr, B_FALSE);
+ arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
@@ -2225,7 +1781,7 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
* and then loan a buffer from it, rather than allocating a
* linear buffer and wrapping it in an abd later.
*/
- cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
+ cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -2315,7 +1871,7 @@ arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
ASSERT(HDR_ENCRYPTED(hdr));
ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
@@ -2538,7 +2094,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
*/
ret = SET_ERROR(EIO);
spa_log_error(spa, zb);
- zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0, 0);
}
@@ -2635,7 +2191,7 @@ static void
add_reference(arc_buf_hdr_t *hdr, void *tag)
{
ASSERT(HDR_HAS_L1HDR(hdr));
- if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+ if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
@@ -2652,7 +2208,11 @@ add_reference(arc_buf_hdr_t *hdr, void *tag)
arc_evictable_space_decrement(hdr, state);
}
/* remove the prefetch flag if we get a reference */
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
}
@@ -2888,9 +2448,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
}
}
- if (HDR_HAS_L1HDR(hdr))
+ if (HDR_HAS_L1HDR(hdr)) {
hdr->b_l1hdr.b_state = new_state;
+ if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
+ l2arc_hdr_arcstats_decrement_state(hdr);
+ hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
+ }
+
/*
* L2 headers should never be on the L2 state list since they don't
* have L1 headers allocated.
@@ -3040,7 +2607,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Only honor requests for compressed bufs if the hdr is actually
@@ -3160,6 +2727,58 @@ arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
return (buf);
}
+/*
+ * Performance tuning of L2ARC persistence:
+ *
+ * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
+ * an L2ARC device (either at pool import or later) will attempt
+ * to rebuild L2ARC buffer contents.
+ * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
+ * whether log blocks are written to the L2ARC device. If the L2ARC
+ * device is less than 1GB, the amount of data l2arc_evict()
+ * evicts is significant compared to the amount of restored L2ARC
+ * data. In this case do not write log blocks in L2ARC in order
+ * not to waste space.
+ */
+int l2arc_rebuild_enabled = B_TRUE;
+unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+
+/* L2ARC persistence rebuild control routines. */
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+
+/* L2ARC persistence read I/O routines. */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io);
+static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
+ const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
+static void l2arc_log_blk_fetch_abort(zio_t *zio);
+
+/* L2ARC persistence block restoration routines. */
+static void l2arc_log_blk_restore(l2arc_dev_t *dev,
+ const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+ l2arc_dev_t *dev);
+
+/* L2ARC persistence write I/O routines. */
+static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+ l2arc_write_callback_t *cb);
+
+/* L2ARC persistence auxilliary routines. */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+ const arc_buf_hdr_t *ab);
+boolean_t l2arc_range_check_overlap(uint64_t bottom,
+ uint64_t top, uint64_t check);
+static void l2arc_blk_fetch_done(zio_t *zio);
+static inline uint64_t
+ l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
/*
* Return a loaned arc buffer to the arc.
@@ -3248,7 +2867,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
ASSERT(arc_can_share(hdr, buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!ARC_BUF_ENCRYPTED(buf));
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Start sharing the data buffer. We transfer the
@@ -3281,7 +2900,7 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
ASSERT(arc_buf_is_shared(buf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* We are no longer sharing this buffer so we need
@@ -3316,7 +2935,7 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
arc_buf_t *lastbuf = NULL;
ASSERT(HDR_HAS_L1HDR(hdr));
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
/*
* Remove the buf from the hdr list and locate the last
@@ -3364,7 +2983,7 @@ arc_buf_destroy_impl(arc_buf_t *buf)
* We're about to change the hdr's b_flags. We must either
* hold the hash_lock or be undiscoverable.
*/
- ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@@ -3457,9 +3076,11 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
static void
-arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata)
+arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, int alloc_flags)
{
uint64_t size;
+ boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
+ boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3469,12 +3090,14 @@ arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata)
if (alloc_rdata) {
size = HDR_GET_PSIZE(hdr);
ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
- hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr);
+ hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
+ do_adapt);
ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
} else {
size = arc_hdr_size(hdr);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
- hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr);
+ hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
+ do_adapt);
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
}
@@ -3527,6 +3150,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
arc_buf_contents_t type, boolean_t alloc_rdata)
{
arc_buf_hdr_t *hdr;
+ int flags = ARC_HDR_DO_ADAPT;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
if (protected) {
@@ -3534,6 +3158,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
} else {
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
}
+ flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
ASSERT(HDR_EMPTY(hdr));
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
@@ -3557,7 +3182,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
* the compressed or uncompressed data depending on the block
* it references and compressed arc enablement.
*/
- arc_hdr_alloc_pabd(hdr, alloc_rdata);
+ arc_hdr_alloc_pabd(hdr, flags);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
return (hdr);
@@ -3842,7 +3467,6 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
{
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
@@ -3853,6 +3477,44 @@ arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
}
/*
+ * Allocates an ARC buf header that's in an evicted & L2-cached state.
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc.
+ */
+arc_buf_hdr_t *
+arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
+ dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
+ enum zio_compress compress, boolean_t protected,
+ boolean_t prefetch, arc_state_type_t arcs_state)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT(size != 0);
+ hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
+ hdr->b_birth = birth;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
+ HDR_SET_LSIZE(hdr, size);
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+ if (protected)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+ if (prefetch)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+
+ hdr->b_dva = dva;
+
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = daddr;
+ hdr->b_l2hdr.b_arcs_state = arcs_state;
+
+ return (hdr);
+}
+
+/*
* Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
* for bufs containing metadata.
*/
@@ -3867,7 +3529,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
arc_buf_t *buf = NULL;
VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
@@ -3908,7 +3569,6 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
compression_type, type, B_TRUE);
- ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
@@ -3933,6 +3593,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
}
static void
+l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+ boolean_t state_only)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ arc_buf_contents_t type = hdr->b_type;
+ int64_t lsize_s;
+ int64_t psize_s;
+ int64_t asize_s;
+
+ if (incr) {
+ lsize_s = lsize;
+ psize_s = psize;
+ asize_s = asize;
+ } else {
+ lsize_s = -lsize;
+ psize_s = -psize;
+ asize_s = -asize;
+ }
+
+ /* If the buffer is a prefetch, count it as such. */
+ if (HDR_PREFETCH(hdr)) {
+ ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
+ } else {
+ /*
+ * We use the value stored in the L2 header upon initial
+ * caching in L2ARC. This value will be updated in case
+ * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
+ * metadata (log entry) cannot currently be updated. Having
+ * the ARC state in the L2 header solves the problem of a
+ * possibly absent L1 header (apparent in buffers restored
+ * from persistent L2ARC).
+ */
+ switch (hdr->b_l2hdr.b_arcs_state) {
+ case ARC_STATE_MRU_GHOST:
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
+ break;
+ case ARC_STATE_MFU_GHOST:
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (state_only)
+ return;
+
+ ARCSTAT_INCR(arcstat_l2_psize, psize_s);
+ ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
+
+ switch (type) {
+ case ARC_BUFC_DATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
+ break;
+ case ARC_BUFC_METADATA:
+ ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
+ break;
+ default:
+ break;
+ }
+}
+
+
+static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
{
l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
@@ -3945,9 +3675,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
list_remove(&dev->l2ad_buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_psize, -psize);
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
-
+ l2arc_hdr_arcstats_decrement(hdr);
vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
@@ -3967,9 +3695,6 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT(!HDR_IN_HASH_TABLE(hdr));
- if (!HDR_EMPTY(hdr))
- buf_discard_identity(hdr);
-
if (HDR_HAS_L2HDR(hdr)) {
l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
@@ -3993,6 +3718,15 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
mutex_exit(&dev->l2ad_mtx);
}
+ /*
+ * The header's identity can only be safely discarded once it is no
+ * longer discoverable. This requires removing it from the hash table
+ * and the l2arc header list. After this point the hash lock can not
+ * be used to protect the header.
+ */
+ if (!HDR_EMPTY(hdr))
+ buf_discard_identity(hdr);
+
if (HDR_HAS_L1HDR(hdr)) {
arc_cksum_free(hdr);
@@ -4006,9 +3740,8 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
}
#endif
- if (hdr->b_l1hdr.b_pabd != NULL) {
+ if (hdr->b_l1hdr.b_pabd != NULL)
arc_hdr_free_pabd(hdr, B_FALSE);
- }
if (HDR_HAS_RABD(hdr))
arc_hdr_free_pabd(hdr, B_TRUE);
@@ -4033,7 +3766,6 @@ void
arc_buf_destroy(arc_buf_t *buf, void* tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
if (hdr->b_l1hdr.b_state == arc_anon) {
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
@@ -4043,7 +3775,9 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
return;
}
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
+
ASSERT3P(hdr, ==, buf->b_hdr);
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
@@ -4151,6 +3885,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
if (l2arc_write_eligible(hdr->b_spa, hdr)) {
ARCSTAT_INCR(arcstat_evict_l2_eligible,
HDR_GET_LSIZE(hdr));
+
+ switch (state->arcs_state) {
+ case ARC_STATE_MRU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mru,
+ HDR_GET_LSIZE(hdr));
+ break;
+ case ARC_STATE_MFU:
+ ARCSTAT_INCR(
+ arcstat_evict_l2_eligible_mfu,
+ HDR_GET_LSIZE(hdr));
+ break;
+ default:
+ break;
+ }
} else {
ARCSTAT_INCR(arcstat_evict_l2_ineligible,
HDR_GET_LSIZE(hdr));
@@ -4873,25 +4622,6 @@ arc_available_memory(void)
r = FMR_PAGES_PP_MAXIMUM;
}
-#if defined(__i386)
- /*
- * If we're on an i386 platform, it's possible that we'll exhaust the
- * kernel heap space before we ever run out of available physical
- * memory. Most checks of the size of the heap_area compare against
- * tune.t_minarmem, which is the minimum available real memory that we
- * can have in the system. However, this is generally fixed at 25 pages
- * which is so low that it's useless. In this comparison, we seek to
- * calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the calculation, if less than 1/4th is
- * free)
- */
- n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
- (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
- if (n < lowest) {
- lowest = n;
- r = FMR_HEAP_ARENA;
- }
-#endif
/*
* If zio data pages are being allocated out of a separate heap segment,
@@ -4954,12 +4684,6 @@ arc_kmem_reap_soon(void)
*/
dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
}
-#if defined(__i386)
- /*
- * Reclaim unused memory from all kmem caches.
- */
- kmem_reap();
-#endif
#endif
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
@@ -5154,9 +4878,6 @@ arc_adapt(int bytes, arc_state_t *state)
int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
- if (state == arc_l2c_only)
- return;
-
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
@@ -5238,11 +4959,12 @@ arc_is_overflowing(void)
}
static abd_t *
-arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+ boolean_t do_adapt)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- arc_get_data_impl(hdr, size, tag);
+ arc_get_data_impl(hdr, size, tag, do_adapt);
if (type == ARC_BUFC_METADATA) {
return (abd_alloc(size, B_TRUE));
} else {
@@ -5256,7 +4978,7 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
arc_buf_contents_t type = arc_buf_type(hdr);
- arc_get_data_impl(hdr, size, tag);
+ arc_get_data_impl(hdr, size, tag, B_TRUE);
if (type == ARC_BUFC_METADATA) {
return (zio_buf_alloc(size));
} else {
@@ -5272,12 +4994,14 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
* limit, we'll only signal the reclaim thread and continue on.
*/
static void
-arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+ boolean_t do_adapt)
{
arc_state_t *state = hdr->b_l1hdr.b_state;
arc_buf_contents_t type = arc_buf_type(hdr);
- arc_adapt(size, state);
+ if (do_adapt)
+ arc_adapt(size, state);
/*
* If arc_size is currently overflowing, and has grown past our
@@ -5448,10 +5172,14 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
hdr->b_l1hdr.b_arc_access = now;
return;
@@ -5480,13 +5208,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* was evicted from the cache. Move it to the
* MFU state.
*/
-
if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREFETCH |
ARC_FLAG_PRESCIENT_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
@@ -5747,8 +5478,6 @@ arc_read_done(zio_t *zio)
}
arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
- if (l2arc_noprefetch && HDR_PREFETCH(hdr))
- arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
@@ -5801,7 +5530,8 @@ arc_read_done(zio_t *zio)
error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(zio->io_spa, &acb->acb_zb);
- zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
}
}
@@ -6058,7 +5788,7 @@ top:
rc = SET_ERROR(EIO);
if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, zb);
- zfs_ereport_post(
+ (void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0, 0);
}
@@ -6073,8 +5803,12 @@ top:
ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
rc != EACCES);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
@@ -6099,6 +5833,7 @@ top:
boolean_t devw = B_FALSE;
uint64_t size;
abd_t *hdr_abd;
+ int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
if (hdr == NULL) {
/* this block is not in the cache */
@@ -6165,8 +5900,9 @@ top:
* do this after we've called arc_access() to
* avoid hitting an assert in remove_reference().
*/
+ arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
arc_access(hdr, hash_lock);
- arc_hdr_alloc_pabd(hdr, encrypted_read);
+ arc_hdr_alloc_pabd(hdr, alloc_flags);
}
if (encrypted_read) {
@@ -6195,8 +5931,13 @@ top:
}
if (*arc_flags & ARC_FLAG_PREFETCH &&
- zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
+ zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_decrement_state(hdr);
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (HDR_HAS_L2HDR(hdr))
+ l2arc_hdr_arcstats_increment_state(hdr);
+ }
if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
@@ -6266,7 +6007,7 @@ top:
* 3. This buffer isn't currently writing to the L2ARC.
* 4. The L2ARC entry wasn't evicted, which may
* also have invalidated the vdev.
- * 5. This isn't prefetch and l2arc_noprefetch is set.
+ * 5. This isn't prefetch or l2arc_noprefetch is 0.
*/
if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
@@ -6285,6 +6026,17 @@ top:
cb->l2rcb_zb = *zb;
cb->l2rcb_flags = zio_flags;
+ /*
+ * When Compressed ARC is disabled, but the
+ * L2ARC block is compressed, arc_hdr_size()
+ * will have returned LSIZE rather than PSIZE.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ !HDR_COMPRESSION_ENABLED(hdr) &&
+ HDR_GET_PSIZE(hdr) != 0) {
+ size = HDR_GET_PSIZE(hdr);
+ }
+
asize = vdev_psize_to_asize(vd, size);
if (asize != size) {
abd = abd_alloc_for_io(asize,
@@ -6566,7 +6318,7 @@ arc_release(arc_buf_t *buf, void *tag)
if (arc_can_share(hdr, lastbuf)) {
arc_share_buf(hdr, lastbuf);
} else {
- arc_hdr_alloc_pabd(hdr, B_FALSE);
+ arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
buf->b_data, psize);
}
@@ -6789,7 +6541,7 @@ arc_write_ready(zio_t *zio)
if (ARC_BUF_ENCRYPTED(buf)) {
ASSERT3U(psize, >, 0);
ASSERT(ARC_BUF_COMPRESSED(buf));
- arc_hdr_alloc_pabd(hdr, B_TRUE);
+ arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
} else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
/*
@@ -6799,16 +6551,17 @@ arc_write_ready(zio_t *zio)
*/
if (BP_IS_ENCRYPTED(bp)) {
ASSERT3U(psize, >, 0);
- arc_hdr_alloc_pabd(hdr, B_TRUE);
+ arc_hdr_alloc_pabd(hdr,
+ ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
!ARC_BUF_COMPRESSED(buf)) {
ASSERT3U(psize, >, 0);
- arc_hdr_alloc_pabd(hdr, B_FALSE);
+ arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
} else {
ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
- arc_hdr_alloc_pabd(hdr, B_FALSE);
+ arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
arc_buf_size(buf));
}
@@ -6894,8 +6647,8 @@ arc_write_done(zio_t *zio)
ASSERT(zfs_refcount_is_zero(
&exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
- mutex_exit(hash_lock);
arc_hdr_destroy(exists);
+ mutex_exit(hash_lock);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
@@ -7027,10 +6780,6 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
#ifdef _KERNEL
uint64_t available_memory = ptob(freemem);
-#if defined(__i386)
- available_memory =
- MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
-#endif
if (freemem > physmem * arc_lotsfree_percent / 100)
return (0);
@@ -7303,6 +7052,13 @@ arc_state_init(void)
aggsum_init(&astat_hdr_size, 0);
aggsum_init(&astat_other_size, 0);
aggsum_init(&astat_l2_hdr_size, 0);
+
+ arc_anon->arcs_state = ARC_STATE_ANON;
+ arc_mru->arcs_state = ARC_STATE_MRU;
+ arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+ arc_mfu->arcs_state = ARC_STATE_MFU;
+ arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+ arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
}
static void
@@ -7671,6 +7427,103 @@ arc_fini(void)
*
* These three functions determine what to write, how much, and how quickly
* to send writes.
+ *
+ * L2ARC persistence:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
+ * which is an additional piece of metadata which describes what's been
+ * written. This allows us to rebuild the arc_buf_hdr_t structures of the
+ * main ARC buffers. There are 2 linked-lists of log blocks headed by
+ * dh_start_lbps[2]. We alternate which chain we append to, so they are
+ * time-wise and offset-wise interleaved, but that is an optimization rather
+ * than for correctness. The log block also includes a pointer to the
+ * previous block in its chain.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ * for our header bookkeeping purposes. This contains a device header,
+ * which contains our top-level reference structures. We update it each
+ * time we write a new log block, so that we're able to locate it in the
+ * L2ARC device. If this write results in an inconsistent device header
+ * (e.g. due to power failure), we detect this by verifying the header's
+ * checksum and simply fail to reconstruct the L2ARC after reboot.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * | ___two newest log block pointers__.__________ |
+ * | / \dh_start_lbps[1] |
+ * | / \ \dh_start_lbps[0]|
+ * |.___/__. V V |
+ * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * || hdr| ^ /^ /^ / / |
+ * |+------+ ...--\-------/ \-----/--\------/ / |
+ * | \--------------/ \--------------/ |
+ * +======================================================================+
+ *
+ * As can be seen on the diagram, rather than using a simple linked list,
+ * we use a pair of linked lists with alternating elements. This is a
+ * performance enhancement due to the fact that we only find out the
+ * address of the next log block access once the current block has been
+ * completely read in. Obviously, this hurts performance, because we'd be
+ * keeping the device's I/O queue at only a 1 operation deep, thus
+ * incurring a large amount of I/O round-trip latency. Having two lists
+ * allows us to fetch two log blocks ahead of where we are currently
+ * rebuilding L2ARC buffers.
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header: l2arc_dev_hdr_phys_t
+ * L2ARC log block: l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ * current write head__ __old tail
+ * \ /
+ * V V
+ * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
+ * ^ ^^^^^^^^^___________________________________
+ * | \
+ * <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
+ *
+ * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
+ * hand are not restored. This is done by saving the offset (in bytes)
+ * l2arc_evict() has evicted to in the L2ARC device header and taking it
+ * into account when restoring buffers.
*/
static boolean_t
@@ -7682,18 +7535,20 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
* 2. is already cached on the L2ARC.
* 3. has an I/O in progress (it may be an incomplete read).
* 4. is flagged not eligible (zfs property).
+ * 5. is a prefetch and l2arc_noprefetch is set.
*/
if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
- HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
+ HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) ||
+ (l2arc_noprefetch && HDR_PREFETCH(hdr)))
return (B_FALSE);
return (B_TRUE);
}
static uint64_t
-l2arc_write_size(void)
+l2arc_write_size(l2arc_dev_t *dev)
{
- uint64_t size;
+ uint64_t size, dev_size;
/*
* Make sure our globals have meaningful values in case the user
@@ -7710,6 +7565,25 @@ l2arc_write_size(void)
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
+ /*
+ * Make sure the write size does not exceed the size of the cache
+ * device. This is important in l2arc_evict(), otherwise infinite
+ * iteration can occur.
+ */
+ dev_size = dev->l2ad_end - dev->l2ad_start;
+ if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
+ cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
+ "plus the overhead of log blocks (persistent L2ARC, "
+ "%" PRIu64 " bytes) exceeds the size of the cache device "
+ "(guid %" PRIu64 "), resetting them to the default (%d)",
+ l2arc_log_blk_overhead(size, dev),
+ dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+ size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+
+ if (arc_warm == B_FALSE)
+ size += l2arc_write_boost;
+ }
+
return (size);
}
@@ -7775,10 +7649,10 @@ l2arc_dev_get_next(void)
else if (next == first)
break;
- } while (vdev_is_dead(next->l2ad_vdev));
+ } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
/* if we were unable to find any usable vdevs, return NULL */
- if (vdev_is_dead(next->l2ad_vdev))
+ if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
next = NULL;
l2arc_dev_last = next;
@@ -7827,16 +7701,20 @@ l2arc_do_free_on_write()
static void
l2arc_write_done(zio_t *zio)
{
- l2arc_write_callback_t *cb;
- l2arc_dev_t *dev;
- list_t *buflist;
- arc_buf_hdr_t *head, *hdr, *hdr_prev;
- kmutex_t *hash_lock;
- int64_t bytes_dropped = 0;
+ l2arc_write_callback_t *cb;
+ l2arc_lb_abd_buf_t *abd_buf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ l2arc_dev_t *dev;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ list_t *buflist;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
+ kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
cb = zio->io_private;
ASSERT3P(cb, !=, NULL);
dev = cb->l2wcb_dev;
+ l2dhdr = dev->l2ad_dev_hdr;
ASSERT3P(dev, !=, NULL);
head = cb->l2wcb_head;
ASSERT3P(head, !=, NULL);
@@ -7845,9 +7723,6 @@ l2arc_write_done(zio_t *zio)
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
- if (zio->io_error != 0)
- ARCSTAT_BUMP(arcstat_l2_writes_error);
-
/*
* All writes completed, or an error was hit.
*/
@@ -7907,8 +7782,7 @@ top:
arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
uint64_t psize = HDR_GET_PSIZE(hdr);
- ARCSTAT_INCR(arcstat_l2_psize, -psize);
- ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
+ l2arc_hdr_arcstats_decrement(hdr);
bytes_dropped +=
vdev_psize_to_asize(dev->l2ad_vdev, psize);
@@ -7925,12 +7799,74 @@ top:
mutex_exit(hash_lock);
}
+ /*
+ * Free the allocated abd buffers for writing the log blocks.
+ * If the zio failed reclaim the allocated space and remove the
+ * pointers to these log blocks from the log block pointer list
+ * of the L2ARC device.
+ */
+ while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
+ abd_free(abd_buf->abd);
+ zio_buf_free(abd_buf, sizeof (*abd_buf));
+ if (zio->io_error != 0) {
+ lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
+ /*
+ * L2BLK_GET_PSIZE returns aligned size for log
+ * blocks.
+ */
+ uint64_t asize =
+ L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
+ bytes_dropped += asize;
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+ list_destroy(&cb->l2wcb_abd_list);
+
+ if (zio->io_error != 0) {
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ /*
+ * Restore the lbps array in the header to its previous state.
+ * If the list of log block pointers is empty, zero out the
+ * log block pointers in the device header.
+ */
+ lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
+ for (int i = 0; i < 2; i++) {
+ if (lb_ptr_buf == NULL) {
+ /*
+ * If the list is empty zero out the device
+ * header. Otherwise zero out the second log
+ * block pointer in the header.
+ */
+ if (i == 0) {
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ } else {
+ bzero(&l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ }
+ break;
+ }
+ bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+ sizeof (l2arc_log_blkptr_t));
+ lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
+ lb_ptr_buf);
+ }
+ }
+
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
mutex_exit(&dev->l2ad_mtx);
+ ASSERT(dev->l2ad_vdev != NULL);
vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
l2arc_do_free_on_write();
@@ -7965,7 +7901,8 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
* until arc_read_done().
*/
if (BP_IS_ENCRYPTED(bp)) {
- abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
+ abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+ B_TRUE);
zio_crypt_decode_params_bp(bp, salt, iv);
zio_crypt_decode_mac_bp(bp, mac);
@@ -8001,7 +7938,8 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
*/
if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
!HDR_COMPRESSION_ENABLED(hdr)) {
- abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
+ abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+ B_TRUE);
void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
@@ -8122,7 +8060,6 @@ l2arc_read_done(zio_t *zio)
zio->io_private = hdr;
arc_read_done(zio);
} else {
- mutex_exit(hash_lock);
/*
* Buffer didn't survive caching. Increment stats and
* reissue to the original storage device.
@@ -8147,10 +8084,24 @@ l2arc_read_done(zio_t *zio)
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
- zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
+ zio = zio_read(pio, zio->io_spa, zio->io_bp,
abd, zio->io_size, arc_read_done,
hdr, zio->io_priority, cb->l2rcb_flags,
- &cb->l2rcb_zb));
+ &cb->l2rcb_zb);
+
+ /*
+ * Original ZIO will be freed, so we need to update
+ * ARC header with the new ZIO pointer to be used
+ * by zio_change_priority() in arc_read().
+ */
+ for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
+ acb != NULL; acb = acb->acb_next)
+ acb->acb_zio_head = zio;
+
+ mutex_exit(hash_lock);
+ zio_nowait(zio);
+ } else {
+ mutex_exit(hash_lock);
}
}
@@ -8173,7 +8124,7 @@ l2arc_sublist_lock(int list_num)
multilist_t *ml = NULL;
unsigned int idx;
- ASSERT(list_num >= 0 && list_num <= 3);
+ ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
switch (list_num) {
case 0:
@@ -8188,6 +8139,8 @@ l2arc_sublist_lock(int list_num)
case 3:
ml = arc_mru->arcs_list[ARC_BUFC_DATA];
break;
+ default:
+ return (NULL);
}
/*
@@ -8201,8 +8154,31 @@ l2arc_sublist_lock(int list_num)
}
/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
+{
+ if (dev->l2ad_log_entries == 0) {
+ return (0);
+ } else {
+ uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
+
+ uint64_t log_blocks = (log_entries +
+ dev->l2ad_log_entries - 1) /
+ dev->l2ad_log_entries;
+
+ return (vdev_psize_to_asize(dev->l2ad_vdev,
+ sizeof (l2arc_log_blk_phys_t)) * log_blocks);
+ }
+}
+
+/*
* Evict buffers from the device write hand to the distance specified in
- * bytes. This distance may span populated buffers, it may span nothing.
+ * bytes. This distance may span populated buffers, it may span nothing.
* This is clearing a region on the L2ARC device ready for writing.
* If the 'all' boolean is set, every buffer is evicted.
*/
@@ -8213,22 +8189,28 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+ boolean_t rerun;
buflist = &dev->l2ad_buflist;
- if (!all && dev->l2ad_first) {
- /*
- * This is the first sweep through the device. There is
- * nothing to evict.
- */
- return;
- }
+ /*
+ * We need to add in the worst case scenario of log block overhead.
+ */
+ distance += l2arc_log_blk_overhead(distance, dev);
- if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
+top:
+ rerun = B_FALSE;
+ if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
/*
- * When nearing the end of the device, evict to the end
- * before the device write hand jumps to the start.
+ * When there is no space to accommodate upcoming writes,
+ * evict to the end. Then bump the write and evict hands
+ * to the start and iterate. This iteration does not
+ * happen indefinitely as we make sure in
+ * l2arc_write_size() that when the write hand is reset,
+ * the write size does not exceed the end of the device.
*/
+ rerun = B_TRUE;
taddr = dev->l2ad_end;
} else {
taddr = dev->l2ad_hand + distance;
@@ -8236,11 +8218,68 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
uint64_t, taddr, boolean_t, all);
-top:
+ /*
+ * This check has to be placed after deciding whether to iterate
+ * (rerun).
+ */
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ goto out;
+ }
+
+ /*
+ * When rebuilding L2ARC we retrieve the evict hand from the header of
+ * the device. Of note, l2arc_evict() does not actually delete buffers
+ * from the cache device, but keeping track of the evict hand will be
+ * useful when TRIM is implemented.
+ */
+ dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+
+retry:
mutex_enter(&dev->l2ad_mtx);
+ /*
+ * We have to account for evicted log blocks. Run vdev_space_update()
+ * on log blocks whose offset (in bytes) is before the evicted offset
+ * (in bytes) by searching in the list of pointers to log blocks
+ * present in the L2ARC device.
+ */
+ for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
+ lb_ptr_buf = lb_ptr_buf_prev) {
+
+ lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE(
+ (lb_ptr_buf->lb_ptr)->lbp_prop);
+
+ /*
+ * We don't worry about log blocks left behind (ie
+ * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
+ * will never write more than l2arc_evict() evicts.
+ */
+ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
+ break;
+ } else {
+ vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+ ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+ zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+ lb_ptr_buf);
+ zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+ list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ kmem_free(lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+ }
+ }
+
for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
hdr_prev = list_prev(buflist, hdr);
+ ASSERT(!HDR_EMPTY(hdr));
hash_lock = HDR_LOCK(hdr);
/*
@@ -8256,7 +8295,7 @@ top:
mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
- goto top;
+ goto retry;
}
/*
@@ -8268,7 +8307,7 @@ top:
ASSERT(!HDR_L2_WRITING(hdr));
ASSERT(!HDR_L2_WRITE_HEAD(hdr));
- if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
+ if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
@@ -8305,6 +8344,26 @@ top:
mutex_exit(hash_lock);
}
mutex_exit(&dev->l2ad_mtx);
+
+out:
+ /*
+ * We need to check if we evict all buffers, otherwise we may iterate
+ * unnecessarily.
+ */
+ if (!all && rerun) {
+ /*
+ * Bump device hand to the device start if it is approaching the
+ * end. l2arc_evict() has already evicted ahead for this case.
+ */
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ goto top;
+ }
+
+ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
}
/*
@@ -8424,6 +8483,17 @@ error:
return (ret);
}
+static void
+l2arc_blk_fetch_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+
+ cb = zio->io_private;
+ if (cb->l2rcb_abd != NULL)
+ abd_put(cb->l2rcb_abd);
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
/*
* Find and write ARC buffers to the L2ARC device.
*
@@ -8433,17 +8503,19 @@ error:
* state between calls to this function.
*
* Returns the number of bytes actually written (which may be smaller than
- * the delta by which the device hand has changed due to alignment).
+ * the delta by which the device hand has changed due to alignment and the
+ * writing of log blocks).
*/
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *hdr, *hdr_prev, *head;
- uint64_t write_asize, write_psize, write_lsize, headroom;
- boolean_t full;
- l2arc_write_callback_t *cb;
- zio_t *pio, *wzio;
- uint64_t guid = spa_load_guid(spa);
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_lsize, headroom;
+ boolean_t full;
+ l2arc_write_callback_t *cb = NULL;
+ zio_t *pio, *wzio;
+ uint64_t guid = spa_load_guid(spa);
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
ASSERT3P(dev->l2ad_vdev, !=, NULL);
@@ -8456,7 +8528,16 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
/*
* Copy buffers for L2ARC writing.
*/
- for (int try = 0; try <= 3; try++) {
+ for (int try = 0; try < L2ARC_FEED_TYPES; try++) {
+ /*
+ * If try == 1 or 3, we cache MRU metadata and data
+ * respectively.
+ */
+ if (l2arc_mfuonly) {
+ if (try == 1 || try == 3)
+ continue;
+ }
+
multilist_sublist_t *mls = l2arc_sublist_lock(try);
uint64_t passed_sz = 0;
@@ -8495,7 +8576,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}
passed_sz += HDR_GET_LSIZE(hdr);
- if (passed_sz > headroom) {
+ if (l2arc_headroom != 0 && passed_sz > headroom) {
/*
* Searched too far.
*/
@@ -8508,12 +8589,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}
- /*
- * We rely on the L1 portion of the header below, so
- * it's invalid for this header to have been evicted out
- * of the ghost cache, prior to being written out. The
- * ARC_FLAG_L2_WRITING bit ensures this won't happen.
- */
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
@@ -8537,12 +8612,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
*/
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
- ASSERT(HDR_HAS_L1HDR(hdr));
-
- ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
- ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
- HDR_HAS_RABD(hdr));
- ASSERT3U(arc_hdr_size(hdr), >, 0);
/*
* If this header has b_rabd, we can use this since it
@@ -8595,12 +8664,21 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
+ /*
+ * Create a list to save allocated abd buffers
+ * for l2arc_log_blk_commit().
+ */
+ list_create(&cb->l2wcb_abd_list,
+ sizeof (l2arc_lb_abd_buf_t),
+ offsetof(l2arc_lb_abd_buf_t, node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}
hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ hdr->b_l2hdr.b_arcs_state =
+ hdr->b_l1hdr.b_state->arcs_state;
arc_hdr_set_flags(hdr,
ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
@@ -8624,10 +8702,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
+ l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
mutex_exit(hash_lock);
+ /*
+ * Append buf info to current log and commit if full.
+ * arcstat_l2_{size,asize} kstats are updated
+ * internally.
+ */
+ if (l2arc_log_blk_insert(dev, hdr))
+ l2arc_log_blk_commit(dev, pio, cb);
+
(void) zio_nowait(wzio);
}
@@ -8642,31 +8729,47 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT0(write_lsize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);
+
+ /*
+ * Although we did not write any buffers l2ad_evict may
+ * have advanced.
+ */
+ if (dev->l2ad_evict != l2dhdr->dh_evict)
+ l2arc_dev_hdr_update(dev);
+
return (0);
}
+ if (!dev->l2ad_first)
+ ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
+
ASSERT3U(write_asize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
- ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
- ARCSTAT_INCR(arcstat_l2_psize, write_psize);
-
- /*
- * Bump device hand to the device start if it is approaching the end.
- * l2arc_evict() will already have evicted ahead for this case.
- */
- if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- dev->l2ad_hand = dev->l2ad_start;
- dev->l2ad_first = B_FALSE;
- }
dev->l2ad_writing = B_TRUE;
(void) zio_wait(pio);
dev->l2ad_writing = B_FALSE;
+ /*
+ * Update the device header after the zio completes as
+ * l2arc_write_done() may have updated the memory holding the log block
+ * pointers in the device header.
+ */
+ l2arc_dev_hdr_update(dev);
+
return (write_asize);
}
+static boolean_t
+l2arc_hdr_limit_reached(void)
+{
+ int64_t s = aggsum_upper_bound(&astat_l2_hdr_size);
+
+ return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+ (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
+}
+
/*
* This thread feeds the L2ARC at regular intervals. This is the beating
* heart of the L2ARC.
@@ -8732,7 +8835,7 @@ l2arc_feed_thread(void *unused)
/*
* Avoid contributing to memory pressure.
*/
- if (arc_reclaim_needed()) {
+ if (l2arc_hdr_limit_reached()) {
ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
spa_config_exit(spa, SCL_L2ARC, dev);
continue;
@@ -8740,7 +8843,7 @@ l2arc_feed_thread(void *unused)
ARCSTAT_BUMP(arcstat_l2_feeds);
- size = l2arc_write_size();
+ size = l2arc_write_size(dev);
/*
* Evict L2ARC buffers that will be overwritten.
@@ -8768,7 +8871,17 @@ l2arc_feed_thread(void *unused)
boolean_t
l2arc_vdev_present(vdev_t *vd)
{
- l2arc_dev_t *dev;
+ return (l2arc_vdev_get(vd) != NULL);
+}
+
+/*
+ * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
+ * the vdev_t isn't an L2ARC device.
+ */
+static l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
+ l2arc_dev_t *dev;
mutex_enter(&l2arc_dev_mtx);
for (dev = list_head(l2arc_dev_list); dev != NULL;
@@ -8778,7 +8891,7 @@ l2arc_vdev_present(vdev_t *vd)
}
mutex_exit(&l2arc_dev_mtx);
- return (dev != NULL);
+ return (dev);
}
/*
@@ -8788,7 +8901,8 @@ l2arc_vdev_present(vdev_t *vd)
void
l2arc_add_vdev(spa_t *spa, vdev_t *vd)
{
- l2arc_dev_t *adddev;
+ l2arc_dev_t *adddev;
+ uint64_t l2dhdr_asize;
ASSERT(!l2arc_vdev_present(vd));
@@ -8798,11 +8912,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
adddev->l2ad_spa = spa;
adddev->l2ad_vdev = vd;
- adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+ /* leave extra size for an l2arc device header */
+ l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
+ MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
+ adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+ ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
@@ -8812,8 +8932,17 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+ /*
+ * This is a list of pointers to log blocks that are still present
+ * on the device.
+ */
+ list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
+ offsetof(l2arc_lb_ptr_buf_t, node));
+
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
zfs_refcount_create(&adddev->l2ad_alloc);
+ zfs_refcount_create(&adddev->l2ad_lb_asize);
+ zfs_refcount_create(&adddev->l2ad_lb_count);
/*
* Add device to global list
@@ -8822,6 +8951,82 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);
+
+ /*
+ * Decide if vdev is eligible for L2ARC rebuild
+ */
+ l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
+}
+
+void
+l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
+{
+ l2arc_dev_t *dev = NULL;
+ l2arc_dev_hdr_phys_t *l2dhdr;
+ uint64_t l2dhdr_asize;
+ spa_t *spa;
+
+ dev = l2arc_vdev_get(vd);
+ ASSERT3P(dev, !=, NULL);
+ spa = dev->l2ad_spa;
+ l2dhdr = dev->l2ad_dev_hdr;
+ l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+
+ /*
+ * The L2ARC has to hold at least the payload of one log block for
+ * them to be restored (persistent L2ARC). The payload of a log block
+ * depends on the amount of its log entries. We always write log blocks
+ * with 1022 entries. How many of them are committed or restored depends
+ * on the size of the L2ARC device. Thus the maximum payload of
+ * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
+ * is less than that, we reduce the amount of committed and restored
+ * log entries per block so as to enable persistence.
+ */
+ if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
+ dev->l2ad_log_entries = 0;
+ } else {
+ dev->l2ad_log_entries = MIN((dev->l2ad_end -
+ dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
+ L2ARC_LOG_BLK_MAX_ENTRIES);
+ }
+
+ /*
+ * Read the device header, if an error is returned do not rebuild L2ARC.
+ */
+ if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
+ /*
+ * If we are onlining a cache device (vdev_reopen) that was
+ * still present (l2arc_vdev_present()) and rebuild is enabled,
+ * we should evict all ARC buffers and pointers to log blocks
+ * and reclaim their space before restoring its contents to
+ * L2ARC.
+ */
+ if (reopen) {
+ if (!l2arc_rebuild_enabled) {
+ return;
+ } else {
+ l2arc_evict(dev, 0, B_TRUE);
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+ }
+ }
+ /*
+ * Just mark the device as pending for a rebuild. We won't
+ * be starting a rebuild in line here as it would block pool
+ * import. Instead spa_load_impl will hand that off to an
+ * async task which will call l2arc_spa_rebuild_start.
+ */
+ dev->l2ad_rebuild = B_TRUE;
+ } else if (spa_writeable(spa)) {
+ /*
+ * In this case create a new header. We zero out the memory
+ * holding the header to reset dh_start_lbps.
+ */
+ bzero(l2dhdr, l2dhdr_asize);
+ l2arc_dev_hdr_update(dev);
+ }
}
/*
@@ -8830,24 +9035,29 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
void
l2arc_remove_vdev(vdev_t *vd)
{
- l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+ l2arc_dev_t *remdev = NULL;
/*
* Find the device by vdev
*/
- mutex_enter(&l2arc_dev_mtx);
- for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
- nextdev = list_next(l2arc_dev_list, dev);
- if (vd == dev->l2ad_vdev) {
- remdev = dev;
- break;
- }
- }
+ remdev = l2arc_vdev_get(vd);
ASSERT3P(remdev, !=, NULL);
/*
+ * Cancel any ongoing or scheduled rebuild.
+ */
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (remdev->l2ad_rebuild_began == B_TRUE) {
+ remdev->l2ad_rebuild_cancel = B_TRUE;
+ while (remdev->l2ad_rebuild == B_TRUE)
+ cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ /*
* Remove device from global list
*/
+ mutex_enter(&l2arc_dev_mtx);
list_remove(l2arc_dev_list, remdev);
l2arc_dev_last = NULL; /* may have been invalidated */
atomic_dec_64(&l2arc_ndev);
@@ -8858,8 +9068,13 @@ l2arc_remove_vdev(vdev_t *vd)
*/
l2arc_evict(remdev, 0, B_TRUE);
list_destroy(&remdev->l2ad_buflist);
+ ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
+ list_destroy(&remdev->l2ad_lbptr_list);
mutex_destroy(&remdev->l2ad_mtx);
zfs_refcount_destroy(&remdev->l2ad_alloc);
+ zfs_refcount_destroy(&remdev->l2ad_lb_asize);
+ zfs_refcount_destroy(&remdev->l2ad_lb_count);
+ kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -8873,6 +9088,8 @@ l2arc_init(void)
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -8897,6 +9114,8 @@ l2arc_fini(void)
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_rebuild_thr_lock);
+ cv_destroy(&l2arc_rebuild_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
@@ -8927,3 +9146,916 @@ l2arc_stop(void)
cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
mutex_exit(&l2arc_feed_thr_lock);
}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called after pool import from the spa async thread, since starting
+ * these threads directly from spa_import() will make them part of the
+ * "zpool import" context and delay process exit (and thus pool import).
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+ /*
+ * Locate the spa's l2arc devices and kick off rebuild threads.
+ */
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+ l2arc_dev_t *dev =
+ l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+ if (dev == NULL) {
+ /* Don't attempt a rebuild if the vdev is UNAVAIL */
+ continue;
+ }
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild_began = B_TRUE;
+ (void) thread_create(NULL, 0,
+ (void (*)(void *))l2arc_dev_rebuild_start,
+ dev, 0, &p0, TS_RUN, minclsyspri);
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ }
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_start(l2arc_dev_t *dev)
+{
+ VERIFY(!dev->l2ad_rebuild_cancel);
+ VERIFY(dev->l2ad_rebuild);
+ (void) l2arc_rebuild(dev);
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ dev->l2ad_rebuild_began = B_FALSE;
+ dev->l2ad_rebuild = B_FALSE;
+ mutex_exit(&l2arc_rebuild_thr_lock);
+
+ thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ * starts reading the log block chain and restores each block's contents
+ * to memory (reconstructing arc_buf_hdr_t's).
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log block chain.
+ * 2) We encounter *any* error condition (cksum errors, io errors)
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+ vdev_t *vd = dev->l2ad_vdev;
+ spa_t *spa = vd->vdev_spa;
+ int err = 0;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ l2arc_log_blk_phys_t *this_lb, *next_lb;
+ zio_t *this_io = NULL, *next_io = NULL;
+ l2arc_log_blkptr_t lbps[2];
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+ boolean_t lock_held;
+
+ this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+ next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+
+ /*
+ * We prevent device removal while issuing reads to the device,
+ * then during the rebuilding phases we drop this lock again so
+ * that a spa_unload or device remove can be initiated - this is
+ * safe, because the spa will signal us to stop before removing
+ * our device and wait for us to stop.
+ */
+ spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+ lock_held = B_TRUE;
+
+ /*
+ * Retrieve the persistent L2ARC device state.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
+ dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
+ L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
+ dev->l2ad_start);
+ dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+ /*
+ * In case the zfs module parameter l2arc_rebuild_enabled is false
+ * we do not start the rebuild process.
+ */
+ if (!l2arc_rebuild_enabled)
+ goto out;
+
+ /* Prepare the rebuild process */
+ bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+
+ /* Start the rebuild process */
+ for (;;) {
+ if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
+ break;
+
+ if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
+ this_lb, next_lb, this_io, &next_io)) != 0)
+ goto out;
+
+ /*
+ * Our memory pressure valve. If the system is running low
+ * on memory, rather than swamping memory with new ARC buf
+ * hdrs, we opt not to rebuild the L2ARC. At this point,
+ * however, we have already set up our L2ARC dev to chain in
+ * new metadata log blocks, so the user may choose to offline/
+ * online the L2ARC dev at a later time (or re-import the pool)
+ * to reconstruct it (when there's less memory pressure).
+ */
+ if (l2arc_hdr_limit_reached()) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+ cmn_err(CE_NOTE, "System running low on memory, "
+ "aborting L2ARC rebuild.");
+ err = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ spa_config_exit(spa, SCL_L2ARC, vd);
+ lock_held = B_FALSE;
+
+ /*
+ * Now that we know that the next_lb checks out alright, we
+ * can start reconstruction from this log block.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+ l2arc_log_blk_restore(dev, this_lb, asize);
+
+ /*
+ * log block restored, include its pointer in the list of
+ * pointers to log blocks present in the L2ARC device.
+ */
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
+ KM_SLEEP);
+ bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(vd, asize, 0, 0);
+
+ /* BEGIN CSTYLED */
+ /*
+ * Protection against loops of log blocks:
+ *
+ * l2ad_hand l2ad_evict
+ * V V
+ * l2ad_start |=======================================| l2ad_end
+ * -----|||----|||---|||----|||
+ * (3) (2) (1) (0)
+ * ---|||---|||----|||---|||
+ * (7) (6) (5) (4)
+ *
+ * In this situation the pointer of log block (4) passes
+ * l2arc_log_blkptr_valid() but the log block should not be
+ * restored as it is overwritten by the payload of log block
+ * (0). Only log blocks (0)-(3) should be restored. We check
+ * whether l2ad_evict lies in between the payload starting
+ * offset of the next log block (lbps[1].lbp_payload_start)
+ * and the payload starting offset of the present log block
+ * (lbps[0].lbp_payload_start). If true and this isn't the
+ * first pass, we are looping from the beginning and we should
+ * stop.
+ */
+ /* END CSTYLED */
+ if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+ lbps[0].lbp_payload_start, dev->l2ad_evict) &&
+ !dev->l2ad_first)
+ goto out;
+
+ for (;;) {
+ mutex_enter(&l2arc_rebuild_thr_lock);
+ if (dev->l2ad_rebuild_cancel) {
+ dev->l2ad_rebuild = B_FALSE;
+ cv_signal(&l2arc_rebuild_thr_cv);
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ err = SET_ERROR(ECANCELED);
+ goto out;
+ }
+ mutex_exit(&l2arc_rebuild_thr_lock);
+ if (spa_config_tryenter(spa, SCL_L2ARC, vd,
+ RW_READER)) {
+ lock_held = B_TRUE;
+ break;
+ }
+ /*
+ * L2ARC config lock held by somebody in writer,
+ * possibly due to them trying to remove us. They'll
+ * likely to want us to shut down, so after a little
+ * delay, we check l2ad_rebuild_cancel and retry
+ * the lock again.
+ */
+ delay(1);
+ }
+
+ /*
+ * Continue with the next log block.
+ */
+ lbps[0] = lbps[1];
+ lbps[1] = this_lb->lb_prev_lbp;
+ PTR_SWAP(this_lb, next_lb);
+ this_io = next_io;
+ next_io = NULL;
+ }
+
+ if (this_io != NULL)
+ l2arc_log_blk_fetch_abort(this_io);
+out:
+ if (next_io != NULL)
+ l2arc_log_blk_fetch_abort(next_io);
+ kmem_free(this_lb, sizeof (*this_lb));
+ kmem_free(next_lb, sizeof (*next_lb));
+
+ if (!l2arc_rebuild_enabled) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "disabled");
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_success);
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "successful, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
+ /*
+ * No error but also nothing restored, meaning the lbps array
+ * in the device header points to invalid/non-present log
+ * blocks. Reset the header.
+ */
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "no valid log blocks");
+ bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+ l2arc_dev_hdr_update(dev);
+ } else if (err == ECANCELED) {
+ /*
+ * In case the rebuild was canceled do not log to spa history
+ * log as the pool may be in the process of being removed.
+ */
+ zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
+ zfs_refcount_count(&dev->l2ad_lb_count));
+ } else if (err != 0) {
+ spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+ "aborted, restored %llu blocks",
+ (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+ }
+
+ if (lock_held)
+ spa_config_exit(spa, SCL_L2ARC, vd);
+
+ return (err);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `hdr'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev)
+{
+ int err;
+ uint64_t guid;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+
+ guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
+ ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+ ZIO_FLAG_SPECULATIVE, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ return (err);
+ }
+
+ if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+ byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
+
+ if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
+ l2dhdr->dh_spa_guid != guid ||
+ l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
+ l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
+ l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
+ l2dhdr->dh_end != dev->l2ad_end ||
+ !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
+ l2dhdr->dh_evict)) {
+ /*
+ * Attempt to rebuild a device containing no actual dev hdr
+ * or containing a header from some other pool or from another
+ * version of persistent L2ARC.
+ */
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple fetcher to make sure that while
+ * we're processing one buffer the L2ARC is already fetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log block
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
+ *
+ * The `this_io' and `next_io' arguments are used for block fetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * fetched IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no fetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the fetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of fetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
+ l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+ zio_t *this_io, zio_t **next_io)
+{
+ int err = 0;
+ zio_cksum_t cksum;
+ abd_t *abd = NULL;
+ uint64_t asize;
+
+ ASSERT(this_lbp != NULL && next_lbp != NULL);
+ ASSERT(this_lb != NULL && next_lb != NULL);
+ ASSERT(next_io != NULL && *next_io == NULL);
+ ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
+
+ /*
+ * Check to see if we have issued the IO for this log block in a
+ * previous run. If not, this is the first call, so issue it now.
+ */
+ if (this_io == NULL) {
+ this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
+ this_lb);
+ }
+
+ /*
+ * Peek to see if we can start issuing the next IO immediately.
+ */
+ if (l2arc_log_blkptr_valid(dev, next_lbp)) {
+ /*
+ * Start issuing IO for the next log block early - this
+ * should help keep the L2ARC device busy while we
+ * decompress and restore this log block.
+ */
+ *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
+ next_lb);
+ }
+
+ /* Wait for the IO to read this log block to complete */
+ if ((err = zio_wait(this_io)) != 0) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+ zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
+ "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
+ dev->l2ad_vdev->vdev_guid);
+ goto cleanup;
+ }
+
+ /*
+ * Make sure the buffer checks out.
+ * L2BLK_GET_PSIZE returns aligned size for log blocks.
+ */
+ asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
+ fletcher_4_native(this_lb, asize, NULL, &cksum);
+ if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
+ ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
+ zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
+ "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
+ this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
+ dev->l2ad_hand, dev->l2ad_evict);
+ err = SET_ERROR(ECKSUM);
+ goto cleanup;
+ }
+
+ /* Now we can take our time decoding this buffer */
+ switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
+ case ZIO_COMPRESS_OFF:
+ break;
+ case ZIO_COMPRESS_LZ4:
+ abd = abd_alloc_for_io(asize, B_TRUE);
+ abd_copy_from_buf_off(abd, this_lb, 0, asize);
+ if ((err = zio_decompress_data(
+ L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
+ abd, this_lb, asize, sizeof (*this_lb))) != 0) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ break;
+ default:
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+ if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+ byteswap_uint64_array(this_lb, sizeof (*this_lb));
+ if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
+ err = SET_ERROR(EINVAL);
+ goto cleanup;
+ }
+cleanup:
+ /* Abort an in-flight fetch I/O in case of error */
+ if (err != 0 && *next_io != NULL) {
+ l2arc_log_blk_fetch_abort(*next_io);
+ *next_io = NULL;
+ }
+ if (abd != NULL)
+ abd_free(abd);
+ return (err);
+}
+
+/*
+ * Restores the payload of a log block to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
+ uint64_t lb_asize)
+{
+ uint64_t size = 0, asize = 0;
+ uint64_t log_entries = dev->l2ad_log_entries;
+
+ /*
+ * Usually arc_adapt() is called only for data, not headers, but
+ * since we may allocate significant amount of memory here, let ARC
+ * grow its arc_c.
+ */
+ arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+
+ for (int i = log_entries - 1; i >= 0; i--) {
+ /*
+ * Restore goes in the reverse temporal direction to preserve
+ * correct temporal ordering of buffers in the l2ad_buflist.
+ * l2arc_hdr_restore also does a list_insert_tail instead of
+ * list_insert_head on the l2ad_buflist:
+ *
+ * LIST l2ad_buflist LIST
+ * HEAD <------ (time) ------ TAIL
+ * direction +-----+-----+-----+-----+-----+ direction
+ * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
+ * fill +-----+-----+-----+-----+-----+
+ * ^ ^
+ * | |
+ * | |
+ * l2arc_feed_thread l2arc_rebuild
+ * will place new bufs here restores bufs here
+ *
+ * During l2arc_rebuild() the device is not used by
+ * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
+ */
+ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
+ asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
+ l2arc_hdr_restore(&lb->lb_entries[i], dev);
+ }
+
+ /*
+ * Record rebuild stats:
+ * size Logical size of restored buffers in the L2ARC
+ * asize Aligned size of restored buffers in the L2ARC
+ */
+ ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+ ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
+ ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
+ ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *hdr, *exists;
+ kmutex_t *hash_lock;
+ arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop);
+ uint64_t asize;
+
+ /*
+ * Do all the allocation before grabbing any locks, this lets us
+ * sleep if memory is full and we don't have to deal with failed
+ * allocations.
+ */
+ hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
+ dev, le->le_dva, le->le_daddr,
+ L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
+ L2BLK_GET_COMPRESS((le)->le_prop),
+ L2BLK_GET_PROTECTED((le)->le_prop),
+ L2BLK_GET_PREFETCH((le)->le_prop),
+ L2BLK_GET_STATE((le)->le_prop));
+ asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ L2BLK_GET_PSIZE((le)->le_prop));
+
+ /*
+ * vdev_space_update() has to be called before arc_hdr_destroy() to
+ * avoid underflow since the latter also calls vdev_space_update().
+ */
+ l2arc_hdr_arcstats_increment(hdr);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, hdr);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ exists = buf_hash_insert(hdr, &hash_lock);
+ if (exists) {
+ /* Buffer was already cached, no need to restore it. */
+ arc_hdr_destroy(hdr);
+ /*
+ * If the buffer is already cached, check whether it has
+ * L2ARC metadata. If not, enter them and update the flag.
+ * This is important is case of onlining a cache device, since
+ * we previously evicted all L2ARC metadata from ARC.
+ */
+ if (!HDR_HAS_L2HDR(exists)) {
+ arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
+ exists->b_l2hdr.b_dev = dev;
+ exists->b_l2hdr.b_daddr = le->le_daddr;
+ exists->b_l2hdr.b_arcs_state =
+ L2BLK_GET_STATE((le)->le_prop);
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_tail(&dev->l2ad_buflist, exists);
+ (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+ arc_hdr_size(exists), exists);
+ mutex_exit(&dev->l2ad_mtx);
+ l2arc_hdr_arcstats_increment(exists);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+ }
+ ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+ }
+
+ mutex_exit(hash_lock);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
+ l2arc_log_blk_phys_t *lb)
+{
+ uint32_t asize;
+ zio_t *pio;
+ l2arc_read_callback_t *cb;
+
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
+ cb->l2rcb_abd = abd_get_from_buf(lb, asize);
+ pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY);
+ (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
+ cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+ return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_fetch_abort(zio_t *zio)
+{
+ (void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device.
+ */
+static void
+l2arc_dev_hdr_update(l2arc_dev_t *dev)
+{
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+ abd_t *abd;
+ int err;
+
+ VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
+
+ l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
+ l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
+ l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+ l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
+ l2dhdr->dh_log_entries = dev->l2ad_log_entries;
+ l2dhdr->dh_evict = dev->l2ad_evict;
+ l2dhdr->dh_start = dev->l2ad_start;
+ l2dhdr->dh_end = dev->l2ad_end;
+ l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
+ l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
+ l2dhdr->dh_flags = 0;
+ if (dev->l2ad_first)
+ l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+ abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+ err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
+ VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
+ NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
+
+ abd_put(abd);
+
+ if (err != 0) {
+ zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
+ "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+ }
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
+ uint64_t psize, asize;
+ zio_t *wzio;
+ l2arc_lb_abd_buf_t *abd_buf;
+ uint8_t *tmpbuf;
+ l2arc_lb_ptr_buf_t *lb_ptr_buf;
+
+ VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
+
+ tmpbuf = zio_buf_alloc(sizeof (*lb));
+ abd_buf = zio_buf_alloc(sizeof (*abd_buf));
+ abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
+ lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+ lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
+
+ /* link the buffer into the block chain */
+ lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
+ lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+ /*
+ * l2arc_log_blk_commit() may be called multiple times during a single
+ * l2arc_write_buffers() call. Save the allocated abd buffers in a list
+ * so we can free them in l2arc_write_done() later on.
+ */
+ list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
+
+ /* try to compress the buffer */
+ psize = zio_compress_data(ZIO_COMPRESS_LZ4,
+ abd_buf->abd, tmpbuf, sizeof (*lb));
+
+ /* a log block is never entirely zero */
+ ASSERT(psize != 0);
+ asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+ ASSERT(asize <= sizeof (*lb));
+
+ /*
+ * Update the start log block pointer in the device header to point
+ * to the log block we're about to write.
+ */
+ l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
+ l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
+ l2dhdr->dh_start_lbps[0].lbp_payload_asize =
+ dev->l2ad_log_blk_payload_asize;
+ l2dhdr->dh_start_lbps[0].lbp_payload_start =
+ dev->l2ad_log_blk_payload_start;
+ _NOTE(CONSTCOND)
+ L2BLK_SET_LSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
+ L2BLK_SET_PSIZE(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
+ L2BLK_SET_CHECKSUM(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_CHECKSUM_FLETCHER_4);
+ if (asize < sizeof (*lb)) {
+ /* compression succeeded */
+ bzero(tmpbuf + psize, asize - psize);
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_LZ4);
+ } else {
+ /* compression failed */
+ bcopy(lb, tmpbuf, sizeof (*lb));
+ L2BLK_SET_COMPRESS(
+ (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+ ZIO_COMPRESS_OFF);
+ }
+
+ /* checksum what we're about to write */
+ fletcher_4_native(tmpbuf, asize, NULL,
+ &l2dhdr->dh_start_lbps[0].lbp_cksum);
+
+ abd_put(abd_buf->abd);
+
+ /* perform the write itself */
+ abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
+ abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+ wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+ asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ dev->l2ad_hand += asize;
+ /*
+ * Include the committed log block's pointer in the list of pointers
+ * to log blocks present in the L2ARC device.
+ */
+ bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+ sizeof (l2arc_log_blkptr_t));
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
+ ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+ zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+ zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+ /* bump the kstats */
+ ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
+ ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+ ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
+ ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+ dev->l2ad_log_blk_payload_asize / asize);
+
+ /* start a new log block */
+ dev->l2ad_log_ent_idx = 0;
+ dev->l2ad_log_blk_payload_asize = 0;
+ dev->l2ad_log_blk_payload_start = 0;
+}
+
+/*
+ * Validates an L2ARC log block address to make sure that it can be read
+ * from the provided L2ARC device.
+ */
+boolean_t
+l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
+{
+ /* L2BLK_GET_PSIZE returns aligned size for log blocks */
+ uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+ uint64_t end = lbp->lbp_daddr + asize - 1;
+ uint64_t start = lbp->lbp_payload_start;
+ boolean_t evicted = B_FALSE;
+
+ /* BEGIN CSTYLED */
+ /*
+ * A log block is valid if all of the following conditions are true:
+ * - it fits entirely (including its payload) between l2ad_start and
+ * l2ad_end
+ * - it has a valid size
+ * - neither the log block itself nor part of its payload was evicted
+ * by l2arc_evict():
+ *
+ * l2ad_hand l2ad_evict
+ * | | lbp_daddr
+ * | start | | end
+ * | | | | |
+ * V V V V V
+ * l2ad_start ============================================ l2ad_end
+ * --------------------------||||
+ * ^ ^
+ * | log block
+ * payload
+ */
+ /* END CSTYLED */
+ evicted =
+ l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
+ l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
+ l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
+
+ return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
+ asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
+ (!evicted || dev->l2ad_first));
+}
+
+/*
+ * Inserts ARC buffer header `hdr' into the current L2ARC log block on
+ * the device. The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log block is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
+{
+ l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
+ l2arc_log_ent_phys_t *le;
+
+ if (dev->l2ad_log_entries == 0)
+ return (B_FALSE);
+
+ int index = dev->l2ad_log_ent_idx++;
+
+ ASSERT3S(index, <, dev->l2ad_log_entries);
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ le = &lb->lb_entries[index];
+ bzero(le, sizeof (*le));
+ le->le_dva = hdr->b_dva;
+ le->le_birth = hdr->b_birth;
+ le->le_daddr = hdr->b_l2hdr.b_daddr;
+ if (index == 0)
+ dev->l2ad_log_blk_payload_start = le->le_daddr;
+ L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
+ L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
+ L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
+ L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
+ L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
+ L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+ L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+
+ dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
+ HDR_GET_PSIZE(hdr));
+
+ return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ * bottom -- Lower end of the range to check (written to earlier).
+ * top -- Upper end of the range to check (written to later).
+ * check -- The address for which we want to determine if it sits in
+ * between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ * bottom < top : Sequentially ordered case:
+ * <check>--------+-------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |---------------<bottom>============<top>--------------|
+ *
+ * bottom > top: Looped-around case:
+ * <check>--------+------------------+
+ * | (overlap here?) |
+ * L2ARC dev V V
+ * |===============<top>---------------<bottom>===========|
+ * ^ ^
+ * | (or here?) |
+ * +---------------+---------<check>
+ *
+ * top == bottom : Just a single address comparison.
+ */
+boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+ if (bottom < top)
+ return (bottom <= check && check <= top);
+ else if (bottom > top)
+ return (check <= top || bottom <= check);
+ else
+ return (check == top);
+}
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index ae0b1fc878..38c4a83cb1 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
@@ -176,6 +176,7 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
bzero(db, sizeof (dmu_buf_impl_t));
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link);
zfs_refcount_create(&db->db_holds);
@@ -189,6 +190,7 @@ dbuf_dest(void *vdb, void *unused)
{
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
+ rw_destroy(&db->db_rwlock);
cv_destroy(&db->db_changed);
ASSERT(!multilist_link_active(&db->db_cache_link));
zfs_refcount_destroy(&db->db_holds);
@@ -789,10 +791,10 @@ dbuf_verify(dmu_buf_impl_t *db)
db->db.db_object);
/*
* dnode_grow_indblksz() can make this fail if we don't
- * have the struct_rwlock. XXX indblksz no longer
+ * have the parent's rwlock. XXX indblksz no longer
* grows. safe to do this now?
*/
- if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
ASSERT3P(db->db_blkptr, ==,
((blkptr_t *)db->db_parent->db.db_data +
db->db_blkid % epb));
@@ -868,6 +870,44 @@ dbuf_clear_data(dmu_buf_impl_t *db)
db->db_state = DB_UNCACHED;
}
+/*
+ * This function is used to lock the parent of the provided dbuf. This should be
+ * used when modifying or reading db_blkptr.
+ */
+db_lock_type_t
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+{
+ enum db_lock_type ret = DLT_NONE;
+ if (db->db_parent != NULL) {
+ rw_enter(&db->db_parent->db_rwlock, rw);
+ ret = DLT_PARENT;
+ } else if (dmu_objset_ds(db->db_objset) != NULL) {
+ rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
+ tag);
+ ret = DLT_OBJSET;
+ }
+ /*
+ * We only return a DLT_NONE lock when it's the top-most indirect block
+ * of the meta-dnode of the MOS.
+ */
+ return (ret);
+}
+
+/*
+ * We need to pass the lock type in because it's possible that the block will
+ * move from being the topmost indirect block in a dnode (and thus, have no
+ * parent) to not the top-most via an indirection increase. This would cause a
+ * panic if we didn't pass the lock type in.
+ */
+void
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+{
+ if (type == DLT_PARENT)
+ rw_exit(&db->db_parent->db_rwlock);
+ else if (type == DLT_OBJSET)
+ rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
+}
+
static void
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
{
@@ -1042,8 +1082,13 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
return (err);
}
+/*
+ * Drops db_mtx and the parent lock specified by dblt and tag before
+ * returning.
+ */
static int
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+ db_lock_type_t dblt, void *tag)
{
dnode_t *dn;
zbookmark_phys_t zb;
@@ -1053,11 +1098,11 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- /* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
+ ASSERT(db->db_parent == NULL ||
+ RW_LOCK_HELD(&db->db_parent->db_rwlock));
if (db->db_blkid == DMU_BONUS_BLKID) {
/*
@@ -1094,6 +1139,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, tag);
return (0);
}
@@ -1134,6 +1180,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_EXIT(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, tag);
return (0);
}
@@ -1150,12 +1197,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
"object set %llu", dmu_objset_id(db->db_objset));
DB_DNODE_EXIT(db);
mutex_exit(&db->db_mtx);
+ dmu_buf_unlock_parent(db, dblt, tag);
return (SET_ERROR(EIO));
}
err = dbuf_read_verify_dnode_crypt(db, flags);
if (err != 0) {
DB_DNODE_EXIT(db);
+ dmu_buf_unlock_parent(db, dblt, tag);
mutex_exit(&db->db_mtx);
return (err);
}
@@ -1175,11 +1224,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
-
- err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
+ /*
+ * The zio layer will copy the provided blkptr later, but we need to
+ * do this now so that we can release the parent's rwlock. We have to
+ * do that now so that if dbuf_read_done is called synchronously (on
+ * an l1 cache hit) we don't acquire the db_mtx while holding the
+ * parent's rwlock, which would be a lock ordering violation.
+ */
+ blkptr_t bp = *db->db_blkptr;
+ dmu_buf_unlock_parent(db, dblt, tag);
+ (void) arc_read(zio, db->db_objset->os_spa, &bp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
-
return (err);
}
@@ -1278,8 +1334,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
(flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
@@ -1316,29 +1370,32 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
- if (err == 0 && prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
+ if (err == 0 && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
spa_t *spa = dn->dn_objset->os_spa;
boolean_t need_wait = B_FALSE;
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+
if (zio == NULL &&
db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
}
- err = dbuf_read_impl(db, zio, flags);
-
- /* dbuf_read_impl has dropped db_mtx for us */
-
- if (!err && prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
+ err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+ /*
+ * dbuf_read_impl has dropped db_mtx and our parent's rwlock
+ * for us
+ */
+ if (!err && prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
if (!err && need_wait)
@@ -1353,10 +1410,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
* occurred and the dbuf went to UNCACHED.
*/
mutex_exit(&db->db_mtx);
- if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
- if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_exit(&dn->dn_struct_rwlock);
+ if (prefetch) {
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ flags & DB_RF_HAVESTRUCT);
+ }
DB_DNODE_EXIT(db);
/* Skip the wait per the caller's request. */
@@ -1536,7 +1593,9 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
+ rw_enter(&db->db_rwlock, RW_WRITER);
bzero(db->db.db_data, db->db.db_size);
+ rw_exit(&db->db_rwlock);
arc_buf_freeze(db->db_buf);
}
@@ -1558,15 +1617,6 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
- /* XXX does *this* func really need the lock? */
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-
- /*
- * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
- * is OK, because there can be no other references to the db
- * when we are changing its size, so no concurrent DB_FILL can
- * be happening.
- */
/*
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
@@ -1643,8 +1693,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dnode_t *dn;
objset_t *os;
dbuf_dirty_record_t **drp, *dr;
- int drop_struct_lock = FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
+ boolean_t drop_struct_rwlock = B_FALSE;
ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
@@ -1846,15 +1896,21 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
return (dr);
}
- /*
- * The dn_struct_rwlock prevents db_blkptr from changing
- * due to a write from syncing context completing
- * while we are running, so we want to acquire it before
- * looking at db_blkptr.
- */
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
+ drop_struct_rwlock = B_TRUE;
+ }
+
+ /*
+ * If we are overwriting a dedup BP, then unless it is snapshotted,
+ * when we get to syncing context we will need to decrement its
+ * refcount in the DDT. Prefetch the relevant DDT block so that
+ * syncing context won't have to wait for the i/o.
+ */
+ if (db->db_blkptr != NULL) {
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+ ddt_prefetch(os->os_spa, db->db_blkptr);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
}
/*
@@ -1867,19 +1923,12 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
- /*
- * If we are overwriting a dedup BP, then unless it is snapshotted,
- * when we get to syncing context we will need to decrement its
- * refcount in the DDT. Prefetch the relevant DDT block so that
- * syncing context won't have to wait for the i/o.
- */
- ddt_prefetch(os->os_spa, db->db_blkptr);
if (db->db_level == 0) {
ASSERT(!db->db_objset->os_raw_receive ||
dn->dn_maxblkid >= db->db_blkid);
dnode_new_blkid(dn, db->db_blkid, tx,
- drop_struct_lock, B_FALSE);
+ drop_struct_rwlock, B_FALSE);
ASSERT(dn->dn_maxblkid >= db->db_blkid);
}
@@ -1890,15 +1939,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
- parent = dbuf_hold_level(dn, db->db_level+1,
+ parent = dbuf_hold_level(dn, db->db_level + 1,
db->db_blkid >> epbs, FTAG);
ASSERT(parent != NULL);
parent_held = TRUE;
}
- if (drop_struct_lock)
+ if (drop_struct_rwlock)
rw_exit(&dn->dn_struct_rwlock);
- ASSERT3U(db->db_level+1, ==, parent->db_level);
+ ASSERT3U(db->db_level + 1, ==, parent->db_level);
di = dbuf_dirty(parent, tx);
if (parent_held)
dbuf_rele(parent, FTAG);
@@ -1919,14 +1967,14 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
mutex_exit(&db->db_mtx);
} else {
- ASSERT(db->db_level+1 == dn->dn_nlevels);
+ ASSERT(db->db_level + 1 == dn->dn_nlevels);
ASSERT(db->db_blkid < dn->dn_nblkptr);
ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
mutex_exit(&dn->dn_mtx);
- if (drop_struct_lock)
+ if (drop_struct_rwlock)
rw_exit(&dn->dn_struct_rwlock);
}
@@ -2447,10 +2495,12 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
*parentp = NULL;
return (err);
}
+ rw_enter(&(*parentp)->db_rwlock, RW_READER);
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
(blkid & ((1ULL << epbs) - 1));
if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
ASSERT(BP_IS_HOLE(*bpp));
+ rw_exit(&(*parentp)->db_rwlock);
return (0);
} else {
/* the block is referenced from the dnode */
@@ -2559,8 +2609,29 @@ typedef struct dbuf_prefetch_arg {
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+ void *dpa_arg; /* prefetch completion arg */
} dbuf_prefetch_arg_t;
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+ if (dpa->dpa_cb != NULL)
+ dpa->dpa_cb(dpa->dpa_arg, io_done);
+ kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ dbuf_prefetch_arg_t *dpa = private;
+
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ if (abuf != NULL)
+ arc_buf_destroy(abuf, private);
+}
+
/*
* Actually issue the prefetch read for the block given.
*/
@@ -2568,7 +2639,7 @@ static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
- return;
+ return (dbuf_prefetch_fini(dpa, B_FALSE));
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags =
@@ -2582,7 +2653,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+ dbuf_issue_final_prefetch_done, dpa,
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
}
@@ -2603,8 +2675,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
- kmem_free(dpa, sizeof (*dpa));
- return;
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
}
ASSERT(zio == NULL || zio->io_error == 0);
@@ -2635,6 +2706,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
dpa->dpa_zb.zb_level));
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
+ if (db == NULL) {
+ arc_buf_destroy(abuf, private);
+ return (dbuf_prefetch_fini(dpa, B_TRUE));
+ }
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
@@ -2647,11 +2722,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
if (BP_IS_HOLE(bp)) {
- kmem_free(dpa, sizeof (*dpa));
+ dbuf_prefetch_fini(dpa, B_TRUE);
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
@@ -2681,9 +2755,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
- arc_flags_t aflags)
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+ void *arg)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
@@ -2693,10 +2768,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (blkid > dn->dn_maxblkid)
- return;
+ goto no_issue;
- if (dnode_block_freed(dn, blkid))
- return;
+ if (level == 0 && dnode_block_freed(dn, blkid))
+ goto no_issue;
/*
* This dnode hasn't been written to disk yet, so there's nothing to
@@ -2704,11 +2779,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
- return;
+ goto no_issue;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
- return;
+ goto no_issue;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
@@ -2718,7 +2793,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
- return;
+ goto no_issue;
}
/*
@@ -2751,7 +2826,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
bp = dn->dn_phys->dn_blkptr[curblkid];
}
if (BP_IS_HOLE(&bp))
- return;
+ goto no_issue;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
@@ -2769,6 +2844,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
+ dpa->dpa_cb = cb;
+ dpa->dpa_arg = arg;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
@@ -2784,7 +2861,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
@@ -2805,6 +2881,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
* dpa may have already been freed.
*/
zio_nowait(pio);
+ return (1);
+no_issue:
+ if (cb != NULL)
+ cb(arg, B_FALSE);
+ return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
+{
+
+ return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
}
/*
@@ -2841,7 +2930,9 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db, dbuf_dirty_record_t *dr)
DBUF_GET_BUFC_TYPE(db), db->db.db_size));
}
+ rw_enter(&db->db_rwlock, RW_WRITER);
bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+ rw_exit(&db->db_rwlock);
}
/*
@@ -2967,7 +3058,6 @@ int
dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- dnode_t *dn;
if (db->db_blkid != DMU_SPILL_BLKID)
return (SET_ERROR(ENOTSUP));
@@ -2976,12 +3066,7 @@ dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dbuf_new_size(db, blksz, tx);
- rw_exit(&dn->dn_struct_rwlock);
- DB_DNODE_EXIT(db);
return (0);
}
@@ -3009,7 +3094,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
dmu_buf_impl_t *found_db;
boolean_t result = B_FALSE;
- if (db->db_blkid == DMU_BONUS_BLKID)
+ if (blkid == DMU_BONUS_BLKID)
found_db = dbuf_find_bonus(os, obj);
else
found_db = dbuf_find(os, obj, 0, blkid);
@@ -3019,7 +3104,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
(void) zfs_refcount_add(&db->db_holds, tag);
result = B_TRUE;
}
- mutex_exit(&db->db_mtx);
+ mutex_exit(&found_db->db_mtx);
}
return (result);
}
@@ -3697,9 +3782,9 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_exit(&db->db_mtx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
*db->db_blkptr = *bp;
- rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
}
/* ARGSUSED */
@@ -3740,9 +3825,9 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
* anybody from reading the blocks we're about to
* zero out.
*/
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ rw_enter(&db->db_rwlock, RW_WRITER);
bzero(db->db.db_data, db->db.db_size);
- rw_exit(&dn->dn_struct_rwlock);
+ rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);
}
@@ -3932,7 +4017,7 @@ dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
}
static void
-dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
{
blkptr_t bp_copy = *bp;
spa_t *spa = dmu_objset_spa(dn->dn_objset);
@@ -3946,14 +4031,16 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
/*
- * The struct_rwlock prevents dbuf_read_impl() from
+ * The db_rwlock prevents dbuf_read_impl() from
* dereferencing the BP while we are changing it. To
* avoid lock contention, only grab it when we are actually
* changing the BP.
*/
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (rw != NULL)
+ rw_enter(rw, RW_WRITER);
*bp = bp_copy;
- rw_exit(&dn->dn_struct_rwlock);
+ if (rw != NULL)
+ rw_exit(rw);
}
}
@@ -4026,7 +4113,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_level > 0) {
blkptr_t *bp = db->db.db_data;
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
- dbuf_remap_impl(dn, &bp[i], tx);
+ dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
}
} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
dnode_phys_t *dnp = db->db.db_data;
@@ -4034,7 +4121,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
DMU_OT_DNODE);
for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
- dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
+ krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
+ &dn->dn_dbuf->db_rwlock);
+ dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
+ tx);
}
}
}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 67ad5d10f6..a79f3f19c3 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -172,8 +172,8 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t blkid;
dmu_buf_impl_t *db;
- blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
@@ -197,8 +197,8 @@ dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
- blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ blkid = dbuf_whichblock(dn, 0, offset);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
@@ -605,7 +605,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
- read && DNODE_IS_CACHEABLE(dn));
+ read && DNODE_IS_CACHEABLE(dn), B_TRUE);
}
rw_exit(&dn->dn_struct_rwlock);
@@ -737,7 +737,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
if (err != 0)
return;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
/*
* offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
@@ -745,6 +744,7 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
*/
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
@@ -757,7 +757,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
-
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
@@ -2341,6 +2340,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
+
/*
* Enable nopwrite if we have secure enough checksum
* algorithm (see comment in zio_nop_write) and
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 6d01fbd2d4..a98097a8ee 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -28,6 +28,7 @@
* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -81,6 +82,8 @@ int dmu_find_threads = 0;
*/
int dmu_rescan_dnode_threshold = 131072;
+static char *upgrade_tag = "upgrade_tag";
+
static void dmu_objset_find_dp_cb(void *arg);
static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
@@ -681,8 +684,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_pool_hold(name, tag, &dp);
if (err != 0)
return (err);
@@ -755,8 +759,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
@@ -794,8 +799,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
{
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
if (err != 0)
return (err);
@@ -812,9 +818,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
void
dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
{
- ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0;
-
+ ds_hold_flags_t flags;
dsl_pool_t *dp = dmu_objset_pool(os);
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
dsl_pool_rele(dp, tag);
}
@@ -842,7 +849,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
{
dsl_pool_t *dp;
char name[ZFS_MAX_DATASET_NAME_LEN];
+ ds_hold_flags_t flags;
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
VERIFY3P(ds, !=, NULL);
VERIFY3P(ds->ds_owner, ==, tag);
VERIFY(dsl_dataset_long_held(ds));
@@ -851,21 +860,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
dp = ds->ds_dir->dd_pool;
dsl_pool_config_enter(dp, FTAG);
- dsl_dataset_disown(ds, 0, tag);
- VERIFY0(dsl_dataset_own(dp, name,
- (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds));
+ dsl_dataset_disown(ds, flags, tag);
+ VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));
dsl_pool_config_exit(dp, FTAG);
}
void
dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
{
+ ds_hold_flags_t flags;
+
+ flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
/*
* Stop upgrading thread
*/
dmu_objset_upgrade_stop(os);
- dsl_dataset_disown(os->os_dsl_dataset,
- (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag);
+ dsl_dataset_disown(os->os_dsl_dataset, flags, tag);
}
void
@@ -980,6 +990,7 @@ dmu_objset_evict_done(objset_t *os)
mutex_destroy(&os->os_userused_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
+ mutex_destroy(&os->os_upgrade_lock);
for (int i = 0; i < TXG_SIZE; i++) {
multilist_destroy(os->os_dirty_dnodes[i]);
}
@@ -1476,14 +1487,20 @@ dmu_objset_upgrade_task_cb(void *data)
mutex_enter(&os->os_upgrade_lock);
os->os_upgrade_status = EINTR;
if (!os->os_upgrade_exit) {
+ int status;
+
mutex_exit(&os->os_upgrade_lock);
- os->os_upgrade_status = os->os_upgrade_cb(os);
+ status = os->os_upgrade_cb(os);
+
mutex_enter(&os->os_upgrade_lock);
+
+ os->os_upgrade_status = status;
}
os->os_upgrade_exit = B_TRUE;
os->os_upgrade_id = 0;
mutex_exit(&os->os_upgrade_lock);
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
}
static void
@@ -1492,6 +1509,9 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
if (os->os_upgrade_id != 0)
return;
+ ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+ dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
+
mutex_enter(&os->os_upgrade_lock);
if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
os->os_upgrade_exit = B_FALSE;
@@ -1499,8 +1519,12 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
os->os_upgrade_id = taskq_dispatch(
os->os_spa->spa_upgrade_taskq,
dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
- if (os->os_upgrade_id == 0)
+ if (os->os_upgrade_id == TASKQID_INVALID) {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
os->os_upgrade_status = ENOMEM;
+ }
+ } else {
+ dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
}
mutex_exit(&os->os_upgrade_lock);
}
@@ -1511,10 +1535,12 @@ dmu_objset_upgrade_stop(objset_t *os)
mutex_enter(&os->os_upgrade_lock);
os->os_upgrade_exit = B_TRUE;
if (os->os_upgrade_id != 0) {
- os->os_upgrade_id = 0;
+ taskqid_t tid = os->os_upgrade_id;
+
mutex_exit(&os->os_upgrade_lock);
- taskq_wait(os->os_spa->spa_upgrade_taskq);
+ taskq_wait_id(os->os_spa->spa_upgrade_taskq, tid);
+ txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
} else {
mutex_exit(&os->os_upgrade_lock);
}
@@ -2215,7 +2241,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
if (flags & DN_ID_OLD_EXIST) {
dn->dn_newuid = dn->dn_olduid;
dn->dn_newgid = dn->dn_oldgid;
- dn->dn_newgid = dn->dn_oldprojid;
+ dn->dn_newprojid = dn->dn_oldprojid;
} else {
dn->dn_newuid = 0;
dn->dn_newgid = 0;
@@ -2306,6 +2332,7 @@ dmu_objset_space_upgrade(objset_t *os)
dmu_tx_hold_bonus(tx, obj);
objerr = dmu_tx_assign(tx, TXG_WAIT);
if (objerr != 0) {
+ dmu_buf_rele(db, FTAG);
dmu_tx_abort(tx);
continue;
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c
index 39f365652e..03e0fee4ff 100644
--- a/usr/src/uts/common/fs/zfs/dmu_recv.c
+++ b/usr/src/uts/common/fs/zfs/dmu_recv.c
@@ -201,7 +201,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
int error;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
@@ -399,7 +399,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
dsl_dataset_t *ds, *newds;
objset_t *os;
uint64_t dsobj;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
int error;
uint64_t crflags = 0;
dsl_crypto_params_t dummy_dcp = { 0 };
@@ -541,7 +541,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
dsl_pool_t *dp = dmu_tx_pool(tx);
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
int error;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
@@ -670,7 +670,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
objset_t *os;
- ds_hold_flags_t dsflags = 0;
+ ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
uint64_t dsobj;
/* 6 extra bytes for /%recv */
char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
@@ -1824,8 +1824,9 @@ static void
dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
{
dsl_dataset_t *ds = drc->drc_ds;
- ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
+ dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
/*
* Wait for the txg sync before cleaning up the receive. For
* resumable receives, this ensures that our resume state has
@@ -2832,11 +2833,12 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
dsl_dataset_t *snapds;
guid_map_entry_t *gmep;
objset_t *os;
- ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
int err;
ASSERT(guid_map != NULL);
+ dsflags = (raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
err = dsl_pool_hold(name, FTAG, &dp);
if (err != 0)
return (err);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index 34cfa2c011..d91a48e2ca 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -1222,9 +1222,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
dsl_pool_t *dp;
dsl_dataset_t *ds;
dsl_dataset_t *fromds = NULL;
- ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
int err;
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
err = dsl_pool_hold(pool, FTAG, &dp);
if (err != 0)
return (err);
@@ -1287,9 +1288,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
dsl_pool_t *dp;
dsl_dataset_t *ds;
int err;
- ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
+ ds_hold_flags_t dsflags;
boolean_t owned = B_FALSE;
+ dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
return (SET_ERROR(EINVAL));
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
index 5d6f20d072..08af78d620 100644
--- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -58,16 +58,29 @@ typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
kstat_named_t zfetchstat_max_streams;
+ kstat_named_t zfetchstat_max_completion_us;
+ kstat_named_t zfetchstat_last_completion_us;
+ kstat_named_t zfetchstat_io_issued;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
{ "max_streams", KSTAT_DATA_UINT64 },
+ { "max_completion_us", KSTAT_DATA_UINT64 },
+ { "last_completion_us", KSTAT_DATA_UINT64 },
+ { "io_issued", KSTAT_DATA_UINT64 },
};
#define ZFETCHSTAT_BUMP(stat) \
- atomic_inc_64(&zfetch_stats.stat.value.ui64);
+ atomic_inc_64(&zfetch_stats.stat.value.ui64)
+#define ZFETCHSTAT_ADD(stat, val) \
+ atomic_add_64(&zfetch_stats.stat.value.ui64, val)
+#define ZFETCHSTAT_SET(stat, val) \
+ zfetch_stats.stat.value.ui64 = val
+#define ZFETCHSTAT_GET(stat) \
+ zfetch_stats.stat.value.ui64
+
kstat_t *zfetch_ksp;
@@ -103,8 +116,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
{
if (zf == NULL)
return;
-
zf->zf_dnode = dno;
+ zf->zf_numstreams = 0;
list_create(&zf->zf_stream, sizeof (zstream_t),
offsetof(zstream_t, zs_node));
@@ -113,12 +126,28 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
}
static void
+dmu_zfetch_stream_fini(zstream_t *zs)
+{
+ mutex_destroy(&zs->zs_lock);
+ kmem_free(zs, sizeof (*zs));
+}
+
+static void
dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
list_remove(&zf->zf_stream, zs);
- mutex_destroy(&zs->zs_lock);
- kmem_free(zs, sizeof (*zs));
+ dmu_zfetch_stream_fini(zs);
+ zf->zf_numstreams--;
+}
+
+static void
+dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
+{
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+ list_remove(&zf->zf_stream, zs);
+ zs->zs_fetch = NULL;
+ zf->zf_numstreams--;
}
/*
@@ -133,8 +162,12 @@ dmu_zfetch_fini(zfetch_t *zf)
ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
rw_enter(&zf->zf_rwlock, RW_WRITER);
- while ((zs = list_head(&zf->zf_stream)) != NULL)
- dmu_zfetch_stream_remove(zf, zs);
+ while ((zs = list_head(&zf->zf_stream)) != NULL) {
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ dmu_zfetch_stream_orphan(zf, zs);
+ else
+ dmu_zfetch_stream_remove(zf, zs);
+ }
rw_exit(&zf->zf_rwlock);
list_destroy(&zf->zf_stream);
rw_destroy(&zf->zf_rwlock);
@@ -152,7 +185,7 @@ static void
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
zstream_t *zs_next;
- int numstreams = 0;
+ hrtime_t now = gethrtime();
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
@@ -162,11 +195,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
for (zstream_t *zs = list_head(&zf->zf_stream);
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
- if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+ /*
+ * Skip gethrtime() call if there are still references
+ */
+ if (zfs_refcount_count(&zs->zs_blocks) != 0)
+ continue;
+ if (((now - zs->zs_atime) / NANOSEC) >
zfetch_min_sec_reap)
dmu_zfetch_stream_remove(zf, zs);
- else
- numstreams++;
}
/*
@@ -180,7 +216,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
zfetch_max_distance));
- if (numstreams >= max_streams) {
+ if (zf->zf_numstreams >= max_streams) {
ZFETCHSTAT_BUMP(zfetchstat_max_streams);
return;
}
@@ -189,12 +225,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid;
zs->zs_ipf_blkid = blkid;
- zs->zs_atime = gethrtime();
+ zs->zs_atime = now;
+ zs->zs_fetch = zf;
+ zfs_refcount_create(&zs->zs_blocks);
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
-
+ zf->zf_numstreams++;
list_insert_head(&zf->zf_stream, zs);
}
+static void
+dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+{
+ zstream_t *zs = arg;
+
+ if (zs->zs_start_time && io_issued) {
+ hrtime_t now = gethrtime();
+ hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
+
+ zs->zs_start_time = 0;
+ ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
+ if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
+ ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
+ }
+
+ if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
+ return;
+
+ /*
+ * The parent fetch structure has gone away
+ */
+ if (zs->zs_fetch == NULL)
+ dmu_zfetch_stream_fini(zs);
+}
+
/*
* This is the predictive prefetch entry point. It associates dnode access
* specified with blkid and nblks arguments with prefetch stream, predicts
@@ -204,12 +267,13 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/
void
-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
+ boolean_t have_lock)
{
zstream_t *zs;
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
int64_t pf_ahead_blks, max_blks;
- int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
uint64_t end_of_access_blkid = blkid + nblks;
spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
@@ -230,9 +294,22 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
* As a fast path for small (single-block) files, ignore access
* to the first block.
*/
- if (blkid == 0)
+ if (!have_lock && blkid == 0)
return;
+ if (!have_lock)
+ rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
+
+
+ /*
+ * A fast path for small files for which no prefetch will
+ * happen.
+ */
+ if (zf->zf_dnode->dn_maxblkid < 2) {
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+ return;
+ }
rw_enter(&zf->zf_rwlock, RW_READER);
/*
@@ -257,6 +334,10 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
/* Already prefetched this before. */
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
+ if (!have_lock) {
+ rw_exit(&zf->zf_dnode->
+ dn_struct_rwlock);
+ }
return;
}
break;
@@ -274,6 +355,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
if (rw_tryupgrade(&zf->zf_rwlock))
dmu_zfetch_stream_create(zf, end_of_access_blkid);
rw_exit(&zf->zf_rwlock);
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
return;
}
@@ -335,9 +418,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
zs->zs_atime = gethrtime();
+ /* no prior reads in progress */
+ if (zfs_refcount_count(&zs->zs_blocks) == 0)
+ zs->zs_start_time = zs->zs_atime;
zs->zs_blkid = end_of_access_blkid;
+ zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
+ NULL);
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
+ issued = 0;
/*
* dbuf_prefetch() is asynchronous (even when it needs to read
@@ -346,12 +435,19 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
*/
for (int i = 0; i < pf_nblks; i++) {
- dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
}
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
- dbuf_prefetch(zf->zf_dnode, 1, iblk,
- ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+ dmu_zfetch_stream_done, zs);
}
+ if (!have_lock)
+ rw_exit(&zf->zf_dnode->dn_struct_rwlock);
ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+ if (issued)
+ ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
}
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index f5ef390896..53aeb42c0e 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -120,6 +120,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+ cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
/*
* Every dbuf has a reference, and dropping a tracked reference is
@@ -184,6 +185,7 @@ dnode_dest(void *arg, void *unused)
mutex_destroy(&dn->dn_mtx);
mutex_destroy(&dn->dn_dbufs_mtx);
cv_destroy(&dn->dn_notxholds);
+ cv_destroy(&dn->dn_nodnholds);
zfs_refcount_destroy(&dn->dn_holds);
zfs_refcount_destroy(&dn->dn_tx_holds);
ASSERT(!list_link_active(&dn->dn_link));
@@ -1175,13 +1177,15 @@ dnode_special_close(dnode_handle_t *dnh)
dnode_t *dn = dnh->dnh_dnode;
/*
- * Wait for final references to the dnode to clear. This can
- * only happen if the arc is asynchronously evicting state that
- * has a hold on this dnode while we are trying to evict this
- * dnode.
+ * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
+ * zfs_refcount_remove()
*/
- while (zfs_refcount_count(&dn->dn_holds) > 0)
- delay(1);
+ mutex_enter(&dn->dn_mtx);
+ if (zfs_refcount_count(&dn->dn_holds) > 0)
+ cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
+
ASSERT(dn->dn_dbuf == NULL ||
dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
@@ -1197,7 +1201,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_t *dn;
zrl_init(&dnh->dnh_zrlock);
- zrl_tryenter(&dnh->dnh_zrlock);
+ VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
dn = dnode_create(os, dnp, NULL, object, dnh);
DNODE_VERIFY(dn);
@@ -1342,7 +1346,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
}
blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
-
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
@@ -1614,7 +1617,10 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
dnode_handle_t *dnh = dn->dn_handle;
refs = zfs_refcount_remove(&dn->dn_holds, tag);
+ if (refs == 0)
+ cv_broadcast(&dn->dn_nodnholds);
mutex_exit(&dn->dn_mtx);
+ /* dnode could get destroyed at this point, so don't use it anymore */
/*
* It's unsafe to release the last hold on a dnode by dnode_rele() or
@@ -1776,10 +1782,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
/* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
- if (err == 0)
+ if (err == 0) {
dbuf_new_size(db, size, tx);
- else if (err != ENOENT)
+ } else if (err != ENOENT) {
goto fail;
+ }
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
@@ -2014,7 +2021,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
int trunc = FALSE;
int epbs;
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
blksz = dn->dn_datablksz;
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -2031,7 +2037,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
head = P2NPHASE(off, blksz);
blkoff = P2PHASE(off, blksz);
if ((off >> blkshift) > dn->dn_maxblkid)
- goto out;
+ return;
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
@@ -2040,12 +2046,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
*/
blkid = 0;
nblks = 1;
- if (dn->dn_nlevels > 1)
+ if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
dnode_dirty_l1(dn, 0, tx);
+ rw_exit(&dn->dn_struct_rwlock);
+ }
goto done;
} else if (off >= blksz) {
/* Freeing past end-of-data */
- goto out;
+ return;
} else {
/* Freeing part of the block. */
head = blksz - off;
@@ -2055,19 +2064,26 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
}
/* zero out any partial block data at the start of the range */
if (head) {
+ int res;
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
- TRUE, FALSE, FTAG, &db) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
caddr_t data;
+ boolean_t dirty;
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
/* don't dirty if it isn't on disk and isn't dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
+ dirty = db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+ if (dirty) {
dmu_buf_will_dirty(&db->db, tx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
}
@@ -2079,11 +2095,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range was less than one block, we're done */
if (len == 0)
- goto out;
+ return;
/* If the remaining range is past end of file, we're done */
if ((off >> blkshift) > dn->dn_maxblkid)
- goto out;
+ return;
ASSERT(ISP2(blksz));
if (trunc)
@@ -2094,16 +2110,23 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */
if (tail) {
+ int res;
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
- TRUE, FALSE, FTAG, &db) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
+ boolean_t dirty;
/* don't dirty if not on disk and not dirty */
- if (db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
- rw_exit(&dn->dn_struct_rwlock);
+ db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
+ FTAG);
+ dirty = db->db_last_dirty ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, type, FTAG);
+ if (dirty) {
dmu_buf_will_dirty(&db->db, tx);
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
dbuf_rele(db, FTAG);
@@ -2113,7 +2136,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
/* If the range did not include a full block, we are done */
if (len == 0)
- goto out;
+ return;
ASSERT(IS_P2ALIGNED(off, blksz));
ASSERT(trunc || IS_P2ALIGNED(len, blksz));
@@ -2143,6 +2166,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
* amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
uint64_t first, last;
first = blkid >> epbs;
@@ -2187,6 +2211,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
dnode_dirty_l1(dn, i, tx);
}
+ rw_exit(&dn->dn_struct_rwlock);
}
done:
@@ -2208,9 +2233,6 @@ done:
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
-out:
-
- rw_exit(&dn->dn_struct_rwlock);
}
static boolean_t
@@ -2322,6 +2344,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
dprintf("probing object %llu offset %llx level %d of %u\n",
dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
@@ -2354,9 +2378,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
return (error);
}
data = db->db.db_data;
+ rw_enter(&db->db_rwlock, RW_READER);
}
-
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
@@ -2416,8 +2440,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
error = SET_ERROR(ESRCH);
}
- if (db)
+ if (db != NULL) {
+ rw_exit(&db->db_rwlock);
dbuf_rele(db, FTAG);
+ }
return (error);
}
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index dc7317b411..396d58da17 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -23,6 +23,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2020 Oxide Computer Company
*/
#include <sys/zfs_context.h>
@@ -51,7 +52,6 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* this dnode can't be paged out because it's dirty */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
- ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
@@ -61,8 +61,24 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
dn->dn_object, dn->dn_phys->dn_nlevels);
+ /*
+ * Lock ordering requires that we hold the children's db_mutexes (by
+ * calling dbuf_find()) before holding the parent's db_rwlock. The lock
+ * order is imposed by dbuf_read's steps of "grab the lock to protect
+ * db_parent, get db_parent, hold db_parent's db_rwlock".
+ */
+ dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
+ ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
+ for (i = 0; i < nblkptr; i++) {
+ children[i] =
+ dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ }
+
/* transfer dnode's block pointers to new indirect block */
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ if (dn->dn_dbuf != NULL)
+ rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
+ rw_enter(&db->db_rwlock, RW_WRITER);
ASSERT(db->db.db_data);
ASSERT(arc_released(db->db_buf));
ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
@@ -72,8 +88,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* set dbuf's parent pointers to new indirect buf */
for (i = 0; i < nblkptr; i++) {
- dmu_buf_impl_t *child =
- dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+ dmu_buf_impl_t *child = children[i];
if (child == NULL)
continue;
@@ -106,6 +121,10 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+ rw_exit(&db->db_rwlock);
+ if (dn->dn_dbuf != NULL)
+ rw_exit(&dn->dn_dbuf->db_rwlock);
+
dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
@@ -182,7 +201,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
ASSERT(db->db_level == 1);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, db->db_level-1,
+ err = dbuf_hold_impl(dn, db->db_level - 1,
(db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
@@ -280,7 +299,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
* ancestor of the first or last block to be freed. The first and
* last L1 indirect blocks are always dirtied by dnode_free_range().
*/
+ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+ dmu_buf_unlock_parent(db, dblt, FTAG);
dbuf_release_bp(db);
bp = db->db.db_data;
@@ -306,7 +327,9 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- free_blocks(dn, bp, end-start+1, tx);
+ rw_enter(&db->db_rwlock, RW_WRITER);
+ free_blocks(dn, bp, end - start + 1, tx);
+ rw_exit(&db->db_rwlock);
} else {
for (uint64_t id = start; id <= end; id++, bp++) {
if (BP_IS_HOLE(bp))
@@ -323,10 +346,12 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
}
if (free_indirects) {
+ rw_enter(&db->db_rwlock, RW_WRITER);
for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
ASSERT(BP_IS_HOLE(bp));
bzero(db->db.db_data, db->db.db_size);
free_blocks(dn, db->db_blkptr, 1, tx);
+ rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);
@@ -378,7 +403,6 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
TRUE, FALSE, FTAG, &db));
rw_exit(&dn->dn_struct_rwlock);
-
free_children(db, blkid, nblks, free_indirects, tx);
dbuf_rele(db, FTAG);
}
@@ -736,13 +760,22 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
dsfra.dsfra_dnode = dn;
dsfra.dsfra_tx = tx;
dsfra.dsfra_free_indirects = freeing_dnode;
+ mutex_enter(&dn->dn_mtx);
if (freeing_dnode) {
ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
0, dn->dn_maxblkid + 1));
}
- mutex_enter(&dn->dn_mtx);
- range_tree_vacate(dn->dn_free_ranges[txgoff],
+ /*
+ * Because dnode_sync_free_range() must drop dn_mtx during its
+ * processing, using it as a callback to range_tree_vacate() is
+ * not safe. No other operations (besides destroy) are allowed
+ * once range_tree_vacate() has begun, and dropping dn_mtx
+ * would leave a window open for another thread to observe that
+ * invalid (and unsafe) state.
+ */
+ range_tree_walk(dn->dn_free_ranges[txgoff],
dnode_sync_free_range, &dsfra);
+ range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
range_tree_destroy(dn->dn_free_ranges[txgoff]);
dn->dn_free_ranges[txgoff] = NULL;
mutex_exit(&dn->dn_mtx);
diff --git a/usr/src/uts/common/fs/zfs/dsl_crypt.c b/usr/src/uts/common/fs/zfs/dsl_crypt.c
index a092326a9c..ed98740f1d 100644
--- a/usr/src/uts/common/fs/zfs/dsl_crypt.c
+++ b/usr/src/uts/common/fs/zfs/dsl_crypt.c
@@ -1391,10 +1391,17 @@ error:
return (ret);
}
-
+/*
+ * This function deals with the intricacies of updating wrapping
+ * key references and encryption roots recursively in the event
+ * of a call to 'zfs change-key' or 'zfs promote'. The 'skip'
+ * parameter should always be set to B_FALSE when called
+ * externally.
+ */
static void
spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
- uint64_t new_rddobj, dsl_wrapping_key_t *wkey, dmu_tx_t *tx)
+ uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip,
+ dmu_tx_t *tx)
{
int ret;
zap_cursor_t *zc;
@@ -1409,7 +1416,7 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
/* hold the dd */
VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
- /* ignore hidden dsl dirs */
+ /* ignore special dsl dirs */
if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') {
dsl_dir_rele(dd, FTAG);
return;
@@ -1422,7 +1429,8 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
* Stop recursing if this dsl dir didn't inherit from the root
* or if this dd is a clone.
*/
- if (ret == ENOENT || curr_rddobj != rddobj || dsl_dir_is_clone(dd)) {
+ if (ret == ENOENT ||
+ (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) {
dsl_dir_rele(dd, FTAG);
return;
}
@@ -1430,19 +1438,23 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
/*
* If we don't have a wrapping key just update the dck to reflect the
* new encryption root. Otherwise rewrap the entire dck and re-sync it
- * to disk.
+ * to disk. If skip is set, we don't do any of this work.
*/
- if (wkey == NULL) {
- VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj,
- DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx));
- } else {
- VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd,
- FTAG, &dck));
- dsl_wrapping_key_hold(wkey, dck);
- dsl_wrapping_key_rele(dck->dck_wkey, dck);
- dck->dck_wkey = wkey;
- dsl_crypto_key_sync(dck, tx);
- spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG);
+ if (!skip) {
+ if (wkey == NULL) {
+ VERIFY0(zap_update(dp->dp_meta_objset,
+ dd->dd_crypto_obj,
+ DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+ &new_rddobj, tx));
+ } else {
+ VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd,
+ FTAG, &dck));
+ dsl_wrapping_key_hold(wkey, dck);
+ dsl_wrapping_key_rele(dck->dck_wkey, dck);
+ dck->dck_wkey = wkey;
+ dsl_crypto_key_sync(dck, tx);
+ spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG);
+ }
}
zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
@@ -1454,7 +1466,27 @@ spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
zap_cursor_retrieve(zc, za) == 0;
zap_cursor_advance(zc)) {
spa_keystore_change_key_sync_impl(rddobj,
- za->za_first_integer, new_rddobj, wkey, tx);
+ za->za_first_integer, new_rddobj, wkey, B_FALSE, tx);
+ }
+ zap_cursor_fini(zc);
+
+ /*
+ * Recurse into all dsl dirs of clones. We utilize the skip parameter
+ * here so that we don't attempt to process the clones directly. This
+ * is because the clone and its origin share the same dck, which has
+ * already been updated.
+ */
+ for (zap_cursor_init(zc, dp->dp_meta_objset,
+ dsl_dir_phys(dd)->dd_clones);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ dsl_dataset_t *clone;
+
+ VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer,
+ FTAG, &clone));
+ spa_keystore_change_key_sync_impl(rddobj,
+ clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx);
+ dsl_dataset_rele(clone, FTAG);
}
zap_cursor_fini(zc);
@@ -1534,7 +1566,7 @@ spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
/* recurse through all children and rewrap their keys */
spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object,
- new_rddobj, wkey, tx);
+ new_rddobj, wkey, B_FALSE, tx);
/*
* All references to the old wkey should be released now (if it
@@ -1708,7 +1740,7 @@ dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
spa_keystore_change_key_sync_impl(rddobj, origin->dd_object,
- target->dd_object, NULL, tx);
+ target->dd_object, NULL, B_FALSE, tx);
rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock);
dsl_dataset_rele(targetds, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c
index b619719ba9..f6e8db4100 100644
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c
@@ -24,7 +24,7 @@
* Copyright 2016 Gary Mills
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright 2019 Joyent, Inc.
- * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
*/
#include <sys/dsl_scan.h>
@@ -549,6 +549,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
zfs_dbgmsg("new-style scrub was modified "
"by old software; restarting in txg %llu",
(longlong_t)scn->scn_restart_txg);
+ } else if (dsl_scan_resilvering(dp)) {
+ /*
+ * If a resilver is in progress and there are already
+ * errors, restart it instead of finishing this scan and
+ * then restarting it. If there haven't been any errors
+ * then remember that the incore DTL is valid.
+ */
+ if (scn->scn_phys.scn_errors > 0) {
+ scn->scn_restart_txg = txg;
+ zfs_dbgmsg("resilver can't excise DTL_MISSING "
+ "when finished; restarting in txg %llu",
+ (u_longlong_t)scn->scn_restart_txg);
+ } else {
+ /* it's safe to excise DTL when finished */
+ spa->spa_scrub_started = B_TRUE;
+ }
}
}
@@ -599,6 +615,13 @@ dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
}
boolean_t
+dsl_scan_resilver_scheduled(dsl_pool_t *dp)
+{
+ return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
+ (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
+}
+
+boolean_t
dsl_scan_scrubbing(const dsl_pool_t *dp)
{
dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
@@ -794,7 +817,7 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
(void) spa_vdev_state_exit(spa, NULL, 0);
if (func == POOL_SCAN_RESILVER) {
- dsl_resilver_restart(spa->spa_dsl_pool, 0);
+ dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
return (0);
}
@@ -813,41 +836,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
}
-/*
- * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
- * B_TRUE if we have devices that need to be resilvered and are available to
- * accept resilver I/Os.
- */
-static boolean_t
-dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
-{
- boolean_t resilver_needed = B_FALSE;
- spa_t *spa = vd->vdev_spa;
-
- for (int c = 0; c < vd->vdev_children; c++) {
- resilver_needed |=
- dsl_scan_clear_deferred(vd->vdev_child[c], tx);
- }
-
- if (vd == spa->spa_root_vdev &&
- spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
- spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
- vdev_config_dirty(vd);
- spa->spa_resilver_deferred = B_FALSE;
- return (resilver_needed);
- }
-
- if (!vdev_is_concrete(vd) || vd->vdev_aux ||
- !vd->vdev_ops->vdev_op_leaf)
- return (resilver_needed);
-
- if (vd->vdev_resilver_deferred)
- vd->vdev_resilver_deferred = B_FALSE;
-
- return (!vdev_is_dead(vd) && !vd->vdev_offline &&
- vdev_resilver_needed(vd, NULL, NULL));
-}
-
/* ARGSUSED */
static void
dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -915,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
"errors=%llu", spa_get_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
- spa->spa_scrub_started = B_FALSE;
spa->spa_scrub_active = B_FALSE;
/*
@@ -943,30 +930,33 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
spa_errlog_rotate(spa);
/*
+ * Don't clear flag until after vdev_dtl_reassess to ensure that
+ * DTL_MISSING will get updated when possible.
+ */
+ spa->spa_scrub_started = B_FALSE;
+
+ /*
* We may have finished replacing a device.
* Let the async thread assess this and handle the detach.
*/
spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
/*
- * Clear any deferred_resilver flags in the config.
+ * Clear any resilver_deferred flags in the config.
* If there are drives that need resilvering, kick
* off an asynchronous request to start resilver.
- * dsl_scan_clear_deferred() may update the config
+ * vdev_clear_resilver_deferred() may update the config
* before the resilver can restart. In the event of
* a crash during this period, the spa loading code
* will find the drives that need to be resilvered
- * when the machine reboots and start the resilver then.
+ * and start the resilver then.
*/
- if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
- boolean_t resilver_needed =
- dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
- if (resilver_needed) {
- spa_history_log_internal(spa,
- "starting deferred resilver", tx,
- "errors=%llu", spa_get_errlog_size(spa));
- spa_async_request(spa, SPA_ASYNC_RESILVER);
- }
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
+ vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
+ spa_history_log_internal(spa,
+ "starting deferred resilver", tx, "errors=%llu",
+ (u_longlong_t)spa_get_errlog_size(spa));
+ spa_async_request(spa, SPA_ASYNC_RESILVER);
}
}
@@ -1073,7 +1063,7 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
/* start a new scan, or restart an existing one. */
void
-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
{
if (txg == 0) {
dmu_tx_t *tx;
@@ -1221,10 +1211,13 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
static boolean_t
dsl_scan_should_clear(dsl_scan_t *scn)
{
+ spa_t *spa = scn->scn_dp->dp_spa;
vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
- uint64_t mlim_hard, mlim_soft, mused;
- uint64_t alloc = metaslab_class_get_alloc(spa_normal_class(
- scn->scn_dp->dp_spa));
+ uint64_t alloc, mlim_hard, mlim_soft, mused;
+
+ alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+ alloc += metaslab_class_get_alloc(spa_special_class(spa));
+ alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
zfs_scan_mem_lim_min);
@@ -3863,7 +3856,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
/*
* Keep track of how much data we've examined so that
- * zpool(1M) status can make useful progress reports.
+ * zpool(8) status can make useful progress reports.
*/
scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
@@ -4208,3 +4201,33 @@ dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
for (int i = 0; i < BP_GET_NDVAS(bp); i++)
dsl_scan_freed_dva(spa, bp, i);
}
+
+/*
+ * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
+ * not started, start it. Otherwise, only restart if max txg in DTL range is
+ * greater than the max txg in the current scan. If the DTL max is less than
+ * the scan max, then the vdev has not missed any new data since the resilver
+ * started, so a restart is not needed.
+ */
+void
+dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
+{
+ uint64_t min, max;
+
+ if (!vdev_resilver_needed(vd, &min, &max))
+ return;
+
+ if (!dsl_scan_resilvering(dp)) {
+ spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+ return;
+ }
+
+ if (max <= dp->dp_scan->scn_phys.scn_max_txg)
+ return;
+
+ /* restart is needed, check if it can be deferred */
+ if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+ vdev_defer_resilver(vd);
+ else
+ spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+}
diff --git a/usr/src/uts/common/fs/zfs/lua/README.zfs b/usr/src/uts/common/fs/zfs/lua/README.zfs
index 0e22de7a4a..bd0804f99e 100644
--- a/usr/src/uts/common/fs/zfs/lua/README.zfs
+++ b/usr/src/uts/common/fs/zfs/lua/README.zfs
@@ -26,7 +26,7 @@ maintenance policy, the modifications that have been made to it, and how it
should (and should not) be used.
For a description of the Lua language and features exposed by ZFS channel
-programs, please refer to the zfs-program(1m) man page instead.
+programs, please refer to the zfs-program(8) man page instead.
Maintenance policy
diff --git a/usr/src/uts/common/fs/zfs/lua/ldebug.c b/usr/src/uts/common/fs/zfs/lua/ldebug.c
index b8ddcff3c6..4ed0094bde 100644
--- a/usr/src/uts/common/fs/zfs/lua/ldebug.c
+++ b/usr/src/uts/common/fs/zfs/lua/ldebug.c
@@ -467,7 +467,7 @@ static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
return getobjname(p, pc, GETARG_A(i), name);
case OP_TFORCALL: { /* for iterator */
*name = "for iterator";
- return "for iterator";
+ return "for iterator";
}
/* all other instructions can call only through metamethods */
case OP_SELF:
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 42ba1f9a46..4828824b10 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2414,7 +2414,7 @@ metaslab_load_impl(metaslab_t *msp)
msp->ms_max_size = metaslab_largest_allocatable(msp);
ASSERT3U(max_size, <=, msp->ms_max_size);
hrtime_t load_end = gethrtime();
- msp->ms_load_time = load_end;
+ msp->ms_load_time = load_end;
if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
"ms_id %llu, smp_length %llu, "
@@ -5639,7 +5639,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
range_tree_remove(msp->ms_allocatable, offset, size);
range_tree_clear(msp->ms_trim, offset, size);
- if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
@@ -5686,7 +5686,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
metaslab_claim_cb_arg_t arg;
/*
- * Only zdb(1M) can claim on indirect vdevs. This is used
+ * Only zdb(8) can claim on indirect vdevs. This is used
* to detect leaks of mapped space (that are not accounted
* for in the obsolete counts, spacemap, or bpobj).
*/
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 0d2d28e1d3..ad4facaf5b 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -24,7 +24,6 @@
* Portions Copyright 2011 iXsystems, Inc
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
- * Copyright (c) 2015 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2019 Joyent, Inc.
*/
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 95c35a0f5f..26cc3b0824 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -27,10 +27,11 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
- * Copyright (c) 2017 Datto Inc.
- * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
*/
/*
@@ -1730,13 +1731,15 @@ spa_load_l2cache(spa_t *spa)
ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ nl2cache = 0;
+ newvdevs = NULL;
if (sav->sav_config != NULL) {
VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
- newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
- } else {
- nl2cache = 0;
- newvdevs = NULL;
+ if (nl2cache > 0) {
+ newvdevs = kmem_alloc(
+ nl2cache * sizeof (void *), KM_SLEEP);
+ }
}
oldvdevs = sav->sav_vdevs;
@@ -1828,7 +1831,11 @@ spa_load_l2cache(spa_t *spa)
VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
DATA_TYPE_NVLIST_ARRAY) == 0);
- l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ l2cache = NULL;
+ if (sav->sav_count > 0) {
+ l2cache = kmem_alloc(
+ sav->sav_count * sizeof (void *), KM_SLEEP);
+ }
for (i = 0; i < sav->sav_count; i++)
l2cache[i] = vdev_config_generate(spa,
sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
@@ -2407,7 +2414,8 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
spa->spa_loaded_ts.tv_nsec = 0;
}
if (error != EBADF) {
- zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0);
+ (void) zfs_ereport_post(ereport, spa,
+ NULL, NULL, NULL, 0, 0);
}
}
spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
@@ -3610,6 +3618,7 @@ spa_ld_get_props(spa_t *spa)
spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+ spa_prop_find(spa, ZPOOL_PROP_BOOTSIZE, &spa->spa_bootsize);
spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
&spa->spa_dedup_ditto);
@@ -4379,6 +4388,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
}
spa_import_progress_remove(spa);
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
spa_load_note(spa, "LOADED");
return (0);
@@ -5375,10 +5386,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
* Get the root pool information from the root disk, then import the root pool
* during the system boot up time.
*/
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
static nvlist_t *
-spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
+spa_generate_rootconf(const char *devpath, const char *devid, uint64_t *guid,
+ uint64_t pool_guid)
{
nvlist_t *config;
nvlist_t *nvtop, *nvroot;
@@ -5396,6 +5406,19 @@ spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
&pgid) == 0);
VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
+ if (pool_guid != 0 && pool_guid != pgid) {
+ /*
+ * The boot loader provided a pool GUID, but it does not match
+ * the one we found in the label. Return failure so that we
+ * can fall back to the full device scan.
+ */
+ zfs_dbgmsg("spa_generate_rootconf: loader pool guid %llu != "
+ "label pool guid %llu", (u_longlong_t)pool_guid,
+ (u_longlong_t)pgid);
+ nvlist_free(config);
+ return (NULL);
+ }
+
/*
* Put this pool's top-level vdevs into a root vdev.
*/
@@ -5462,7 +5485,8 @@ spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
* "/pci@1f,0/ide@d/disk@0,0:a"
*/
int
-spa_import_rootpool(char *devpath, char *devid)
+spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid,
+ uint64_t vdev_guid)
{
spa_t *spa;
vdev_t *rvd, *bvd, *avd = NULL;
@@ -5470,20 +5494,43 @@ spa_import_rootpool(char *devpath, char *devid)
uint64_t guid, txg;
char *pname;
int error;
+ const char *altdevpath = NULL;
/*
* Read the label from the boot device and generate a configuration.
*/
- config = spa_generate_rootconf(devpath, devid, &guid);
+ config = spa_generate_rootconf(devpath, devid, &guid, pool_guid);
#if defined(_OBP) && defined(_KERNEL)
if (config == NULL) {
if (strstr(devpath, "/iscsi/ssd") != NULL) {
/* iscsi boot */
get_iscsi_bootpath_phy(devpath);
- config = spa_generate_rootconf(devpath, devid, &guid);
+ config = spa_generate_rootconf(devpath, devid, &guid,
+ pool_guid);
}
}
#endif
+
+ /*
+ * We were unable to import the pool using the /devices path or devid
+ * provided by the boot loader. This may be the case if the boot
+ * device has been connected to a different location in the system, or
+ * if a new boot environment has changed the driver used to access the
+ * boot device.
+ *
+ * Attempt an exhaustive scan of all visible block devices to see if we
+ * can locate an alternative /devices path with a label that matches
+ * the expected pool and vdev GUID.
+ */
+ if (config == NULL && (altdevpath =
+ vdev_disk_preroot_lookup(pool_guid, vdev_guid)) != NULL) {
+ cmn_err(CE_NOTE, "Original /devices path (%s) not available; "
+ "ZFS is trying an alternate path (%s)", devpath,
+ altdevpath);
+ config = spa_generate_rootconf(altdevpath, NULL, &guid,
+ pool_guid);
+ }
+
if (config == NULL) {
cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
devpath);
@@ -6360,9 +6407,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
- vdev_set_deferred_resilver(spa, newvd);
+ vdev_defer_resilver(newvd);
else
- dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg);
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -7600,7 +7647,7 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_RESILVER &&
(!dsl_scan_resilvering(dp) ||
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
- dsl_resilver_restart(dp, 0);
+ dsl_scan_restart_resilver(dp, 0);
if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
mutex_enter(&spa_namespace_lock);
@@ -7627,6 +7674,17 @@ spa_async_thread(void *arg)
}
/*
+ * Kick off L2 cache rebuilding.
+ */
+ if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+ mutex_enter(&spa_namespace_lock);
+ spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+ l2arc_spa_rebuild_start(spa);
+ spa_config_exit(spa, SCL_L2ARC, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ /*
* Let the world know that we're done.
*/
mutex_enter(&spa->spa_async_lock);
@@ -7716,6 +7774,12 @@ spa_async_request(spa_t *spa, int task)
mutex_exit(&spa->spa_async_lock);
}
+int
+spa_async_tasks(spa_t *spa)
+{
+ return (spa->spa_async_tasks);
+}
+
/*
* ==========================================================================
* SPA syncing routines
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index 4719696ca4..ae814208fd 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -280,7 +280,8 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
* resource issues are resolved.
*/
if (target->spa_ccw_fail_time == 0) {
- zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
target, NULL, NULL, NULL, 0, 0);
}
target->spa_ccw_fail_time = gethrtime();
diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c
index 897d3c6e9a..44a4ec7ddc 100644
--- a/usr/src/uts/common/fs/zfs/spa_history.c
+++ b/usr/src/uts/common/fs/zfs/spa_history.c
@@ -334,7 +334,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx)
* posted as a result of the ZPOOL_HIST_CMD key being present
* it would result in only one sysevent being posted with the
* full command line arguments, requiring the consumer to know
- * how to parse and understand zfs(1M) command invocations.
+ * how to parse and understand zfs(8) command invocations.
*/
spa_history_log_notify(spa, nvl);
} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 9dac4e2ddc..cb59eef824 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -44,6 +44,7 @@
#include <sys/vdev_impl.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
+#include <sys/vdev_raidz.h>
#include <sys/metaslab.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
@@ -315,6 +316,16 @@ uint64_t zfs_deadman_checktime_ms = 5000ULL;
*/
int zfs_deadman_enabled = -1;
+#if defined(__amd64__) || defined(__i386__)
+/*
+ * Should we allow the use of mechanisms that depend on saving and restoring
+ * the FPU state? This was disabled initially due to stability issues in
+ * the kernel FPU routines; see bug 13717. As of the fixes for 13902 and
+ * 13915, it has once again been enabled.
+ */
+int zfs_fpu_enabled = 1;
+#endif
+
/*
* The worst case is single-sector max-parity RAID-Z blocks, in which
* case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
@@ -1373,7 +1384,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
/*
* If anything changed, wait for it to sync. This ensures that,
- * from the system administrator's perspective, zpool(1M) commands
+ * from the system administrator's perspective, zpool(8) commands
* are synchronous. This is important for things like zpool offline:
* when the command completes, you expect no further I/O from ZFS.
*/
@@ -2253,6 +2264,7 @@ spa_init(int mode)
zil_init();
vdev_cache_stat_init();
vdev_mirror_stat_init();
+ vdev_raidz_math_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
@@ -2271,6 +2283,7 @@ spa_fini(void)
vdev_cache_stat_fini();
vdev_mirror_stat_fini();
+ vdev_raidz_math_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h
index 621635933e..23699c0420 100644
--- a/usr/src/uts/common/fs/zfs/sys/abd.h
+++ b/usr/src/uts/common/fs/zfs/sys/abd.h
@@ -103,6 +103,15 @@ int abd_cmp(abd_t *, abd_t *, size_t);
int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t);
void abd_zero_off(abd_t *, size_t, size_t);
+void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t));
+void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul);
+
/*
* Wrappers for calls with offsets of 0
*/
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index 1ef3bb79ca..e5c18febe5 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -179,6 +179,16 @@ typedef enum arc_space_type {
ARC_SPACE_NUMTYPES
} arc_space_type_t;
+typedef enum arc_state_type {
+ ARC_STATE_ANON,
+ ARC_STATE_MRU,
+ ARC_STATE_MRU_GHOST,
+ ARC_STATE_MFU,
+ ARC_STATE_MFU_GHOST,
+ ARC_STATE_L2C_ONLY,
+ ARC_STATE_NUMTYPES
+} arc_state_type_t;
+
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
boolean_t arc_is_metadata(arc_buf_t *buf);
@@ -248,10 +258,14 @@ void arc_fini(void);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top,
+ uint64_t check);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
+void l2arc_spa_rebuild_start(spa_t *spa);
#ifndef _KERNEL
extern boolean_t arc_watch;
diff --git a/usr/src/uts/common/fs/zfs/sys/arc_impl.h b/usr/src/uts/common/fs/zfs/sys/arc_impl.h
new file mode 100644
index 0000000000..d35b7eea2d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/arc_impl.h
@@ -0,0 +1,876 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, Joyent, Inc.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
+ */
+
+#ifndef _SYS_ARC_IMPL_H
+#define _SYS_ARC_IMPL_H
+
+#include <sys/arc.h>
+#include <sys/multilist.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Note that buffers can be in one of 6 states:
+ * ARC_anon - anonymous (discussed below)
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recently used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
+ * When there are no active references to the buffer, they are
+ * are linked onto a list in one of these arc states. These are
+ * the only buffers that can be evicted or deleted. Within each
+ * state there are multiple lists, one for meta-data and one for
+ * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
+ * etc.) is tracked separately so that it can be managed more
+ * explicitly: favored over data, limited explicitly.
+ *
+ * Anonymous buffers are buffers that are not associated with
+ * a DVA. These are buffers that hold dirty block copies
+ * before they are written to stable storage. By definition,
+ * they are "ref'd" and are considered part of arc_mru
+ * that cannot be freed. Generally, they will aquire a DVA
+ * as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
+ */
+
+typedef struct arc_state {
+ /*
+ * list of evictable buffers
+ */
+ multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ zfs_refcount_t arcs_size;
+
+ arc_state_type_t arcs_state;
+} arc_state_t;
+
+typedef struct arc_callback arc_callback_t;
+
+struct arc_callback {
+ void *acb_private;
+ arc_read_done_func_t *acb_done;
+ arc_buf_t *acb_buf;
+ boolean_t acb_encrypted;
+ boolean_t acb_compressed;
+ boolean_t acb_noauth;
+ zbookmark_phys_t acb_zb;
+ zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
+ arc_callback_t *acb_next;
+};
+
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+ void *awcb_private;
+ arc_write_done_func_t *awcb_ready;
+ arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
+ arc_write_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
+};
+
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
+ kmutex_t b_freeze_lock;
+ zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * Used for debugging with kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
+ void *b_thawed;
+#endif
+
+ arc_buf_t *b_buf;
+ uint32_t b_bufcnt;
+ /* for waiting on writes to complete */
+ kcondvar_t b_cv;
+ uint8_t b_byteswap;
+
+ /* protected by arc state mutex */
+ arc_state_t *b_state;
+ multilist_node_t b_arc_node;
+
+ /* updated atomically */
+ clock_t b_arc_access;
+
+ /* self protecting */
+ zfs_refcount_t b_refcnt;
+
+ arc_callback_t *b_acb;
+ abd_t *b_pabd;
+} l1arc_buf_hdr_t;
+
+typedef enum l2arc_dev_hdr_flags_t {
+ L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
+} l2arc_dev_hdr_flags_t;
+
+/*
+ * Pointer used in persistent L2ARC (for pointing to log blocks).
+ */
+typedef struct l2arc_log_blkptr {
+ /*
+ * Offset of log block within the device, in bytes
+ */
+ uint64_t lbp_daddr;
+ /*
+ * Aligned payload size (in bytes) of the log block
+ */
+ uint64_t lbp_payload_asize;
+ /*
+ * Offset in bytes of the first buffer in the payload
+ */
+ uint64_t lbp_payload_start;
+ /*
+ * lbp_prop has the following format:
+ * * logical size (in bytes)
+ * * aligned (after compression) size (in bytes)
+ * * compression algorithm (we always LZ4-compress l2arc logs)
+ * * checksum algorithm (used for lbp_cksum)
+ */
+ uint64_t lbp_prop;
+ zio_cksum_t lbp_cksum; /* checksum of log */
+} l2arc_log_blkptr_t;
+
+/*
+ * The persistent L2ARC device header.
+ * Byte order of magic determines whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_dev_hdr_phys {
+ uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
+ uint64_t dh_version; /* Persistent L2ARC version */
+
+ /*
+ * Global L2ARC device state and metadata.
+ */
+ uint64_t dh_spa_guid;
+ uint64_t dh_vdev_guid;
+ uint64_t dh_log_entries; /* mirror of l2ad_log_entries */
+ uint64_t dh_evict; /* evicted offset in bytes */
+ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
+ /*
+ * Used in zdb.c for determining if a log block is valid, in the same
+ * way that l2arc_rebuild() does.
+ */
+ uint64_t dh_start; /* mirror of l2ad_start */
+ uint64_t dh_end; /* mirror of l2ad_end */
+ /*
+ * Start of log block chain. [0] -> newest log, [1] -> one older (used
+ * for initiating prefetch).
+ */
+ l2arc_log_blkptr_t dh_start_lbps[2];
+ /*
+ * Aligned size of all log blocks as accounted by vdev_space_update().
+ */
+ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */
+ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */
+ const uint64_t dh_pad[32]; /* pad to 512 bytes */
+ zio_eck_t dh_tail;
+} l2arc_dev_hdr_phys_t;
+CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
+
+/*
+ * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
+ */
+typedef struct l2arc_log_ent_phys {
+ dva_t le_dva; /* dva of buffer */
+ uint64_t le_birth; /* birth txg of buffer */
+ /*
+ * le_prop has the following format:
+ * * logical size (in bytes)
+ * * physical (compressed) size (in bytes)
+ * * compression algorithm
+ * * object type (used to restore arc_buf_contents_t)
+ * * protected status (used for encryption)
+ * * prefetch status (used in l2arc_read_done())
+ */
+ uint64_t le_prop;
+ uint64_t le_daddr; /* buf location on l2dev */
+ /*
+ * We pad the size of each entry to a power of 2 so that the size of
+ * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT,
+ * because of the L2ARC_SET_*SIZE macros.
+ */
+ const uint64_t le_pad[3]; /* pad to 64 bytes */
+} l2arc_log_ent_phys_t;
+
+#define L2ARC_LOG_BLK_MAX_ENTRIES (1022)
+
+/*
+ * A log block of up to 1022 ARC buffer log entries, chained into the
+ * persistent L2ARC metadata linked list. Byte order of magic determines
+ * whether 64-bit bswap of fields is necessary.
+ */
+typedef struct l2arc_log_blk_phys {
+ uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
+ /*
+ * There are 2 chains (headed by dh_start_lbps[2]), and this field
+ * points back to the previous block in this chain. We alternate
+ * which chain we append to, so they are time-wise and offset-wise
+ * interleaved, but that is an optimization rather than for
+ * correctness.
+ */
+ l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
+ /*
+ * Pad header section to 128 bytes
+ */
+ uint64_t lb_pad[7];
+ /* Payload */
+ l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
+} l2arc_log_blk_phys_t; /* 64K total */
+/*
+ * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with
+ * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros.
+ */
+CTASSERT(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t),
+ 1ULL << SPA_MINBLOCKSHIFT));
+CTASSERT(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
+CTASSERT(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);
+
+/*
+ * These structures hold in-flight abd buffers for log blocks as they're being
+ * written to the L2ARC device.
+ */
+typedef struct l2arc_lb_abd_buf {
+ abd_t *abd;
+ list_node_t node;
+} l2arc_lb_abd_buf_t;
+
+/*
+ * These structures hold pointers to log blocks present on the L2ARC device.
+ */
+typedef struct l2arc_lb_ptr_buf {
+ l2arc_log_blkptr_t *lb_ptr;
+ list_node_t node;
+} l2arc_lb_ptr_buf_t;
+
+/* Macros for setting fields in le_prop and lbp_prop */
+#define L2BLK_GET_LSIZE(field) \
+ BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_LSIZE(field, x) \
+ BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_PSIZE(field) \
+ BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
+#define L2BLK_SET_PSIZE(field, x) \
+ BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+#define L2BLK_GET_COMPRESS(field) \
+ BF64_GET((field), 32, SPA_COMPRESSBITS)
+#define L2BLK_SET_COMPRESS(field, x) \
+ BF64_SET((field), 32, SPA_COMPRESSBITS, x)
+#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1)
+#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x)
+#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
+#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
+#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8)
+#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
+#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1)
+#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
+#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4)
+#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x)
+
+#define PTR_SWAP(x, y) \
+ do { \
+ void *tmp = (x);\
+ x = y; \
+ y = tmp; \
+ _NOTE(CONSTCOND)\
+ } while (0)
+
+#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
+#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ boolean_t l2ad_first; /* first sweep through */
+ boolean_t l2ad_writing; /* currently writing */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+ zfs_refcount_t l2ad_alloc; /* allocated bytes */
+ /*
+ * Persistence-related stuff
+ */
+ l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
+ uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
+ l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
+ int l2ad_log_ent_idx; /* index into cur log blk */
+ /* Number of bytes in current log block's payload */
+ uint64_t l2ad_log_blk_payload_asize;
+ /*
+ * Offset (in bytes) of the first buffer in current log block's
+ * payload.
+ */
+ uint64_t l2ad_log_blk_payload_start;
+ /* Flag indicating whether a rebuild is scheduled or is going on */
+ boolean_t l2ad_rebuild;
+ boolean_t l2ad_rebuild_cancel;
+ boolean_t l2ad_rebuild_began;
+ uint64_t l2ad_log_entries; /* entries per log blk */
+ uint64_t l2ad_evict; /* evicted offset in bytes */
+ /* List of pointers to log blocks present in the L2ARC device */
+ list_t l2ad_lbptr_list;
+ /*
+ * Aligned size of all log blocks as accounted by vdev_space_update().
+ */
+ zfs_refcount_t l2ad_lb_asize;
+ /*
+ * Number of log blocks present on the device.
+ */
+ zfs_refcount_t l2ad_lb_count;
+} l2arc_dev_t;
+
+/*
+ * Encrypted blocks will need to be stored encrypted on the L2ARC
+ * disk as they appear in the main pool. In order for this to work we
+ * need to pass around the encryption parameters so they can be used
+ * to write data to the L2ARC. This struct is only defined in the
+ * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
+ * flag set.
+ */
+typedef struct arc_buf_hdr_crypt {
+ abd_t *b_rabd; /* raw encrypted data */
+ dmu_object_type_t b_ot; /* object type */
+ uint32_t b_ebufcnt; /* number or encryped buffers */
+
+ /* dsobj for looking up encryption key for l2arc encryption */
+ uint64_t b_dsobj; /* for looking up key */
+
+ /* encryption parameters */
+ uint8_t b_salt[ZIO_DATA_SALT_LEN];
+ uint8_t b_iv[ZIO_DATA_IV_LEN];
+
+ /*
+ * Technically this could be removed since we will always be able to
+ * get the mac from the bp when we need it. However, it is inconvenient
+ * for callers of arc code to have to pass a bp in all the time. This
+ * also allows us to assert that L2ARC data is properly encrypted to
+ * match the data in the main storage pool.
+ */
+ uint8_t b_mac[ZIO_DATA_MAC_LEN];
+} arc_buf_hdr_crypt_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+
+ arc_state_type_t b_arcs_state;
+ list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+ /* in-flight list of log blocks */
+ list_t l2wcb_abd_list;
+} l2arc_write_callback_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+
+ arc_buf_contents_t b_type;
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /*
+ * This field stores the size of the data buffer after
+ * compression, and is set in the arc's zio completion handlers.
+ * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+ *
+ * While the block pointers can store up to 32MB in their psize
+ * field, we can only store up to 32MB minus 512B. This is due
+ * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+ * a field of zeros represents 512B in the bp). We can't use a
+ * bias of 1 since we need to reserve a psize of zero, here, to
+ * represent holes and embedded blocks.
+ *
+ * This isn't a problem in practice, since the maximum size of a
+ * buffer is limited to 16MB, so we never need to store 32MB in
+ * this field.
+ */
+ uint16_t b_psize;
+
+ /*
+ * This field stores the size of the data buffer before
+ * compression, and cannot change once set. It is in units
+ * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+ */
+ uint16_t b_lsize; /* immutable */
+ uint64_t b_spa; /* immutable */
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
+ /*
+ * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
+ * is set and the L1 header exists.
+ */
+ arc_buf_hdr_crypt_t b_crypt_hdr;
+};
+
+typedef struct arc_stats {
+ kstat_named_t arcstat_hits;
+ kstat_named_t arcstat_misses;
+ kstat_named_t arcstat_demand_data_hits;
+ kstat_named_t arcstat_demand_data_misses;
+ kstat_named_t arcstat_demand_metadata_hits;
+ kstat_named_t arcstat_demand_metadata_misses;
+ kstat_named_t arcstat_prefetch_data_hits;
+ kstat_named_t arcstat_prefetch_data_misses;
+ kstat_named_t arcstat_prefetch_metadata_hits;
+ kstat_named_t arcstat_prefetch_metadata_misses;
+ kstat_named_t arcstat_mru_hits;
+ kstat_named_t arcstat_mru_ghost_hits;
+ kstat_named_t arcstat_mfu_hits;
+ kstat_named_t arcstat_mfu_ghost_hits;
+ kstat_named_t arcstat_deleted;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
+ kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped when updating the access state due to the
+ * header having already been released after acquiring the hash lock.
+ */
+ kstat_named_t arcstat_access_skip;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indirect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
+ kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach its target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
+ kstat_named_t arcstat_evict_l2_cached;
+ kstat_named_t arcstat_evict_l2_eligible;
+ kstat_named_t arcstat_evict_l2_eligible_mfu;
+ kstat_named_t arcstat_evict_l2_eligible_mru;
+ kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
+ kstat_named_t arcstat_hash_elements;
+ kstat_named_t arcstat_hash_elements_max;
+ kstat_named_t arcstat_hash_collisions;
+ kstat_named_t arcstat_hash_chains;
+ kstat_named_t arcstat_hash_chain_max;
+ kstat_named_t arcstat_p;
+ kstat_named_t arcstat_c;
+ kstat_named_t arcstat_c_min;
+ kstat_named_t arcstat_c_max;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_size;
+ /*
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
+ * Note that the compressed bytes may match the uncompressed bytes
+ * if the block is either not compressed or compressed arc is disabled.
+ */
+ kstat_named_t arcstat_compressed_size;
+ /*
+ * Uncompressed size of the data stored in b_pabd. If compressed
+ * arc is disabled then this value will be identical to the stat
+ * above.
+ */
+ kstat_named_t arcstat_uncompressed_size;
+ /*
+ * Number of bytes stored in all the arc_buf_t's. This is classified
+ * as "overhead" since this data is typically short-lived and will
+ * be evicted from the arc when it becomes unreferenced unless the
+ * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+ * values have been set (see comment in dbuf.c for more information).
+ */
+ kstat_named_t arcstat_overhead_size;
+ /*
+ * Number of bytes consumed by internal ARC structures necessary
+ * for tracking purposes; these structures are not actually
+ * backed by ARC buffers. This includes arc_buf_hdr_t structures
+ * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+ * caches), and arc_buf_t structures (allocated via arc_buf_t
+ * cache).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_hdr_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_DATA. This is generally consumed by buffers backing
+ * on disk user data (e.g. plain file contents).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_data_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_METADATA. This is generally consumed by buffers
+ * backing on disk data that is used for internal ZFS
+ * structures (e.g. ZAP, dnode, indirect blocks, etc).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_metadata_size;
+ /*
+ * Number of bytes consumed by various buffers and structures
+ * not actually backed with ARC buffers. This includes bonus
+ * buffers (allocated directly via zio_buf_* functions),
+ * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+ * cache), and dnode_t structures (allocated via dnode_t cache).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_other_size;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_anon state. This includes *all* buffers in the arc_anon
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_anon_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mru state. This includes *all* buffers in the arc_mru
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mru_ghost state. The key thing to note
+ * here, is the fact that this size doesn't actually indicate
+ * RAM consumption. The ghost lists only consist of headers and
+ * don't actually have ARC buffers linked off of these headers.
+ * Thus, *if* the headers had associated ARC buffers, these
+ * buffers *would have* consumed this number of bytes.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mfu state. This includes *all* buffers in the arc_mfu
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_size;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+ * state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_METADATA, and reside in the
+ * arc_mfu state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mfu_ghost state. See the comment above
+ * arcstat_mru_ghost_size for more details.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ * Not updated directly; only synced in arc_kstat_update.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_metadata;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ /*
+ * Allocated size (in bytes) of L2ARC cached buffers by ARC state.
+ */
+ kstat_named_t arcstat_l2_prefetch_asize;
+ kstat_named_t arcstat_l2_mru_asize;
+ kstat_named_t arcstat_l2_mfu_asize;
+ /*
+ * Allocated size (in bytes) of L2ARC cached buffers by buffer content
+ * type.
+ */
+ kstat_named_t arcstat_l2_bufc_data_asize;
+ kstat_named_t arcstat_l2_bufc_metadata_asize;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_read_bytes;
+ kstat_named_t arcstat_l2_write_bytes;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_lock_retry;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_lsize;
+ kstat_named_t arcstat_l2_psize;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_l2_hdr_size;
+ /*
+ * Number of L2ARC log blocks written. These are used for restoring the
+ * L2ARC. Updated during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_writes;
+ /*
+ * Moving average of the aligned size of the L2ARC log blocks, in
+ * bytes. Updated during L2ARC rebuild and during writing of L2ARC
+ * log blocks.
+ */
+ kstat_named_t arcstat_l2_log_blk_avg_asize;
+ /* Aligned size of L2ARC log blocks on L2ARC devices. */
+ kstat_named_t arcstat_l2_log_blk_asize;
+ /* Number of L2ARC log blocks present on L2ARC devices. */
+ kstat_named_t arcstat_l2_log_blk_count;
+ /*
+ * Moving average of the aligned size of L2ARC restored data, in bytes,
+ * to the aligned size of their metadata in L2ARC, in bytes.
+ * Updated during L2ARC rebuild and during writing of L2ARC log blocks.
+ */
+ kstat_named_t arcstat_l2_data_to_meta_ratio;
+ /*
+ * Number of times the L2ARC rebuild was successful for an L2ARC device.
+ */
+ kstat_named_t arcstat_l2_rebuild_success;
+ /*
+ * Number of times the L2ARC rebuild failed because the device header
+ * was in an unsupported format or corrupted.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_unsupported;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors
+ * while reading a log block.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_io_errors;
+ /*
+ * Number of times the L2ARC rebuild failed because of IO errors when
+ * reading the device header.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
+ /*
+ * Number of L2ARC log blocks which failed to be restored due to
+ * checksum errors.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
+ /*
+ * Number of times the L2ARC rebuild was aborted due to low system
+ * memory.
+ */
+ kstat_named_t arcstat_l2_rebuild_abort_lowmem;
+ /* Logical size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_size;
+ /* Aligned size of L2ARC restored data, in bytes. */
+ kstat_named_t arcstat_l2_rebuild_asize;
+ /*
+ * Number of L2ARC log entries (buffers) that were successfully
+ * restored in ARC.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs;
+ /*
+ * Number of L2ARC log entries (buffers) already cached in ARC. These
+ * were not restored again.
+ */
+ kstat_named_t arcstat_l2_rebuild_bufs_precached;
+ /*
+ * Number of L2ARC log blocks that were restored successfully. Each
+ * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
+ */
+ kstat_named_t arcstat_l2_rebuild_log_blks;
+ kstat_named_t arcstat_memory_throttle_count;
+ /* Not updated directly; only synced in arc_kstat_update. */
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_meta_max;
+ kstat_named_t arcstat_meta_min;
+ kstat_named_t arcstat_async_upgrade_sync;
+ kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
+} arc_stats_t;
+
+#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
+
+#define ARCSTAT_INCR(stat, val) \
+ atomic_add_64(&arc_stats.stat.value.ui64, (val))
+
+#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
+#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them. For these variables, we therefore define them to be in
+ * terms of the statistic variable. This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
+#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
+#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
+#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
+
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
+
+extern arc_stats_t arc_stats;
+
+/* used in zdb.c */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+ const l2arc_log_blkptr_t *lbp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ARC_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index 271232c61c..e543f6ac09 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -108,6 +108,12 @@ typedef enum override_states {
DR_OVERRIDDEN
} override_states_t;
+typedef enum db_lock_type {
+ DLT_NONE,
+ DLT_PARENT,
+ DLT_OBJSET
+} db_lock_type_t;
+
typedef struct dbuf_dirty_record {
/* link on our parents dirty list */
list_node_t dr_dirty_node;
@@ -217,6 +223,22 @@ typedef struct dmu_buf_impl {
*/
uint8_t db_level;
+ /*
+ * Protects db_buf's contents if they contain an indirect block or data
+ * block of the meta-dnode. We use this lock to protect the structure of
+ * the block tree. This means that when modifying this dbuf's data, we
+ * grab its rwlock. When modifying its parent's data (including the
+ * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering
+ * for this lock is:
+ * 1) dn_struct_rwlock
+ * 2) db_rwlock
+ * We don't currently grab multiple dbufs' db_rwlocks at once.
+ */
+ krwlock_t db_rwlock;
+
+ /* buffer holding our data */
+ arc_buf_t *db_buf;
+
/* db_mtx protects the members below */
kmutex_t db_mtx;
@@ -232,9 +254,6 @@ typedef struct dmu_buf_impl {
*/
zfs_refcount_t db_holds;
- /* buffer holding our data */
- arc_buf_t *db_buf;
-
kcondvar_t db_changed;
dbuf_dirty_record_t *db_data_pending;
@@ -289,6 +308,8 @@ typedef struct dbuf_hash_table {
kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;
+typedef void (*dbuf_prefetch_fn)(void *, boolean_t);
+
uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
@@ -305,7 +326,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);
-void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+ void *arg);
+int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
@@ -336,6 +360,8 @@ void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
+db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag);
+void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag);
boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index d38914dd1d..be834895c8 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -963,7 +963,7 @@ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
/*
* Add entries to the nvlist for all the objset's properties. See
- * zfs_prop_table[] and zfs(1m) for details on the properties.
+ * zfs_prop_table[] and zfs(8) for details on the properties.
*/
void dmu_objset_stats(objset_t *os, struct nvlist *nv);
@@ -974,7 +974,7 @@ void dmu_objset_stats(objset_t *os, struct nvlist *nv);
* availbytes is the amount of space available to this objset, taking
* into account quotas & reservations, assuming that no other objsets
* use the space first. These values correspond to the 'referenced' and
- * 'available' properties, described in the zfs(1m) manpage.
+ * 'available' properties, described in the zfs(8) manpage.
*
* usedobjs and availobjs are the number of objects currently allocated,
* and available.
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
index ccb5d7ac51..cc32359653 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -55,42 +55,42 @@ extern "C" {
* XXX try to improve evicting path?
*
* dp_config_rwlock > os_obj_lock > dn_struct_rwlock >
- * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
+ * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs
*
* dp_config_rwlock
* must be held before: everything
* protects dd namespace changes
* protects property changes globally
* held from:
- * dsl_dir_open/r:
- * dsl_dir_create_sync/w:
- * dsl_dir_sync_destroy/w:
- * dsl_dir_rename_sync/w:
- * dsl_prop_changed_notify/r:
+ * dsl_dir_open/r:
+ * dsl_dir_create_sync/w:
+ * dsl_dir_sync_destroy/w:
+ * dsl_dir_rename_sync/w:
+ * dsl_prop_changed_notify/r:
*
* os_obj_lock
* must be held before:
- * everything except dp_config_rwlock
+ * everything except dp_config_rwlock
* protects os_obj_next
* held from:
- * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
+ * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock
*
* dn_struct_rwlock
* must be held before:
- * everything except dp_config_rwlock and os_obj_lock
+ * everything except dp_config_rwlock and os_obj_lock
* protects structure of dnode (eg. nlevels)
- * db_blkptr can change when syncing out change to nlevels
- * dn_maxblkid
- * dn_nlevels
- * dn_*blksz*
- * phys nlevels, maxblkid, physical blkptr_t's (?)
+ * db_blkptr can change when syncing out change to nlevels
+ * dn_maxblkid
+ * dn_nlevels
+ * dn_*blksz*
+ * phys nlevels, maxblkid, physical blkptr_t's (?)
* held from:
- * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
- * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
- * dbuf_read_impl: db_mtx, dmu_zfetch()
- * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
- * dbuf_new_size: db_mtx
- * dbuf_dirty: db_mtx
+ * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch
+ * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz)
+ * dbuf_read_impl: db_mtx, dmu_zfetch()
+ * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch()
+ * dbuf_new_size: db_mtx
+ * dbuf_dirty: db_mtx
* dbuf_findbp: (callers, phys? - the real need)
* dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?)
* dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx
@@ -103,126 +103,127 @@ extern "C" {
*
* dn_dbufs_mtx
* must be held before:
- * db_mtx, hash_mutexes
+ * db_mtx, hash_mutexes
* protects:
- * dn_dbufs
- * dn_evicted
+ * dn_dbufs
+ * dn_evicted
* held from:
- * dmu_evict_user: db_mtx (dn_dbufs)
- * dbuf_free_range: db_mtx (dn_dbufs)
- * dbuf_remove_ref: db_mtx, callees:
- * dbuf_hash_remove: hash_mutexes, db_mtx
- * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
- * dnode_set_blksz: (dn_dbufs)
+ * dmu_evict_user: db_mtx (dn_dbufs)
+ * dbuf_free_range: db_mtx (dn_dbufs)
+ * dbuf_remove_ref: db_mtx, callees:
+ * dbuf_hash_remove: hash_mutexes, db_mtx
+ * dbuf_create: hash_mutexes, db_mtx (dn_dbufs)
+ * dnode_set_blksz: (dn_dbufs)
*
* hash_mutexes (global)
* must be held before:
- * db_mtx
+ * db_mtx
* protects dbuf_hash_table (global) and db_hash_next
* held from:
- * dbuf_find: db_mtx
- * dbuf_hash_insert: db_mtx
- * dbuf_hash_remove: db_mtx
+ * dbuf_find: db_mtx
+ * dbuf_hash_insert: db_mtx
+ * dbuf_hash_remove: db_mtx
*
* db_mtx (meta-leaf)
* must be held before:
- * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
+ * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes)
* protects:
- * db_state
- * db_holds
- * db_buf
- * db_changed
- * db_data_pending
- * db_dirtied
- * db_link
- * db_dirty_node (??)
- * db_dirtycnt
- * db_d.*
- * db.*
+ * db_state
+ * db_holds
+ * db_buf
+ * db_changed
+ * db_data_pending
+ * db_dirtied
+ * db_link
+ * db_dirty_node (??)
+ * db_dirtycnt
+ * db_d.*
+ * db.*
* held from:
- * dbuf_dirty: dn_mtx, dn_dirty_mtx
- * dbuf_dirty->dsl_dir_willuse_space: dd_lock
- * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
- * dbuf_undirty: dn_dirty_mtx (db_d)
- * dbuf_write_done: dn_dirty_mtx (db_state)
- * dbuf_*
- * dmu_buf_update_user: none (db_d)
- * dmu_evict_user: none (db_d) (maybe can eliminate)
- * dbuf_find: none (db_holds)
- * dbuf_hash_insert: none (db_holds)
- * dmu_buf_read_array_impl: none (db_state, db_changed)
- * dmu_sync: none (db_dirty_node, db_d)
- * dnode_reallocate: none (db)
+ * dbuf_dirty: dn_mtx, dn_dirty_mtx
+ * dbuf_dirty->dsl_dir_willuse_space: dd_lock
+ * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock
+ * dbuf_undirty: dn_dirty_mtx (db_d)
+ * dbuf_write_done: dn_dirty_mtx (db_state)
+ * dbuf_*
+ * dmu_buf_update_user: none (db_d)
+ * dmu_evict_user: none (db_d) (maybe can eliminate)
+ * dbuf_find: none (db_holds)
+ * dbuf_hash_insert: none (db_holds)
+ * dmu_buf_read_array_impl: none (db_state, db_changed)
+ * dmu_sync: none (db_dirty_node, db_d)
+ * dnode_reallocate: none (db)
*
* dn_mtx (leaf)
* protects:
- * dn_dirty_dbufs
- * dn_ranges
- * phys accounting
- * dn_allocated_txg
- * dn_free_txg
- * dn_assigned_txg
- * dn_dirty_txg
- * dd_assigned_tx
- * dn_notxholds
- * dn_dirtyctx
- * dn_dirtyctx_firstset
- * (dn_phys copy fields?)
- * (dn_phys contents?)
+ * dn_dirty_dbufs
+ * dn_ranges
+ * phys accounting
+ * dn_allocated_txg
+ * dn_free_txg
+ * dn_assigned_txg
+ * dn_dirty_txg
+ * dd_assigned_tx
+ * dn_notxholds
+ * dn_nodnholds
+ * dn_dirtyctx
+ * dn_dirtyctx_firstset
+ * (dn_phys copy fields?)
+ * (dn_phys contents?)
* held from:
- * dnode_*
- * dbuf_dirty: none
- * dbuf_sync: none (phys accounting)
- * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
- * dbuf_write_done: none (phys accounting)
- * dmu_object_info_from_dnode: none (accounting)
- * dmu_tx_commit: none
- * dmu_tx_hold_object_impl: none
- * dmu_tx_try_assign: dn_notxholds(cv)
- * dmu_tx_unassign: none
+ * dnode_*
+ * dbuf_dirty: none
+ * dbuf_sync: none (phys accounting)
+ * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs)
+ * dbuf_write_done: none (phys accounting)
+ * dmu_object_info_from_dnode: none (accounting)
+ * dmu_tx_commit: none
+ * dmu_tx_hold_object_impl: none
+ * dmu_tx_try_assign: dn_notxholds(cv)
+ * dmu_tx_unassign: none
*
* dd_lock
* must be held before:
- * ds_lock
- * ancestors' dd_lock
+ * ds_lock
+ * ancestors' dd_lock
* protects:
- * dd_prop_cbs
- * dd_sync_*
- * dd_used_bytes
- * dd_tempreserved
- * dd_space_towrite
- * dd_myname
- * dd_phys accounting?
+ * dd_prop_cbs
+ * dd_sync_*
+ * dd_used_bytes
+ * dd_tempreserved
+ * dd_space_towrite
+ * dd_myname
+ * dd_phys accounting?
* held from:
- * dsl_dir_*
- * dsl_prop_changed_notify: none (dd_prop_cbs)
- * dsl_prop_register: none (dd_prop_cbs)
- * dsl_prop_unregister: none (dd_prop_cbs)
+ * dsl_dir_*
+ * dsl_prop_changed_notify: none (dd_prop_cbs)
+ * dsl_prop_register: none (dd_prop_cbs)
+ * dsl_prop_unregister: none (dd_prop_cbs)
*
* os_lock (leaf)
* protects:
- * os_dirty_dnodes
- * os_free_dnodes
- * os_dnodes
- * os_downgraded_dbufs
- * dn_dirtyblksz
- * dn_dirty_link
+ * os_dirty_dnodes
+ * os_free_dnodes
+ * os_dnodes
+ * os_downgraded_dbufs
+ * dn_dirtyblksz
+ * dn_dirty_link
* held from:
- * dnode_create: none (os_dnodes)
- * dnode_destroy: none (os_dnodes)
- * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
- * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_create: none (os_dnodes)
+ * dnode_destroy: none (os_dnodes)
+ * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes)
+ * dnode_free: none (dn_dirtyblksz, os_*_dnodes)
*
* ds_lock
* protects:
- * ds_objset
- * ds_open_refcount
- * ds_snapname
- * ds_phys accounting
+ * ds_objset
+ * ds_open_refcount
+ * ds_snapname
+ * ds_phys accounting
* ds_phys userrefs zapobj
* ds_reserved
* held from:
- * dsl_dataset_*
+ * dsl_dataset_*
*
* dr_mtx (leaf)
* protects:
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
index 21a3ff3a20..71f76cc88b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#ifndef _DMU_ZFETCH_H
@@ -40,6 +40,13 @@ extern uint64_t zfetch_array_rd_sz;
struct dnode; /* so we can reference dnode */
+typedef struct zfetch {
+ krwlock_t zf_rwlock; /* protects zfetch structure */
+ list_t zf_stream; /* list of zstream_t's */
+ struct dnode *zf_dnode; /* dnode that owns this zfetch */
+ int zf_numstreams; /* number of zstream_t's */
+} zfetch_t;
+
typedef struct zstream {
uint64_t zs_blkid; /* expect next access at this blkid */
uint64_t zs_pf_blkid; /* next block to prefetch */
@@ -52,21 +59,19 @@ typedef struct zstream {
kmutex_t zs_lock; /* protects stream */
hrtime_t zs_atime; /* time last prefetch issued */
+ hrtime_t zs_start_time; /* start of last prefetch */
list_node_t zs_node; /* link for zf_stream */
+ zfetch_t *zs_fetch; /* parent fetch */
+ zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */
} zstream_t;
-typedef struct zfetch {
- krwlock_t zf_rwlock; /* protects zfetch structure */
- list_t zf_stream; /* list of zstream_t's */
- struct dnode *zf_dnode; /* dnode that owns this zfetch */
-} zfetch_t;
-
void zfetch_init(void);
void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *);
void dmu_zfetch_fini(zfetch_t *);
-void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t,
+ boolean_t);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 054e467bb7..ca94e5f1c9 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -310,6 +310,7 @@ struct dnode {
uint64_t dn_assigned_txg;
uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
kcondvar_t dn_notxholds;
+ kcondvar_t dn_nodnholds;
enum dnode_dirtycontext dn_dirtyctx;
uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index 189376eefc..0fd7e1a7e2 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -306,6 +306,7 @@ typedef struct dsl_dataset_snapshot_arg {
/* flags for holding the dataset */
typedef enum ds_hold_flags {
+ DS_HOLD_FLAG_NONE = 0 << 0,
DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */
} ds_hold_flags_t;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
index 1b600405ae..4693293290 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
- * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
*/
#ifndef _SYS_DSL_SCAN_H
@@ -164,10 +164,12 @@ void dsl_scan_fini(struct dsl_pool *dp);
void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
int dsl_scan_cancel(struct dsl_pool *);
int dsl_scan(struct dsl_pool *, pool_scan_func_t);
+void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd);
boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
-void dsl_resilver_restart(struct dsl_pool *, uint64_t txg);
+void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg);
boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
+boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_entry_t *dde, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/simd.h b/usr/src/uts/common/fs/zfs/sys/simd.h
new file mode 100644
index 0000000000..1ee17c902d
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/simd.h
@@ -0,0 +1,184 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Joyent, Inc.
+ */
+
+#ifndef _SIMD_H
+#define _SIMD_H
+
+#if defined(__amd64__) || defined(__i386__)
+
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_init() (0)
+#define kfpu_fini() do {} while (0)
+
+#ifdef _KERNEL
+#include <sys/x86_archext.h>
+#include <sys/archsystm.h>
+#include <sys/kfpu.h>
+#include <sys/proc.h>
+#include <sys/disp.h>
+#include <sys/cpuvar.h>
+
+static inline int
+kfpu_allowed(void)
+{
+ extern int zfs_fpu_enabled;
+
+ return (zfs_fpu_enabled != 0 ? 1 : 0);
+}
+
+static inline void
+kfpu_begin(void)
+{
+ if (curthread->t_lwp != NULL && (curthread->t_procp->p_flag & SSYS)) {
+ kernel_fpu_begin(NULL, KFPU_USE_LWP);
+ } else {
+ kpreempt_disable();
+ kernel_fpu_begin(NULL, KFPU_NO_STATE);
+ }
+}
+
+static inline void
+kfpu_end(void)
+{
+ if (curthread->t_lwp != NULL && (curthread->t_procp->p_flag & SSYS)) {
+ kernel_fpu_end(NULL, KFPU_USE_LWP);
+ } else {
+ kernel_fpu_end(NULL, KFPU_NO_STATE);
+ kpreempt_enable();
+ }
+}
+
+/*
+ * Check if various vector instruction sets are available.
+ */
+
+static inline boolean_t
+zfs_sse_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_SSE));
+}
+
+static inline boolean_t
+zfs_sse2_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_SSE2));
+}
+
+static inline boolean_t
+zfs_sse3_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_SSE3));
+}
+
+static inline boolean_t
+zfs_ssse3_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_SSSE3));
+}
+
+static inline boolean_t
+zfs_avx_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_AVX));
+}
+
+static inline boolean_t
+zfs_avx2_available(void)
+{
+ return (is_x86_feature(x86_featureset, X86FSET_AVX2));
+}
+
+#else /* ! _KERNEL */
+
+#include <sys/auxv.h>
+#include <sys/auxv_386.h>
+
+#define kfpu_allowed() 1
+#define kfpu_begin() do {} while (0)
+#define kfpu_end() do {} while (0)
+
+/*
+ * User-level check if various vector instruction sets are available.
+ */
+
+static inline boolean_t
+zfs_sse_available(void)
+{
+ uint32_t u = 0;
+
+ (void) getisax(&u, 1);
+ return ((u & AV_386_SSE) != 0);
+}
+
+static inline boolean_t
+zfs_sse2_available(void)
+{
+ uint32_t u = 0;
+
+ (void) getisax(&u, 1);
+ return ((u & AV_386_SSE2) != 0);
+}
+
+static inline boolean_t
+zfs_sse3_available(void)
+{
+ uint32_t u = 0;
+
+ (void) getisax(&u, 1);
+ return ((u & AV_386_SSE3) != 0);
+}
+
+static inline boolean_t
+zfs_ssse3_available(void)
+{
+ uint32_t u = 0;
+
+ (void) getisax(&u, 1);
+ return ((u & AV_386_SSSE3) != 0);
+}
+
+static inline boolean_t
+zfs_avx_available(void)
+{
+ uint_t u = 0;
+
+ (void) getisax(&u, 1);
+ return ((u & AV_386_AVX) != 0);
+}
+
+static inline boolean_t
+zfs_avx2_available(void)
+{
+ uint32_t u[2] = { 0 };
+
+ (void) getisax((uint32_t *)&u, 2);
+ return ((u[1] & AV_386_2_AVX2) != 0);
+}
+
+#endif /* _KERNEL */
+
+
+#else
+
+/* Non-x86 CPUs currently always disallow kernel FPU support */
+#define kfpu_allowed() 0
+#define kfpu_initialize(tsk) do {} while (0)
+#define kfpu_begin() do {} while (0)
+#define kfpu_end() do {} while (0)
+#define kfpu_init() (0)
+#define kfpu_fini() do {} while (0)
+#endif
+
+#endif /* _SIMD_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index e017462613..af8057be8f 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -26,8 +26,9 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2019 Joyent, Inc.
- * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
*/
#ifndef _SYS_SPA_H
@@ -759,7 +760,8 @@ extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
size_t buflen);
extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
nvlist_t *zplprops, struct dsl_crypto_params *dcp);
-extern int spa_import_rootpool(char *devpath, char *devid);
+extern int spa_import_rootpool(char *devpath, char *devid, uint64_t pool_guid,
+ uint64_t vdev_guid);
extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props,
uint64_t flags);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
@@ -773,6 +775,7 @@ extern void spa_async_request(spa_t *spa, int flag);
extern void spa_async_unrequest(spa_t *spa, int flag);
extern void spa_async_suspend(spa_t *spa);
extern void spa_async_resume(spa_t *spa);
+extern int spa_async_tasks(spa_t *spa);
extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
@@ -789,6 +792,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
+#define SPA_ASYNC_L2CACHE_REBUILD 0x800
/*
* Controls the behavior of spa_vdev_remove().
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_boot.h b/usr/src/uts/common/fs/zfs/sys/spa_boot.h
index 8df5072a55..b1b100e17e 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_boot.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_boot.h
@@ -25,6 +25,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
*/
#ifndef _SYS_SPA_BOOT_H
@@ -36,8 +37,8 @@
extern "C" {
#endif
-extern char *spa_get_bootprop(char *prop);
-extern void spa_free_bootprop(char *prop);
+extern char *spa_get_bootprop(const char *propname);
+extern void spa_free_bootprop(char *propval);
extern void spa_arch_init(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 88a172eed5..45a78717da 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -25,7 +25,6 @@
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
- * Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright 2019 Joyent, Inc.
*/
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index a6de7e6f2c..b839ed2359 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -21,8 +21,9 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Datto Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -153,6 +154,8 @@ extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd);
+extern void vdev_defer_resilver(vdev_t *vd);
+extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx);
typedef enum vdev_config_flag {
VDEV_CONFIG_SPARE = 1 << 0,
@@ -177,6 +180,8 @@ extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **);
extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv);
extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t
offset, uint64_t size, zio_done_func_t *done, void *private, int flags);
+extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *);
+extern int vdev_label_write_bootenv(vdev_t *, nvlist_t *);
typedef enum {
VDEV_LABEL_CREATE, /* create/add a new device */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 774ed92db5..4e42247345 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -20,10 +20,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -411,7 +411,7 @@ struct vdev {
#define VDEV_RAIDZ_MAXPARITY 3
#define VDEV_PAD_SIZE (8 << 10)
-/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
+/* 2 padding areas (vl_pad1 and vl_be) to skip */
#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
@@ -438,9 +438,38 @@ typedef struct vdev_phys {
zio_eck_t vp_zbt;
} vdev_phys_t;
+typedef enum vbe_vers {
+ /*
+ * The bootenv file is stored as ascii text in the envblock.
+ * It is used by the GRUB bootloader used on Linux to store the
+ * contents of the grubenv file. The file is stored as raw ASCII,
+ * and is protected by an embedded checksum. By default, GRUB will
+ * check if the boot filesystem supports storing the environment data
+ * in a special location, and if so, will invoke filesystem specific
+ * logic to retrieve it. This can be overriden by a variable, should
+ * the user so desire.
+ */
+ VB_RAW = 0,
+
+ /*
+ * The bootenv file is converted to an nvlist and then packed into the
+ * envblock.
+ */
+ VB_NVLIST = 1
+} vbe_vers_t;
+
+typedef struct vdev_boot_envblock {
+ uint64_t vbe_version;
+ char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) -
+ sizeof (zio_eck_t)];
+ zio_eck_t vbe_zbt;
+} vdev_boot_envblock_t;
+
+CTASSERT(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE);
+
typedef struct vdev_label {
char vl_pad1[VDEV_PAD_SIZE]; /* 8K */
- char vl_pad2[VDEV_PAD_SIZE]; /* 8K */
+ vdev_boot_envblock_t vl_be; /* 8K */
vdev_phys_t vl_vdev_phys; /* 112K */
char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */
} vdev_label_t; /* 256K total */
@@ -556,6 +585,14 @@ typedef struct vdev_buf {
zio_t *vb_io; /* pointer back to the original zio_t */
} vdev_buf_t;
+/*
+ * Support routines used during boot from a ZFS pool
+ */
+extern int vdev_disk_read_rootlabel(const char *, const char *, nvlist_t **);
+extern void vdev_disk_preroot_init(void);
+extern void vdev_disk_preroot_fini(void);
+extern const char *vdev_disk_preroot_lookup(uint64_t, uint64_t);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
new file mode 100644
index 0000000000..bf5c840139
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Neskovic <neskovic@compeng.uni-frankfurt.de>.
+ * Copyright 2020 Joyent, Inc.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define _SYS_VDEV_RAIDZ_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct zio;
+struct raidz_map;
+#if !defined(_KERNEL)
+struct kernel_param {};
+#endif
+
+/*
+ * vdev_raidz interface
+ */
+struct raidz_map * vdev_raidz_map_alloc(struct zio *, uint64_t,
+ uint64_t, uint64_t);
+void vdev_raidz_map_free(struct raidz_map *);
+void vdev_raidz_generate_parity(struct raidz_map *);
+int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
+
+/*
+ * vdev_raidz_math interface
+ */
+void vdev_raidz_math_init(void);
+void vdev_raidz_math_fini(void);
+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+int vdev_raidz_math_generate(struct raidz_map *);
+int vdev_raidz_math_reconstruct(struct raidz_map *, const int *,
+ const int *, const int);
+int vdev_raidz_impl_set(const char *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h
new file mode 100644
index 0000000000..18771534bf
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_raidz_impl.h
@@ -0,0 +1,360 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef _VDEV_RAIDZ_H
+#define _VDEV_RAIDZ_H
+
+#include <sys/types.h>
+#include <sys/debug.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CODE_P (0U)
+#define CODE_Q (1U)
+#define CODE_R (2U)
+
+#define PARITY_P (1U)
+#define PARITY_PQ (2U)
+#define PARITY_PQR (3U)
+
+#define TARGET_X (0U)
+#define TARGET_Y (1U)
+#define TARGET_Z (2U)
+
+/*
+ * Parity generation methods indexes
+ */
+enum raidz_math_gen_op {
+ RAIDZ_GEN_P = 0,
+ RAIDZ_GEN_PQ,
+ RAIDZ_GEN_PQR,
+ RAIDZ_GEN_NUM = 3
+};
+/*
+ * Data reconstruction methods indexes
+ */
+enum raidz_rec_op {
+ RAIDZ_REC_P = 0,
+ RAIDZ_REC_Q,
+ RAIDZ_REC_R,
+ RAIDZ_REC_PQ,
+ RAIDZ_REC_PR,
+ RAIDZ_REC_QR,
+ RAIDZ_REC_PQR,
+ RAIDZ_REC_NUM = 7
+};
+
+extern const char *raidz_gen_name[RAIDZ_GEN_NUM];
+extern const char *raidz_rec_name[RAIDZ_REC_NUM];
+
+/*
+ * Methods used to define raidz implementation
+ *
+ * @raidz_gen_f Parity generation function
+ * @par1 pointer to raidz_map
+ * @raidz_rec_f Data reconstruction function
+ * @par1 pointer to raidz_map
+ * @par2 array of reconstruction targets
+ * @will_work_f Function returns TRUE if impl. is supported on the system
+ * @init_impl_f Function is called once on init
+ * @fini_impl_f Function is called once on fini
+ */
+typedef void (*raidz_gen_f)(void *);
+typedef int (*raidz_rec_f)(void *, const int *);
+typedef boolean_t (*will_work_f)(void);
+typedef void (*init_impl_f)(void);
+typedef void (*fini_impl_f)(void);
+
+#define RAIDZ_IMPL_NAME_MAX (20)
+
+typedef struct raidz_impl_ops {
+ init_impl_f init;
+ fini_impl_f fini;
+ raidz_gen_f gen[RAIDZ_GEN_NUM]; /* Parity generate functions */
+ raidz_rec_f rec[RAIDZ_REC_NUM]; /* Data reconstruction functions */
+ will_work_f is_supported; /* Support check function */
+ char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */
+} raidz_impl_ops_t;
+
+typedef struct raidz_col {
+ size_t rc_devidx; /* child device index for I/O */
+ size_t rc_offset; /* device offset */
+ size_t rc_size; /* I/O size */
+ abd_t *rc_abd; /* I/O data */
+ void *rc_gdata; /* used to store the "good" version */
+ int rc_error; /* I/O error for this device */
+ unsigned int rc_tried; /* Did we attempt this I/O column? */
+ unsigned int rc_skipped; /* Did we skip this I/O column? */
+} raidz_col_t;
+
+typedef struct raidz_map {
+ size_t rm_cols; /* Regular column count */
+ size_t rm_scols; /* Count including skipped columns */
+ size_t rm_bigcols; /* Number of oversized columns */
+ size_t rm_asize; /* Actual total I/O size */
+ size_t rm_missingdata; /* Count of missing data devices */
+ size_t rm_missingparity; /* Count of missing parity devices */
+ size_t rm_firstdatacol; /* First data column/parity count */
+ size_t rm_nskip; /* Skipped sectors for padding */
+ size_t rm_skipstart; /* Column index of padding start */
+ void *rm_abd_copy; /* rm_asize-buffer of copied data */
+ size_t rm_reports; /* # of referencing checksum reports */
+ unsigned int rm_freed; /* map no longer has referencing ZIO */
+ unsigned int rm_ecksuminjected; /* checksum error was injected */
+ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
+ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
+} raidz_map_t;
+
+#define RAIDZ_ORIGINAL_IMPL (INT_MAX)
+
+extern const raidz_impl_ops_t vdev_raidz_scalar_impl;
+#if defined(__x86)
+extern const raidz_impl_ops_t vdev_raidz_sse2_impl;
+#endif
+#if defined(__x86)
+extern const raidz_impl_ops_t vdev_raidz_ssse3_impl;
+#endif
+#if defined(__x86)
+extern const raidz_impl_ops_t vdev_raidz_avx2_impl;
+#endif
+
+/*
+ * Commonly used raidz_map helpers
+ *
+ * raidz_parity Returns parity of the RAIDZ block
+ * raidz_ncols Returns number of columns the block spans
+ * raidz_nbigcols Returns number of big columns
+ * raidz_col_p Returns pointer to a column
+ * raidz_col_size Returns size of a column
+ * raidz_big_size Returns size of big columns
+ * raidz_short_size Returns size of short columns
+ */
+#define raidz_parity(rm) ((rm)->rm_firstdatacol)
+#define raidz_ncols(rm) ((rm)->rm_cols)
+#define raidz_nbigcols(rm) ((rm)->rm_bigcols)
+#define raidz_col_p(rm, c) ((rm)->rm_col + (c))
+#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size)
+#define raidz_big_size(rm) (raidz_col_size(rm, CODE_P))
+#define raidz_short_size(rm) (raidz_col_size(rm, raidz_ncols(rm)-1))
+
+/*
+ * Macro defines an RAIDZ parity generation method
+ *
+ * @code parity the function produce
+ * @impl name of the implementation
+ */
+#define _RAIDZ_GEN_WRAP(code, impl) \
+static void \
+impl ## _gen_ ## code(void *rmp) \
+{ \
+ raidz_map_t *rm = (raidz_map_t *) rmp; \
+ raidz_generate_## code ## _impl(rm); \
+}
+
+/*
+ * Macro defines an RAIDZ data reconstruction method
+ *
+ * @code parity the function produce
+ * @impl name of the implementation
+ */
+#define _RAIDZ_REC_WRAP(code, impl) \
+static int \
+impl ## _rec_ ## code(void *rmp, const int *tgtidx) \
+{ \
+ raidz_map_t *rm = (raidz_map_t *) rmp; \
+ return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \
+}
+
+/*
+ * Define all gen methods for an implementation
+ *
+ * @impl name of the implementation
+ */
+#define DEFINE_GEN_METHODS(impl) \
+ _RAIDZ_GEN_WRAP(p, impl); \
+ _RAIDZ_GEN_WRAP(pq, impl); \
+ _RAIDZ_GEN_WRAP(pqr, impl)
+
+/*
+ * Define all rec functions for an implementation
+ *
+ * @impl name of the implementation
+ */
+#define DEFINE_REC_METHODS(impl) \
+ _RAIDZ_REC_WRAP(p, impl); \
+ _RAIDZ_REC_WRAP(q, impl); \
+ _RAIDZ_REC_WRAP(r, impl); \
+ _RAIDZ_REC_WRAP(pq, impl); \
+ _RAIDZ_REC_WRAP(pr, impl); \
+ _RAIDZ_REC_WRAP(qr, impl); \
+ _RAIDZ_REC_WRAP(pqr, impl)
+
+#define RAIDZ_GEN_METHODS(impl) \
+{ \
+ [RAIDZ_GEN_P] = & impl ## _gen_p, \
+ [RAIDZ_GEN_PQ] = & impl ## _gen_pq, \
+ [RAIDZ_GEN_PQR] = & impl ## _gen_pqr \
+}
+
+#define RAIDZ_REC_METHODS(impl) \
+{ \
+ [RAIDZ_REC_P] = & impl ## _rec_p, \
+ [RAIDZ_REC_Q] = & impl ## _rec_q, \
+ [RAIDZ_REC_R] = & impl ## _rec_r, \
+ [RAIDZ_REC_PQ] = & impl ## _rec_pq, \
+ [RAIDZ_REC_PR] = & impl ## _rec_pr, \
+ [RAIDZ_REC_QR] = & impl ## _rec_qr, \
+ [RAIDZ_REC_PQR] = & impl ## _rec_pqr \
+}
+
+
+typedef struct raidz_impl_kstat {
+ uint64_t gen[RAIDZ_GEN_NUM]; /* gen method speed B/s */
+ uint64_t rec[RAIDZ_REC_NUM]; /* rec method speed B/s */
+} raidz_impl_kstat_t;
+
+/*
+ * Enumerate various multiplication constants
+ * used in reconstruction methods
+ */
+typedef enum raidz_mul_info {
+ /* Reconstruct Q */
+ MUL_Q_X = 0,
+ /* Reconstruct R */
+ MUL_R_X = 0,
+ /* Reconstruct PQ */
+ MUL_PQ_X = 0,
+ MUL_PQ_Y = 1,
+ /* Reconstruct PR */
+ MUL_PR_X = 0,
+ MUL_PR_Y = 1,
+ /* Reconstruct QR */
+ MUL_QR_XQ = 0,
+ MUL_QR_X = 1,
+ MUL_QR_YQ = 2,
+ MUL_QR_Y = 3,
+ /* Reconstruct PQR */
+ MUL_PQR_XP = 0,
+ MUL_PQR_XQ = 1,
+ MUL_PQR_XR = 2,
+ MUL_PQR_YU = 3,
+ MUL_PQR_YP = 4,
+ MUL_PQR_YQ = 5,
+
+ MUL_CNT = 6
+} raidz_mul_info_t;
+
+/*
+ * Powers of 2 in the Galois field.
+ */
+extern const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256)));
+/* Logs of 2 in the Galois field defined above. */
+extern const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256)));
+
+/*
+ * Multiply a given number by 2 raised to the given power.
+ */
+static inline uint8_t
+vdev_raidz_exp2(const uint8_t a, const unsigned exp)
+{
+ if (a == 0)
+ return (0);
+
+ return (vdev_raidz_pow2[(exp + (unsigned) vdev_raidz_log2[a]) % 255]);
+}
+
+/*
+ * Galois Field operations.
+ *
+ * gf_exp2 - computes 2 raised to the given power
+ * gf_exp2 - computes 4 raised to the given power
+ * gf_mul - multiplication
+ * gf_div - division
+ * gf_inv - multiplicative inverse
+ */
+typedef unsigned gf_t;
+typedef unsigned gf_log_t;
+
+static inline gf_t
+gf_mul(const gf_t a, const gf_t b)
+{
+ gf_log_t logsum;
+
+ if (a == 0 || b == 0)
+ return (0);
+
+ logsum = (gf_log_t) vdev_raidz_log2[a] + (gf_log_t) vdev_raidz_log2[b];
+
+ return ((gf_t) vdev_raidz_pow2[logsum % 255]);
+}
+
+static inline gf_t
+gf_div(const gf_t a, const gf_t b)
+{
+ gf_log_t logsum;
+
+ ASSERT3U(b, >, 0);
+ if (a == 0)
+ return (0);
+
+ logsum = (gf_log_t) 255 + (gf_log_t) vdev_raidz_log2[a] -
+ (gf_log_t) vdev_raidz_log2[b];
+
+ return ((gf_t) vdev_raidz_pow2[logsum % 255]);
+}
+
+static inline gf_t
+gf_inv(const gf_t a)
+{
+ gf_log_t logsum;
+
+ ASSERT3U(a, >, 0);
+
+ logsum = (gf_log_t) 255 - (gf_log_t) vdev_raidz_log2[a];
+
+ return ((gf_t) vdev_raidz_pow2[logsum]);
+}
+
+static inline gf_t
+gf_exp2(gf_log_t exp)
+{
+ return (vdev_raidz_pow2[exp % 255]);
+}
+
+static inline gf_t
+gf_exp4(gf_log_t exp)
+{
+ ASSERT3U(exp, <=, 255);
+ return ((gf_t) vdev_raidz_pow2[(2 * exp) % 255]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VDEV_RAIDZ_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h b/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h
new file mode 100644
index 0000000000..703a1c8fa6
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_bootenv.h
@@ -0,0 +1,52 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Toomas Soome <tsoome@me.com>
+ */
+
+#ifndef _ZFS_BOOTENV_H
+#define _ZFS_BOOTENV_H
+
+/*
+ * Define macros for label bootenv nvlist pair keys.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BOOTENV_VERSION "version"
+
+#define BE_ILLUMOS_VENDOR "illumos"
+#define BE_FREEBSD_VENDOR "freebsd"
+#define BE_GRUB_VENDOR "grub"
+
+#define BOOTENV_OS BE_ILLUMOS_VENDOR
+
+#define GRUB_ENVMAP BE_GRUB_VENDOR ":" "envmap"
+
+#define FREEBSD_BOOTONCE BE_FREEBSD_VENDOR ":" "bootonce"
+#define FREEBSD_BOOTONCE_USED BE_FREEBSD_VENDOR ":" "bootonce-used"
+#define ILLUMOS_BOOTONCE BE_ILLUMOS_VENDOR ":" "bootonce"
+#define ILLUMOS_BOOTONCE_USED BE_ILLUMOS_VENDOR ":" "bootonce-used"
+#define FREEBSD_NVSTORE BE_FREEBSD_VENDOR ":" "nvstore"
+#define ILLUMOS_NVSTORE BE_ILLUMOS_VENDOR ":" "nvstore"
+
+#define OS_BOOTONCE BOOTENV_OS ":" "bootonce"
+#define OS_BOOTONCE_USED BOOTENV_OS ":" "bootonce-used"
+#define OS_NVSTORE BOOTENV_OS ":" "nvstore"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_BOOTENV_H */
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 9947bedf54..5058d48e74 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -24,6 +24,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright 2016 RackTop Systems.
* Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2017, Intel Corporation.
*/
#ifndef _SYS_ZFS_IOCTL_H
@@ -389,6 +390,10 @@ typedef struct zinject_record {
#define ZI_NO_DVA (-1)
+/* scaled frequency ranges */
+#define ZI_PERCENTAGE_MIN 4294UL
+#define ZI_PERCENTAGE_MAX UINT32_MAX
+
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
@@ -450,7 +455,7 @@ typedef struct zfs_cmd {
uint64_t zc_history_len;
uint64_t zc_history_offset;
uint64_t zc_obj;
- uint64_t zc_iflags; /* internal to zfs(7fs) */
+ uint64_t zc_iflags; /* internal to zfs(4FS) */
zfs_share_t zc_share;
dmu_objset_stats_t zc_objset_stats;
dmu_replay_record_t zc_begin_record;
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 01e892f4c4..8e155979e6 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -27,6 +27,7 @@
* Copyright 2016 Toomas Soome <tsoome@me.com>
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Datto Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -98,6 +99,12 @@ boolean_t vdev_validate_skip = B_FALSE;
int zfs_vdev_dtl_sm_blksz = (1 << 12);
/*
+ * Ignore errors during scrub/resilver. Allows to work around resilver
+ * upon import when there are pool errors.
+ */
+int zfs_scan_ignore_errors = 0;
+
+/*
* vdev-wide space maps that have lots of entries written to them at
* the end of each transaction can benefit from a higher I/O bandwidth
* (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
@@ -196,7 +203,7 @@ vdev_getops(const char *type)
/*
* Derive the enumerated alloction bias from string input.
- * String origin is either the per-vdev zap or zpool(1M).
+ * String origin is either the per-vdev zap or zpool(8).
*/
static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char *bias)
@@ -772,7 +779,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_resilver_txg);
if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
- vdev_set_deferred_resilver(spa, vd);
+ vdev_defer_resilver(vd);
/*
* When importing a pool, we want to ignore the persistent fault
@@ -1358,7 +1365,7 @@ vdev_probe_done(zio_t *zio)
} else {
ASSERT(zio->io_error != 0);
vdev_dbgmsg(vd, "failed probe");
- zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0, 0);
zio->io_error = SET_ERROR(ENXIO);
}
@@ -1462,7 +1469,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
for (int l = 1; l < VDEV_LABELS; l++) {
zio_nowait(zio_read_phys(pio, vd,
vdev_label_offset(vd->vdev_psize, l,
- offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE,
+ offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
@@ -1710,7 +1717,8 @@ vdev_open(vdev_t *vd)
*/
if (ashift > vd->vdev_top->vdev_ashift &&
vd->vdev_ops->vdev_op_leaf) {
- zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
+ (void) zfs_ereport_post(
+ FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
spa, vd, NULL, NULL, 0, 0);
}
@@ -1764,18 +1772,12 @@ vdev_open(vdev_t *vd)
}
/*
- * If a leaf vdev has a DTL, and seems healthy, then kick off a
- * resilver. But don't do this if we are doing a reopen for a scrub,
- * since this would just restart the scrub we are already doing.
+ * If this is a leaf vdev, assess whether a resilver is needed.
+ * But don't do this if we are doing a reopen for a scrub, since
+ * this would just restart the scrub we are already doing.
*/
- if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
- vdev_resilver_needed(vd, NULL, NULL)) {
- if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
- spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
- vdev_set_deferred_resilver(spa, vd);
- else
- spa_async_request(spa, SPA_ASYNC_RESILVER);
- }
+ if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
+ dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
return (0);
}
@@ -2134,7 +2136,7 @@ vdev_hold(vdev_t *vd)
for (int c = 0; c < vd->vdev_children; c++)
vdev_hold(vd->vdev_child[c]);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
vd->vdev_ops->vdev_op_hold(vd);
}
@@ -2147,7 +2149,7 @@ vdev_rele(vdev_t *vd)
for (int c = 0; c < vd->vdev_children; c++)
vdev_rele(vd->vdev_child[c]);
- if (vd->vdev_ops->vdev_op_leaf)
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
vd->vdev_ops->vdev_op_rele(vd);
}
@@ -2177,9 +2179,22 @@ vdev_reopen(vdev_t *vd)
if (vd->vdev_aux) {
(void) vdev_validate_aux(vd);
if (vdev_readable(vd) && vdev_writeable(vd) &&
- vd->vdev_aux == &spa->spa_l2cache &&
- !l2arc_vdev_present(vd))
- l2arc_add_vdev(spa, vd);
+ vd->vdev_aux == &spa->spa_l2cache) {
+ /*
+ * When reopening we can assume the device label has
+ * already the attribute l2cache_persistent, since we've
+ * opened the device in the past and updated the label.
+ * In case the vdev is present we should evict all ARC
+ * buffers and pointers to log blocks and reclaim their
+ * space before restoring its contents to L2ARC.
+ */
+ if (l2arc_vdev_present(vd)) {
+ l2arc_rebuild_vdev(vd, B_TRUE);
+ } else {
+ l2arc_add_vdev(spa, vd);
+ }
+ spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+ }
} else {
(void) vdev_validate(vd);
}
@@ -2470,7 +2485,6 @@ vdev_dtl_should_excise(vdev_t *vd)
spa_t *spa = vd->vdev_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
- ASSERT0(scn->scn_phys.scn_errors);
ASSERT0(vd->vdev_children);
if (vd->vdev_state < VDEV_STATE_DEGRADED)
@@ -2520,10 +2534,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+ boolean_t wasempty = B_TRUE;
mutex_enter(&vd->vdev_dtl_lock);
/*
+ * If requested, pretend the scan completed cleanly.
+ */
+ if (zfs_scan_ignore_errors && scn)
+ scn->scn_phys.scn_errors = 0;
+
+ if (scrub_txg != 0 &&
+ !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ wasempty = B_FALSE;
+ zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
+ "dtl:%llu/%llu errors:%llu",
+ (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
+ (u_longlong_t)scrub_txg, spa->spa_scrub_started,
+ (u_longlong_t)vdev_dtl_min(vd),
+ (u_longlong_t)vdev_dtl_max(vd),
+ (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
+ }
+
+ /*
* If we've completed a scan cleanly then determine
* if this vdev should remove any DTLs. We only want to
* excise regions on vdevs that were available during
@@ -2559,6 +2592,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
space_reftree_generate_map(&reftree,
vd->vdev_dtl[DTL_MISSING], 1);
space_reftree_destroy(&reftree);
+
+ if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+ zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
+ (u_longlong_t)vdev_dtl_min(vd),
+ (u_longlong_t)vdev_dtl_max(vd));
+ } else if (!wasempty) {
+ zfs_dbgmsg("DTL_MISSING is now empty");
+ }
}
range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
range_tree_walk(vd->vdev_dtl[DTL_MISSING],
@@ -3543,14 +3584,11 @@ vdev_clear(spa_t *spa, vdev_t *vd)
if (vd != rvd && vdev_writeable(vd->vdev_top))
vdev_state_dirty(vd->vdev_top);
- if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) {
- if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
- spa_feature_is_enabled(spa,
- SPA_FEATURE_RESILVER_DEFER))
- vdev_set_deferred_resilver(spa, vd);
- else
- spa_async_request(spa, SPA_ASYNC_RESILVER);
- }
+ /* If a resilver isn't required, check if vdevs can be culled */
+ if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
+ !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+ !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
+ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
}
@@ -3749,6 +3787,8 @@ void
vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
{
vdev_t *tvd = vd->vdev_top;
+ spa_t *spa = vd->vdev_spa;
+
mutex_enter(&vd->vdev_stat_lock);
if (vs) {
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
@@ -3790,8 +3830,8 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
*/
if (vd->vdev_aux == NULL && tvd != NULL) {
vs->vs_esize = P2ALIGN(
- vd->vdev_max_asize - vd->vdev_asize,
- 1ULL << tvd->vdev_ms_shift);
+ vd->vdev_max_asize - vd->vdev_asize -
+ spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift);
}
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
@@ -4384,7 +4424,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
}
- zfs_ereport_post(class, spa, vd, NULL, NULL,
+ (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
save_state, 0);
}
@@ -4414,7 +4454,6 @@ vdev_children_are_offline(vdev_t *vd)
/*
* Check the vdev configuration to ensure that it's capable of supporting
* a root pool. We do not support partial configuration.
- * In addition, only a single top-level vdev is allowed.
*/
boolean_t
vdev_is_bootable(vdev_t *vd)
@@ -4422,23 +4461,7 @@ vdev_is_bootable(vdev_t *vd)
if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type;
- if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
- vd->vdev_children > 1) {
- int non_indirect = 0;
-
- for (int c = 0; c < vd->vdev_children; c++) {
- vdev_type =
- vd->vdev_child[c]->vdev_ops->vdev_op_type;
- if (strcmp(vdev_type, VDEV_TYPE_INDIRECT) != 0)
- non_indirect++;
- }
- /*
- * non_indirect > 1 means we have more than one
- * top-level vdev, so we stop here.
- */
- if (non_indirect > 1)
- return (B_FALSE);
- } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
+ if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
return (B_FALSE);
}
}
@@ -4559,18 +4582,46 @@ vdev_deadman(vdev_t *vd)
}
void
-vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
+vdev_defer_resilver(vdev_t *vd)
{
- for (uint64_t i = 0; i < vd->vdev_children; i++)
- vdev_set_deferred_resilver(spa, vd->vdev_child[i]);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
- if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) ||
- range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
- return;
+ vd->vdev_resilver_deferred = B_TRUE;
+ vd->vdev_spa->spa_resilver_deferred = B_TRUE;
+}
+
+/*
+ * Clears the resilver deferred flag on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+boolean_t
+vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+ boolean_t resilver_needed = B_FALSE;
+ spa_t *spa = vd->vdev_spa;
+
+ for (int c = 0; c < vd->vdev_children; c++) {
+ vdev_t *cvd = vd->vdev_child[c];
+ resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
}
- vd->vdev_resilver_deferred = B_TRUE;
- spa->spa_resilver_deferred = B_TRUE;
+ if (vd == spa->spa_root_vdev &&
+ spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+ spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+ vdev_config_dirty(vd);
+ spa->spa_resilver_deferred = B_FALSE;
+ return (resilver_needed);
+ }
+
+ if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+ !vd->vdev_ops->vdev_op_leaf)
+ return (resilver_needed);
+
+ vd->vdev_resilver_deferred = B_FALSE;
+
+ return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+ vdev_resilver_needed(vd, NULL, NULL));
}
/*
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 9408ec68fb..4be567d551 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -23,6 +23,7 @@
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright 2020 Joyent, Inc.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
*/
#include <sys/zfs_context.h>
@@ -365,7 +366,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
error = EINVAL; /* presume failure */
if (vd->vdev_path != NULL) {
-
if (vd->vdev_wholedisk == -1ULL) {
size_t len = strlen(vd->vdev_path) + 3;
char *buf = kmem_alloc(len, KM_SLEEP);
@@ -480,6 +480,28 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
}
}
+ /*
+ * If this is early in boot, a sweep of available block devices may
+ * locate an alternative path that we can try.
+ */
+ if (error != 0) {
+ const char *altdevpath = vdev_disk_preroot_lookup(
+ spa_guid(spa), vd->vdev_guid);
+
+ if (altdevpath != NULL) {
+ vdev_dbgmsg(vd, "Trying alternate preroot path (%s)",
+ altdevpath);
+
+ validate_devid = B_TRUE;
+
+ if ((error = ldi_open_by_name((char *)altdevpath,
+ spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) {
+ vdev_dbgmsg(vd, "Failed to open by preroot "
+ "path (%s)", altdevpath);
+ }
+ }
+ }
+
if (error != 0) {
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]",
@@ -1063,7 +1085,8 @@ vdev_ops_t vdev_disk_ops = {
* the device, and construct a configuration nvlist.
*/
int
-vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
+vdev_disk_read_rootlabel(const char *devpath, const char *devid,
+ nvlist_t **config)
{
ldi_handle_t vd_lh;
vdev_label_t *label;
@@ -1076,7 +1099,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
/*
* Read the device label and build the nvlist.
*/
- if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
+ if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid,
&minor_name) == 0) {
error = ldi_open_by_devid(tmpdevid, minor_name,
FREAD, kcred, &vd_lh, zfs_li);
@@ -1084,9 +1107,10 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
ddi_devid_str_free(minor_name);
}
- if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh,
- zfs_li)))
+ if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD,
+ kcred, &vd_lh, zfs_li)) != 0) {
return (error);
+ }
if (ldi_get_size(vd_lh, &s)) {
(void) ldi_close(vd_lh, FREAD, kcred);
@@ -1136,3 +1160,150 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
return (error);
}
+
+struct veb {
+ list_t veb_ents;
+ boolean_t veb_scanned;
+};
+
+struct veb_ent {
+ uint64_t vebe_pool_guid;
+ uint64_t vebe_vdev_guid;
+
+ char *vebe_devpath;
+
+ list_node_t vebe_link;
+};
+
+static kmutex_t veb_lock;
+static struct veb *veb;
+
+static int
+vdev_disk_preroot_scan_walk(const char *devpath, void *arg)
+{
+ int r;
+ nvlist_t *cfg = NULL;
+ uint64_t pguid = 0, vguid = 0;
+
+ /*
+ * Attempt to read the label from this block device.
+ */
+ if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) {
+ /*
+ * Many of the available block devices will represent slices or
+ * partitions of disks, or may represent disks that are not at
+ * all initialised with ZFS. As this is a best effort
+ * mechanism to locate an alternate path to a particular vdev,
+ * we will ignore any failures and keep scanning.
+ */
+ return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
+ }
+
+ /*
+ * Determine the pool and vdev GUID read from the label for this
+ * device. Both values must be present and have a non-zero value.
+ */
+ if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 ||
+ nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 ||
+ pguid == 0 || vguid == 0) {
+ /*
+ * This label was not complete.
+ */
+ goto out;
+ }
+
+ /*
+ * Keep track of all of the GUID-to-devpath mappings we find so that
+ * vdev_disk_preroot_lookup() can search them.
+ */
+ struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP);
+ vebe->vebe_pool_guid = pguid;
+ vebe->vebe_vdev_guid = vguid;
+ vebe->vebe_devpath = spa_strdup(devpath);
+
+ list_insert_tail(&veb->veb_ents, vebe);
+
+out:
+ nvlist_free(cfg);
+ return (PREROOT_WALK_BLOCK_DEVICES_NEXT);
+}
+
+const char *
+vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid)
+{
+ if (pool_guid == 0 || vdev_guid == 0) {
+ /*
+ * If we aren't provided both a pool and a vdev GUID, we cannot
+ * perform a lookup.
+ */
+ return (NULL);
+ }
+
+ mutex_enter(&veb_lock);
+ if (veb == NULL) {
+ /*
+ * If vdev_disk_preroot_fini() has been called already, there
+ * is nothing we can do.
+ */
+ mutex_exit(&veb_lock);
+ return (NULL);
+ }
+
+ /*
+ * We want to perform at most one scan of all block devices per boot.
+ */
+ if (!veb->veb_scanned) {
+ cmn_err(CE_NOTE, "Performing full ZFS device scan!");
+
+ preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL);
+
+ veb->veb_scanned = B_TRUE;
+ }
+
+ const char *path = NULL;
+ for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL;
+ vebe = list_next(&veb->veb_ents, vebe)) {
+ if (vebe->vebe_pool_guid == pool_guid &&
+ vebe->vebe_vdev_guid == vdev_guid) {
+ path = vebe->vebe_devpath;
+ break;
+ }
+ }
+
+ mutex_exit(&veb_lock);
+
+ return (path);
+}
+
+void
+vdev_disk_preroot_init(void)
+{
+ mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ VERIFY3P(veb, ==, NULL);
+ veb = kmem_zalloc(sizeof (*veb), KM_SLEEP);
+ list_create(&veb->veb_ents, sizeof (struct veb_ent),
+ offsetof(struct veb_ent, vebe_link));
+ veb->veb_scanned = B_FALSE;
+}
+
+void
+vdev_disk_preroot_fini(void)
+{
+ mutex_enter(&veb_lock);
+
+ if (veb != NULL) {
+ while (!list_is_empty(&veb->veb_ents)) {
+ struct veb_ent *vebe = list_remove_head(&veb->veb_ents);
+
+ spa_strfree(vebe->vebe_devpath);
+
+ kmem_free(vebe, sizeof (*vebe));
+ }
+
+ kmem_free(veb, sizeof (*veb));
+ veb = NULL;
+ }
+
+ mutex_exit(&veb_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c
index effea61bc6..6c636dd4d2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_indirect.c
+++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c
@@ -1382,8 +1382,8 @@ vdev_indirect_checksum_error(zio_t *zio,
void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size);
abd_t *good_abd = is->is_good_child->ic_data;
void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size);
- zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio,
- is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark,
+ zio, is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc);
abd_return_buf(ic->ic_data, bad_buf, is->is_size);
abd_return_buf(good_abd, good_buf, is->is_size);
}
@@ -1459,7 +1459,7 @@ vdev_indirect_all_checksum_errors(zio_t *zio)
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
- zfs_ereport_post_checksum(zio->io_spa, vd,
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, is->is_target_offset,
is->is_size, NULL, NULL, NULL);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 6235b06f17..b683c3694b 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright 2020 Joyent, Inc.
*/
@@ -150,6 +150,8 @@
#include <sys/dsl_scan.h>
#include <sys/abd.h>
#include <sys/fs/zfs.h>
+#include <sys/byteorder.h>
+#include <sys/zfs_bootenv.h>
/*
* Basic routines to read and write from a vdev label.
@@ -940,7 +942,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
nvlist_t *label;
vdev_phys_t *vp;
abd_t *vp_abd;
- abd_t *pad2;
+ abd_t *bootenv;
uberblock_t *ub;
abd_t *ub_abd;
zio_t *zio;
@@ -1101,8 +1103,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
ub->ub_txg = 0;
/* Initialize the 2nd padding area. */
- pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
- abd_zero(pad2, VDEV_PAD_SIZE);
+ bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(bootenv, VDEV_PAD_SIZE);
/*
* Write everything in parallel.
@@ -1121,8 +1123,8 @@ retry:
* Zero out the 2nd padding area where it might have
* left over data from previous filesystem format.
*/
- vdev_label_write(zio, vd, l, pad2,
- offsetof(vdev_label_t, vl_pad2),
+ vdev_label_write(zio, vd, l, bootenv,
+ offsetof(vdev_label_t, vl_be),
VDEV_PAD_SIZE, NULL, NULL, flags);
vdev_label_write(zio, vd, l, ub_abd,
@@ -1138,7 +1140,7 @@ retry:
}
nvlist_free(label);
- abd_free(pad2);
+ abd_free(bootenv);
abd_free(ub_abd);
abd_free(vp_abd);
@@ -1162,6 +1164,212 @@ retry:
}
/*
+ * Done callback for vdev_label_read_bootenv_impl. If this is the first
+ * callback to finish, store our abd in the callback pointer. Otherwise, we
+ * just free our abd and return.
+ */
+static void
+vdev_label_read_bootenv_done(zio_t *zio)
+{
+ zio_t *rio = zio->io_private;
+ abd_t **cbp = rio->io_private;
+
+ ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
+
+ if (zio->io_error == 0) {
+ mutex_enter(&rio->io_lock);
+ if (*cbp == NULL) {
+ /* Will free this buffer in vdev_label_read_bootenv. */
+ *cbp = zio->io_abd;
+ } else {
+ abd_free(zio->io_abd);
+ }
+ mutex_exit(&rio->io_lock);
+ } else {
+ abd_free(zio->io_abd);
+ }
+}
+
+static void
+vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
+
+ /*
+ * We just use the first label that has a correct checksum; the
+ * bootloader should have rewritten them all to be the same on boot,
+ * and any changes we made since boot have been the same across all
+ * labels.
+ */
+ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ vdev_label_read(zio, vd, l,
+ abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
+ offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
+ vdev_label_read_bootenv_done, zio, flags);
+ }
+ }
+}
+
+int
+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
+{
+ nvlist_t *config;
+ spa_t *spa = rvd->vdev_spa;
+ abd_t *abd = NULL;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+ ASSERT(bootenv);
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ zio_t *zio = zio_root(spa, NULL, &abd, flags);
+ vdev_label_read_bootenv_impl(zio, rvd, flags);
+ int err = zio_wait(zio);
+
+ if (abd != NULL) {
+ char *buf;
+ vdev_boot_envblock_t *vbe = abd_to_buf(abd);
+
+ vbe->vbe_version = ntohll(vbe->vbe_version);
+ switch (vbe->vbe_version) {
+ case VB_RAW:
+ /*
+ * if we have textual data in vbe_bootenv, create nvlist
+ * with key "envmap".
+ */
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
+ vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
+ fnvlist_add_string(bootenv, GRUB_ENVMAP,
+ vbe->vbe_bootenv);
+ break;
+
+ case VB_NVLIST:
+ err = nvlist_unpack(vbe->vbe_bootenv,
+ sizeof (vbe->vbe_bootenv), &config, 0);
+ if (err == 0) {
+ fnvlist_merge(bootenv, config);
+ nvlist_free(config);
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ /* Check for FreeBSD zfs bootonce command string */
+ buf = abd_to_buf(abd);
+ if (*buf == '\0') {
+ fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
+ VB_NVLIST);
+ break;
+ }
+ fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
+ }
+
+ /*
+ * abd was allocated in vdev_label_read_bootenv_impl()
+ */
+ abd_free(abd);
+ /*
+ * If we managed to read any successfully,
+ * return success.
+ */
+ return (0);
+ }
+ return (err);
+}
+
+int
+vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
+{
+ zio_t *zio;
+ spa_t *spa = vd->vdev_spa;
+ vdev_boot_envblock_t *bootenv;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+ size_t nvsize;
+ char *nvbuf;
+
+ error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
+ return (SET_ERROR(E2BIG));
+ }
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ error = ENXIO;
+ for (int c = 0; c < vd->vdev_children; c++) {
+ int child_err;
+
+ child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
+ /*
+ * As long as any of the disks managed to write all of their
+ * labels successfully, return success.
+ */
+ if (child_err == 0)
+ error = child_err;
+ }
+
+ if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
+ !vdev_writeable(vd)) {
+ return (error);
+ }
+ ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
+ abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+ abd_zero(abd, VDEV_PAD_SIZE);
+
+ bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
+ nvbuf = bootenv->vbe_bootenv;
+ nvsize = sizeof (bootenv->vbe_bootenv);
+
+ bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
+ switch (bootenv->vbe_version) {
+ case VB_RAW:
+ if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
+ (void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+ }
+ error = 0;
+ break;
+
+ case VB_NVLIST:
+ error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error == 0) {
+ bootenv->vbe_version = htonll(bootenv->vbe_version);
+ abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+ } else {
+ abd_free(abd);
+ return (SET_ERROR(error));
+ }
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ for (int l = 0; l < VDEV_LABELS; l++) {
+ vdev_label_write(zio, vd, l, abd,
+ offsetof(vdev_label_t, vl_be),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ }
+
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ abd_free(abd);
+ return (error);
+}
+
+/*
* ==========================================================================
* uberblock load/sync
* ==========================================================================
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index 10772d5265..524ba25cb2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -35,6 +36,8 @@
#include <sys/abd.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
#ifdef ZFS_DEBUG
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
@@ -98,7 +101,7 @@
* R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
* = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
*
- * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
* XOR operation, and 2 and 4 can be computed quickly and generate linearly-
* independent coefficients. (There are no additional coefficients that have
* this property which is why the uncorrected Plank method breaks down.)
@@ -107,34 +110,6 @@
* or in concert to recover missing data columns.
*/
-typedef struct raidz_col {
- uint64_t rc_devidx; /* child device index for I/O */
- uint64_t rc_offset; /* device offset */
- uint64_t rc_size; /* I/O size */
- abd_t *rc_abd; /* I/O data */
- void *rc_gdata; /* used to store the "good" version */
- int rc_error; /* I/O error for this device */
- uint8_t rc_tried; /* Did we attempt this I/O column? */
- uint8_t rc_skipped; /* Did we skip this I/O column? */
-} raidz_col_t;
-
-typedef struct raidz_map {
- uint64_t rm_cols; /* Regular column count */
- uint64_t rm_scols; /* Count including skipped columns */
- uint64_t rm_bigcols; /* Number of oversized columns */
- uint64_t rm_asize; /* Actual total I/O size */
- uint64_t rm_missingdata; /* Count of missing data devices */
- uint64_t rm_missingparity; /* Count of missing parity devices */
- uint64_t rm_firstdatacol; /* First data column/parity count */
- uint64_t rm_nskip; /* Skipped sectors for padding */
- uint64_t rm_skipstart; /* Column index of padding start */
- abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
- uintptr_t rm_reports; /* # of referencing checksum reports */
- uint8_t rm_freed; /* map no longer has referencing ZIO */
- uint8_t rm_ecksuminjected; /* checksum error was injected */
- raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
-} raidz_map_t;
-
#define VDEV_RAIDZ_P 0
#define VDEV_RAIDZ_Q 1
#define VDEV_RAIDZ_R 2
@@ -153,7 +128,7 @@ typedef struct raidz_map {
(mask) = (x) & 0x8080808080808080ULL; \
(mask) = ((mask) << 1) - ((mask) >> 7); \
(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
- ((mask) & 0x1d1d1d1d1d1d1d1d); \
+ ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
}
#define VDEV_RAIDZ_64MUL_4(x, mask) \
@@ -162,106 +137,7 @@ typedef struct raidz_map {
VDEV_RAIDZ_64MUL_2((x), mask); \
}
-#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
-
-/*
- * Force reconstruction to use the general purpose method.
- */
-int vdev_raidz_default_to_general;
-
-/* Powers of 2 in the Galois field defined above. */
-static const uint8_t vdev_raidz_pow2[256] = {
- 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
- 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
- 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
- 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
- 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
- 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
- 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
- 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
- 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
- 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
- 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
- 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
- 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
- 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
- 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
- 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
- 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
- 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
- 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
- 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
- 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
- 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
- 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
- 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
- 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
- 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
- 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
- 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
- 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
- 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
- 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
- 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
-};
-/* Logs of 2 in the Galois field defined above. */
-static const uint8_t vdev_raidz_log2[256] = {
- 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
- 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
- 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
- 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
- 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
- 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
- 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
- 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
- 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
- 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
- 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
- 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
- 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
- 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
- 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
- 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
- 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
- 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
- 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
- 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
- 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
- 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
- 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
- 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
- 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
- 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
- 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
- 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
- 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
- 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
- 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
- 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
-};
-
-static void vdev_raidz_generate_parity(raidz_map_t *rm);
-
-/*
- * Multiply a given number by 2 raised to the given power.
- */
-static uint8_t
-vdev_raidz_exp2(uint_t a, int exp)
-{
- if (a == 0)
- return (0);
-
- ASSERT(exp >= 0);
- ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
-
- exp += vdev_raidz_log2[a];
- if (exp > 255)
- exp -= 255;
-
- return (vdev_raidz_pow2[exp]);
-}
-
-static void
+void
vdev_raidz_map_free(raidz_map_t *rm)
{
int c;
@@ -271,7 +147,6 @@ vdev_raidz_map_free(raidz_map_t *rm)
if (rm->rm_col[c].rc_gdata != NULL)
abd_free(rm->rm_col[c].rc_gdata);
-
}
for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
@@ -311,7 +186,7 @@ static void
vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
{
raidz_map_t *rm = zcr->zcr_cbdata;
- size_t c = zcr->zcr_cbinfo;
+ const size_t c = zcr->zcr_cbinfo;
size_t x, offset;
const abd_t *good = NULL;
@@ -459,19 +334,19 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
*/
-static raidz_map_t *
-vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
- uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
+raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
+ uint64_t nparity)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
- uint64_t b = offset >> unit_shift;
+ uint64_t b = zio->io_offset >> ashift;
/* The zio's size in units of the vdev's minimum sector size. */
- uint64_t s = size >> unit_shift;
+ uint64_t s = zio->io_size >> ashift;
/* The first column for this stripe. */
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
- uint64_t o = (b / dcols) << unit_shift;
+ uint64_t o = (b / dcols) << ashift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
uint64_t off = 0;
@@ -530,7 +405,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
coff = o;
if (col >= dcols) {
col -= dcols;
- coff += 1ULL << unit_shift;
+ coff += 1ULL << ashift;
}
rm->rm_col[c].rc_devidx = col;
rm->rm_col[c].rc_offset = coff;
@@ -543,29 +418,29 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
if (c >= acols)
rm->rm_col[c].rc_size = 0;
else if (c < bc)
- rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+ rm->rm_col[c].rc_size = (q + 1) << ashift;
else
- rm->rm_col[c].rc_size = q << unit_shift;
+ rm->rm_col[c].rc_size = q << ashift;
asize += rm->rm_col[c].rc_size;
}
- ASSERT3U(asize, ==, tot << unit_shift);
- rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+ ASSERT3U(asize, ==, tot << ashift);
+ rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
rm->rm_nskip = roundup(tot, nparity + 1) - tot;
- ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
ASSERT3U(rm->rm_nskip, <=, nparity);
for (c = 0; c < rm->rm_firstdatacol; c++)
rm->rm_col[c].rc_abd =
abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
- rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0,
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
rm->rm_col[c].rc_size);
off = rm->rm_col[c].rc_size;
for (c = c + 1; c < acols; c++) {
- rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off,
+ rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
rm->rm_col[c].rc_size);
off += rm->rm_col[c].rc_size;
}
@@ -573,7 +448,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
/*
* If all data stored spans all columns, there's a danger that parity
* will always be on the same device and, since parity isn't read
- * during normal operation, that that device's I/O bandwidth won't be
+ * during normal operation, that device's I/O bandwidth won't be
* used effectively. We therefore switch the parity every 1MB.
*
* ... at least that was, ostensibly, the theory. As a practical
@@ -593,7 +468,7 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
+ if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -605,6 +480,9 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
rm->rm_skipstart = 1;
}
+ /* init RAIDZ parity ops */
+ rm->rm_ops = vdev_raidz_math_get_ops();
+
return (rm);
}
@@ -681,7 +559,6 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
if (c == rm->rm_firstdatacol) {
- ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size);
abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
} else {
struct pqr_struct pqr = { p, NULL, NULL };
@@ -793,9 +670,13 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
* Generate RAID parity in the first virtual columns according to the number of
* parity columns available.
*/
-static void
+void
vdev_raidz_generate_parity(raidz_map_t *rm)
{
+ /* Generate using the new math implementation */
+ if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
+ return;
+
switch (rm->rm_firstdatacol) {
case 1:
vdev_raidz_generate_parity_p(rm);
@@ -873,8 +754,8 @@ vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
int cnt = size / sizeof (dst[0]);
for (int i = 0; i < cnt; i++, dst++, rq->q++) {
- *dst ^= *rq->q;
+ *dst ^= *rq->q;
int j;
uint8_t *b;
for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
@@ -1159,9 +1040,12 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
* ~~ ~~
* __ __
* | 1 1 1 1 1 1 1 1 |
+ * | 128 64 32 16 8 4 2 1 |
* | 19 205 116 29 64 16 4 1 |
* | 1 0 0 0 0 0 0 0 |
- * (V|I)' = | 0 0 0 1 0 0 0 0 |
+ * | 0 1 0 0 0 0 0 0 |
+ * (V|I)' = | 0 0 1 0 0 0 0 0 |
+ * | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
@@ -1385,8 +1269,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
int i, j, x, cc, c;
uint8_t *src;
uint64_t ccount;
- uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
- uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+ uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
+ uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
uint8_t log = 0;
uint8_t val;
int ll;
@@ -1595,12 +1479,12 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
return (code);
}
-static int
-vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+int
+vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
{
int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
int ntgts;
- int i, c;
+ int i, c, ret;
int code;
int nbadparity, nbaddata;
int parity_valid[VDEV_RAIDZ_MAXPARITY];
@@ -1638,34 +1522,37 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
dt = &tgts[nbadparity];
+ /* Reconstruct using the new math implementation */
+ ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
+ if (ret != RAIDZ_ORIGINAL_IMPL)
+ return (ret);
+
/*
* See if we can use any of our optimized reconstruction routines.
*/
- if (!vdev_raidz_default_to_general) {
- switch (nbaddata) {
- case 1:
- if (parity_valid[VDEV_RAIDZ_P])
- return (vdev_raidz_reconstruct_p(rm, dt, 1));
+ switch (nbaddata) {
+ case 1:
+ if (parity_valid[VDEV_RAIDZ_P])
+ return (vdev_raidz_reconstruct_p(rm, dt, 1));
- ASSERT(rm->rm_firstdatacol > 1);
+ ASSERT(rm->rm_firstdatacol > 1);
- if (parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_q(rm, dt, 1));
+ if (parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_q(rm, dt, 1));
- ASSERT(rm->rm_firstdatacol > 2);
- break;
+ ASSERT(rm->rm_firstdatacol > 2);
+ break;
- case 2:
- ASSERT(rm->rm_firstdatacol > 1);
+ case 2:
+ ASSERT(rm->rm_firstdatacol > 1);
- if (parity_valid[VDEV_RAIDZ_P] &&
- parity_valid[VDEV_RAIDZ_Q])
- return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+ if (parity_valid[VDEV_RAIDZ_P] &&
+ parity_valid[VDEV_RAIDZ_Q])
+ return (vdev_raidz_reconstruct_pq(rm, dt, 2));
- ASSERT(rm->rm_firstdatacol > 2);
+ ASSERT(rm->rm_firstdatacol > 2);
- break;
- }
+ break;
}
code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
@@ -1821,11 +1708,16 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size,
* treat the on-disk format as if the only blocks are the complete 128
* KB size.
*/
- abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
+
+ /* First, fake a zio for vdev_raidz_map_alloc. */
+ zio_t *zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ zio->io_offset = origoffset;
+ zio->io_size = SPA_OLD_MAXBLOCKSIZE;
+ zio->io_abd = abd_get_from_buf(data - (offset - origoffset),
SPA_OLD_MAXBLOCKSIZE);
- rm = vdev_raidz_map_alloc(abd,
- SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
- vd->vdev_children, vd->vdev_nparity);
+
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ vd->vdev_nparity);
coloffset = origoffset;
@@ -1860,21 +1752,17 @@ vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size,
VERIFY3U(colsize, <=, rc->rc_size);
VERIFY3U(colskip, <=, rc->rc_size);
- /*
- * Note that the child vdev will have a vdev label at the start
- * of its range of offsets, hence the need for
- * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
- * example of why this calculation is needed.
- */
if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd,
((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
- VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0,
- doread, isdump)) != 0)
+ rc->rc_offset + colskip, 0, doread, isdump)) != 0) {
break;
+ }
}
vdev_raidz_map_free(rm);
- abd_put(abd);
+ abd_put(zio->io_abd);
+ kmem_free(zio, sizeof (zio_t));
+
#endif /* KERNEL */
return (err);
@@ -1965,8 +1853,7 @@ vdev_raidz_io_start(zio_t *zio)
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
- tvd->vdev_ashift, vd->vdev_children,
+ rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
zio->io_vsd = rm;
@@ -2073,7 +1960,7 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
zbc.zbc_has_cksum = 0;
zbc.zbc_injected = rm->rm_ecksuminjected;
- zfs_ereport_post_checksum(zio->io_spa, vd,
+ (void) zfs_ereport_post_checksum(zio->io_spa, vd,
&zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
rc->rc_abd, bad_data, &zbc);
}
@@ -2141,11 +2028,6 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
return (ret);
}
-/*
- * Keep statistics on all the ways that we used parity to correct data.
- */
-static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
-
static int
vdev_raidz_worst_error(raidz_map_t *rm)
{
@@ -2251,7 +2133,6 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
*/
code = vdev_raidz_reconstruct(rm, tgts, n);
if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
for (i = 0; i < n; i++) {
c = tgts[i];
@@ -2466,8 +2347,6 @@ vdev_raidz_io_done(zio_t *zio)
code = vdev_raidz_reconstruct(rm, tgts, n);
if (raidz_checksum_verify(zio) == 0) {
- atomic_inc_64(&raidz_corrected[code]);
-
/*
* If we read more parity disks than were used
* for reconstruction, confirm that the other
@@ -2620,7 +2499,7 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
/*
* Determine if any portion of the provided block resides on a child vdev
* with a dirty DTL and therefore needs to be resilvered. The function
- * assumes that at least one DTL is dirty which imples that full stripe
+ * assumes that at least one DTL is dirty which implies that full stripe
* width blocks must be resilvered.
*/
static boolean_t
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c
new file mode 100644
index 0000000000..1591147375
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math.c
@@ -0,0 +1,573 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/debug.h>
+#include <sys/zfs_debug.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <sys/simd.h>
+
+#ifndef isspace
+#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || \
+ (c) == '\r' || (c) == '\f' || (c) == '\013')
+#endif
+
+extern boolean_t raidz_will_scalar_work(void);
+
+/* Opaque implementation with NULL methods to represent original methods */
+static const raidz_impl_ops_t vdev_raidz_original_impl = {
+ .name = "original",
+ .is_supported = raidz_will_scalar_work,
+};
+
+/* RAIDZ parity op that contain the fastest methods */
+static raidz_impl_ops_t vdev_raidz_fastest_impl = {
+ .name = "fastest"
+};
+
+/* All compiled in implementations */
+const raidz_impl_ops_t *raidz_all_maths[] = {
+ &vdev_raidz_original_impl,
+ &vdev_raidz_scalar_impl,
+#if defined(__amd64)
+ &vdev_raidz_sse2_impl,
+#endif
+#if defined(__amd64)
+ &vdev_raidz_ssse3_impl,
+#endif
+#if defined(__amd64)
+ &vdev_raidz_avx2_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t raidz_math_initialized = B_FALSE;
+
+/* Select raidz implementation */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX - 1)
+#define IMPL_ORIGINAL (0)
+#define IMPL_SCALAR (1)
+
+#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t raidz_supp_impl_cnt = 0;
+static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+
+#if defined(_KERNEL)
+/*
+ * kstats values for supported implementations
+ * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
+ *
+ * PORTING NOTE:
+ * On illumos this is not a kstat. OpenZFS uses their home-grown kstat code
+ * which implements a free-form kstat using additional functionality that does
+ * not exist in illumos. Because there are no software consumers of this
+ * information, we omit a kstat API. If an administrator needs to see this
+ * data for some reason, they can use mdb.
+ *
+ * The format of the kstat data on OpenZFS would be a "header" that looks like
+ * this (a column for each entry in the "raidz_gen_name" and "raidz_rec_name"
+ * arrays, starting with the parity function "implementation" name):
+ * impl gen_p gen_pq gen_pqr rec_p rec_q rec_r rec_pq rec_pr rec_qr rec_pqr
+ * This is followed by a row for each parity function implementation, showing
+ * the "speed" values calculated for that implementation for each of the
+ * parity generation and reconstruction functions in the "raidz_all_maths"
+ * array.
+ */
+static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
+
+#endif
+
+/*
+ * Returns the RAIDZ operations for raidz_map() parity calculations. When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
+{
+ if (!kfpu_allowed())
+ return (&vdev_raidz_scalar_impl);
+
+ raidz_impl_ops_t *ops = NULL;
+ const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+ switch (impl) {
+ case IMPL_FASTEST:
+ ASSERT(raidz_math_initialized);
+ ops = &vdev_raidz_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ /* Cycle through all supported implementations */
+ ASSERT(raidz_math_initialized);
+ ASSERT3U(raidz_supp_impl_cnt, >, 0);
+ static size_t cycle_impl_idx = 0;
+ size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+ ops = raidz_supp_impl[idx];
+ break;
+ case IMPL_ORIGINAL:
+ ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
+ break;
+ case IMPL_SCALAR:
+ ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
+ break;
+ default:
+ ASSERT3U(impl, <, raidz_supp_impl_cnt);
+ ASSERT3U(raidz_supp_impl_cnt, >, 0);
+ if (impl < ARRAY_SIZE(raidz_all_maths))
+ ops = raidz_supp_impl[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+
+ return (ops);
+}
+
+/*
+ * Select parity generation method for raidz_map
+ */
+int
+vdev_raidz_math_generate(raidz_map_t *rm)
+{
+ raidz_gen_f gen_parity = NULL;
+
+ switch (raidz_parity(rm)) {
+ case 1:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
+ break;
+ case 2:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
+ break;
+ case 3:
+ gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
+ break;
+ default:
+ gen_parity = NULL;
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
+ (uint_t)raidz_parity(rm));
+ break;
+ }
+
+ /* if method is NULL execute the original implementation */
+ if (gen_parity == NULL)
+ return (RAIDZ_ORIGINAL_IMPL);
+
+ gen_parity(rm);
+
+ return (0);
+}
+
+static raidz_rec_f
+reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1 && parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1) {
+ if (parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ } else if (parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+ }
+ } else if (nbaddata == 2 &&
+ parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
+ const int nbaddata)
+{
+ if (nbaddata == 1) {
+ if (parity_valid[CODE_P]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_P]);
+ } else if (parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+ } else if (parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_R]);
+ }
+ } else if (nbaddata == 2) {
+ if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+ } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PR]);
+ } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_QR]);
+ }
+ } else if (nbaddata == 3 &&
+ parity_valid[CODE_P] && parity_valid[CODE_Q] &&
+ parity_valid[CODE_R]) {
+ return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
+ }
+ return ((raidz_rec_f) NULL);
+}
+
+/*
+ * Select data reconstruction method for raidz_map
+ * @parity_valid - Parity validity flag
+ * @dt - Failed data index array
+ * @nbaddata - Number of failed data columns
+ */
+int
+vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid,
+ const int *dt, const int nbaddata)
+{
+ raidz_rec_f rec_fn = NULL;
+
+ switch (raidz_parity(rm)) {
+ case PARITY_P:
+ rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
+ break;
+ case PARITY_PQ:
+ rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
+ break;
+ case PARITY_PQR:
+ rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
+ break;
+ default:
+ cmn_err(CE_PANIC, "invalid RAID-Z configuration %u",
+ (uint_t)raidz_parity(rm));
+ break;
+ }
+
+ if (rec_fn == NULL)
+ return (RAIDZ_ORIGINAL_IMPL);
+ else
+ return (rec_fn(rm, dt));
+}
+
+const char *raidz_gen_name[] = {
+ "gen_p", "gen_pq", "gen_pqr"
+};
+const char *raidz_rec_name[] = {
+ "rec_p", "rec_q", "rec_r",
+ "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
+};
+
+#if defined(_KERNEL)
+
+#define BENCH_D_COLS (8ULL)
+#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR)
+#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */
+#define BENCH_NS MSEC2NSEC(25) /* 25ms */
+
+typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
+
+static void
+benchmark_gen_impl(raidz_map_t *rm, const int fn)
+{
+ (void) fn;
+ vdev_raidz_generate_parity(rm);
+}
+
+static void
+benchmark_rec_impl(raidz_map_t *rm, const int fn)
+{
+ static const int rec_tgt[7][3] = {
+ {1, 2, 3}, /* rec_p: bad QR & D[0] */
+ {0, 2, 3}, /* rec_q: bad PR & D[0] */
+ {0, 1, 3}, /* rec_r: bad PQ & D[0] */
+ {2, 3, 4}, /* rec_pq: bad R & D[0][1] */
+ {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */
+ {0, 3, 4}, /* rec_qr: bad P & D[0][1] */
+ {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */
+ };
+
+ vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
+}
+
+/*
+ * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
+ * is performed by setting the rm_ops pointer and calling the top level
+ * generate/reconstruct methods of bench_rm.
+ */
+static void
+benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
+{
+ uint64_t run_cnt, speed, best_speed = 0;
+ hrtime_t t_start, t_diff;
+ raidz_impl_ops_t *curr_impl;
+ raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
+ int impl, i;
+
+ for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
+ /* set an implementation to benchmark */
+ curr_impl = raidz_supp_impl[impl];
+ bench_rm->rm_ops = curr_impl;
+
+ run_cnt = 0;
+ t_start = gethrtime();
+
+ do {
+ for (i = 0; i < 25; i++, run_cnt++)
+ bench_fn(bench_rm, fn);
+
+ t_diff = gethrtime() - t_start;
+ } while (t_diff < BENCH_NS);
+
+ speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
+ speed /= (t_diff * BENCH_COLS);
+
+ if (bench_fn == benchmark_gen_impl)
+ raidz_impl_kstats[impl].gen[fn] = speed;
+ else
+ raidz_impl_kstats[impl].rec[fn] = speed;
+
+ /* Update fastest implementation method */
+ if (speed > best_speed) {
+ best_speed = speed;
+
+ if (bench_fn == benchmark_gen_impl) {
+ fstat->gen[fn] = impl;
+ vdev_raidz_fastest_impl.gen[fn] =
+ curr_impl->gen[fn];
+ } else {
+ fstat->rec[fn] = impl;
+ vdev_raidz_fastest_impl.rec[fn] =
+ curr_impl->rec[fn];
+ }
+ }
+ }
+}
+#endif
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void)
+{
+ raidz_impl_ops_t *curr_impl;
+ int i, c;
+
+ /* Move supported impl into raidz_supp_impl */
+ for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
+
+ if (curr_impl->init)
+ curr_impl->init();
+
+ if (curr_impl->is_supported())
+ raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
+ }
+ membar_producer(); /* complete raidz_supp_impl[] init */
+ raidz_supp_impl_cnt = c; /* number of supported impl */
+
+#if defined(_KERNEL)
+ zio_t *bench_zio = NULL;
+ raidz_map_t *bench_rm = NULL;
+ uint64_t bench_parity;
+
+ /* Fake a zio and run the benchmark on a warmed up buffer */
+ bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+ bench_zio->io_offset = 0;
+ bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
+ bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
+ memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
+
+ /* Benchmark parity generation methods */
+ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+ bench_parity = fn + 1;
+ /* New raidz_map is needed for each generate_p/q/r */
+ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+ BENCH_D_COLS + bench_parity, bench_parity);
+
+ benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
+
+ vdev_raidz_map_free(bench_rm);
+ }
+
+ /* Benchmark data reconstruction methods */
+ bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+ BENCH_COLS, PARITY_PQR);
+
+ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
+ benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
+
+ vdev_raidz_map_free(bench_rm);
+
+ /* cleanup the bench zio */
+ abd_free(bench_zio->io_abd);
+ kmem_free(bench_zio, sizeof (zio_t));
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&vdev_raidz_fastest_impl,
+ raidz_supp_impl[raidz_supp_impl_cnt - 1],
+ sizeof (vdev_raidz_fastest_impl));
+ strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
+
+void
+vdev_raidz_math_init(void)
+{
+ /* Determine the fastest available implementation. */
+ benchmark_raidz();
+
+ /* Finish initialization */
+ atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
+ raidz_math_initialized = B_TRUE;
+}
+
+void
+vdev_raidz_math_fini(void)
+{
+ raidz_impl_ops_t const *curr_impl;
+
+ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+ curr_impl = raidz_all_maths[i];
+ if (curr_impl->fini)
+ curr_impl->fini();
+ }
+}
+
+static const struct {
+ char *name;
+ uint32_t sel;
+} math_impl_opts[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST },
+ { "original", IMPL_ORIGINAL },
+ { "scalar", IMPL_SCALAR }
+};
+
+/*
+ * Function sets desired raidz implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * zfs_vdev_raidz_impl.
+ *
+ * @val Name of raidz implementation to use
+ * @param Unused.
+ */
+int
+vdev_raidz_impl_set(const char *val)
+{
+ int err = -EINVAL;
+ char req_name[RAIDZ_IMPL_NAME_MAX];
+ uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
+ if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
+ return (err);
+
+ strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
+ while (i > 0 && !!isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ /* Check mandatory options */
+ for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
+ if (strcmp(req_name, math_impl_opts[i].name) == 0) {
+ impl = math_impl_opts[i].sel;
+ err = 0;
+ break;
+ }
+ }
+
+ /* check all supported impl if init() was already called */
+ if (err != 0 && raidz_math_initialized) {
+ /* check all supported implementations */
+ for (i = 0; i < raidz_supp_impl_cnt; i++) {
+ if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+
+ if (err == 0) {
+ if (raidz_math_initialized)
+ atomic_swap_32(&zfs_vdev_raidz_impl, impl);
+ else
+ atomic_swap_32(&user_sel_impl, impl);
+ }
+
+ return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+ return (vdev_raidz_impl_set(val));
+}
+
+static int
+zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cnt = 0;
+ char *fmt;
+ const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+ ASSERT(raidz_math_initialized);
+
+ /* list mandatory options */
+ for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
+ fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
+ }
+
+ /* list all supported implementations */
+ for (i = 0; i < raidz_supp_impl_cnt; i++) {
+ fmt = (i == impl) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
+ }
+
+ return (cnt);
+}
+
+module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
+ zfs_vdev_raidz_impl_get, NULL, 0644);
+MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
+#endif
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c
new file mode 100644
index 0000000000..1a0214547b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_avx2.c
@@ -0,0 +1,424 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+#include <sys/isa_defs.h>
+
+#if defined(__amd64)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#define __asm __asm__ __volatile__
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "ymm"#REG
+#define VR1_(_1, REG, ...) "ymm"#REG
+#define VR2_(_1, _2, REG, ...) "ymm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "ymm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "ymm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 32
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ "vpxor 0x40(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \
+ "vpxor 0x60(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \
+ "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vpxor %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \
+ "vpxor %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \
+ "vpxor %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \
+ "vpxor %" VR3(r) ", %" VR7(r)", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vpxor %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \
+ "vpxor %" VR1(r) ", %" VR3(r)", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "vmovdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "vmovdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "vmovdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "vmovdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "vmovdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "vmovdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \
+ "vmovdqa 0x40(%[SRC]), %%" VR2(r) "\n" \
+ "vmovdqa 0x60(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \
+ "vmovdqa %%" VR2(r) ", 0x40(%[DST])\n" \
+ "vmovdqa %%" VR3(r) ", 0x60(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \
+ "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define FLUSH() \
+{ \
+ __asm("vzeroupper"); \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \
+ __asm("vpbroadcastq %xmm14, %ymm14"); \
+ __asm("vpxor %ymm15, %ymm15 ,%ymm15"); \
+}
+
+#define _MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpcmpgtb %" VR0(r)", %ymm15, %ymm12\n" \
+ "vpcmpgtb %" VR1(r)", %ymm15, %ymm13\n" \
+ "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \
+ "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \
+ "vpand %ymm14, %ymm12, %ymm12\n" \
+ "vpand %ymm14, %ymm13, %ymm13\n" \
+ "vpxor %ymm12, %" VR0(r)", %" VR0(r) "\n" \
+ "vpxor %ymm13, %" VR1(r)", %" VR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2(R_01(r)); \
+ _MUL2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2(r); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+#define _0f "ymm15"
+#define _as "ymm14"
+#define _bs "ymm13"
+#define _ltmod "ymm12"
+#define _ltmul "ymm11"
+#define _ta "ymm10"
+#define _tb "ymm15"
+
+static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "vpbroadcastb (%[mask]), %%" _0f "\n" \
+ /* upper bits */ \
+ "vbroadcasti128 0x00(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti128 0x10(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \
+ "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \
+ "vpand %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpand %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpand %%" _0f ", %%" _as ", %%" _as "\n" \
+ "vpand %%" _0f ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \
+ "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \
+ /* lower bits */ \
+ "vbroadcasti128 0x20(%[lt]), %%" _ltmod "\n" \
+ "vbroadcasti128 0x30(%[lt]), %%" _ltmul "\n" \
+ \
+ "vpxor %%" _ta ", %%" _as ", %%" _as "\n" \
+ "vpxor %%" _tb ", %%" _bs ", %%" _bs "\n" \
+ \
+ "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \
+ "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \
+ "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
+ "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
+ \
+ "vpxor %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxor %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \
+ "vpxor %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ "vpxor %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \
+ : : [mask] "r" (&_mul_mask), \
+ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_01(r)); \
+ _MULx2(c, R_23(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() \
+{ \
+ FLUSH(); \
+ kfpu_end(); \
+}
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx2);
+DEFINE_REC_METHODS(avx2);
+
+static boolean_t
+raidz_will_avx2_work(void)
+{
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(avx2),
+ .rec = RAIDZ_REC_METHODS(avx2),
+ .is_supported = &raidz_will_avx2_work,
+ .name = "avx2"
+};
+
+#elif defined(__i386)
+
+/* 32-bit stub for user-level fakekernel dependencies */
+#include <sys/vdev_raidz_impl.h>
+const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = NULL,
+ .rec = NULL,
+ .is_supported = NULL,
+ .name = "avx2"
+};
+
+#endif /* defined(__amd64) */
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h
new file mode 100644
index 0000000000..89c2082c4a
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_impl.h
@@ -0,0 +1,1477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef _VDEV_RAIDZ_MATH_IMPL_H
+#define _VDEV_RAIDZ_MATH_IMPL_H
+
+#include <sys/types.h>
+
+#define raidz_inline inline __attribute__((always_inline))
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+/*
+ * Functions calculate multiplication constants for data reconstruction.
+ * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
+ * used parity columns for reconstruction.
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ * @coeff output array of coefficients. Array must be provided by
+ * user and must hold minimum MUL_CNT values.
+ */
+static noinline void
+raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+
+ coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+
+ coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+ gf_t a, b, e;
+
+ a = gf_exp2(x + 255 - y);
+ b = gf_exp2(255 - (ncols - x - 1));
+ e = a ^ 0x01;
+
+ coeff[MUL_PQ_X] = gf_div(a, e);
+ coeff[MUL_PQ_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+
+ gf_t a, b, e;
+
+ a = gf_exp4(x + 255 - y);
+ b = gf_exp4(255 - (ncols - x - 1));
+ e = a ^ 0x01;
+
+ coeff[MUL_PR_X] = gf_div(a, e);
+ coeff[MUL_PR_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+
+ gf_t nx, ny, nxxy, nxyy, d;
+
+ nx = gf_exp2(ncols - x - 1);
+ ny = gf_exp2(ncols - y - 1);
+ nxxy = gf_mul(gf_mul(nx, nx), ny);
+ nxyy = gf_mul(gf_mul(nx, ny), ny);
+ d = nxxy ^ nxyy;
+
+ coeff[MUL_QR_XQ] = ny;
+ coeff[MUL_QR_X] = gf_div(ny, d);
+ coeff[MUL_QR_YQ] = nx;
+ coeff[MUL_QR_Y] = gf_div(nx, d);
+}
+
+static noinline void
+raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff)
+{
+ const unsigned ncols = raidz_ncols(rm);
+ const unsigned x = tgtidx[TARGET_X];
+ const unsigned y = tgtidx[TARGET_Y];
+ const unsigned z = tgtidx[TARGET_Z];
+
+ gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd;
+
+ nx = gf_exp2(ncols - x - 1);
+ ny = gf_exp2(ncols - y - 1);
+ nz = gf_exp2(ncols - z - 1);
+
+ nxx = gf_exp4(ncols - x - 1);
+ nyy = gf_exp4(ncols - y - 1);
+ nzz = gf_exp4(ncols - z - 1);
+
+ nyyz = gf_mul(gf_mul(ny, nz), ny);
+ nyzz = gf_mul(nzz, ny);
+
+ xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^
+ gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz;
+
+ yd = gf_inv(ny ^ nz);
+
+ coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd);
+ coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd);
+ coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd);
+ coeff[MUL_PQR_YU] = nx;
+ coeff[MUL_PQR_YP] = gf_mul(nz, yd);
+ coeff[MUL_PQR_YQ] = yd;
+}
+
+/*
+ * Method for zeroing a buffer (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @dsize Destination buffer size
+ * @private Unused
+ */
+static int
+raidz_zero_abd_cb(void *dc, size_t dsize, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ size_t i;
+
+ ZERO_DEFINE();
+
+ (void) private; /* unused */
+
+ ZERO(ZERO_D);
+
+ for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) {
+ STORE(dst + i, ZERO_D);
+ STORE(dst + i + ZERO_STRIDE, ZERO_D);
+ }
+
+ return (0);
+}
+
+#define raidz_zero(dabd, size) \
+{ \
+ abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \
+}
+
+/*
+ * Method for copying two buffers (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
+ */
+static int
+raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ const v_t *src = (v_t *)sc;
+ size_t i;
+
+ COPY_DEFINE();
+
+ (void) private; /* unused */
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) {
+ LOAD(src + i, COPY_D);
+ STORE(dst + i, COPY_D);
+
+ LOAD(src + i + COPY_STRIDE, COPY_D);
+ STORE(dst + i + COPY_STRIDE, COPY_D);
+ }
+
+ return (0);
+}
+
+
+#define raidz_copy(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+}
+
+/*
+ * Method for adding (XORing) two buffers.
+ * Source and destination are XORed together and result is stored in
+ * destination buffer. This method is used by multiple for gen/rec functions.
+ *
+ * @dc Destination buffer
+ * @sc Source buffer
+ * @dsize Destination buffer size
+ * @ssize Source buffer size
+ * @private Unused
+ */
+static int
+raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+ v_t *dst = (v_t *)dc;
+ const v_t *src = (v_t *)sc;
+ size_t i;
+
+ ADD_DEFINE();
+
+ (void) private; /* unused */
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) {
+ LOAD(dst + i, ADD_D);
+ XOR_ACC(src + i, ADD_D);
+ STORE(dst + i, ADD_D);
+
+ LOAD(dst + i + ADD_STRIDE, ADD_D);
+ XOR_ACC(src + i + ADD_STRIDE, ADD_D);
+ STORE(dst + i + ADD_STRIDE, ADD_D);
+ }
+
+ return (0);
+}
+
+#define raidz_add(dabd, sabd, size) \
+{ \
+ abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
+}
+
+/*
+ * Method for multiplying a buffer with a constant in GF(2^8).
+ * Symbols from buffer are multiplied by a constant and result is stored
+ * back in the same buffer.
+ *
+ * @dc In/Out data buffer.
+ * @size Size of the buffer
+ * @private pointer to the multiplication constant (unsigned)
+ */
+static int
+raidz_mul_abd_cb(void *dc, size_t size, void *private)
+{
+ const unsigned mul = *((unsigned *)private);
+ v_t *d = (v_t *)dc;
+ size_t i;
+
+ MUL_DEFINE();
+
+ for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) {
+ LOAD(d + i, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i, MUL_D);
+
+ LOAD(d + i + MUL_STRIDE, MUL_D);
+ MUL(mul, MUL_D);
+ STORE(d + i + MUL_STRIDE, MUL_D);
+ }
+
+ return (0);
+}
+
+
+/*
+ * Syndrome generation/update macros
+ *
+ * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros
+ */
+#define P_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define Q_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL2(T); \
+ STORE((t), T); \
+}
+
+#define R_D_SYNDROME(D, T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ XOR(D, T); \
+ STORE((t), T); \
+}
+
+#define R_SYNDROME(T, t) \
+{ \
+ LOAD((t), T); \
+ MUL4(T); \
+ STORE((t), T); \
+}
+
+
+/*
+ * PARITY CALCULATION
+ *
+ * Macros *_SYNDROME are used for parity/syndrome calculation.
+ * *_D_SYNDROME() macros are used to calculate syndrome between 0 and
+ * length of data column, and *_SYNDROME() macros are only for updating
+ * the parity/syndrome if data column is shorter.
+ *
+ * P parity is calculated using raidz_add_abd().
+ */
+
+/*
+ * Generate P parity (RAIDZ1)
+ *
+ * @rm RAIDZ map
+ */
+static raidz_inline void
+raidz_generate_p_impl(raidz_map_t * const rm)
+{
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t psize = rm->rm_col[CODE_P].rc_size;
+ abd_t *pabd = rm->rm_col[CODE_P].rc_abd;
+ size_t size;
+ abd_t *dabd;
+
+ raidz_math_begin();
+
+ /* start with first data column */
+ raidz_copy(pabd, rm->rm_col[1].rc_abd, psize);
+
+ for (c = 2; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ size = rm->rm_col[c].rc_size;
+
+ /* add data column */
+ raidz_add(pabd, dabd, size);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
+ */
+static void
+raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *)c[0];
+ v_t *q = (v_t *)c[1];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQ_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
+ q += GEN_PQ_STRIDE) {
+ LOAD(d, GEN_PQ_D);
+ P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
+ Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
+ }
+ for (; q < qend; q += GEN_PQ_STRIDE) {
+ Q_SYNDROME(GEN_PQ_C, q);
+ }
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ *
+ * @rm RAIDZ map
+ */
+static raidz_inline void
+raidz_generate_pq_impl(raidz_map_t * const rm)
+{
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t csize = rm->rm_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd
+ };
+
+ raidz_math_begin();
+
+ raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize);
+
+ for (c = 3; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
+ raidz_gen_pq_add);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ3)
+ * The function is called per data column.
+ *
+ * @c array of pointers to parity (code) columns
+ * @dc pointer to data column
+ * @csize size of parity columns
+ * @dsize size of data column
+ */
+static void
+raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
+ const size_t dsize)
+{
+ v_t *p = (v_t *)c[0];
+ v_t *q = (v_t *)c[1];
+ v_t *r = (v_t *)c[CODE_R];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const qend = q + (csize / sizeof (v_t));
+
+ GEN_PQR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE,
+ q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ LOAD(d, GEN_PQR_D);
+ P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p);
+ Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q);
+ R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r);
+ }
+ for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+ Q_SYNDROME(GEN_PQR_C, q);
+ R_SYNDROME(GEN_PQR_C, r);
+ }
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ2)
+ *
+ * @rm RAIDZ map
+ */
+static raidz_inline void
+raidz_generate_pqr_impl(raidz_map_t * const rm)
+{
+ size_t c;
+ const size_t ncols = raidz_ncols(rm);
+ const size_t csize = rm->rm_col[CODE_P].rc_size;
+ size_t dsize;
+ abd_t *dabd;
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
+
+ raidz_math_begin();
+
+ raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize);
+ raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize);
+
+ for (c = 4; c < ncols; c++) {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+
+ abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
+ raidz_gen_pqr_add);
+ }
+
+ raidz_math_end();
+}
+
+
+/*
+ * DATA RECONSTRUCTION
+ *
+ * Data reconstruction process consists of two phases:
+ * - Syndrome calculation
+ * - Data reconstruction
+ *
+ * Syndrome is calculated by generating parity using available data columns
+ * and zeros in places of erasure. Existing parity is added to corresponding
+ * syndrome value to obtain the [P|Q|R]syn values from equation:
+ * P = Psyn + Dx + Dy + Dz
+ * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz
+ * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz
+ *
+ * For data reconstruction phase, the corresponding equations are solved
+ * for missing data (Dx, Dy, Dz). This generally involves multiplying known
+ * symbols by an coefficient and adding them together. The multiplication
+ * constant coefficients are calculated ahead of the operation in
+ * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions.
+ *
+ * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big"
+ * and "short" columns.
+ * For this reason, reconstruction is performed in minimum of
+ * two steps. First, from offset 0 to short_size, then from short_size to
+ * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work
+ * over both ranges. The split also enables removal of conditional expressions
+ * from loop bodies, improving throughput of SIMD implementations.
+ * For the best performance, all functions marked with raidz_inline attribute
+ * must be inlined by compiler.
+ *
+ * parity data
+ * columns columns
+ * <----------> <------------------>
+ * x y <----+ missing columns (x, y)
+ * | |
+ * +---+---+---+---+-v-+---+-v-+---+ ^ 0
+ * | | | | | | | | | |
+ * | | | | | | | | | |
+ * | P | Q | R | D | D | D | D | D | |
+ * | | | | 0 | 1 | 2 | 3 | 4 | |
+ * | | | | | | | | | v
+ * | | | | | +---+---+---+ ^ short_size
+ * | | | | | | |
+ * +---+---+---+---+---+ v big_size
+ * <------------------> <---------->
+ * big columns short columns
+ *
+ */
+
+
+
+
+/*
+ * Reconstruct single data column using P parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method not applicable
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ size_t size;
+ abd_t *dabd;
+
+ raidz_math_begin();
+
+ /* copy P into target */
+ raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize);
+
+ /* generate p_syndrome */
+ for (c = firstdc; c < ncols; c++) {
+ if (c == x)
+ continue;
+
+ dabd = rm->rm_col[c].rc_abd;
+ size = MIN(rm->rm_col[c].rc_size, xsize);
+
+ raidz_add(xabd, dabd, size);
+ }
+
+ raidz_math_end();
+
+ return (1 << CODE_P);
+}
+
+
+/*
+ * Generate Q syndrome (Qsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @xsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)xc[TARGET_X];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (xsize / sizeof (v_t));
+
+ SYN_Q_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_Q_D);
+ Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ Q_SYNDROME(SYN_Q_X, x);
+ }
+}
+
+
+/*
+ * Reconstruct single data column using Q parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *tabds[] = { xabd };
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_q_coeff(rm, tgtidx, coeff);
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_q_abd);
+ }
+
+ /* add Q to the syndrome */
+ raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize);
+
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
+
+ raidz_math_end();
+
+ return (1 << CODE_Q);
+}
+
+
+/*
+ * Generate R syndrome (Rsyn)
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)xc[TARGET_X];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+
+ SYN_R_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+ LOAD(d, SYN_R_D);
+ R_D_SYNDROME(SYN_R_D, SYN_R_X, x);
+ }
+ for (; x < xend; x += SYN_STRIDE) {
+ R_SYNDROME(SYN_R_X, x);
+ }
+}
+
+
+/*
+ * Reconstruct single data column using R parity
+ *
+ * @syn_method raidz_add_abd()
+ * @rec_method raidz_mul_abd_cb()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *tabds[] = { xabd };
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_r_coeff(rm, tgtidx, coeff);
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ }
+
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+ raidz_syn_r_abd);
+ }
+
+ /* add R to the syndrome */
+ raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize);
+
+ /* transform the syndrome */
+ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
+
+ raidz_math_end();
+
+ return (1 << CODE_R);
+}
+
+
+/*
+ * Generate P and Q syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)tc[TARGET_X];
+ v_t *y = (v_t *)tc[TARGET_Y];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+
+ SYN_PQ_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x);
+ Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQ_X, y);
+ }
+}
+
+/*
+ * Reconstruct data using PQ parity and PQ syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)tc[TARGET_X];
+ v_t *y = (v_t *)tc[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_PQ_DEFINE();
+
+ for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE,
+ p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) {
+ LOAD(x, REC_PQ_X);
+ LOAD(y, REC_PQ_Y);
+
+ XOR_ACC(p, REC_PQ_X);
+ XOR_ACC(q, REC_PQ_Y);
+
+ /* Save Pxy */
+ COPY(REC_PQ_X, REC_PQ_T);
+
+ /* Calc X */
+ MUL(mul[MUL_PQ_X], REC_PQ_X);
+ MUL(mul[MUL_PQ_Y], REC_PQ_Y);
+ XOR(REC_PQ_Y, REC_PQ_X);
+ STORE(x, REC_PQ_X);
+
+ /* Calc Y */
+ XOR(REC_PQ_T, REC_PQ_X);
+ STORE(y, REC_PQ_X);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using PQ parity
+ *
+ * @syn_method raidz_syn_pq_abd()
+ * @rec_method raidz_rec_pq_abd()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd
+ };
+
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pq_coeff(rm, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pq_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff);
+
+ /* Copy shorter targets back to the original abd buffer */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q));
+}
+
+
+/*
+ * Generate P and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+
+ SYN_PR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PR_D);
+ P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x);
+ R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y);
+ }
+ for (; y < yend; y += SYN_STRIDE) {
+ R_SYNDROME(SYN_PR_X, y);
+ }
+}
+
+/*
+ * Reconstruct data using PR parity and PR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_PR_DEFINE();
+
+ for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE,
+ p += REC_PR_STRIDE, q += REC_PR_STRIDE) {
+ LOAD(x, REC_PR_X);
+ LOAD(y, REC_PR_Y);
+ XOR_ACC(p, REC_PR_X);
+ XOR_ACC(q, REC_PR_Y);
+
+ /* Save Pxy */
+ COPY(REC_PR_X, REC_PR_T);
+
+ /* Calc X */
+ MUL(mul[MUL_PR_X], REC_PR_X);
+ MUL(mul[MUL_PR_Y], REC_PR_Y);
+ XOR(REC_PR_Y, REC_PR_X);
+ STORE(x, REC_PR_X);
+
+ /* Calc Y */
+ XOR(REC_PR_T, REC_PR_X);
+ STORE(y, REC_PR_X);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using PR parity
+ *
+ * @syn_method raidz_syn_pr_abd()
+ * @rec_method raidz_rec_pr_abd()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[0];
+ const size_t y = tgtidx[1];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pr_coeff(rm, tgtidx, coeff);
+
+ /*
+ * Check if some of targets are shorter then others.
+ * They need to be replaced with a new buffer so that syndrome can
+ * be calculated on full length.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_pr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q));
+}
+
+
+/*
+ * Generate Q and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+
+ SYN_QR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+ LOAD(d, SYN_PQ_D);
+ Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x);
+ R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y);
+ }
+ for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) {
+ Q_SYNDROME(SYN_QR_X, x);
+ R_SYNDROME(SYN_QR_X, y);
+ }
+}
+
+
+/*
+ * Reconstruct data using QR parity and QR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
+ const unsigned *mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+
+ REC_QR_DEFINE();
+
+ for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE,
+ p += REC_QR_STRIDE, q += REC_QR_STRIDE) {
+ LOAD(x, REC_QR_X);
+ LOAD(y, REC_QR_Y);
+
+ XOR_ACC(p, REC_QR_X);
+ XOR_ACC(q, REC_QR_Y);
+
+ /* Save Pxy */
+ COPY(REC_QR_X, REC_QR_T);
+
+ /* Calc X */
+ MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */
+ MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */
+ STORE(x, REC_QR_X);
+
+ /* Calc Y */
+ MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */
+ XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */
+ MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */
+ STORE(y, REC_QR_T);
+ }
+}
+
+
+/*
+ * Reconstruct two data columns using QR parity
+ *
+ * @syn_method raidz_syn_qr_abd()
+ * @rec_method raidz_rec_qr_abd()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *tabds[2] = { xabd, yabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
+ unsigned coeff[MUL_CNT];
+ raidz_rec_qr_coeff(rm, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+ raidz_syn_qr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+
+
+ return ((1 << CODE_Q) | (1 << CODE_R));
+}
+
+
+/*
+ * Generate P, Q, and R syndromes
+ *
+ * @xc array of pointers to syndrome columns
+ * @dc data column (NULL if missing)
+ * @tsize size of syndrome columns
+ * @dsize size of data column (0 if missing)
+ */
+static void
+raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize,
+ const size_t dsize)
+{
+ v_t *x = (v_t *)c[TARGET_X];
+ v_t *y = (v_t *)c[TARGET_Y];
+ v_t *z = (v_t *)c[TARGET_Z];
+ const v_t * const yend = y + (tsize / sizeof (v_t));
+ const v_t *d = (const v_t *)dc;
+ const v_t * const dend = d + (dsize / sizeof (v_t));
+
+ SYN_PQR_DEFINE();
+
+ MUL2_SETUP();
+
+ for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE,
+ z += SYN_STRIDE) {
+ LOAD(d, SYN_PQR_D);
+ P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x)
+ Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y);
+ R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z);
+ }
+ for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) {
+ Q_SYNDROME(SYN_PQR_X, y);
+ R_SYNDROME(SYN_PQR_X, z);
+ }
+}
+
+
+/*
+ * Reconstruct data using PRQ parity and PQR syndromes
+ *
+ * @tc syndrome/result columns
+ * @tsize size of syndrome/result columns
+ * @c parity columns
+ * @mul array of multiplication constants
+ */
+static void
+raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
+ const unsigned * const mul)
+{
+ v_t *x = (v_t *)t[TARGET_X];
+ v_t *y = (v_t *)t[TARGET_Y];
+ v_t *z = (v_t *)t[TARGET_Z];
+ const v_t * const xend = x + (tsize / sizeof (v_t));
+ const v_t *p = (v_t *)c[CODE_P];
+ const v_t *q = (v_t *)c[CODE_Q];
+ const v_t *r = (v_t *)c[CODE_R];
+
+ REC_PQR_DEFINE();
+
+ for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE,
+ z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE,
+ r += REC_PQR_STRIDE) {
+ LOAD(x, REC_PQR_X);
+ LOAD(y, REC_PQR_Y);
+ LOAD(z, REC_PQR_Z);
+
+ XOR_ACC(p, REC_PQR_X);
+ XOR_ACC(q, REC_PQR_Y);
+ XOR_ACC(r, REC_PQR_Z);
+
+ /* Save Pxyz and Qxyz */
+ COPY(REC_PQR_X, REC_PQR_XS);
+ COPY(REC_PQR_Y, REC_PQR_YS);
+
+ /* Calc X */
+ MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */
+ MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */
+ XOR(REC_PQR_Y, REC_PQR_X);
+ MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */
+ XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */
+ STORE(x, REC_PQR_X);
+
+ /* Calc Y */
+ XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */
+ MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */
+ COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */
+ MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */
+ MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */
+ XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */
+ STORE(y, REC_PQR_YS);
+
+ /* Calc Z */
+ XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */
+ STORE(z, REC_PQR_YS);
+ }
+}
+
+
+/*
+ * Reconstruct three data columns using PQR parity
+ *
+ * @syn_method raidz_syn_pqr_abd()
+ * @rec_method raidz_rec_pqr_abd()
+ *
+ * @rm RAIDZ map
+ * @tgtidx array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx)
+{
+ size_t c;
+ size_t dsize;
+ abd_t *dabd;
+ const size_t firstdc = raidz_parity(rm);
+ const size_t ncols = raidz_ncols(rm);
+ const size_t x = tgtidx[TARGET_X];
+ const size_t y = tgtidx[TARGET_Y];
+ const size_t z = tgtidx[TARGET_Z];
+ const size_t xsize = rm->rm_col[x].rc_size;
+ const size_t ysize = rm->rm_col[y].rc_size;
+ const size_t zsize = rm->rm_col[z].rc_size;
+ abd_t *xabd = rm->rm_col[x].rc_abd;
+ abd_t *yabd = rm->rm_col[y].rc_abd;
+ abd_t *zabd = rm->rm_col[z].rc_abd;
+ abd_t *tabds[] = { xabd, yabd, zabd };
+ abd_t *cabds[] = {
+ rm->rm_col[CODE_P].rc_abd,
+ rm->rm_col[CODE_Q].rc_abd,
+ rm->rm_col[CODE_R].rc_abd
+ };
+ unsigned coeff[MUL_CNT];
+ raidz_rec_pqr_coeff(rm, tgtidx, coeff);
+
+ /*
+ * Check if some of targets is shorter then others
+ * In this case, shorter target needs to be replaced with
+ * new buffer so that syndrome can be calculated.
+ */
+ if (ysize < xsize) {
+ yabd = abd_alloc(xsize, B_FALSE);
+ tabds[1] = yabd;
+ }
+ if (zsize < xsize) {
+ zabd = abd_alloc(xsize, B_FALSE);
+ tabds[2] = zabd;
+ }
+
+ raidz_math_begin();
+
+ /* Start with first data column if present */
+ if (firstdc != x) {
+ raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize);
+ raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize);
+ } else {
+ raidz_zero(xabd, xsize);
+ raidz_zero(yabd, xsize);
+ raidz_zero(zabd, xsize);
+ }
+
+ /* generate q_syndrome */
+ for (c = firstdc+1; c < ncols; c++) {
+ if (c == x || c == y || c == z) {
+ dabd = NULL;
+ dsize = 0;
+ } else {
+ dabd = rm->rm_col[c].rc_abd;
+ dsize = rm->rm_col[c].rc_size;
+ }
+
+ abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+ raidz_syn_pqr_abd);
+ }
+
+ abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff);
+
+ /*
+ * Copy shorter targets back to the original abd buffer
+ */
+ if (ysize < xsize)
+ raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize);
+ if (zsize < xsize)
+ raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize);
+
+ raidz_math_end();
+
+ if (ysize < xsize)
+ abd_free(yabd);
+ if (zsize < xsize)
+ abd_free(zabd);
+
+ return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+}
+
+#endif /* _VDEV_RAIDZ_MATH_IMPL_H */
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c
new file mode 100644
index 0000000000..cd742e146c
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_scalar.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/vdev_raidz_impl.h>
+
+/*
+ * Provide native CPU scalar routines.
+ * Support 32bit and 64bit CPUs.
+ */
+#if ((~(0x0ULL)) >> 24) == 0xffULL
+#define ELEM_SIZE 4
+typedef uint32_t iv_t;
+#elif ((~(0x0ULL)) >> 56) == 0xffULL
+#define ELEM_SIZE 8
+typedef uint64_t iv_t;
+#endif
+
+/*
+ * Vector type used in scalar implementation
+ *
+ * The union is expected to be of native CPU register size. Since addition
+ * uses XOR operation, it can be performed an all byte elements at once.
+ * Multiplication requires per byte access.
+ */
+typedef union {
+ iv_t e;
+ uint8_t b[ELEM_SIZE];
+} v_t;
+
+/*
+ * Precomputed lookup tables for multiplication by a constant
+ *
+ * Reconstruction path requires multiplication by a constant factors. Instead of
+ * performing two step lookup (log & exp tables), a direct lookup can be used
+ * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
+ *
+ * r = vdev_raidz_mul_lt[c_log][a];
+ *
+ * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
+ * they are faster to obtain while solving the syndrome equations.
+ *
+ * PERFORMANCE NOTE:
+ * Even though the complete lookup table uses 64kiB, only relatively small
+ * portion of it is used at the same time. Following shows number of accessed
+ * bytes for different cases:
+ * - 1 failed disk: 256B (1 mul. coefficient)
+ * - 2 failed disks: 512B (2 mul. coefficients)
+ * - 3 failed disks: 1536B (6 mul. coefficients)
+ *
+ * Size of actually accessed lookup table regions is only larger for
+ * reconstruction of 3 failed disks, when compared to traditional log/exp
+ * method. But since the result is obtained in one lookup step performance is
+ * doubled.
+ */
+static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
+
+static void
+raidz_init_scalar(void)
+{
+ int c, i;
+ for (c = 0; c < 256; c++)
+ for (i = 0; i < 256; i++)
+ vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
+
+}
+
+#define PREFETCHNTA(ptr, offset) {}
+#define PREFETCH(ptr, offset) {}
+
+#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e
+#define XOR(src, acc) acc.e ^= src.e
+#define ZERO(acc) acc.e = 0
+#define COPY(src, dst) dst = src
+#define LOAD(src, val) val = ((v_t *)src)[0]
+#define STORE(dst, val) ((v_t *)dst)[0] = val
+
+/*
+ * Constants used for optimized multiplication by 2.
+ */
+static const struct {
+ iv_t mod;
+ iv_t mask;
+ iv_t msb;
+} scalar_mul2_consts = {
+#if ELEM_SIZE == 8
+ .mod = 0x1d1d1d1d1d1d1d1dULL,
+ .mask = 0xfefefefefefefefeULL,
+ .msb = 0x8080808080808080ULL,
+#else
+ .mod = 0x1d1d1d1dULL,
+ .mask = 0xfefefefeULL,
+ .msb = 0x80808080ULL,
+#endif
+};
+
+#define MUL2_SETUP() {}
+
+#define MUL2(a) \
+{ \
+ iv_t _mask; \
+ \
+ _mask = (a).e & scalar_mul2_consts.msb; \
+ _mask = (_mask << 1) - (_mask >> 7); \
+ (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \
+ (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \
+}
+
+#define MUL4(a) \
+{ \
+ MUL2(a); \
+ MUL2(a); \
+}
+
+#define MUL(c, a) \
+{ \
+ const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \
+ switch (ELEM_SIZE) { \
+ case 8: \
+ a.b[7] = mul_lt[a.b[7]]; \
+ a.b[6] = mul_lt[a.b[6]]; \
+ a.b[5] = mul_lt[a.b[5]]; \
+ a.b[4] = mul_lt[a.b[4]]; \
+ /* falls through */ \
+ case 4: \
+ a.b[3] = mul_lt[a.b[3]]; \
+ a.b[2] = mul_lt[a.b[2]]; \
+ a.b[1] = mul_lt[a.b[1]]; \
+ a.b[0] = mul_lt[a.b[0]]; \
+ break; \
+ } \
+}
+
+#define raidz_math_begin() {}
+#define raidz_math_end() {}
+
+#define SYN_STRIDE 1
+
+#define ZERO_DEFINE() v_t d0
+#define ZERO_STRIDE 1
+#define ZERO_D d0
+
+#define COPY_DEFINE() v_t d0
+#define COPY_STRIDE 1
+#define COPY_D d0
+
+#define ADD_DEFINE() v_t d0
+#define ADD_STRIDE 1
+#define ADD_D d0
+
+#define MUL_DEFINE() v_t d0
+#define MUL_STRIDE 1
+#define MUL_D d0
+
+#define GEN_P_STRIDE 1
+#define GEN_P_DEFINE() v_t p0
+#define GEN_P_P p0
+
+#define GEN_PQ_STRIDE 1
+#define GEN_PQ_DEFINE() v_t d0, c0
+#define GEN_PQ_D d0
+#define GEN_PQ_C c0
+
+#define GEN_PQR_STRIDE 1
+#define GEN_PQR_DEFINE() v_t d0, c0
+#define GEN_PQR_D d0
+#define GEN_PQR_C c0
+
+#define SYN_Q_DEFINE() v_t d0, x0
+#define SYN_Q_D d0
+#define SYN_Q_X x0
+
+
+#define SYN_R_DEFINE() v_t d0, x0
+#define SYN_R_D d0
+#define SYN_R_X x0
+
+
+#define SYN_PQ_DEFINE() v_t d0, x0
+#define SYN_PQ_D d0
+#define SYN_PQ_X x0
+
+
+#define REC_PQ_STRIDE 1
+#define REC_PQ_DEFINE() v_t x0, y0, t0
+#define REC_PQ_X x0
+#define REC_PQ_Y y0
+#define REC_PQ_T t0
+
+
+#define SYN_PR_DEFINE() v_t d0, x0
+#define SYN_PR_D d0
+#define SYN_PR_X x0
+
+#define REC_PR_STRIDE 1
+#define REC_PR_DEFINE() v_t x0, y0, t0
+#define REC_PR_X x0
+#define REC_PR_Y y0
+#define REC_PR_T t0
+
+
+#define SYN_QR_DEFINE() v_t d0, x0
+#define SYN_QR_D d0
+#define SYN_QR_X x0
+
+
+#define REC_QR_STRIDE 1
+#define REC_QR_DEFINE() v_t x0, y0, t0
+#define REC_QR_X x0
+#define REC_QR_Y y0
+#define REC_QR_T t0
+
+
+#define SYN_PQR_DEFINE() v_t d0, x0
+#define SYN_PQR_D d0
+#define SYN_PQR_X x0
+
+#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0
+#define REC_PQR_X x0
+#define REC_PQR_Y y0
+#define REC_PQR_Z z0
+#define REC_PQR_XS xs0
+#define REC_PQR_YS ys0
+
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(scalar);
+DEFINE_REC_METHODS(scalar);
+
+boolean_t
+raidz_will_scalar_work(void)
+{
+ return (B_TRUE); /* always */
+}
+
+const raidz_impl_ops_t vdev_raidz_scalar_impl = {
+ .init = raidz_init_scalar,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(scalar),
+ .rec = RAIDZ_REC_METHODS(scalar),
+ .is_supported = &raidz_will_scalar_work,
+ .name = "scalar"
+};
+
+/* Powers of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+ 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+ 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+ 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+ 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+ 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+ 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+ 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+ 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+ 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+ 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+ 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+ 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+ 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+ 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+ 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+ 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+ 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+ 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+ 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+ 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+ 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+ 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+ 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+ 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+ 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+ 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+ 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+ 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+ 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+ 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+ 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+
+/* Logs of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
+ 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+ 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+ 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+ 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+ 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+ 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+ 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+ 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+ 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+ 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+ 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+ 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+ 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+ 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+ 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+ 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+ 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+ 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+ 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+ 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+ 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+ 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+ 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+ 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+ 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+ 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+ 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+ 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+ 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+ 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+ 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+ 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c
new file mode 100644
index 0000000000..569f73006b
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_sse2.c
@@ -0,0 +1,642 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__amd64)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+#include <sys/debug.h>
+
+#define __asm __asm__ __volatile__
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "xmm"#REG
+#define VR1_(_1, REG, ...) "xmm"#REG
+#define VR2_(_1, _2, REG, ...) "xmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define VR0(r...) VR0_(r, 1, 2, 3, 4, 5, 6)
+#define VR1(r...) VR1_(r, 1, 2, 3, 4, 5, 6)
+#define VR2(r...) VR2_(r, 1, 2, 3, 4, 5, 6)
+#define VR3(r...) VR3_(r, 1, 2, 3, 4, 5, 6)
+#define VR4(r...) VR4_(r, 1, 2, 3, 4, 5, 6)
+#define VR5(r...) VR5_(r, 1, 2, 3, 4, 5, 6)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4, 5, 6)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5, 6)
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 1: \
+ __asm("pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR4(r) "\n" \
+ "pxor %" VR1(r) ", %" VR5(r) "\n" \
+ "pxor %" VR2(r) ", %" VR6(r) "\n" \
+ "pxor %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR2(r) "\n" \
+ "pxor %" VR1(r) ", %" VR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR1(r)); \
+ break; \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "movdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "movdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR1(r)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 1: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ "movdqa %%" VR2(r)", 0x20(%[DST])\n" \
+ "movdqa %%" VR3(r)", 0x30(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 1: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "movd %[mask], %%xmm15\n" \
+ "pshufd $0x0, %%xmm15, %%xmm15\n" \
+ : : [mask] "r" (0x1d1d1d1d)); \
+}
+
+#define _MUL2_x1(a0) \
+{ \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pcmpgtb %" a0", %xmm14\n" \
+ "pand %xmm15, %xmm14\n" \
+ "paddb %" a0", %" a0 "\n" \
+ "pxor %xmm14, %" a0); \
+}
+
+#define _MUL2_x2(a0, a1) \
+{ \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pxor %xmm13, %xmm13\n" \
+ "pcmpgtb %" a0", %xmm14\n" \
+ "pcmpgtb %" a1", %xmm13\n" \
+ "pand %xmm15, %xmm14\n" \
+ "pand %xmm15, %xmm13\n" \
+ "paddb %" a0", %" a0 "\n" \
+ "paddb %" a1", %" a1 "\n" \
+ "pxor %xmm14, %" a0 "\n" \
+ "pxor %xmm13, %" a1); \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2_x2(VR0(r), VR1(r)); \
+ _MUL2_x2(VR2(r), VR3(r)); \
+ break; \
+ case 2: \
+ _MUL2_x2(VR0(r), VR1(r)); \
+ break; \
+ case 1: \
+ _MUL2_x1(VR0(r)); \
+ break; \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+/* General multiplication by adding powers of two */
+
+#define _MUL_PARAM(x, in, acc) \
+{ \
+ if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \
+ if (x & 0xfe) { MUL2(in); } \
+ if (x & 0x02) { XOR(in, acc); } \
+ if (x & 0xfc) { MUL2(in); } \
+ if (x & 0x04) { XOR(in, acc); } \
+ if (x & 0xf8) { MUL2(in); } \
+ if (x & 0x08) { XOR(in, acc); } \
+ if (x & 0xf0) { MUL2(in); } \
+ if (x & 0x10) { XOR(in, acc); } \
+ if (x & 0xe0) { MUL2(in); } \
+ if (x & 0x20) { XOR(in, acc); } \
+ if (x & 0xc0) { MUL2(in); } \
+ if (x & 0x40) { XOR(in, acc); } \
+ if (x & 0x80) { MUL2(in); XOR(in, acc); } \
+}
+
+#define _mul_x1_in 11
+#define _mul_x1_acc 12
+
+#define MUL_x1_DEFINE(x) \
+static void \
+mul_x1_ ## x(void) { _MUL_PARAM(x, _mul_x1_in, _mul_x1_acc); }
+
+#define _mul_x2_in 9, 10
+#define _mul_x2_acc 11, 12
+
+#define MUL_x2_DEFINE(x) \
+static void \
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+MUL_x1_DEFINE(0); MUL_x1_DEFINE(1); MUL_x1_DEFINE(2); MUL_x1_DEFINE(3);
+MUL_x1_DEFINE(4); MUL_x1_DEFINE(5); MUL_x1_DEFINE(6); MUL_x1_DEFINE(7);
+MUL_x1_DEFINE(8); MUL_x1_DEFINE(9); MUL_x1_DEFINE(10); MUL_x1_DEFINE(11);
+MUL_x1_DEFINE(12); MUL_x1_DEFINE(13); MUL_x1_DEFINE(14); MUL_x1_DEFINE(15);
+MUL_x1_DEFINE(16); MUL_x1_DEFINE(17); MUL_x1_DEFINE(18); MUL_x1_DEFINE(19);
+MUL_x1_DEFINE(20); MUL_x1_DEFINE(21); MUL_x1_DEFINE(22); MUL_x1_DEFINE(23);
+MUL_x1_DEFINE(24); MUL_x1_DEFINE(25); MUL_x1_DEFINE(26); MUL_x1_DEFINE(27);
+MUL_x1_DEFINE(28); MUL_x1_DEFINE(29); MUL_x1_DEFINE(30); MUL_x1_DEFINE(31);
+MUL_x1_DEFINE(32); MUL_x1_DEFINE(33); MUL_x1_DEFINE(34); MUL_x1_DEFINE(35);
+MUL_x1_DEFINE(36); MUL_x1_DEFINE(37); MUL_x1_DEFINE(38); MUL_x1_DEFINE(39);
+MUL_x1_DEFINE(40); MUL_x1_DEFINE(41); MUL_x1_DEFINE(42); MUL_x1_DEFINE(43);
+MUL_x1_DEFINE(44); MUL_x1_DEFINE(45); MUL_x1_DEFINE(46); MUL_x1_DEFINE(47);
+MUL_x1_DEFINE(48); MUL_x1_DEFINE(49); MUL_x1_DEFINE(50); MUL_x1_DEFINE(51);
+MUL_x1_DEFINE(52); MUL_x1_DEFINE(53); MUL_x1_DEFINE(54); MUL_x1_DEFINE(55);
+MUL_x1_DEFINE(56); MUL_x1_DEFINE(57); MUL_x1_DEFINE(58); MUL_x1_DEFINE(59);
+MUL_x1_DEFINE(60); MUL_x1_DEFINE(61); MUL_x1_DEFINE(62); MUL_x1_DEFINE(63);
+MUL_x1_DEFINE(64); MUL_x1_DEFINE(65); MUL_x1_DEFINE(66); MUL_x1_DEFINE(67);
+MUL_x1_DEFINE(68); MUL_x1_DEFINE(69); MUL_x1_DEFINE(70); MUL_x1_DEFINE(71);
+MUL_x1_DEFINE(72); MUL_x1_DEFINE(73); MUL_x1_DEFINE(74); MUL_x1_DEFINE(75);
+MUL_x1_DEFINE(76); MUL_x1_DEFINE(77); MUL_x1_DEFINE(78); MUL_x1_DEFINE(79);
+MUL_x1_DEFINE(80); MUL_x1_DEFINE(81); MUL_x1_DEFINE(82); MUL_x1_DEFINE(83);
+MUL_x1_DEFINE(84); MUL_x1_DEFINE(85); MUL_x1_DEFINE(86); MUL_x1_DEFINE(87);
+MUL_x1_DEFINE(88); MUL_x1_DEFINE(89); MUL_x1_DEFINE(90); MUL_x1_DEFINE(91);
+MUL_x1_DEFINE(92); MUL_x1_DEFINE(93); MUL_x1_DEFINE(94); MUL_x1_DEFINE(95);
+MUL_x1_DEFINE(96); MUL_x1_DEFINE(97); MUL_x1_DEFINE(98); MUL_x1_DEFINE(99);
+MUL_x1_DEFINE(100); MUL_x1_DEFINE(101); MUL_x1_DEFINE(102); MUL_x1_DEFINE(103);
+MUL_x1_DEFINE(104); MUL_x1_DEFINE(105); MUL_x1_DEFINE(106); MUL_x1_DEFINE(107);
+MUL_x1_DEFINE(108); MUL_x1_DEFINE(109); MUL_x1_DEFINE(110); MUL_x1_DEFINE(111);
+MUL_x1_DEFINE(112); MUL_x1_DEFINE(113); MUL_x1_DEFINE(114); MUL_x1_DEFINE(115);
+MUL_x1_DEFINE(116); MUL_x1_DEFINE(117); MUL_x1_DEFINE(118); MUL_x1_DEFINE(119);
+MUL_x1_DEFINE(120); MUL_x1_DEFINE(121); MUL_x1_DEFINE(122); MUL_x1_DEFINE(123);
+MUL_x1_DEFINE(124); MUL_x1_DEFINE(125); MUL_x1_DEFINE(126); MUL_x1_DEFINE(127);
+MUL_x1_DEFINE(128); MUL_x1_DEFINE(129); MUL_x1_DEFINE(130); MUL_x1_DEFINE(131);
+MUL_x1_DEFINE(132); MUL_x1_DEFINE(133); MUL_x1_DEFINE(134); MUL_x1_DEFINE(135);
+MUL_x1_DEFINE(136); MUL_x1_DEFINE(137); MUL_x1_DEFINE(138); MUL_x1_DEFINE(139);
+MUL_x1_DEFINE(140); MUL_x1_DEFINE(141); MUL_x1_DEFINE(142); MUL_x1_DEFINE(143);
+MUL_x1_DEFINE(144); MUL_x1_DEFINE(145); MUL_x1_DEFINE(146); MUL_x1_DEFINE(147);
+MUL_x1_DEFINE(148); MUL_x1_DEFINE(149); MUL_x1_DEFINE(150); MUL_x1_DEFINE(151);
+MUL_x1_DEFINE(152); MUL_x1_DEFINE(153); MUL_x1_DEFINE(154); MUL_x1_DEFINE(155);
+MUL_x1_DEFINE(156); MUL_x1_DEFINE(157); MUL_x1_DEFINE(158); MUL_x1_DEFINE(159);
+MUL_x1_DEFINE(160); MUL_x1_DEFINE(161); MUL_x1_DEFINE(162); MUL_x1_DEFINE(163);
+MUL_x1_DEFINE(164); MUL_x1_DEFINE(165); MUL_x1_DEFINE(166); MUL_x1_DEFINE(167);
+MUL_x1_DEFINE(168); MUL_x1_DEFINE(169); MUL_x1_DEFINE(170); MUL_x1_DEFINE(171);
+MUL_x1_DEFINE(172); MUL_x1_DEFINE(173); MUL_x1_DEFINE(174); MUL_x1_DEFINE(175);
+MUL_x1_DEFINE(176); MUL_x1_DEFINE(177); MUL_x1_DEFINE(178); MUL_x1_DEFINE(179);
+MUL_x1_DEFINE(180); MUL_x1_DEFINE(181); MUL_x1_DEFINE(182); MUL_x1_DEFINE(183);
+MUL_x1_DEFINE(184); MUL_x1_DEFINE(185); MUL_x1_DEFINE(186); MUL_x1_DEFINE(187);
+MUL_x1_DEFINE(188); MUL_x1_DEFINE(189); MUL_x1_DEFINE(190); MUL_x1_DEFINE(191);
+MUL_x1_DEFINE(192); MUL_x1_DEFINE(193); MUL_x1_DEFINE(194); MUL_x1_DEFINE(195);
+MUL_x1_DEFINE(196); MUL_x1_DEFINE(197); MUL_x1_DEFINE(198); MUL_x1_DEFINE(199);
+MUL_x1_DEFINE(200); MUL_x1_DEFINE(201); MUL_x1_DEFINE(202); MUL_x1_DEFINE(203);
+MUL_x1_DEFINE(204); MUL_x1_DEFINE(205); MUL_x1_DEFINE(206); MUL_x1_DEFINE(207);
+MUL_x1_DEFINE(208); MUL_x1_DEFINE(209); MUL_x1_DEFINE(210); MUL_x1_DEFINE(211);
+MUL_x1_DEFINE(212); MUL_x1_DEFINE(213); MUL_x1_DEFINE(214); MUL_x1_DEFINE(215);
+MUL_x1_DEFINE(216); MUL_x1_DEFINE(217); MUL_x1_DEFINE(218); MUL_x1_DEFINE(219);
+MUL_x1_DEFINE(220); MUL_x1_DEFINE(221); MUL_x1_DEFINE(222); MUL_x1_DEFINE(223);
+MUL_x1_DEFINE(224); MUL_x1_DEFINE(225); MUL_x1_DEFINE(226); MUL_x1_DEFINE(227);
+MUL_x1_DEFINE(228); MUL_x1_DEFINE(229); MUL_x1_DEFINE(230); MUL_x1_DEFINE(231);
+MUL_x1_DEFINE(232); MUL_x1_DEFINE(233); MUL_x1_DEFINE(234); MUL_x1_DEFINE(235);
+MUL_x1_DEFINE(236); MUL_x1_DEFINE(237); MUL_x1_DEFINE(238); MUL_x1_DEFINE(239);
+MUL_x1_DEFINE(240); MUL_x1_DEFINE(241); MUL_x1_DEFINE(242); MUL_x1_DEFINE(243);
+MUL_x1_DEFINE(244); MUL_x1_DEFINE(245); MUL_x1_DEFINE(246); MUL_x1_DEFINE(247);
+MUL_x1_DEFINE(248); MUL_x1_DEFINE(249); MUL_x1_DEFINE(250); MUL_x1_DEFINE(251);
+MUL_x1_DEFINE(252); MUL_x1_DEFINE(253); MUL_x1_DEFINE(254); MUL_x1_DEFINE(255);
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x1_mul_fns[256] = {
+ mul_x1_0, mul_x1_1, mul_x1_2, mul_x1_3, mul_x1_4, mul_x1_5,
+ mul_x1_6, mul_x1_7, mul_x1_8, mul_x1_9, mul_x1_10, mul_x1_11,
+ mul_x1_12, mul_x1_13, mul_x1_14, mul_x1_15, mul_x1_16, mul_x1_17,
+ mul_x1_18, mul_x1_19, mul_x1_20, mul_x1_21, mul_x1_22, mul_x1_23,
+ mul_x1_24, mul_x1_25, mul_x1_26, mul_x1_27, mul_x1_28, mul_x1_29,
+ mul_x1_30, mul_x1_31, mul_x1_32, mul_x1_33, mul_x1_34, mul_x1_35,
+ mul_x1_36, mul_x1_37, mul_x1_38, mul_x1_39, mul_x1_40, mul_x1_41,
+ mul_x1_42, mul_x1_43, mul_x1_44, mul_x1_45, mul_x1_46, mul_x1_47,
+ mul_x1_48, mul_x1_49, mul_x1_50, mul_x1_51, mul_x1_52, mul_x1_53,
+ mul_x1_54, mul_x1_55, mul_x1_56, mul_x1_57, mul_x1_58, mul_x1_59,
+ mul_x1_60, mul_x1_61, mul_x1_62, mul_x1_63, mul_x1_64, mul_x1_65,
+ mul_x1_66, mul_x1_67, mul_x1_68, mul_x1_69, mul_x1_70, mul_x1_71,
+ mul_x1_72, mul_x1_73, mul_x1_74, mul_x1_75, mul_x1_76, mul_x1_77,
+ mul_x1_78, mul_x1_79, mul_x1_80, mul_x1_81, mul_x1_82, mul_x1_83,
+ mul_x1_84, mul_x1_85, mul_x1_86, mul_x1_87, mul_x1_88, mul_x1_89,
+ mul_x1_90, mul_x1_91, mul_x1_92, mul_x1_93, mul_x1_94, mul_x1_95,
+ mul_x1_96, mul_x1_97, mul_x1_98, mul_x1_99, mul_x1_100, mul_x1_101,
+ mul_x1_102, mul_x1_103, mul_x1_104, mul_x1_105, mul_x1_106, mul_x1_107,
+ mul_x1_108, mul_x1_109, mul_x1_110, mul_x1_111, mul_x1_112, mul_x1_113,
+ mul_x1_114, mul_x1_115, mul_x1_116, mul_x1_117, mul_x1_118, mul_x1_119,
+ mul_x1_120, mul_x1_121, mul_x1_122, mul_x1_123, mul_x1_124, mul_x1_125,
+ mul_x1_126, mul_x1_127, mul_x1_128, mul_x1_129, mul_x1_130, mul_x1_131,
+ mul_x1_132, mul_x1_133, mul_x1_134, mul_x1_135, mul_x1_136, mul_x1_137,
+ mul_x1_138, mul_x1_139, mul_x1_140, mul_x1_141, mul_x1_142, mul_x1_143,
+ mul_x1_144, mul_x1_145, mul_x1_146, mul_x1_147, mul_x1_148, mul_x1_149,
+ mul_x1_150, mul_x1_151, mul_x1_152, mul_x1_153, mul_x1_154, mul_x1_155,
+ mul_x1_156, mul_x1_157, mul_x1_158, mul_x1_159, mul_x1_160, mul_x1_161,
+ mul_x1_162, mul_x1_163, mul_x1_164, mul_x1_165, mul_x1_166, mul_x1_167,
+ mul_x1_168, mul_x1_169, mul_x1_170, mul_x1_171, mul_x1_172, mul_x1_173,
+ mul_x1_174, mul_x1_175, mul_x1_176, mul_x1_177, mul_x1_178, mul_x1_179,
+ mul_x1_180, mul_x1_181, mul_x1_182, mul_x1_183, mul_x1_184, mul_x1_185,
+ mul_x1_186, mul_x1_187, mul_x1_188, mul_x1_189, mul_x1_190, mul_x1_191,
+ mul_x1_192, mul_x1_193, mul_x1_194, mul_x1_195, mul_x1_196, mul_x1_197,
+ mul_x1_198, mul_x1_199, mul_x1_200, mul_x1_201, mul_x1_202, mul_x1_203,
+ mul_x1_204, mul_x1_205, mul_x1_206, mul_x1_207, mul_x1_208, mul_x1_209,
+ mul_x1_210, mul_x1_211, mul_x1_212, mul_x1_213, mul_x1_214, mul_x1_215,
+ mul_x1_216, mul_x1_217, mul_x1_218, mul_x1_219, mul_x1_220, mul_x1_221,
+ mul_x1_222, mul_x1_223, mul_x1_224, mul_x1_225, mul_x1_226, mul_x1_227,
+ mul_x1_228, mul_x1_229, mul_x1_230, mul_x1_231, mul_x1_232, mul_x1_233,
+ mul_x1_234, mul_x1_235, mul_x1_236, mul_x1_237, mul_x1_238, mul_x1_239,
+ mul_x1_240, mul_x1_241, mul_x1_242, mul_x1_243, mul_x1_244, mul_x1_245,
+ mul_x1_246, mul_x1_247, mul_x1_248, mul_x1_249, mul_x1_250, mul_x1_251,
+ mul_x1_252, mul_x1_253, mul_x1_254, mul_x1_255
+};
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+ mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+ mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+ mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+ mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+ mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+ mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+ mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+ mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+ mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+ mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+ mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+ mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+ mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+ mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+ mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+ mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+ mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+ mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+ mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+ mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+ mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+ mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+ mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+ mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+ mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+ mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+ mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+ mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+ mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+ mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+ mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+ mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+ mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+ mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+ mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+ mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+ mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+ mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+ mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+ mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+ mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+ mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+ mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ COPY(r, _mul_x2_in); \
+ gf_x2_mul_fns[c](); \
+ COPY(_mul_x2_acc, r); \
+ break; \
+ case 1: \
+ COPY(r, _mul_x1_in); \
+ gf_x1_mul_fns[c](); \
+ COPY(_mul_x1_acc, r); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+}
+
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 2
+#define MUL_DEFINE() MUL2_SETUP()
+#define MUL_D 0, 1
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() MUL2_SETUP()
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() MUL2_SETUP()
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() MUL2_SETUP()
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 1
+#define REC_PQR_DEFINE() MUL2_SETUP()
+#define REC_PQR_X 0
+#define REC_PQR_Y 1
+#define REC_PQR_Z 2
+#define REC_PQR_XS 3
+#define REC_PQR_YS 4
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(sse2);
+DEFINE_REC_METHODS(sse2);
+
+static boolean_t
+raidz_will_sse2_work(void)
+{
+ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(sse2),
+ .rec = RAIDZ_REC_METHODS(sse2),
+ .is_supported = &raidz_will_sse2_work,
+ .name = "sse2"
+};
+
+#elif defined(__i386)
+
+/* 32-bit stub for user-level fakekernel dependencies */
+#include <sys/vdev_raidz_impl.h>
+const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = NULL,
+ .rec = NULL,
+ .is_supported = NULL,
+ .name = "avx2"
+};
+
+#endif /* defined(__amd64) */
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c b/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c
new file mode 100644
index 0000000000..03d51901cb
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz_math_ssse3.c
@@ -0,0 +1,2483 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__amd64)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#define __asm __asm__ __volatile__
+
+#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define VR0_(REG, ...) "xmm"#REG
+#define VR1_(_1, REG, ...) "xmm"#REG
+#define VR2_(_1, _2, REG, ...) "xmm"#REG
+#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define VR0(r...) VR0_(r)
+#define VR1(r...) VR1_(r)
+#define VR2(r...) VR2_(r, 1)
+#define VR3(r...) VR3_(r, 1, 2)
+#define VR4(r...) VR4_(r, 1, 2)
+#define VR5(r...) VR5_(r, 1, 2, 3)
+#define VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define R_01(REG1, REG2, ...) REG1, REG2
+#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define ZFS_ASM_BUG() ASSERT(0)
+
+const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define ELEM_SIZE 16
+
+typedef struct v {
+ uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define XOR_ACC(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define XOR(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR4(r) "\n" \
+ "pxor %" VR1(r) ", %" VR5(r) "\n" \
+ "pxor %" VR2(r) ", %" VR6(r) "\n" \
+ "pxor %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "pxor %" VR0(r) ", %" VR2(r) "\n" \
+ "pxor %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define ZERO(r...) XOR(r, r)
+
+#define COPY(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 8: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR4(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR5(r) "\n" \
+ "movdqa %" VR2(r) ", %" VR6(r) "\n" \
+ "movdqa %" VR3(r) ", %" VR7(r)); \
+ break; \
+ case 4: \
+ __asm( \
+ "movdqa %" VR0(r) ", %" VR2(r) "\n" \
+ "movdqa %" VR1(r) ", %" VR3(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define LOAD(src, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \
+ "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \
+ "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \
+ : : [SRC] "r" (src)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define STORE(dst, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ "movdqa %%" VR2(r)", 0x20(%[DST])\n" \
+ "movdqa %%" VR3(r)", 0x30(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ case 2: \
+ __asm( \
+ "movdqa %%" VR0(r)", 0x00(%[DST])\n" \
+ "movdqa %%" VR1(r)", 0x10(%[DST])\n" \
+ : : [DST] "r" (dst)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2_SETUP() \
+{ \
+ __asm( \
+ "movd %[mask], %%xmm15\n" \
+ "pshufd $0x0, %%xmm15, %%xmm15\n" \
+ : : [mask] "r" (0x1d1d1d1d)); \
+}
+
+#define _MUL2_x2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ "pxor %xmm14, %xmm14\n" \
+ "pxor %xmm13, %xmm13\n" \
+ "pcmpgtb %" VR0(r)", %xmm14\n" \
+ "pcmpgtb %" VR1(r)", %xmm13\n" \
+ "pand %xmm15, %xmm14\n" \
+ "pand %xmm15, %xmm13\n" \
+ "paddb %" VR0(r)", %" VR0(r) "\n" \
+ "paddb %" VR1(r)", %" VR1(r) "\n" \
+ "pxor %xmm14, %" VR0(r) "\n" \
+ "pxor %xmm13, %" VR1(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL2(r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MUL2_x2(R_01(r)); \
+ _MUL2_x2(R_23(r)); \
+ break; \
+ case 2: \
+ _MUL2_x2(r); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL4(r...) \
+{ \
+ MUL2(r); \
+ MUL2(r); \
+}
+
+#define _0f "xmm15"
+#define _a_save "xmm14"
+#define _b_save "xmm13"
+#define _lt_mod_a "xmm12"
+#define _lt_clmul_a "xmm11"
+#define _lt_mod_b "xmm10"
+#define _lt_clmul_b "xmm15"
+
+#define _MULx2(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 2: \
+ __asm( \
+ /* lts for upper part */ \
+ "movd %[mask], %%" _0f "\n" \
+ "pshufd $0x0, %%" _0f ", %%" _0f "\n" \
+ "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n" \
+ "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n" \
+ /* upper part */ \
+ "movdqa %%" VR0(r) ", %%" _a_save "\n" \
+ "movdqa %%" VR1(r) ", %%" _b_save "\n" \
+ "psraw $0x4, %%" VR0(r) "\n" \
+ "psraw $0x4, %%" VR1(r) "\n" \
+ "pand %%" _0f ", %%" _a_save "\n" \
+ "pand %%" _0f ", %%" _b_save "\n" \
+ "pand %%" _0f ", %%" VR0(r) "\n" \
+ "pand %%" _0f ", %%" VR1(r) "\n" \
+ \
+ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \
+ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \
+ \
+ "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n" \
+ "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n" \
+ "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n" \
+ "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n" \
+ \
+ "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n" \
+ "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n" \
+ "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n" \
+ "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n" \
+ /* lts for lower part */ \
+ "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n" \
+ "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n" \
+ "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \
+ "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \
+ /* lower part */ \
+ "pshufb %%" _a_save ",%%" _lt_mod_a "\n" \
+ "pshufb %%" _b_save ",%%" _lt_mod_b "\n" \
+ "pshufb %%" _a_save ",%%" _lt_clmul_a "\n" \
+ "pshufb %%" _b_save ",%%" _lt_clmul_b "\n" \
+ \
+ "pxor %%" _lt_mod_a ",%%" VR0(r) "\n" \
+ "pxor %%" _lt_mod_b ",%%" VR1(r) "\n" \
+ "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n" \
+ "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n" \
+ : : [mask] "r" (0x0f0f0f0f), \
+ [lt] "r" (gf_clmul_mod_lt[4*(c)])); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define MUL(c, r...) \
+{ \
+ switch (REG_CNT(r)) { \
+ case 4: \
+ _MULx2(c, R_23(r)); \
+ _MULx2(c, R_01(r)); \
+ break; \
+ case 2: \
+ _MULx2(c, R_01(r)); \
+ break; \
+ default: \
+ ZFS_ASM_BUG(); \
+ } \
+}
+
+#define raidz_math_begin() kfpu_begin()
+#define raidz_math_end() kfpu_end()
+
+
+#define SYN_STRIDE 4
+
+#define ZERO_STRIDE 4
+#define ZERO_DEFINE() {}
+#define ZERO_D 0, 1, 2, 3
+
+#define COPY_STRIDE 4
+#define COPY_DEFINE() {}
+#define COPY_D 0, 1, 2, 3
+
+#define ADD_STRIDE 4
+#define ADD_DEFINE() {}
+#define ADD_D 0, 1, 2, 3
+
+#define MUL_STRIDE 4
+#define MUL_DEFINE() {}
+#define MUL_D 0, 1, 2, 3
+
+#define GEN_P_STRIDE 4
+#define GEN_P_DEFINE() {}
+#define GEN_P_P 0, 1, 2, 3
+
+#define GEN_PQ_STRIDE 4
+#define GEN_PQ_DEFINE() {}
+#define GEN_PQ_D 0, 1, 2, 3
+#define GEN_PQ_C 4, 5, 6, 7
+
+#define GEN_PQR_STRIDE 4
+#define GEN_PQR_DEFINE() {}
+#define GEN_PQR_D 0, 1, 2, 3
+#define GEN_PQR_C 4, 5, 6, 7
+
+#define SYN_Q_DEFINE() {}
+#define SYN_Q_D 0, 1, 2, 3
+#define SYN_Q_X 4, 5, 6, 7
+
+#define SYN_R_DEFINE() {}
+#define SYN_R_D 0, 1, 2, 3
+#define SYN_R_X 4, 5, 6, 7
+
+#define SYN_PQ_DEFINE() {}
+#define SYN_PQ_D 0, 1, 2, 3
+#define SYN_PQ_X 4, 5, 6, 7
+
+#define REC_PQ_STRIDE 2
+#define REC_PQ_DEFINE() {}
+#define REC_PQ_X 0, 1
+#define REC_PQ_Y 2, 3
+#define REC_PQ_T 4, 5
+
+#define SYN_PR_DEFINE() {}
+#define SYN_PR_D 0, 1, 2, 3
+#define SYN_PR_X 4, 5, 6, 7
+
+#define REC_PR_STRIDE 2
+#define REC_PR_DEFINE() {}
+#define REC_PR_X 0, 1
+#define REC_PR_Y 2, 3
+#define REC_PR_T 4, 5
+
+#define SYN_QR_DEFINE() {}
+#define SYN_QR_D 0, 1, 2, 3
+#define SYN_QR_X 4, 5, 6, 7
+
+#define REC_QR_STRIDE 2
+#define REC_QR_DEFINE() {}
+#define REC_QR_X 0, 1
+#define REC_QR_Y 2, 3
+#define REC_QR_T 4, 5
+
+#define SYN_PQR_DEFINE() {}
+#define SYN_PQR_D 0, 1, 2, 3
+#define SYN_PQR_X 4, 5, 6, 7
+
+#define REC_PQR_STRIDE 2
+#define REC_PQR_DEFINE() {}
+#define REC_PQR_X 0, 1
+#define REC_PQR_Y 2, 3
+#define REC_PQR_Z 4, 5
+#define REC_PQR_XS 6, 7
+#define REC_PQR_YS 8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(ssse3);
+DEFINE_REC_METHODS(ssse3);
+
+static boolean_t
+raidz_will_ssse3_work(void)
+{
+ return (kfpu_allowed() && zfs_sse_available() &&
+ zfs_sse2_available() && zfs_ssse3_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = RAIDZ_GEN_METHODS(ssse3),
+ .rec = RAIDZ_REC_METHODS(ssse3),
+ .is_supported = &raidz_will_ssse3_work,
+ .name = "ssse3"
+};
+
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] =
+{
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+ 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+ 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+ 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+ 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+ 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+ 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+ 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+ 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+ 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+ 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+ 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+ 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+ 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+ 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+ 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+ 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+ 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+ 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+ 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+ 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+ 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+ 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+ 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+ 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+ 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+ 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+ 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+ 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+ 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+ 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa },
+ { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+ 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+ 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+ 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+ 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+ 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+ 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+ 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+ 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+ 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+ 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+ 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+ 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+ 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+ 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+ 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+ 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+ 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba },
+ { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+ 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+ 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+ 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+ 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e },
+ { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+ 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+ 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+ 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+ 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+ 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 },
+ { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+ 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+ 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+ 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+ 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+ 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 },
+ { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+ 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+ 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+ 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+ 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+ 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a },
+ { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+ 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d },
+ { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+ 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+ 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+ 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+ 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+ 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+ 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+ 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+ 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+ 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+ 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+ 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+ 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+ 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+ 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+ 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+ 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+ 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+ 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+ 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+ 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+ 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+ 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+ 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+ 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+ 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+ 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+ 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+ 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+ 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+ 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+ 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+ 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+ 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a },
+ { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+ 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 },
+ { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+ 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+ 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+ 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+ 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+ 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+ 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+ 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+ 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 },
+ { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+ 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+ 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+ 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+ 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+ 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+ 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+ 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+ 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+ 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+ 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a },
+ { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+ 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+ 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+ 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+ 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+ 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+ 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+ 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+ 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+ 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 },
+ { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+ 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+ 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+ 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+ 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+ 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+ 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+ 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+ 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+ 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+ 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a },
+ { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+ 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+ 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a },
+ { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+ 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+ 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+ 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+ 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+ 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+ 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+ 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 },
+ { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+ 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+ 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+ 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+ 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+ 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+ 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+ 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+ 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+ 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+ 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+ 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+ 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+ 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+ 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+ 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+ 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+ 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+ 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 },
+ { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+ 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+ 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+ 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+ 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+ 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+ 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+ 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+ 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+ 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 },
+ { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+ 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+ 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+ 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+ 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+ 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+ 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+ 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+ 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+ 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+ 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+ 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+ 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+ 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+ 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+ 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+ 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+ 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+ 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+ 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+ 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+ 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+ 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+ 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+ 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+ 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 },
+ { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+ 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+ 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+ 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+ 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+ 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+ 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+ 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+ 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+ 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+ 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e },
+ { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+ 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+ 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+ 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+ 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+ 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+ 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+ 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+ 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 },
+ { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+ 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+ 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+ 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+ 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+ 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+ 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+ 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+ 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+ 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+ 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a },
+ { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+ 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+ 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+ 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+ 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+ 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+ 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+ 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+ 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+ 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 },
+ { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+ 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+ 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+ 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+ 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+ 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 },
+ { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+ 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+ 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+ 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+ 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+ 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea },
+ { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+ 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 },
+ { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+ 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+ 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+ 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+ 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+ 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+ 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+ 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+ 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 },
+ { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+ 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+ 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+ 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+ 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+ 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+ 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+ 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+ 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+ 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+ 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa },
+ { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+ 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+ 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 },
+ { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+ 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+ 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+ 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa },
+ { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+ 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+ 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+ 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 },
+ { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+ 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+ 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+ 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 },
+ { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+ 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d },
+ { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+ 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+ 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+ 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 },
+ { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+ 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+ 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+ 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+ 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 },
+ { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+ 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+ 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+ 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 },
+ { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+ 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+ 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+ 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a },
+ { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+ 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde },
+ { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+ 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 },
+ { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+ 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 },
+ { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+ 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 }
+};
+/* END CSTYLED */
+
+#elif defined(__i386)
+
+/* 32-bit stub for user-level fakekernel dependencies */
+#include <sys/vdev_raidz_impl.h>
+const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
+ .init = NULL,
+ .fini = NULL,
+ .gen = NULL,
+ .rec = NULL,
+ .is_supported = NULL,
+ .name = "sse3"
+};
+
+#endif /* defined(__amd64) */
diff --git a/usr/src/uts/common/fs/zfs/zcp.c b/usr/src/uts/common/fs/zfs/zcp.c
index 61ce60a233..e2db01c5b6 100644
--- a/usr/src/uts/common/fs/zfs/zcp.c
+++ b/usr/src/uts/common/fs/zfs/zcp.c
@@ -22,7 +22,7 @@
*
* The ZCP interface allows various ZFS commands and operations ZFS
* administrative operations (e.g. creating and destroying snapshots, typically
- * performed via an ioctl to /dev/zfs by the zfs(1M) command and
+ * performed via an ioctl to /dev/zfs by the zfs(8) command and
* libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP
* script is run as a dsl_sync_task and fully executed during one transaction
* group sync. This ensures that no other changes can be written concurrently
@@ -86,7 +86,7 @@
* longjumps out of the script execution with luaL_error() and returns with the
* error.
*
- * See zfs-program(1M) for more information on high level usage.
+ * See zfs-program(8) for more information on high level usage.
*/
#include "lua.h"
@@ -718,8 +718,7 @@ static void *
zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
{
zcp_alloc_arg_t *allocargs = ud;
- int flags = (allocargs->aa_must_succeed) ?
- KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI);
+ int flags = (allocargs->aa_must_succeed) ? KM_SLEEP : KM_NOSLEEP_LAZY;
if (nsize == 0) {
if (ptr != NULL) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index c5add7b25f..5d377a109e 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -21,7 +21,6 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, Joyent, Inc.
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
* Copyright (c) 2015, Joyent, Inc.
@@ -718,6 +717,7 @@ zfs_rmnode(znode_t *zp)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
/*
diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c
index dd854c12e1..2118fd549e 100644
--- a/usr/src/uts/common/fs/zfs/zfs_fm.c
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c
@@ -735,7 +735,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
report->zcr_length = length;
#ifdef _KERNEL
- zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+ (void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
if (report->zcr_ereport == NULL) {
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 153dcf1502..b74baf46ea 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -37,6 +37,7 @@
* Copyright (c) 2017, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
* Copyright 2017 RackTop Systems.
* Copyright (c) 2017, Datto, Inc. All rights reserved.
+ * Copyright 2021 The University of Queensland
*/
/*
@@ -63,8 +64,9 @@
*
* zfs_ioc_t ioc
* The ioctl request number, which userland will pass to ioctl(2).
- * The ioctl numbers can change from release to release, because
- * the caller (libzfs) must be matched to the kernel.
+ * We want newer versions of libzfs and libzfs_core to run against
+ * existing zfs kernel modules (i.e. a deferred reboot after an update).
+ * Therefore the ioctl numbers cannot change from release to release.
*
* zfs_secpolicy_func_t *secpolicy
* This function will be called before the zfs_ioc_func_t, to
@@ -90,6 +92,10 @@
* Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
* POOL_CHECK_READONLY).
*
+ * zfs_ioc_key_t *nvl_keys
+ * The list of expected/allowable innvl input keys. This list is used
+ * to validate the nvlist input to the ioctl.
+ *
* boolean_t smush_outnvlist
* If smush_outnvlist is true, then the output is presumed to be a
* list of errors, and it will be "smushed" down to fit into the
@@ -138,6 +144,14 @@
* use the outnvl if they succeed, because the caller can not
* distinguish between the operation failing, and
* deserialization failing.
+ *
+ * IOCTL Interface Errors
+ *
+ * The following ioctl input errors can be returned:
+ * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel
+ * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel
+ * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type
*/
#include <sys/types.h>
@@ -223,6 +237,37 @@ typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *);
+/*
+ * IOC Keys are used to document and validate user->kernel interface inputs.
+ * See zfs_keys_recv_new for an example declaration. Any key name that is not
+ * listed will be rejected as input.
+ *
+ * The keyname 'optional' is always allowed, and must be an nvlist if present.
+ * Arguments which older kernels can safely ignore can be placed under the
+ * "optional" key.
+ *
+ * When adding new keys to an existing ioc for new functionality, consider:
+ * - adding an entry into zfs_sysfs.c zfs_features[] list
+ * - updating the libzfs_input_check.c test utility
+ *
+ * Note: in the ZK_WILDCARDLIST case, the name serves as documentation
+ * for the expected name (bookmark, snapshot, property, etc) but there
+ * is no validation in the preflight zfs_check_input_nvpairs() check.
+ */
+typedef enum {
+ ZK_OPTIONAL = 1 << 0, /* pair is optional */
+ ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */
+} ioc_key_flag_t;
+
+/* DATA_TYPE_ANY is used when zkey_type can vary. */
+#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN
+
+typedef struct zfs_ioc_key {
+ const char *zkey_name;
+ data_type_t zkey_type;
+ ioc_key_flag_t zkey_flags;
+} zfs_ioc_key_t;
+
typedef enum {
NO_NAME,
POOL_NAME,
@@ -244,6 +289,8 @@ typedef struct zfs_ioc_vec {
zfs_ioc_poolcheck_t zvec_pool_check;
boolean_t zvec_smush_outnvlist;
const char *zvec_name;
+ const zfs_ioc_key_t *zvec_nvl_keys;
+ size_t zvec_nvl_key_count;
} zfs_ioc_vec_t;
/* This array is indexed by zfs_userquota_prop_t */
@@ -861,8 +908,8 @@ zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
nvpair_t *pair, *nextpair;
int error = 0;
- if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
- return (SET_ERROR(EINVAL));
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nextpair) {
nextpair = nvlist_next_nvpair(snaps, pair);
@@ -1009,8 +1056,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
int error = 0;
nvpair_t *pair;
- if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
- return (SET_ERROR(EINVAL));
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char *name = nvpair_name(pair);
@@ -1030,7 +1077,7 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
}
/*
- * Check for permission to create each snapshot in the nvlist.
+ * Check for permission to create each bookmark in the nvlist.
*/
/* ARGSUSED */
static int
@@ -1265,9 +1312,7 @@ zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
nvlist_t *holds;
int error;
- error = nvlist_lookup_nvlist(innvl, "holds", &holds);
- if (error != 0)
- return (SET_ERROR(EINVAL));
+ holds = fnvlist_lookup_nvlist(innvl, "holds");
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
@@ -1338,12 +1383,15 @@ zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
return (0);
error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
- if (error == 0)
- error = zfs_secpolicy_hold(zc, innvl, cr);
- if (error == 0)
- error = zfs_secpolicy_release(zc, innvl, cr);
- if (error == 0)
- error = zfs_secpolicy_destroy(zc, innvl, cr);
+
+ if (innvl != NULL) {
+ if (error == 0)
+ error = zfs_secpolicy_hold(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_release(zc, innvl, cr);
+ if (error == 0)
+ error = zfs_secpolicy_destroy(zc, innvl, cr);
+ }
return (error);
}
@@ -1929,8 +1977,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
- nvlist_t *config, **l2cache, **spares;
- uint_t nl2cache = 0, nspares = 0;
+ nvlist_t *config;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
@@ -1938,27 +1985,6 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config);
- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
- &l2cache, &nl2cache);
-
- (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
- &spares, &nspares);
-
- /*
- * A root pool with concatenated devices is not supported.
- * Thus, can not add a device to a root pool.
- *
- * Intent log device can not be added to a rootpool because
- * during mountroot, zil is replayed, a seperated log device
- * can not be accessed during the mountroot time.
- *
- * l2cache and spare devices are ok to be added to a rootpool.
- */
- if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
- nvlist_free(config);
- spa_close(spa, FTAG);
- return (SET_ERROR(EDOM));
- }
if (error == 0) {
error = spa_vdev_add(spa, config);
@@ -3300,6 +3326,13 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
*
* outnvl: propname -> error code (int32)
*/
+
+static const zfs_ioc_key_t zfs_keys_create[] = {
+ {"type", DATA_TYPE_INT32, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
static int
zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3308,14 +3341,11 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
nvlist_t *nvprops = NULL;
nvlist_t *hidden_args = NULL;
void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
- int32_t type32;
dmu_objset_type_t type;
boolean_t is_insensitive = B_FALSE;
dsl_crypto_params_t *dcp = NULL;
- if (nvlist_lookup_int32(innvl, "type", &type32) != 0)
- return (SET_ERROR(EINVAL));
- type = type32;
+ type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
@@ -3418,6 +3448,12 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
*
* outnvl: propname -> error code (int32)
*/
+static const zfs_ioc_key_t zfs_keys_clone[] = {
+ {"origin", DATA_TYPE_STRING, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
static int
zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3450,6 +3486,10 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
return (error);
}
+static const zfs_ioc_key_t zfs_keys_remap[] = {
+ /* no nvl keys */
+};
+
/* ARGSUSED */
static int
zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3469,6 +3509,11 @@ zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
*
* outnvl: snapshot -> error code (int32)
*/
+static const zfs_ioc_key_t zfs_keys_snapshot[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
static int
zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3485,8 +3530,7 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
return (SET_ERROR(ENOTSUP));
- if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0)
- return (SET_ERROR(EINVAL));
+ snaps = fnvlist_lookup_nvlist(innvl, "snaps");
poollen = strlen(poolname);
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
@@ -3525,6 +3569,10 @@ zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
/*
* innvl: "message" -> string
*/
+static const zfs_ioc_key_t zfs_keys_log_history[] = {
+ {"message", DATA_TYPE_STRING, 0},
+};
+
/* ARGSUSED */
static int
zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3548,10 +3596,7 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
return (error);
- if (nvlist_lookup_string(innvl, "message", &message) != 0) {
- spa_close(spa, FTAG);
- return (SET_ERROR(EINVAL));
- }
+ message = fnvlist_lookup_string(innvl, "message");
if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
spa_close(spa, FTAG);
@@ -3564,6 +3609,58 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
}
/*
+ * This ioctl is used to set the bootenv configuration on the current
+ * pool. This configuration is stored in the second padding area of the label,
+ * and it is used by the bootloader(s) to store bootloader and/or system
+ * specific data.
+ * The data is stored as nvlist data stream, and is protected by
+ * an embedded checksum.
+ * The version can have two possible values:
+ * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.
+ * VB_NVLIST: nvlist with arbitrary <key, value> pairs.
+ */
+static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
+ {"version", DATA_TYPE_UINT64, 0},
+ {"<keys>", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl,
+ nvlist_t *outnvl __unused)
+{
+ int error;
+ spa_t *spa;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
+ /* no nvl keys */
+};
+
+static int
+zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl __unused,
+ nvlist_t *outnvl)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+
+/*
* The dp_config_rwlock must not be held when calling this, because the
* unmount may need to write out data.
*
@@ -3645,6 +3742,11 @@ zfs_destroy_unmount_origin(const char *fsname)
* outnvl: snapshot -> error code (int32)
*
*/
+static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
+ {"snaps", DATA_TYPE_NVLIST, 0},
+ {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3676,6 +3778,10 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
* outnvl: bookmark -> error code (int32)
*
*/
+static const zfs_ioc_key_t zfs_keys_bookmark[] = {
+ {"<bookmark>...", DATA_TYPE_STRING, ZK_WILDCARDLIST},
+};
+
/* ARGSUSED */
static int
zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3713,6 +3819,10 @@ zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
* }
*
*/
+static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
+ {"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
+};
+
static int
zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3727,6 +3837,10 @@ zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
* outnvl: bookmark -> error code (int32)
*
*/
+static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
+ {"<bookmark>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST},
+};
+
static int
zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
@@ -3759,6 +3873,15 @@ zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
return (error);
}
+static const zfs_ioc_key_t zfs_keys_channel_program[] = {
+ {"program", DATA_TYPE_STRING, 0},
+ {"arg", DATA_TYPE_ANY, 0},
+ {"hidden_args", DATA_TYPE_ANY, ZK_OPTIONAL},
+ {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+ {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
static int
zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
nvlist_t *outnvl)
@@ -3769,9 +3892,7 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
nvpair_t *nvarg = NULL;
nvlist_t *hidden_args = NULL;
- if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) {
- return (EINVAL);
- }
+ program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
sync_flag = B_TRUE;
}
@@ -3781,9 +3902,7 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
memlimit = ZCP_DEFAULT_MEMLIMIT;
}
- if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) {
- return (EINVAL);
- }
+ nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
/* hidden args are optional */
if (nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args) == 0) {
@@ -3808,6 +3927,10 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
* innvl: unused
* outnvl: empty
*/
+static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
+ /* no nvl keys */
+};
+
/* ARGSUSED */
static int
zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -3819,6 +3942,10 @@ zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
* innvl: unused
* outnvl: empty
*/
+static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
+ /* no nvl keys */
+};
+
/* ARGSUSED */
static int
zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
@@ -3909,6 +4036,11 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
* EINVAL is returned for an unknown command or if any of the provided vdev
* guids have be specified with a type other than uint64.
*/
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+ {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0}
+};
+
static int
zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3980,6 +4112,12 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
* EINVAL is returned for an unknown command or if any of the provided vdev
* guids have be specified with a type other than uint64.
*/
+static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
+ {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0},
+ {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0},
+ {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL},
+};
static int
zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -4044,6 +4182,10 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
* outnvl: "target" -> name of most recent snapshot
* }
*/
+static const zfs_ioc_key_t zfs_keys_rollback[] = {
+ {"target", DATA_TYPE_STRING, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -5536,9 +5678,6 @@ zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc)
if (error != 0)
return (error);
- dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
- dsl_pool_rele(dmu_objset_pool(os), FTAG);
-
if (dmu_objset_userobjspace_upgradable(os) ||
dmu_objset_projectquota_upgradable(os)) {
mutex_enter(&os->os_upgrade_lock);
@@ -5552,11 +5691,14 @@ zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc)
mutex_exit(&os->os_upgrade_lock);
}
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
error = os->os_upgrade_status;
+ } else {
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
}
- dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
return (error);
@@ -5952,6 +6094,11 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc)
* ...
* }
*/
+static const zfs_ioc_key_t zfs_keys_hold[] = {
+ {"holds", DATA_TYPE_NVLIST, 0},
+ {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
@@ -5962,9 +6109,7 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
int error;
minor_t minor = 0;
- error = nvlist_lookup_nvlist(args, "holds", &holds);
- if (error != 0)
- return (SET_ERROR(EINVAL));
+ holds = fnvlist_lookup_nvlist(args, "holds");
/* make sure the user didn't pass us any invalid (empty) tags */
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
@@ -5999,11 +6144,14 @@ zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
* ...
* }
*/
+static const zfs_ioc_key_t zfs_keys_get_holds[] = {
+ /* no nvl keys */
+};
+
/* ARGSUSED */
static int
zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
{
- ASSERT3P(args, ==, NULL);
return (dsl_dataset_get_holds(snapname, outnvl));
}
@@ -6018,6 +6166,10 @@ zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
* ...
* }
*/
+static const zfs_ioc_key_t zfs_keys_release[] = {
+ {"<snapname>...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST},
+};
+
/* ARGSUSED */
static int
zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
@@ -6076,6 +6228,10 @@ zfs_ioc_space_written(zfs_cmd_t *zc)
* "uncompressed" -> uncompressed space in bytes
* }
*/
+static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
+ {"firstsnap", DATA_TYPE_STRING, 0},
+};
+
static int
zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -6085,8 +6241,7 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
char *firstsnap;
uint64_t used, comp, uncomp;
- if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0)
- return (SET_ERROR(EINVAL));
+ firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
error = dsl_pool_hold(lastsnap, FTAG, &dp);
if (error != 0)
@@ -6140,6 +6295,17 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
*
* outnvl is unused
*/
+static const zfs_ioc_key_t zfs_keys_send_new[] = {
+ {"fd", DATA_TYPE_INT32, 0},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -6155,9 +6321,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
uint64_t resumeobj = 0;
uint64_t resumeoff = 0;
- error = nvlist_lookup_int32(innvl, "fd", &fd);
- if (error != 0)
- return (SET_ERROR(EINVAL));
+ fd = fnvlist_lookup_int32(innvl, "fd");
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
@@ -6202,6 +6366,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
* "space" -> bytes of space (uint64)
* }
*/
+static const zfs_ioc_key_t zfs_keys_send_space[] = {
+ {"from", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL},
+ {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+ {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
static int
zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -6294,18 +6467,24 @@ out:
*
* onvl is unused
*/
+static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
+ {"force", DATA_TYPE_BOOLEAN_VALUE, 0},
+};
+
/* ARGSUSED */
static int
zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
{
int err;
- boolean_t force;
+ boolean_t force = B_FALSE;
spa_t *spa;
if ((err = spa_open(pool, &spa, FTAG)) != 0)
return (err);
- force = fnvlist_lookup_boolean_value(innvl, "force");
+ if (innvl)
+ force = fnvlist_lookup_boolean_value(innvl, "force");
+
if (force) {
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
vdev_config_dirty(spa->spa_root_vdev);
@@ -6327,6 +6506,11 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
* presence indicated key should only be verified, not loaded
* }
*/
+static const zfs_ioc_key_t zfs_keys_load_key[] = {
+ {"hidden_args", DATA_TYPE_NVLIST, 0},
+ {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -6341,11 +6525,7 @@ zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
goto error;
}
- ret = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
- if (ret != 0) {
- ret = SET_ERROR(EINVAL);
- goto error;
- }
+ hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS);
ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
hidden_args, &dcp);
@@ -6369,6 +6549,10 @@ error:
* Unload a user's wrapping key from the kernel.
* Both innvl and outnvl are unused.
*/
+static const zfs_ioc_key_t zfs_keys_unload_key[] = {
+ /* no nvl keys */
+};
+
/* ARGSUSED */
static int
zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -6401,6 +6585,12 @@ out:
*
* outnvl is unused
*/
+static const zfs_ioc_key_t zfs_keys_change_key[] = {
+ {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL},
+ {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+ {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL},
+};
+
/* ARGSUSED */
static int
zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
@@ -6467,7 +6657,7 @@ static void
zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
- boolean_t allow_log)
+ boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
{
zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
@@ -6486,6 +6676,8 @@ zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
vec->zvec_pool_check = pool_check;
vec->zvec_smush_outnvlist = smush_outnvlist;
vec->zvec_allow_log = allow_log;
+ vec->zvec_nvl_keys = nvl_keys;
+ vec->zvec_nvl_key_count = num_keys;
}
static void
@@ -6549,104 +6741,141 @@ zfs_ioctl_init(void)
{
zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
zfs_ioctl_register("create", ZFS_IOC_CREATE,
zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
zfs_ioctl_register("clone", ZFS_IOC_CLONE,
zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
zfs_ioctl_register("remap", ZFS_IOC_REMAP,
zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
zfs_ioctl_register("hold", ZFS_IOC_HOLD,
zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
zfs_ioctl_register("release", ZFS_IOC_RELEASE,
zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
- POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+ zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_destroy_bookmarks,
+ ARRAY_SIZE(zfs_keys_destroy_bookmarks));
+
+ zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
+ zfs_ioc_load_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+ zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key));
+ zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
+ zfs_ioc_unload_key, zfs_secpolicy_load_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+ zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key));
+ zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
+ zfs_ioc_change_key, zfs_secpolicy_change_key,
+ DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
+ B_TRUE, B_TRUE, zfs_keys_change_key,
+ ARRAY_SIZE(zfs_keys_change_key));
+
+ zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
+ zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+ zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
zfs_ioc_channel_program, zfs_secpolicy_config,
POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
- B_TRUE);
+ B_TRUE, zfs_keys_channel_program,
+ ARRAY_SIZE(zfs_keys_channel_program));
zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
zfs_ioctl_register("zpool_discard_checkpoint",
ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
zfs_secpolicy_config, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_discard_checkpoint,
+ ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+ zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
- zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
- zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
- POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE);
+ zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
+ zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+ zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
- zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
- zfs_ioc_load_key, zfs_secpolicy_load_key,
- DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
- zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
- zfs_ioc_unload_key, zfs_secpolicy_load_key,
- DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE);
- zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
- zfs_ioc_change_key, zfs_secpolicy_change_key,
- DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
- B_TRUE, B_TRUE);
+ zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
+ zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
+ zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
/* IOCTLS that use the legacy function signature */
@@ -6783,6 +7012,80 @@ zfs_ioctl_init(void)
POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
}
+/*
+ * Verify that for non-legacy ioctls the input nvlist
+ * pairs match against the expected input.
+ *
+ * Possible errors are:
+ * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered
+ * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair
+ */
+static int
+zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
+{
+ const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
+ boolean_t required_keys_found = B_FALSE;
+
+ /*
+ * examine each input pair
+ */
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ data_type_t type = nvpair_type(pair);
+ boolean_t identified = B_FALSE;
+
+ /*
+ * check pair against the documented names and type
+ */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ /* if not a wild card name, check for an exact match */
+ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
+ strcmp(nvl_keys[k].zkey_name, name) != 0)
+ continue;
+
+ identified = B_TRUE;
+
+ if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
+ nvl_keys[k].zkey_type != type) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
+ }
+
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ required_keys_found = B_TRUE;
+ break;
+ }
+
+ /* allow an 'optional' key, everything else is invalid */
+ if (!identified &&
+ (strcmp(name, "optional") != 0 ||
+ type != DATA_TYPE_NVLIST)) {
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
+ }
+ }
+
+ /* verify that all required keys were found */
+ for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+ if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+ continue;
+
+ if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
+ /* at least one non-optionial key is expected here */
+ if (!required_keys_found)
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ continue;
+ }
+
+ if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
+ return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+ }
+
+ return (0);
+}
+
int
pool_status_check(const char *name, zfs_ioc_namecheck_t type,
zfs_ioc_poolcheck_t check)
@@ -6933,9 +7236,16 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
- return (SET_ERROR(EINVAL));
+ return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
vec = &zfs_ioc_vec[vecnum];
+ /*
+ * The registered ioctl list may be sparse, verify that either
+ * a normal or legacy handler are registered.
+ */
+ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL)
+ return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+
zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
@@ -6978,6 +7288,19 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
break;
}
+ /*
+ * Ensure that all input pairs are valid before we pass them down
+ * to the lower layers.
+ *
+ * The vectored functions can use fnvlist_lookup_{type} for any
+ * required pairs since zfs_check_input_nvpairs() confirmed that
+ * they exist and are of the correct type.
+ */
+ if (error == 0 && vec->zvec_func != NULL) {
+ error = zfs_check_input_nvpairs(innvl, vec);
+ if (error != 0)
+ goto out;
+ }
if (error == 0)
error = vec->zvec_secpolicy(zc, innvl, cr);
diff --git a/usr/src/uts/common/fs/zfs/zfs_onexit.c b/usr/src/uts/common/fs/zfs/zfs_onexit.c
index 4ae8dc29a0..99e530ca3c 100644
--- a/usr/src/uts/common/fs/zfs/zfs_onexit.c
+++ b/usr/src/uts/common/fs/zfs/zfs_onexit.c
@@ -125,13 +125,18 @@ zfs_onexit_fd_hold(int fd, minor_t *minorp)
{
file_t *fp;
zfs_onexit_t *zo;
+ int ret;
fp = getf(fd);
if (fp == NULL)
return (SET_ERROR(EBADF));
*minorp = getminor(fp->f_vnode->v_rdev);
- return (zfs_onexit_minor_to_state(*minorp, &zo));
+ ret = zfs_onexit_minor_to_state(*minorp, &zo);
+ if (ret != 0)
+ releasef(fd);
+
+ return (ret);
}
void
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 6b61cd7a84..86d83e7ace 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -20,11 +20,12 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
+ * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -64,10 +65,12 @@
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/bootconf.h>
+#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
+#include <sys/vdev_impl.h>
#include "zfs_comutil.h"
int zfsfstype;
@@ -172,7 +175,7 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
} else {
/*
* Sync all ZFS filesystems. This is what happens when you
- * run sync(1M). Unlike other filesystems, ZFS honors the
+ * run sync(8). Unlike other filesystems, ZFS honors the
* request by waiting for all pools to commit all dirty data.
*/
spa_sync_allpools();
@@ -921,8 +924,13 @@ zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
int err;
if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
- if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os))
+ if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
return (B_FALSE);
}
@@ -1711,6 +1719,36 @@ zfs_mount_label_policy(vfs_t *vfsp, char *osname)
return (retv);
}
+/*
+ * Load a string-valued boot property and attempt to convert it to a 64-bit
+ * unsigned integer. If the value is not present, or the conversion fails,
+ * return the provided default value.
+ */
+static uint64_t
+spa_get_bootprop_uint64(const char *name, uint64_t defval)
+{
+ char *propval;
+ u_longlong_t r;
+ int e;
+
+ if ((propval = spa_get_bootprop(name)) == NULL) {
+ /*
+ * The property does not exist.
+ */
+ return (defval);
+ }
+
+ e = ddi_strtoull(propval, NULL, 10, &r);
+
+ spa_free_bootprop(propval);
+
+ /*
+ * If the conversion succeeded, return the value. If there was any
+ * kind of failure, just return the default value.
+ */
+ return (e == 0 ? r : defval);
+}
+
static int
zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
{
@@ -1721,6 +1759,8 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
vnode_t *vp = NULL;
char *zfs_bootfs;
char *zfs_devid;
+ uint64_t zfs_bootpool;
+ uint64_t zfs_bootvdev;
ASSERT(vfsp);
@@ -1732,6 +1772,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
if (why == ROOT_INIT) {
if (zfsrootdone++)
return (SET_ERROR(EBUSY));
+
/*
* the process of doing a spa_load will require the
* clock to be set before we could (for example) do
@@ -1746,23 +1787,47 @@ zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
return (SET_ERROR(EINVAL));
}
zfs_devid = spa_get_bootprop("diskdevid");
- error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
- if (zfs_devid)
- spa_free_bootprop(zfs_devid);
- if (error) {
+
+ /*
+ * The boot loader may also provide us with the GUID for both
+ * the pool and the nominated boot vdev. A GUID value of 0 is
+ * explicitly invalid (see "spa_change_guid()"), so we use this
+ * as a sentinel value when no GUID is present.
+ */
+ zfs_bootpool = spa_get_bootprop_uint64("zfs-bootpool", 0);
+ zfs_bootvdev = spa_get_bootprop_uint64("zfs-bootvdev", 0);
+
+ /*
+ * Initialise the early boot device rescan mechanism. A scan
+ * will not actually be performed unless we need to do so in
+ * order to find the correct /devices path for a relocated
+ * device.
+ */
+ vdev_disk_preroot_init();
+
+ error = spa_import_rootpool(rootfs.bo_name, zfs_devid,
+ zfs_bootpool, zfs_bootvdev);
+
+ spa_free_bootprop(zfs_devid);
+
+ if (error != 0) {
spa_free_bootprop(zfs_bootfs);
+ vdev_disk_preroot_fini();
cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
error);
return (error);
}
+
if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
spa_free_bootprop(zfs_bootfs);
+ vdev_disk_preroot_fini();
cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
error);
return (error);
}
spa_free_bootprop(zfs_bootfs);
+ vdev_disk_preroot_fini();
if (error = vfs_lock(vfsp))
return (error);
@@ -1832,7 +1897,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
mutex_enter(&mvp->v_lock);
if ((uap->flags & MS_REMOUNT) == 0 &&
(uap->flags & MS_OVERLAY) == 0 &&
- (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ (vn_count(mvp) != 1 || (mvp->v_flag & VROOT))) {
mutex_exit(&mvp->v_lock);
return (SET_ERROR(EBUSY));
}
@@ -2169,18 +2234,34 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
* Our count is maintained in the vfs structure, but the
* number is off by 1 to indicate a hold on the vfs
* structure itself.
- *
- * The '.zfs' directory maintains a reference of its
- * own, and any active references underneath are
- * reflected in the vnode count.
*/
- if (zfsvfs->z_ctldir == NULL) {
- if (vfsp->vfs_count > 1)
- return (SET_ERROR(EBUSY));
- } else {
- if (vfsp->vfs_count > 2 ||
- zfsvfs->z_ctldir->v_count > 1)
- return (SET_ERROR(EBUSY));
+ boolean_t draining;
+ uint_t thresh = 1;
+
+ /*
+ * The '.zfs' directory maintains a reference of its own, and
+ * any active references underneath are reflected in the vnode
+ * count. Allow one additional reference for it.
+ */
+ if (zfsvfs->z_ctldir != NULL)
+ thresh++;
+
+ /*
+ * If it's running, the asynchronous unlinked drain task needs
+ * to be stopped before the number of active vnodes can be
+ * reliably checked.
+ */
+ draining = zfsvfs->z_draining;
+ if (draining)
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+
+ if (vfsp->vfs_count > thresh || (zfsvfs->z_ctldir != NULL &&
+ zfsvfs->z_ctldir->v_count > 1)) {
+ if (draining) {
+ /* If it was draining, restart the task */
+ zfs_unlinked_drain(zfsvfs);
+ }
+ return (SET_ERROR(EBUSY));
}
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 99011b83b4..dd58b4a549 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -738,6 +738,57 @@ out:
return (error);
}
+static void
+zfs_write_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp,
+ cred_t *cr, boolean_t *did_check, dmu_tx_t *tx)
+{
+ ASSERT(did_check != NULL);
+ ASSERT(tx != NULL);
+
+ if (*did_check)
+ return;
+
+ zilog_t *zilog = zfsvfs->z_log;
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the execute bits is set.
+ *
+ * It would be nice to do this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ ((zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0)) != 0) {
+ uint64_t newmode;
+ vattr_t va;
+
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+
+ /*
+ * Make sure SUID/SGID bits will be removed when we replay the
+ * log.
+ */
+ bzero(&va, sizeof (va));
+ va.va_mask = AT_MODE;
+ va.va_nodeid = zp->z_id;
+ va.va_mode = newmode;
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, AT_MODE, NULL);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ *did_check = B_TRUE;
+}
+
/*
* Write the bytes to a file.
*
@@ -784,6 +835,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
int count = 0;
sa_bulk_attr_t bulk[4];
uint64_t mtime[2], ctime[2];
+ boolean_t did_clear_setid_bits = B_FALSE;
/*
* Fasttrack empty write
@@ -973,6 +1025,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
}
/*
+ * NB: We must call zfs_write_clear_setid_bits_if_necessary
+ * before committing the transaction!
+ */
+
+ /*
* If rangelock_enter() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen
* on the first iteration since rangelock_reduce() will
@@ -1049,30 +1106,8 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
break;
}
- /*
- * Clear Set-UID/Set-GID bits on successful write if not
- * privileged and at least one of the excute bits is set.
- *
- * It would be nice to to this after all writes have
- * been done, but that would still expose the ISUID/ISGID
- * to another app after the partial write is committed.
- *
- * Note: we don't call zfs_fuid_map_id() here because
- * user 0 is not an ephemeral uid.
- */
- mutex_enter(&zp->z_acl_lock);
- if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
- (S_IXUSR >> 6))) != 0 &&
- (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
- secpolicy_vnode_setid_retain(cr,
- (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
- uint64_t newmode;
- zp->z_mode &= ~(S_ISUID | S_ISGID);
- newmode = zp->z_mode;
- (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
- (void *)&newmode, sizeof (uint64_t), tx);
- }
- mutex_exit(&zp->z_acl_lock);
+ zfs_write_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
+ &did_clear_setid_bits, tx);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
@@ -1100,6 +1135,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
prev_error = error;
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ /*
+ * NB: During replay, the TX_SETATTR record logged by
+ * zfs_write_clear_setid_bits_if_necessary must precede
+ * any of the TX_WRITE records logged here.
+ */
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
@@ -4839,7 +4879,7 @@ zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
{
if (vp->v_type == VDIR)
return (0);
- return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+ return ((*noffp < 0) ? EINVAL : 0);
}
/*
@@ -5147,27 +5187,6 @@ zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
return (0);
}
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
/* ARGSUSED */
static int
zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
@@ -5610,7 +5629,6 @@ zfs_isdir()
/*
* Directory vnode operations template
*/
-vnodeops_t *zfs_dvnodeops;
const fs_operation_def_t zfs_dvnodeops_template[] = {
VOPNAME_OPEN, { .vop_open = zfs_open },
VOPNAME_CLOSE, { .vop_close = zfs_close },
@@ -5643,7 +5661,6 @@ const fs_operation_def_t zfs_dvnodeops_template[] = {
/*
* Regular file vnode operations template
*/
-vnodeops_t *zfs_fvnodeops;
const fs_operation_def_t zfs_fvnodeops_template[] = {
VOPNAME_OPEN, { .vop_open = zfs_open },
VOPNAME_CLOSE, { .vop_close = zfs_close },
@@ -5678,7 +5695,6 @@ const fs_operation_def_t zfs_fvnodeops_template[] = {
/*
* Symbolic link vnode operations template
*/
-vnodeops_t *zfs_symvnodeops;
const fs_operation_def_t zfs_symvnodeops_template[] = {
VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
@@ -5695,7 +5711,6 @@ const fs_operation_def_t zfs_symvnodeops_template[] = {
/*
* special share hidden files vnode operations template
*/
-vnodeops_t *zfs_sharevnodeops;
const fs_operation_def_t zfs_sharevnodeops_template[] = {
VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
VOPNAME_ACCESS, { .vop_access = zfs_access },
@@ -5721,7 +5736,6 @@ const fs_operation_def_t zfs_sharevnodeops_template[] = {
* zfs_link() - no links into/out of attribute space
* zfs_rename() - no moves into/out of attribute space
*/
-vnodeops_t *zfs_xdvnodeops;
const fs_operation_def_t zfs_xdvnodeops_template[] = {
VOPNAME_OPEN, { .vop_open = zfs_open },
VOPNAME_CLOSE, { .vop_close = zfs_close },
@@ -5752,7 +5766,6 @@ const fs_operation_def_t zfs_xdvnodeops_template[] = {
/*
* Error vnode operations template
*/
-vnodeops_t *zfs_evnodeops;
const fs_operation_def_t zfs_evnodeops_template[] = {
VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 257d5b2a35..84ba5947fa 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -1246,6 +1246,8 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
if (zp == NULL) {
err = SET_ERROR(ENOENT);
} else {
+ if (zp->z_links == 0)
+ zp->z_unlinked = B_TRUE;
*zpp = zp;
}
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index b02363e7eb..b32dffd79c 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -24,8 +24,8 @@
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
- * Copyright (c) 2019 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
+ * Copyright 2020 Joyent, Inc.
*/
#include <sys/sysmacros.h>
@@ -51,6 +51,7 @@
#include <sys/abd.h>
#include <sys/cityhash.h>
#include <sys/dsl_crypt.h>
+#include <sys/stdbool.h>
/*
* ==========================================================================
@@ -482,7 +483,7 @@ error:
zio->io_error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark);
- zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0, 0);
}
} else {
@@ -1120,7 +1121,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
spa_min_claim_txg(spa));
ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
- ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */
+ ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
@@ -1858,10 +1859,36 @@ zio_execute(zio_t *zio)
return;
}
+#ifdef _KERNEL
+ /*
+ * The I/O pipeline is a part of the machinery responsible for
+ * evacuation of memory pages to disk when we are under
+ * sufficient memory pressure for pageout to run. By setting
+ * this flag, allocations may dip into pages in the pageout
+ * reserved pool in order to try to make forward progress.
+ */
+ bool set_pushpage = false;
+ if (!(curthread->t_flag & T_PUSHPAGE)) {
+ /*
+ * We can be called recursively, so we need to remember
+ * if this frame was the one that first set the flag or
+ * not.
+ */
+ set_pushpage = true;
+ curthread->t_flag |= T_PUSHPAGE;
+ }
+#endif
+
zio->io_stage = stage;
zio->io_pipeline_trace |= zio->io_stage;
rv = zio_pipeline[highbit64(stage) - 1](zio);
+#ifdef _KERNEL
+ if (set_pushpage) {
+ curthread->t_flag &= ~T_PUSHPAGE;
+ }
+#endif
+
if (rv == ZIO_PIPELINE_STOP)
return;
@@ -1990,7 +2017,11 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
- zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
+ cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
+ "failure and has been suspended; `zpool clear` will be required "
+ "before the pool can be written to.", spa_name(spa));
+
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0, 0);
mutex_enter(&spa->spa_suspend_lock);
@@ -4260,7 +4291,7 @@ zio_done(zio_t *zio)
zio->io_vd->vdev_stat.vs_slow_ios++;
mutex_exit(&zio->io_vd->vdev_stat_lock);
- zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, zio->io_vd, &zio->io_bookmark,
zio, 0, 0);
}
@@ -4275,7 +4306,7 @@ zio_done(zio_t *zio)
* device is currently unavailable.
*/
if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
- zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd,
&zio->io_bookmark, zio, 0, 0);
if ((zio->io_error == EIO || !(zio->io_flags &
@@ -4286,7 +4317,7 @@ zio_done(zio_t *zio)
* error and generate a logical data ereport.
*/
spa_log_error(spa, &zio->io_bookmark);
- zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL,
+ (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL,
&zio->io_bookmark, zio, 0, 0);
}
}
diff --git a/usr/src/uts/common/fs/zfs/zio_crypt.c b/usr/src/uts/common/fs/zfs/zio_crypt.c
index 78c26e3e90..9541a0a734 100644
--- a/usr/src/uts/common/fs/zfs/zio_crypt.c
+++ b/usr/src/uts/common/fs/zfs/zio_crypt.c
@@ -1061,14 +1061,17 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
dnode_phys_t *adnp;
boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
crypto_data_t cd;
- uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+ uint8_t tmp_dncore[sizeof (dnode_phys_t)];
+ adnp = (dnode_phys_t *)tmp_dncore;
cd.cd_format = CRYPTO_DATA_RAW;
cd.cd_offset = 0;
+ cd.cd_length = offsetof(dnode_phys_t, dn_blkptr);
+ cd.cd_raw.iov_base = (char *)adnp;
+ cd.cd_raw.iov_len = cd.cd_length;
/* authenticate the core dnode (masking out non-portable bits) */
- bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
- adnp = (dnode_phys_t *)tmp_dncore;
+ bcopy(dnp, tmp_dncore, cd.cd_length);
if (le_bswap) {
adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
@@ -1078,10 +1081,6 @@ zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
adnp->dn_used = 0;
- cd.cd_length = sizeof (tmp_dncore);
- cd.cd_raw.iov_base = (char *)adnp;
- cd.cd_raw.iov_len = cd.cd_length;
-
ret = crypto_mac_update(ctx, &cd, NULL);
if (ret != CRYPTO_SUCCESS) {
ret = SET_ERROR(EIO);
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
index a65721d175..e332da9672 100644
--- a/usr/src/uts/common/fs/zfs/zio_inject.c
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
*/
/*
@@ -100,6 +101,26 @@ static kmutex_t inject_delay_mtx;
static int inject_next_id = 1;
/*
+ * Test if the requested frequency was triggered
+ */
+static boolean_t
+freq_triggered(uint32_t frequency)
+{
+ /*
+ * zero implies always (100%)
+ */
+ if (frequency == 0)
+ return (B_TRUE);
+
+ /*
+ * Note: we still handle legacy (unscaled) frequecy values
+ */
+ uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
+
+ return (spa_get_random(maximum) < frequency);
+}
+
+/*
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
@@ -114,8 +135,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
record->zi_object == DMU_META_DNODE_OBJECT) {
if (record->zi_type == DMU_OT_NONE ||
type == record->zi_type)
- return (record->zi_freq == 0 ||
- spa_get_random(100) < record->zi_freq);
+ return (freq_triggered(record->zi_freq));
else
return (B_FALSE);
}
@@ -130,8 +150,7 @@ zio_match_handler(zbookmark_phys_t *zb, uint64_t type, int dva,
zb->zb_blkid <= record->zi_end &&
(record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
error == record->zi_error) {
- return (record->zi_freq == 0 ||
- spa_get_random(100) < record->zi_freq);
+ return (freq_triggered(record->zi_freq));
}
return (B_FALSE);
@@ -360,6 +379,12 @@ zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
if (handler->zi_record.zi_error == error) {
/*
+ * limit error injection if requested
+ */
+ if (!freq_triggered(handler->zi_record.zi_freq))
+ continue;
+
+ /*
* For a failed open, pretend like the device
* has gone away.
*/
@@ -527,6 +552,9 @@ zio_handle_io_delay(zio_t *zio)
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
continue;
+ if (!freq_triggered(handler->zi_record.zi_freq))
+ continue;
+
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 3ed5977c20..1b3bc07600 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -24,7 +24,7 @@
* Portions Copyright 2010 Robert Milkowski
*
* Copyright 2017 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2019 Joyent, Inc.
*/
@@ -1158,13 +1158,13 @@ zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
return (SET_ERROR(EINVAL));
}
- ASSERT(size <= zv->zv_volblocksize);
+ VERIFY3U(size, <=, zv->zv_volblocksize);
/* Locate the extent this belongs to */
- ze = list_head(&zv->zv_extents);
- while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
+ for (ze = list_head(&zv->zv_extents);
+ ze != NULL && offset >= ze->ze_nblks * zv->zv_volblocksize;
+ ze = list_next(&zv->zv_extents, ze)) {
offset -= ze->ze_nblks * zv->zv_volblocksize;
- ze = list_next(&zv->zv_extents, ze);
}
if (ze == NULL)
@@ -1232,7 +1232,7 @@ zvol_strategy(buf_t *bp)
addr = bp->b_un.b_addr;
resid = bp->b_bcount;
- if (resid > 0 && (off < 0 || off >= volsize)) {
+ if (resid > 0 && off >= volsize) {
bioerror(bp, EIO);
biodone(bp);
return (0);
@@ -1499,7 +1499,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
bytes = volsize - off;
tot_bytes += bytes;
- dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -1709,7 +1709,7 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
}
/*
- * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ * Dirtbag ioctls to support mkfs(8) for UFS filesystems. See dkio(4I).
* Also a dirtbag dkio ioctl for unmap/free-block functionality.
*/
/*ARGSUSED*/
@@ -1767,6 +1767,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
case DKIOCGMEDIAINFOEXT:
{
struct dk_minfo_ext dkmext;
+ size_t len;
bzero(&dkmext, sizeof (dkmext));
dkmext.dki_lbsize = 1U << zv->zv_min_bs;
@@ -1774,7 +1775,17 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
dkmext.dki_media_type = DK_UNKNOWN;
mutex_exit(&zfsdev_state_lock);
- if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
+
+ switch (ddi_model_convert_from(flag & FMODELS)) {
+ case DDI_MODEL_ILP32:
+ len = sizeof (struct dk_minfo_ext32);
+ break;
+ default:
+ len = sizeof (struct dk_minfo_ext);
+ break;
+ }
+
+ if (ddi_copyout(&dkmext, (void *)arg, len, flag))
error = SET_ERROR(EFAULT);
return (error);
}