summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/boot/sys/boot/zfs/zfs.c1
-rw-r--r--usr/src/cmd/boot/Makefile1
-rw-r--r--usr/src/cmd/boot/mbr/Makefile46
-rw-r--r--usr/src/cmd/boot/mbr/mbr.c172
-rw-r--r--usr/src/cmd/rmformat/rmf_slice.c26
-rw-r--r--usr/src/cmd/sgs/crle/common/inspect.c17
-rw-r--r--usr/src/cmd/ztest/ztest.c16
-rw-r--r--usr/src/pkg/manifests/system-boot-real-mode.mf1
-rw-r--r--usr/src/pkg/manifests/system-kernel.mf1
-rw-r--r--usr/src/psm/stand/bootblks/ufs/i386/Makefile48
-rw-r--r--usr/src/psm/stand/bootblks/ufs/i386/mboot.S394
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_rlock.h69
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h14
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_rlock.c568
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c54
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c71
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c52
-rw-r--r--usr/src/uts/common/inet/mi.c26
-rw-r--r--usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c43
20 files changed, 507 insertions, 1116 deletions
diff --git a/usr/src/boot/sys/boot/zfs/zfs.c b/usr/src/boot/sys/boot/zfs/zfs.c
index 8139f8fa24..1f68b012b2 100644
--- a/usr/src/boot/sys/boot/zfs/zfs.c
+++ b/usr/src/boot/sys/boot/zfs/zfs.c
@@ -465,6 +465,7 @@ zfs_probe(int fd, uint64_t *pool_guid)
spa_t *spa;
int ret;
+ spa = NULL;
ret = vdev_probe(vdev_read, (void *)(uintptr_t)fd, &spa);
if (ret == 0 && pool_guid != NULL)
*pool_guid = spa->spa_guid;
diff --git a/usr/src/cmd/boot/Makefile b/usr/src/cmd/boot/Makefile
index 32f8c778d6..44f319d263 100644
--- a/usr/src/cmd/boot/Makefile
+++ b/usr/src/cmd/boot/Makefile
@@ -35,7 +35,6 @@ COMMON_SUBDIRS= \
i386_SUBDIRS= \
installgrub \
- mbr \
symdef
sparc_SUBDIRS=
diff --git a/usr/src/cmd/boot/mbr/Makefile b/usr/src/cmd/boot/mbr/Makefile
deleted file mode 100644
index 844ee0d758..0000000000
--- a/usr/src/cmd/boot/mbr/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
-
-BOOTPROG= mbr
-
-OBJS= $(BOOTPROG).o
-SRCS = $(OBJS:.o=.c)
-
-include ../Makefile.com
-
-CLOBBERFILES = $(BOOTPROG)
-
-.KEEP_STATE:
-
-all: $(BOOTPROG)
-
-install: all $(ROOTBOOTSOLARISBINPROG)
-
-clean:
- -$(RM) $(OBJS)
-
-lint: lint_SRCS
-
-include ../Makefile.targ
diff --git a/usr/src/cmd/boot/mbr/mbr.c b/usr/src/cmd/boot/mbr/mbr.c
deleted file mode 100644
index e4978d224d..0000000000
--- a/usr/src/cmd/boot/mbr/mbr.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <string.h>
-#include <fcntl.h>
-#include <libgen.h>
-#include <stdlib.h>
-#include <strings.h>
-#include <unistd.h>
-#include <sys/dktp/fdisk.h>
-
-#define SECTOR_SIZE 512
-static char boot_sect[SECTOR_SIZE];
-static char new_mboot[SECTOR_SIZE];
-
-static void
-usage(char *progname)
-{
- fprintf(stderr, "Usage: %s [ -d | -n | -o | -r ] <device> [<mboot>]\n",
- basename(progname));
- fprintf(stderr, "\t-n Set new Solaris partition magic 0xbf\n");
- fprintf(stderr, "\t-o Set old Solaris partition magic 0x82\n");
- fprintf(stderr, "\t-r Replace master boot program "
- "(/usr/lib/fs/ufs/mboot)\n");
- exit(-1);
-}
-
-int
-main(int argc, char *argv[])
-{
- int c, fd, i, sol_part = -1;
- int setold = 0, setnew = 0, write_mboot = 0, list_hd = 0;
- char *device;
- struct mboot *mboot;
- char *mboot_file = "/usr/lib/fs/ufs/mboot";
-
- while ((c = getopt(argc, argv, "dnor")) != EOF) {
- switch (c) {
- case 'd':
- list_hd = 1;
- continue;
- case 'n':
- setnew = 1;
- continue;
- case 'o':
- setold = 1;
- continue;
- case 'r':
- write_mboot = 1;
- continue;
- default:
- usage(argv[0]);
- }
- }
-
- /* check arguments */
- if ((setnew && setold) || argc < optind + 1) {
- usage(argv[0]);
- }
-
- if (write_mboot && argc > optind + 1) {
- mboot_file = strdup(argv[optind + 1]);
- }
- if (!mboot_file) {
- usage(argv[0]);
- }
- fd = open(mboot_file, O_RDONLY);
- if (fd == -1 || read(fd, new_mboot, SECTOR_SIZE) != SECTOR_SIZE) {
- fprintf(stderr, "cannot read file %s\n", mboot_file);
- if (fd == -1)
- perror("open");
- else
- perror("read");
- exit(-1);
- }
- close(fd);
-
- device = strdup(argv[optind]);
- if (!device) {
- usage(argv[0]);
- }
- fd = open(device, O_RDWR);
- if (fd == -1 || read(fd, boot_sect, SECTOR_SIZE) != SECTOR_SIZE) {
- fprintf(stderr, "cannot read MBR on %s\n", device);
- if (fd == -1)
- perror("open");
- else
- perror("read");
- exit(-1);
- }
-
- mboot = (struct mboot *)boot_sect;
- for (i = 0; i < FD_NUMPART; i++) {
- struct ipart *part = (struct ipart *)mboot->parts + i;
- if (!list_hd) {
- if (part->bootid == 128)
- printf("active ");
- else
- printf(" ");
- }
- if (setnew && part->systid == 0x82) {
- part->systid = 0xbf;
- sol_part = i;
- } else if (setold && part->systid == 0xbf) {
- part->systid = 0x82;
- sol_part = i;
- } else if (list_hd &&
- (part->systid == 0x82 || part->systid == 0xbf)) {
- sol_part = i;
- }
- if (!list_hd)
- printf("%d (0x%2x): start_sect %u, size_sect %u\n",
- i + 1, part->systid, part->relsect, part->numsect);
- }
-
- if (list_hd) {
- printf("(hd0,%d,a)\n", sol_part);
- (void) close(fd);
- return (0);
- }
-
- /* write new mboot */
- if (write_mboot || sol_part != -1) {
- if (write_mboot) {
- /* copy over the new boot program */
- bcopy((void *)new_mboot, (void *)boot_sect, BOOTSZ);
- }
-
- if ((lseek(fd, 0, SEEK_SET) < 0) ||
- (write(fd, (void *)boot_sect, SECTOR_SIZE) < 0)) {
- perror("failed to update MBR");
- exit(-1);
- }
- if (sol_part != -1) {
- printf("Changed solaris partition %d", sol_part + 1);
- if (setnew)
- printf("from 0x82 to 0xbf\n");
- else
- printf("from 0xbf to 0x82\n");
- }
- if (write_mboot) {
- printf("Replaced mboot program with %s\n", mboot_file);
- }
- }
-
- (void) close(fd);
- return (0);
-}
diff --git a/usr/src/cmd/rmformat/rmf_slice.c b/usr/src/cmd/rmformat/rmf_slice.c
index dccd09b609..ec8d5209e0 100644
--- a/usr/src/cmd/rmformat/rmf_slice.c
+++ b/usr/src/cmd/rmformat/rmf_slice.c
@@ -25,8 +25,8 @@
/*
* rmf_slice.c :
- * This file contains the functions for parsing a slice file
- * for rmformat.
+ * This file contains the functions for parsing a slice file
+ * for rmformat.
*/
#include <sys/types.h>
@@ -786,7 +786,7 @@ str2sector(char *str)
int32_t
valid_slice_file(smedia_handle_t handle, int32_t fd, char *file_name,
- struct extvtoc *vt)
+ struct extvtoc *vt)
{
struct stat status;
int32_t ret_val;
@@ -852,17 +852,17 @@ valid_slice_file(smedia_handle_t handle, int32_t fd, char *file_name,
* in fdisk table.
* Following table describes how is it handled
* SPARC:
- * SCSI/ATAPI, floppy, pcmcia : don't check for fdisk.
+ * SCSI/ATAPI, floppy, pcmcia : don't check for fdisk.
* DKIOCGGEOM is sufficient.
* x86 : floppy, pcmcia : Don't check for fdisk. DKIOCGGEOM is sufficient.
- * SCSI/ATAPI : Check for fdisk.
+ * SCSI/ATAPI : Check for fdisk.
* if not present, assume that the solaris
* partition covers 100% of the medium
- * (minus one cylinder).
+ * (minus one cylinder).
*
- * if present :
+ * if present :
* check for active solaris partition.
- * if not found, take the first solaris
+ * if not found, take the first solaris
* partition.
* If there are no solaris partitions, its an error, stop.
*/
@@ -1114,7 +1114,7 @@ Solaris partition\n"));
static int32_t
get_fdisk(smedia_handle_t handle, int32_t fd, int32_t offset,
- struct fdisk_info *fdisk)
+ struct fdisk_info *fdisk)
{
struct mboot *boot_sec;
struct ipart *part;
@@ -1220,7 +1220,7 @@ get_fdisk(smedia_handle_t handle, int32_t fd, int32_t offset,
/*
* wrrite_defualt_label(int32_t fd)
- * fd = file descriptor for the device.
+ * fd = file descriptor for the device.
*
* For sparc solaris
* Create a vtoc partition with
@@ -1351,7 +1351,7 @@ write_default_label(smedia_handle_t handle, int32_t fd)
return;
}
- tmp_fd = open("/usr/lib/fs/ufs/mboot", O_RDONLY);
+ tmp_fd = open("/boot/pmbr", O_RDONLY);
if (tmp_fd <= 0) {
return;
}
@@ -1499,9 +1499,9 @@ write_default_label(smedia_handle_t handle, int32_t fd)
* zip/jaz media. So, the meta data on the disk should be erased.
*
* If there is a valid fdisk table,
- * erase first 64K of each partition.
+ * erase first 64K of each partition.
* If there is a valid vtoc,
- * erase first 64k of each slice.
+ * erase first 64k of each slice.
* Then erase the 0th sector (the home for vtoc and fdisk) of the disk.
* Note that teh vtoc on x86 resides in one of the fdisk partition.
* So delay the erasing of the solaris partition until the vtoc is read.
diff --git a/usr/src/cmd/sgs/crle/common/inspect.c b/usr/src/cmd/sgs/crle/common/inspect.c
index 7f68c77a27..38667fccba 100644
--- a/usr/src/cmd/sgs/crle/common/inspect.c
+++ b/usr/src/cmd/sgs/crle/common/inspect.c
@@ -97,8 +97,8 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags,
if (flags & RTC_OBJ_DUMP) {
char _alter[PATH_MAX];
- (void) strcpy(_alter, crle->c_objdir);
- (void) realpath(_alter, _alter);
+ (void) strlcpy(alter, crle->c_objdir, sizeof (alter));
+ (void) realpath(alter, _alter);
(void) snprintf(alter, PATH_MAX, MSG_ORIG(MSG_FMT_PATH),
_alter, file);
if (strcmp(alter, obj->o_path) == 0) {
@@ -107,9 +107,10 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags,
return (0);
}
obj->o_flags |= RTC_OBJ_DUMP;
- } else
+ } else {
(void) snprintf(alter, PATH_MAX, MSG_ORIG(MSG_FMT_PATH),
crle->c_objdir, file);
+ }
obj->o_flags |= RTC_OBJ_ALTER;
/*
@@ -119,8 +120,9 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags,
if (obj->o_alter) {
crle->c_strsize -= strlen(alter) + 1;
fmt = MSG_INTL(MSG_DIA_ALTUPDATE);
- } else
+ } else {
fmt = MSG_INTL(MSG_DIA_ALTCREATE);
+ }
/*
* Allocate the new alternative and update the string table size.
@@ -397,8 +399,9 @@ _enterfile(Crle_desc *crle, const char *file, int off, Hash_ent *fent,
if ((nfile = malloc(size)) == NULL)
return (0);
(void) strcpy(nfile, file);
- } else
+ } else {
nfile = (char *)file;
+ }
fent->e_key = (Addr)nfile;
fent->e_off = off;
@@ -1028,9 +1031,9 @@ inspect(Crle_desc *crle, const char *name, Half flags)
} else {
size_t off = file - name;
- if (file == name)
+ if (file == name) {
dir = MSG_ORIG(MSG_DIR_ROOT);
- else {
+ } else {
(void) strncpy(_dir, name, off);
_dir[off] = '\0';
dir = (const char *)_dir;
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index 46a8dd2c4e..8db1103272 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -237,7 +237,9 @@ typedef struct bufwad {
} bufwad_t;
/*
- * XXX -- fix zfs range locks to be generic so we can use them here.
+ * It would be better to use a rangelock_t per object. Unfortunately
+ * the rangelock_t is not a drop-in replacement for rl_t, because we
+ * still need to map from object ID to rangelock_t.
*/
typedef enum {
RL_READER,
@@ -1845,12 +1847,12 @@ static void
ztest_get_done(zgd_t *zgd, int error)
{
ztest_ds_t *zd = zgd->zgd_private;
- uint64_t object = zgd->zgd_rl->rl_object;
+ uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
- ztest_range_unlock(zgd->zgd_rl);
+ ztest_range_unlock((rl_t *)zgd->zgd_lr);
ztest_object_unlock(zd, object);
umem_free(zgd, sizeof (*zgd));
@@ -1900,8 +1902,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
zgd->zgd_private = zd;
if (buf != NULL) { /* immediate write */
- zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
- RL_READER);
+ zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
@@ -1915,8 +1917,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb,
offset = 0;
}
- zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
- RL_READER);
+ zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd,
+ object, offset, size, RL_READER);
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
diff --git a/usr/src/pkg/manifests/system-boot-real-mode.mf b/usr/src/pkg/manifests/system-boot-real-mode.mf
index 094432e059..4492c13f0d 100644
--- a/usr/src/pkg/manifests/system-boot-real-mode.mf
+++ b/usr/src/pkg/manifests/system-boot-real-mode.mf
@@ -45,7 +45,6 @@ $(i386_ONLY)file path=boot/solaris/bootenv.rc group=sys \
$(i386_ONLY)file path=boot/solaris/devicedb/master group=sys \
original_name=SUNWrmod:boot/solaris/devicedb/master preserve=true
$(i386_ONLY)file path=boot/splashimage.xpm group=sys
-$(i386_ONLY)file path=usr/lib/fs/ufs/mboot mode=0444
$(i386_ONLY)hardlink path=boot/solaris.xpm target=splashimage.xpm
$(i386_ONLY)legacy pkg=SUNWrmodr desc="Realmode Modules, (Root)" \
name="Realmode Modules, (Root)"
diff --git a/usr/src/pkg/manifests/system-kernel.mf b/usr/src/pkg/manifests/system-kernel.mf
index d9b0c02702..59fe6451a4 100644
--- a/usr/src/pkg/manifests/system-kernel.mf
+++ b/usr/src/pkg/manifests/system-kernel.mf
@@ -295,7 +295,6 @@ driver name=wc perms="* 0600 root sys"
$(i386_ONLY)file path=boot/solaris/bin/create_diskmap group=sys mode=0555
file path=boot/solaris/bin/create_ramdisk group=sys mode=0555
file path=boot/solaris/bin/extract_boot_filelist group=sys mode=0555
-$(i386_ONLY)file path=boot/solaris/bin/mbr group=sys mode=0555
$(i386_ONLY)file path=boot/solaris/bin/symdef group=sys mode=0555
$(i386_ONLY)file path=boot/solaris/bin/update_grub group=sys mode=0555
file path=boot/solaris/filelist.ramdisk group=sys
diff --git a/usr/src/psm/stand/bootblks/ufs/i386/Makefile b/usr/src/psm/stand/bootblks/ufs/i386/Makefile
index 0d5b74fda8..929fde267e 100644
--- a/usr/src/psm/stand/bootblks/ufs/i386/Makefile
+++ b/usr/src/psm/stand/bootblks/ufs/i386/Makefile
@@ -25,51 +25,19 @@
#
# psm/stand/bootblks/ufs/i386/Makefile
#
-.KEEP_STATE:
BASEDIR = ../..
include $(BASEDIR)/ufs/Makefile.ufs
+all := TARGET= all
+install := TARGET= install
+clean := TARGET= clean
+clobber := TARGET= clobber
+lint := TARGET= lint
-CC = $(GNUC_ROOT)/bin/gcc
-ASFLAGS = -B$(GNUC_ROOT)/bin/ -fno-builtin -nostdinc
-CPPFLAGS =
-
-LD = $(GNU_ROOT)/bin/gld
-LDFLAGS = -nostdlib -N -Ttext 600
-
-OBJCOPY = $(GNU_ROOT)/bin/gobjcopy
-
-INSTALL_DIR = $(USR)/lib/fs/ufs
-INSTALL_TARGETS = $(PROGS:%=$(INSTALL_DIR)/%)
-
-$(INSTALL_TARGETS) := FILEMODE = 0444
-
-PROGS = mboot
-
-
-all: $(PROGS)
-
-$(PROGS): $$(@).exec
- $(OBJCOPY) -O binary $@.exec $@
-
-%.exec: %.o
- $(LD) $(LDFLAGS) -o $@ $(@:exec=o)
-
-
-install: all $(INSTALL_TARGETS)
-
-$(INSTALL_DIR)/%: $(INSTALL_DIR) %
- $(INS.file)
-
-$(INSTALL_DIR):
- $(INS.dir)
-
-lint:
+.KEEP_STATE:
-clean:
- $(RM) *.exec *.o
+all install lint clean clobber: FRC
-clobber: clean
- $(RM) $(PROGS) $(INSTALLBOOT)
+FRC:
diff --git a/usr/src/psm/stand/bootblks/ufs/i386/mboot.S b/usr/src/psm/stand/bootblks/ufs/i386/mboot.S
deleted file mode 100644
index 48afbae207..0000000000
--- a/usr/src/psm/stand/bootblks/ufs/i386/mboot.S
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2012 OmniTI Computer Consulting, Inc. All rights reserved.
- *
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * SOLARIS MASTER BOOT:
- *
- * PURPOSE: loads the primary boot from the active fdisk partition.
- * in effect, this routine mimics the functionality of INT 0x19.
- *
- * resides on the first physical sector of the hard drive media.
- * loaded by INT 0x19 (ROM bootstrap loader) at address 0x7C00
- * limited to 512 bytes total, including embedded fdisk table.
- *
- * for compatibility with the ROM BIOS, we contain standard DOS structures:
- *
- * the fdisk partition table (at offset 0x1BE-0x1FE)
- * boot signature bytes (0x55, 0xAA at 0x1FE, 0x1FF)
- *
- * the above two entities are required in order to be compatible with
- * the manner in which the DOS BIOS has always performed its boot operation.
- * In the event that our master boot record is inadvertently replaced by
- * a standard DOS boot sector, the booting operation will still succeed!
- *
- * This master boot record uses the relsect/numsect fields of the partition
- * table entry, to compute the start of the active partition; therefore,
- * it is geometry independent. This means that the drive could be "built"
- * on a system with a disk controller that uses a given disk geometry, but
- * would run on any other controller.
- *
- * SYNOPSIS:
- * begins execution at 0:0x7C00
- * relocates to 0:0x600 (to get out of the way!)
- * reads fdisk table to locate bootable partition
- * load boot record from the active fdisk partition at 0x7C00
- * verify boot record signature bytes
- * jump to/execute the SOLARIS PARTITION PRIMARY BOOT
- * error handler - can either reboot, or invoke INT 0x18.
- *
- * interface from DOS INT 0x19: BootDev in DL
- * (this fails sometimes, so we look for a signature to determine whether
- * to rely on DL from the floppy boot, or if we should assume 0x80 from
- * the BIOS)
- *
- * interface to partition boot: BootDev in DL
- *
- *=============================================================================
- * Master boot record: resides on first physical sector of device
- */
-
-/*
- * This file is written in GNU as syntax using Intel assembler syntax. The
- * startup label _start will be executed at address PBOOT_ADDR (0x7C00), but
- * the text section must be set at address RELOC_ADDR (0x600). With GNU ld
- * this can be done using the "-Ttext 600" option.
- */
-
-
-#define PBOOT_ADDR 0x7C00
-#define RELOC_ADDR 0x600
-
-#define FDISK_START 0x1BE
-#define BOOT_SIG 0xAA55
-#define N_RETRIES 5
-
-#define FD_NUMPART 4
-#define FD_PTESIZE 0x10
-#define ACTIVE 0x80
-
-/*
- * A convenience macro for declaring a message string (using .ascii directive--
- * NOT nul-terminated) surrounded by two labels, which can then be used with
- * the SIZEOF() macro to get its length.
- */
-#define MSG(label, string) label: .ascii string; label##_end:
-
-/*
- * Returns the length of some consecutive bytes. These bytes must be placed
- * between two labels. The ending label must be the same as the starting label
- * but with a suffix "_end".
- */
-#define SIZEOF(label) (label##_end - label)
-
-
- .title "Solaris_Master_Boot"
-
- .intel_syntax noprefix /* use Intel syntax */
- .code16 /* 16-bit mode (real mode) */
-
- .text /* code segment begins here */
-
- .global BootDev
- .global _start
-
-_start: /* _start is loaded at PBOOT_ADDR */
- jmp bootrun
-
-Version:
- .ascii "M3.0" /* ident string */
-
-bootrun:
- cli /* don't bother me now! */
-
- /* prepare to relocate ourselves */
- cld /* prepare for relocation */
- mov si, PBOOT_ADDR
- mov di, RELOC_ADDR
-
- /* set up segment registers */
- mov ax, cs /* initialize segment registers */
- mov ss, ax
- mov sp, si /* stack starts down from 7C00 */
- mov es, ax
- mov ds, ax
-
- push cx /* save possible signature on stack */
- mov cx, 0x100
- rep movsw
- pop cx /* restore saved cx */
-
- /* running at PBOOT_ADDR, jump to RELOC_ADDR-rel addr */
- jmp (new_home - PBOOT_ADDR + RELOC_ADDR)
-
-new_home:
- sti /* re-enable interrupts */
-
- /*
- * assuming boot device number is in dl has caused problems in the past
- * since we still don't absolutely have to rely on it, I've just
- * removed the now-pointless code to check for the FACE-CAFE signature
- * from mdexec, which doesn't do anything anymore, but left the
- * assumption that BootDev is 0x80 and nothing but. If we ever need to
- * have BIOS load us from a drive not numbered 0x80, we'll need to
- * uncomment the following line; otherwise, the initialized value of
- * BootDev, namely 0x80, will be used for disk accesses.
- */
- /* mov BootDev, dl */
-
- /* set debug flag based on seeing "both shift down" */
- mov ah, 2 /* get shift state */
- int 0x16
- and al, 3 /* isolate shift-key bits */
- cmp al, 3
- jne nodbg
- mov byte ptr [debugmode], 1 /* set to 1 */
-
-nodbg:
- /*
- * Search the fdisk table sequentially to find a physical partition
- * that is marked as "active" (bootable).
- */
- mov bx, RELOC_ADDR + FDISK_START
- mov cx, FD_NUMPART
-
-nxtpart:
- cmp byte ptr [bx], ACTIVE
- je got_active_part
- add bx, FD_PTESIZE
- loop nxtpart
-
-noparts:
- mov bp, offset NoActiveErrMsg
- mov cx, SIZEOF(NoActiveErrMsg)
- jmp fatal_err
-
-got_active_part:
- mov ah, 0 /* reset disk */
- int 0x13
-
- push bx /* save partition pointer */
-
- /* Check for LBA BIOS */
- mov ah, 0x41 /* chkext function */
- mov bx, 0x55AA /* signature to change */
- mov cx, 0
- int 0x13
- jc noLBA /* carry == failure */
- cmp bx, 0xAA55
- jne noLBA /* bad signature in BX == failure */
- test cx, 1 /* cx & 1 must be true, or... */
- jz noLBA /* ...no LBA */
-
- mov bp, offset lbastring
- mov cx, SIZEOF(lbastring)
- call debugout
-
- /*
- * LBA case: form a packet on the stack and call fn 0x42 to read
- * packet, backwards (from hi to lo addresses):
- * 8-byte LBA
- * seg:ofs buffer address
- * byte reserved
- * byte nblocks
- * byte reserved
- * packet size in bytes (>= 0x10)
- */
-
- pop bx /* restore partition pointer */
- push bx /* and save again */
- mov cx, N_RETRIES /* retry count */
-retryLBA:
- pushd 0 /* hi 32 bits of 64-bit sector number */
- push dword ptr [bx+8] /* relsect (lo 32 of 64-bit number) */
- push dword ptr [solaris_priboot] /* seg:ofs of buffer */
- push 1 /* reserved, one block */
- push 0x10 /* reserved, size (0x10) */
- mov ah, 0x42 /* "read LBA" */
- mov si, sp /* (ds already == ss) */
- int 0x13
- lahf /* save flags */
- add sp, 16 /* restore stack */
- sahf /* restore flags */
- jnc readok /* got it */
- mov ah, 0 /* reset disk */
- int 0x13
- loop retryLBA /* try again */
- jmp readerr /* exhausted retries; give up */
-
-noLBA:
- mov bp, offset chsstring
- mov cx, SIZEOF(chsstring)
- call debugout
-
- pop bx /* restore partition pointer */
- push bx /* and save again */
-
- /* get BIOS disk parameters */
- mov dl, byte ptr [BootDev]
- mov ah, 0x8
- int 0x13
-
- jnc geomok
-
- /* error reading geom; die */
- mov bp, offset GeomErrMsg
- mov cx, SIZEOF(GeomErrMsg)
- jmp fatal_err
-
-geomok:
- /* calculate sectors per track */
- mov al, cl /* ah doesn't matter; mul dh will set it */
- and al, 0x3F
- mov byte ptr [secPerTrk], al
-
- /* calculate sectors per cylinder */
- inc dh
- mul dh
- mov word ptr [secPerCyl], ax
-
- /* calculate cylinder # */
- mov ax, [bx+8] /* ax = loword(relsect) */
- mov dx, [bx+10] /* dx:ax = relsect */
- div word ptr [secPerCyl] /* ax = cyl, */
- /* dx = sect in cyl (0 - cylsize-1) */
- mov bx, ax /* bx = cyl */
-
- /* calculate head/sector # */
- mov ax, dx /* ax = sect in cyl (0 - cylsize-1) */
- div byte ptr [secPerTrk] /* al = head, */
- /* ah = 0-rel sect in track */
- inc ah /* ah = 1-rel sector */
-
- xor cl,cl /* cl = 0 */
- mov ch, bh /* ch = hi bits of cyl (if any) */
- shr cx, 2 /* cl{7:6} = cyl{9:8} (if any) */
- and cl, 0xC0 /* cl = cyl{9:8} to merge with sect (if any) */
-
- or cl, ah /* cl{7:6} = cyl bits, cl{5:0} = sect */
- mov ch, bl /* ch = lo cyl bits */
- mov dh, al /* dh = head */
- mov dl, byte ptr [BootDev] /* dl = drivenum */
- les bx, solaris_priboot /* es:bx points to buffer */
-
- mov si, N_RETRIES
-retry_noLBA:
- mov ax, 0x201 /* 02=read, sector count = 1 */
-
- int 0x13
- jnc readok
- mov ah, 0 /* reset disk */
- int 0x13
- dec si
- cmp si, 0
- jne retry_noLBA /* retry, or fall through to read error */
-
-readerr:
- mov bp, offset ReadErrMsg
- mov cx, SIZEOF(ReadErrMsg)
- jmp fatal_err
-
-readok:
- /* verify boot record signature */
- mov bx, PBOOT_ADDR
- cmp word ptr [bx+0x1FE], BOOT_SIG
- je sigok
-
- mov bp, offset SigErrMsg
- mov cx, SIZEOF(SigErrMsg)
- jmp fatal_err
-
-sigok:
- mov dl, byte ptr [BootDev] /* pass BootDev to next boot phase */
- pop si /* and pass partition pointer ds:si */
- call dword ptr [solaris_priboot] /* call doesn't return! */
-
- mov bp, offset ReturnErrMsg
- mov cx, SIZEOF(ReturnErrMsg)
-
-fatal_err: /* land of no return....... */
- /*
- * bp contains pointer to error message string,
- * cx contains string length
- */
- mov bx, 0x4F /* video page, attribute */
- call msgout
- int 0x18
-
-debugout:
- /* call with string pointer in es:bp, len in cx */
- cmp byte ptr [debugmode], 0
- je debugout_ret /* skip if not in debug mode */
-
- mov bx, 0x1F /* page, attr (white on blue) */
-
- /* alternate entry for fatal_err */
-msgout:
- pusha
- mov ax, 0x1301
- mov dx, 0x1700 /* row, col */
- int 0x10
-
- mov al, 7 /* BEL */
- mov cx, 1
- int 0x10
-
- mov ah, 0 /* get key */
- int 0x16
- popa
-
-debugout_ret:
- ret
-
-secPerTrk:
- .byte 0
-secPerCyl:
- .word 0
-solaris_priboot:
- .long PBOOT_ADDR
-BootDev:
- .byte 0x80 /* assumes drive 80 (see comment above) */
-debugmode:
- .byte 0
-
-MSG(GeomErrMsg, "Can't read geometry")
-MSG(NoActiveErrMsg, "No active partition")
-MSG(ReadErrMsg, "Can't read PBR")
-MSG(SigErrMsg, "Bad PBR sig")
-MSG(ReturnErrMsg, "!!!")
-MSG(lbastring, "LBA")
-MSG(chsstring, "CHS")
-
-/*
- * For debugging: Here's a representative FDISK table entry
- *
- * .org 0x1BE
- * .byte 0x80,1,1,0,0x82,0xfe,0x7f,4,0x3f,0,0,0,0x86,0xfa,0x3f,0
- */
- .org 0x1FE
-
- .word BOOT_SIG
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index fb3c6b2fbe..cf24fd5c5e 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -75,6 +75,7 @@ struct nvlist;
struct arc_buf;
struct zio_prop;
struct sa_handle;
+struct locked_range;
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
@@ -941,7 +942,7 @@ typedef struct zgd {
struct lwb *zgd_lwb;
struct blkptr *zgd_bp;
dmu_buf_t *zgd_db;
- struct rl *zgd_rl;
+ struct locked_range *zgd_lr;
void *zgd_private;
} zgd_t;
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
index 93733ba8a2..37a5594bbc 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -22,6 +22,9 @@
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
#ifndef _SYS_FS_ZFS_RLOCK_H
#define _SYS_FS_ZFS_RLOCK_H
@@ -30,54 +33,44 @@
extern "C" {
#endif
-#ifdef _KERNEL
-
-#include <sys/zfs_znode.h>
-
typedef enum {
RL_READER,
RL_WRITER,
RL_APPEND
-} rl_type_t;
+} rangelock_type_t;
-typedef struct rl {
- znode_t *r_zp; /* znode this lock applies to */
- avl_node_t r_node; /* avl node link */
- uint64_t r_off; /* file range offset */
- uint64_t r_len; /* file range length */
- uint_t r_cnt; /* range reference count in tree */
- rl_type_t r_type; /* range type */
- kcondvar_t r_wr_cv; /* cv for waiting writers */
- kcondvar_t r_rd_cv; /* cv for waiting readers */
- uint8_t r_proxy; /* acting for original range */
- uint8_t r_write_wanted; /* writer wants to lock this range */
- uint8_t r_read_wanted; /* reader wants to lock this range */
-} rl_t;
+struct locked_range;
-/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that
- * is converted to RL_WRITER that specified to lock from the start of the
- * end of file. Returns the range lock structure.
- */
-rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
+typedef void (rangelock_cb_t)(struct locked_range *, void *);
-/* Unlock range and destroy range lock structure. */
-void zfs_range_unlock(rl_t *rl);
+typedef struct rangelock {
+ avl_tree_t rl_tree; /* contains locked_range_t */
+ kmutex_t rl_lock;
+ rangelock_cb_t *rl_cb;
+ void *rl_arg;
+} rangelock_t;
-/*
- * Reduce range locked as RW_WRITER from whole file to specified range.
- * Asserts the whole file was previously locked.
- */
-void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
+typedef struct locked_range {
+ rangelock_t *lr_rangelock; /* rangelock that this lock applies to */
+ avl_node_t lr_node; /* avl node link */
+ uint64_t lr_offset; /* file range offset */
+ uint64_t lr_length; /* file range length */
+ uint_t lr_count; /* range reference count in tree */
+ rangelock_type_t lr_type; /* range type */
+ kcondvar_t lr_write_cv; /* cv for waiting writers */
+ kcondvar_t lr_read_cv; /* cv for waiting readers */
+ uint8_t lr_proxy; /* acting for original range */
+ uint8_t lr_write_wanted; /* writer wants to lock this range */
+ uint8_t lr_read_wanted; /* reader wants to lock this range */
+} locked_range_t;
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int zfs_range_compare(const void *arg1, const void *arg2);
+void rangelock_init(rangelock_t *, rangelock_cb_t *, void *);
+void rangelock_fini(rangelock_t *);
-#endif /* _KERNEL */
+locked_range_t *rangelock_enter(rangelock_t *,
+ uint64_t, uint64_t, rangelock_type_t);
+void rangelock_exit(locked_range_t *);
+void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index bebe577d3f..8c4f8f7dc8 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/
@@ -39,6 +39,7 @@
#include <sys/rrwlock.h>
#include <sys/zfs_sa.h>
#include <sys/zfs_stat.h>
+#include <sys/zfs_rlock.h>
#endif
#include <sys/zfs_acl.h>
#include <sys/zil.h>
@@ -60,8 +61,8 @@ extern "C" {
#define ZFS_APPENDONLY 0x0000004000000000
#define ZFS_NODUMP 0x0000008000000000
#define ZFS_OPAQUE 0x0000010000000000
-#define ZFS_AV_QUARANTINED 0x0000020000000000
-#define ZFS_AV_MODIFIED 0x0000040000000000
+#define ZFS_AV_QUARANTINED 0x0000020000000000
+#define ZFS_AV_MODIFIED 0x0000040000000000
#define ZFS_REPARSE 0x0000080000000000
#define ZFS_OFFLINE 0x0000100000000000
#define ZFS_SPARSE 0x0000200000000000
@@ -81,8 +82,8 @@ extern "C" {
*/
#define ZFS_XATTR 0x1 /* is an extended attribute */
#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */
-#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
-#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
+#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */
+#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */
#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */
#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */
#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */
@@ -176,8 +177,7 @@ typedef struct znode {
krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
- kmutex_t z_range_lock; /* protects changes to z_range_avl */
- avl_tree_t z_range_avl; /* avl tree of file range locks */
+ rangelock_t z_rangelock; /* file range locks */
uint8_t z_unlinked; /* file has been unlinked */
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
diff --git a/usr/src/uts/common/fs/zfs/zfs_rlock.c b/usr/src/uts/common/fs/zfs/zfs_rlock.c
index b40bdbea12..4e80ab27cc 100644
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
*/
/*
@@ -34,9 +34,9 @@
* Interface
* ---------
* Defined in zfs_rlock.h but essentially:
- * rl = zfs_range_lock(zp, off, len, lock_type);
- * zfs_range_unlock(rl);
- * zfs_range_reduce(rl, off, len);
+ * lr = rangelock_enter(zp, off, len, lock_type);
+ * rangelock_reduce(lr, off, len); // optional
+ * rangelock_exit(lr);
*
* AVL tree
* --------
@@ -46,9 +46,10 @@
*
* Common case
* -----------
- * The (hopefully) usual case is of no overlaps or contention for
- * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree
- * searched that finds no overlap, and *this* rl_t is placed in the tree.
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
*
* Overlaps/Reference counting/Proxy locks
* ---------------------------------------
@@ -87,67 +88,89 @@
*
* Grow block handling
* -------------------
- * ZFS supports multiple block sizes currently upto 128K. The smallest
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
* block size is used for the file which is grown as needed. During this
* growth all other writers and readers must be excluded.
* So if the block size needs to be grown then the whole file is
* exclusively locked, then later the caller will reduce the lock
- * range to just the range to be written using zfs_reduce_range.
+ * range to just the range to be written using rangelock_reduce().
*/
+#include <sys/zfs_context.h>
#include <sys/zfs_rlock.h>
/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+rangelock_compare(const void *arg1, const void *arg2)
+{
+ const locked_range_t *rl1 = arg1;
+ const locked_range_t *rl2 = arg2;
+
+ if (rl1->lr_offset > rl2->lr_offset)
+ return (1);
+ if (rl1->lr_offset < rl2->lr_offset)
+ return (-1);
+ return (0);
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg)
+{
+ mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+ avl_create(&rl->rl_tree, rangelock_compare,
+ sizeof (locked_range_t), offsetof(locked_range_t, lr_node));
+ rl->rl_cb = cb;
+ rl->rl_arg = arg;
+}
+
+void
+rangelock_fini(rangelock_t *rl)
+{
+ mutex_destroy(&rl->rl_lock);
+ avl_destroy(&rl->rl_tree);
+}
+
+/*
* Check if a write lock can be grabbed, or wait and recheck until available.
*/
static void
-zfs_range_lock_writer(znode_t *zp, rl_t *new)
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl;
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *lr;
avl_index_t where;
- uint64_t end_size;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ uint64_t orig_off = new->lr_offset;
+ uint64_t orig_len = new->lr_length;
+ rangelock_type_t orig_type = new->lr_type;
for (;;) {
/*
- * Range locking is also used by zvol and uses a
- * dummied up znode. However, for zvol, we don't need to
- * append or grow blocksize, and besides we don't have
- * a "sa" data or z_zfsvfs - so skip that processing.
- *
- * Yes, this is ugly, and would be solved by not handling
- * grow or append in range lock code. If that was done then
- * we could make the range locking code generically available
- * to other non-zfs consumers.
+ * Call callback which can modify new->r_off,len,type.
+ * Note, the callback is used by the ZPL to handle appending
+ * and changing blocksizes. It isn't needed for zvols.
*/
- if (zp->z_vnode) { /* caller is ZPL */
- /*
- * If in append mode pick up the current end of file.
- * This is done under z_range_lock to avoid races.
- */
- if (new->r_type == RL_APPEND)
- new->r_off = zp->z_size;
-
- /*
- * If we need to grow the block size then grab the whole
- * file range. This is also done under z_range_lock to
- * avoid races.
- */
- end_size = MAX(zp->z_size, new->r_off + len);
- if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
- zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
- new->r_off = 0;
- new->r_len = UINT64_MAX;
- }
+ if (rl->rl_cb != NULL) {
+ rl->rl_cb(new, rl->rl_arg);
}
/*
+ * If the type was APPEND, the callback must convert it to
+ * WRITER.
+ */
+ ASSERT3U(new->lr_type, ==, RL_WRITER);
+
+ /*
* First check for the usual case of no locks
*/
if (avl_numnodes(tree) == 0) {
- new->r_type = RL_WRITER; /* convert to writer */
avl_add(tree, new);
return;
}
@@ -155,31 +178,33 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new)
/*
* Look for any locks in the range.
*/
- rl = avl_find(tree, new, &where);
- if (rl)
+ lr = avl_find(tree, new, &where);
+ if (lr != NULL)
goto wait; /* already locked at same offset */
- rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- if (rl && (rl->r_off < new->r_off + new->r_len))
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ if (lr != NULL &&
+ lr->lr_offset < new->lr_offset + new->lr_length)
goto wait;
- rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
- if (rl && rl->r_off + rl->r_len > new->r_off)
+ lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
+ if (lr != NULL &&
+ lr->lr_offset + lr->lr_length > new->lr_offset)
goto wait;
- new->r_type = RL_WRITER; /* convert possible RL_APPEND */
avl_insert(tree, new, where);
return;
wait:
- if (!rl->r_write_wanted) {
- cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL);
- rl->r_write_wanted = B_TRUE;
+ if (!lr->lr_write_wanted) {
+ cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+ lr->lr_write_wanted = B_TRUE;
}
- cv_wait(&rl->r_wr_cv, &zp->z_range_lock);
+ cv_wait(&lr->lr_write_cv, &rl->rl_lock);
/* reset to original */
- new->r_off = off;
- new->r_len = len;
+ new->lr_offset = orig_off;
+ new->lr_length = orig_len;
+ new->lr_type = orig_type;
}
}
@@ -187,29 +212,29 @@ wait:
* If this is an original (non-proxy) lock then replace it by
* a proxy and return the proxy.
*/
-static rl_t *
-zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
+static locked_range_t *
+rangelock_proxify(avl_tree_t *tree, locked_range_t *lr)
{
- rl_t *proxy;
+ locked_range_t *proxy;
- if (rl->r_proxy)
- return (rl); /* already a proxy */
+ if (lr->lr_proxy)
+ return (lr); /* already a proxy */
- ASSERT3U(rl->r_cnt, ==, 1);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
- avl_remove(tree, rl);
- rl->r_cnt = 0;
+ ASSERT3U(lr->lr_count, ==, 1);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
+ avl_remove(tree, lr);
+ lr->lr_count = 0;
/* create a proxy range lock */
- proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- proxy->r_off = rl->r_off;
- proxy->r_len = rl->r_len;
- proxy->r_cnt = 1;
- proxy->r_type = RL_READER;
- proxy->r_proxy = B_TRUE;
- proxy->r_write_wanted = B_FALSE;
- proxy->r_read_wanted = B_FALSE;
+ proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ proxy->lr_offset = lr->lr_offset;
+ proxy->lr_length = lr->lr_length;
+ proxy->lr_count = 1;
+ proxy->lr_type = RL_READER;
+ proxy->lr_proxy = B_TRUE;
+ proxy->lr_write_wanted = B_FALSE;
+ proxy->lr_read_wanted = B_FALSE;
avl_add(tree, proxy);
return (proxy);
@@ -219,29 +244,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl)
* Split the range lock at the supplied offset
* returning the *front* proxy.
*/
-static rl_t *
-zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
+static locked_range_t *
+rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off)
{
- rl_t *front, *rear;
-
- ASSERT3U(rl->r_len, >, 1);
- ASSERT3U(off, >, rl->r_off);
- ASSERT3U(off, <, rl->r_off + rl->r_len);
- ASSERT(rl->r_write_wanted == B_FALSE);
- ASSERT(rl->r_read_wanted == B_FALSE);
+ ASSERT3U(lr->lr_length, >, 1);
+ ASSERT3U(off, >, lr->lr_offset);
+ ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+ ASSERT(lr->lr_write_wanted == B_FALSE);
+ ASSERT(lr->lr_read_wanted == B_FALSE);
/* create the rear proxy range lock */
- rear = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rear->r_off = off;
- rear->r_len = rl->r_off + rl->r_len - off;
- rear->r_cnt = rl->r_cnt;
- rear->r_type = RL_READER;
- rear->r_proxy = B_TRUE;
- rear->r_write_wanted = B_FALSE;
- rear->r_read_wanted = B_FALSE;
-
- front = zfs_range_proxify(tree, rl);
- front->r_len = off - rl->r_off;
+ locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ rear->lr_offset = off;
+ rear->lr_length = lr->lr_offset + lr->lr_length - off;
+ rear->lr_count = lr->lr_count;
+ rear->lr_type = RL_READER;
+ rear->lr_proxy = B_TRUE;
+ rear->lr_write_wanted = B_FALSE;
+ rear->lr_read_wanted = B_FALSE;
+
+ locked_range_t *front = rangelock_proxify(tree, lr);
+ front->lr_length = off - lr->lr_offset;
avl_insert_here(tree, rear, front, AVL_AFTER);
return (front);
@@ -251,28 +274,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off)
* Create and add a new proxy range lock for the supplied range.
*/
static void
-zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
{
- rl_t *rl;
-
- ASSERT(len);
- rl = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- rl->r_off = off;
- rl->r_len = len;
- rl->r_cnt = 1;
- rl->r_type = RL_READER;
- rl->r_proxy = B_TRUE;
- rl->r_write_wanted = B_FALSE;
- rl->r_read_wanted = B_FALSE;
- avl_add(tree, rl);
+ ASSERT(len != 0);
+ locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ lr->lr_count = 1;
+ lr->lr_type = RL_READER;
+ lr->lr_proxy = B_TRUE;
+ lr->lr_write_wanted = B_FALSE;
+ lr->lr_read_wanted = B_FALSE;
+ avl_add(tree, lr);
}
static void
-zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
+rangelock_add_reader(avl_tree_t *tree, locked_range_t *new,
+ locked_range_t *prev, avl_index_t where)
{
- rl_t *next;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ locked_range_t *next;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
/*
* prev arrives either:
@@ -281,37 +303,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* range may overlap with the new range
* - null, if there were no ranges starting before the new one
*/
- if (prev) {
- if (prev->r_off + prev->r_len <= off) {
+ if (prev != NULL) {
+ if (prev->lr_offset + prev->lr_length <= off) {
prev = NULL;
- } else if (prev->r_off != off) {
+ } else if (prev->lr_offset != off) {
/*
* convert to proxy if needed then
* split this entry and bump ref count
*/
- prev = zfs_range_split(tree, prev, off);
+ prev = rangelock_split(tree, prev, off);
prev = AVL_NEXT(tree, prev); /* move to rear range */
}
}
- ASSERT((prev == NULL) || (prev->r_off == off));
+ ASSERT((prev == NULL) || (prev->lr_offset == off));
- if (prev)
+ if (prev != NULL)
next = prev;
else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
+ next = avl_nearest(tree, where, AVL_AFTER);
- if (next == NULL || off + len <= next->r_off) {
+ if (next == NULL || off + len <= next->lr_offset) {
/* no overlaps, use the original new rl_t in the tree */
avl_insert(tree, new, where);
return;
}
- if (off < next->r_off) {
+ if (off < next->lr_offset) {
/* Add a proxy for initial range before the overlap */
- zfs_range_new_proxy(tree, off, next->r_off - off);
+ rangelock_new_proxy(tree, off, next->lr_offset - off);
}
- new->r_cnt = 0; /* will use proxies in tree */
+ new->lr_count = 0; /* will use proxies in tree */
/*
* We now search forward through the ranges, until we go past the end
* of the new range. For each entry we make it a proxy if it
@@ -319,47 +341,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where)
* gaps between the ranges then we create a new proxy range.
*/
for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
+ if (off + len <= next->lr_offset)
break;
- if (prev && prev->r_off + prev->r_len < next->r_off) {
+ if (prev != NULL && prev->lr_offset + prev->lr_length <
+ next->lr_offset) {
/* there's a gap */
- ASSERT3U(next->r_off, >, prev->r_off + prev->r_len);
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- next->r_off - (prev->r_off + prev->r_len));
+ ASSERT3U(next->lr_offset, >,
+ prev->lr_offset + prev->lr_length);
+ rangelock_new_proxy(tree,
+ prev->lr_offset + prev->lr_length,
+ next->lr_offset -
+ (prev->lr_offset + prev->lr_length));
}
- if (off + len == next->r_off + next->r_len) {
+ if (off + len == next->lr_offset + next->lr_length) {
/* exact overlap with end */
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
return;
}
- if (off + len < next->r_off + next->r_len) {
+ if (off + len < next->lr_offset + next->lr_length) {
/* new range ends in the middle of this block */
- next = zfs_range_split(tree, next, off + len);
- next->r_cnt++;
+ next = rangelock_split(tree, next, off + len);
+ next->lr_count++;
return;
}
- ASSERT3U(off + len, >, next->r_off + next->r_len);
- next = zfs_range_proxify(tree, next);
- next->r_cnt++;
+ ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+ next = rangelock_proxify(tree, next);
+ next->lr_count++;
}
/* Add the remaining end range. */
- zfs_range_new_proxy(tree, prev->r_off + prev->r_len,
- (off + len) - (prev->r_off + prev->r_len));
+ rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+ (off + len) - (prev->lr_offset + prev->lr_length));
}
/*
* Check if a reader lock can be grabbed, or wait and recheck until available.
*/
static void
-zfs_range_lock_reader(znode_t *zp, rl_t *new)
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *prev, *next;
+ avl_tree_t *tree = &rl->rl_tree;
+ locked_range_t *prev, *next;
avl_index_t where;
- uint64_t off = new->r_off;
- uint64_t len = new->r_len;
+ uint64_t off = new->lr_offset;
+ uint64_t len = new->lr_length;
/*
* Look for any writer locks in the range.
@@ -367,21 +393,22 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new)
retry:
prev = avl_find(tree, new, &where);
if (prev == NULL)
- prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE);
+ prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE);
/*
* Check the previous range for a writer lock overlap.
*/
- if (prev && (off < prev->r_off + prev->r_len)) {
- if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) {
- if (!prev->r_read_wanted) {
- cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL);
- prev->r_read_wanted = B_TRUE;
+ if (prev && (off < prev->lr_offset + prev->lr_length)) {
+ if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+ if (!prev->lr_read_wanted) {
+ cv_init(&prev->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ prev->lr_read_wanted = B_TRUE;
}
- cv_wait(&prev->r_rd_cv, &zp->z_range_lock);
+ cv_wait(&prev->lr_read_cv, &rl->rl_lock);
goto retry;
}
- if (off + len < prev->r_off + prev->r_len)
+ if (off + len < prev->lr_offset + prev->lr_length)
goto got_lock;
}
@@ -389,70 +416,71 @@ retry:
* Search through the following ranges to see if there's
* write lock any overlap.
*/
- if (prev)
+ if (prev != NULL)
next = AVL_NEXT(tree, prev);
else
- next = (rl_t *)avl_nearest(tree, where, AVL_AFTER);
- for (; next; next = AVL_NEXT(tree, next)) {
- if (off + len <= next->r_off)
+ next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER);
+ for (; next != NULL; next = AVL_NEXT(tree, next)) {
+ if (off + len <= next->lr_offset)
goto got_lock;
- if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) {
- if (!next->r_read_wanted) {
- cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL);
- next->r_read_wanted = B_TRUE;
+ if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+ if (!next->lr_read_wanted) {
+ cv_init(&next->lr_read_cv,
+ NULL, CV_DEFAULT, NULL);
+ next->lr_read_wanted = B_TRUE;
}
- cv_wait(&next->r_rd_cv, &zp->z_range_lock);
+ cv_wait(&next->lr_read_cv, &rl->rl_lock);
goto retry;
}
- if (off + len <= next->r_off + next->r_len)
+ if (off + len <= next->lr_offset + next->lr_length)
goto got_lock;
}
got_lock:
/*
* Add the read lock, which may involve splitting existing
- * locks and bumping ref counts (r_cnt).
+ * locks and bumping ref counts (r_count).
*/
- zfs_range_add_reader(tree, new, prev, where);
+ rangelock_add_reader(tree, new, prev, where);
}
/*
- * Lock a range (offset, length) as either shared (RL_READER)
- * or exclusive (RL_WRITER). Returns the range lock structure
- * for later unlocking or reduce range (if entire file
- * previously locked as RL_WRITER).
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file). Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER).
*/
-rl_t *
-zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+ rangelock_type_t type)
{
- rl_t *new;
-
ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
- new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
- new->r_zp = zp;
- new->r_off = off;
+ locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+ new->lr_rangelock = rl;
+ new->lr_offset = off;
if (len + off < off) /* overflow */
len = UINT64_MAX - off;
- new->r_len = len;
- new->r_cnt = 1; /* assume it's going to be in the tree */
- new->r_type = type;
- new->r_proxy = B_FALSE;
- new->r_write_wanted = B_FALSE;
- new->r_read_wanted = B_FALSE;
-
- mutex_enter(&zp->z_range_lock);
+ new->lr_length = len;
+ new->lr_count = 1; /* assume it's going to be in the tree */
+ new->lr_type = type;
+ new->lr_proxy = B_FALSE;
+ new->lr_write_wanted = B_FALSE;
+ new->lr_read_wanted = B_FALSE;
+
+ mutex_enter(&rl->rl_lock);
if (type == RL_READER) {
/*
* First check for the usual case of no locks
*/
- if (avl_numnodes(&zp->z_range_avl) == 0)
- avl_add(&zp->z_range_avl, new);
+ if (avl_numnodes(&rl->rl_tree) == 0)
+ avl_add(&rl->rl_tree, new);
else
- zfs_range_lock_reader(zp, new);
+ rangelock_enter_reader(rl, new);
} else
- zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */
- mutex_exit(&zp->z_range_lock);
+ rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
+ mutex_exit(&rl->rl_lock);
return (new);
}
@@ -460,10 +488,9 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
* Unlock a reader lock
*/
static void
-zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
+rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove)
{
- avl_tree_t *tree = &zp->z_range_avl;
- rl_t *rl, *next = NULL;
+ avl_tree_t *tree = &rl->rl_tree;
uint64_t len;
/*
@@ -473,133 +500,118 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove)
* removed from the tree and replaced by proxies (one or
* more ranges mapping to the entire range).
*/
- if (remove->r_cnt == 1) {
+ if (remove->lr_count == 1) {
avl_remove(tree, remove);
- if (remove->r_write_wanted) {
- cv_broadcast(&remove->r_wr_cv);
- cv_destroy(&remove->r_wr_cv);
+ if (remove->lr_write_wanted) {
+ cv_broadcast(&remove->lr_write_cv);
+ cv_destroy(&remove->lr_write_cv);
}
- if (remove->r_read_wanted) {
- cv_broadcast(&remove->r_rd_cv);
- cv_destroy(&remove->r_rd_cv);
+ if (remove->lr_read_wanted) {
+ cv_broadcast(&remove->lr_read_cv);
+ cv_destroy(&remove->lr_read_cv);
}
} else {
- ASSERT0(remove->r_cnt);
- ASSERT0(remove->r_write_wanted);
- ASSERT0(remove->r_read_wanted);
+ ASSERT0(remove->lr_count);
+ ASSERT0(remove->lr_write_wanted);
+ ASSERT0(remove->lr_read_wanted);
/*
* Find start proxy representing this reader lock,
* then decrement ref count on all proxies
* that make up this range, freeing them as needed.
*/
- rl = avl_find(tree, remove, NULL);
- ASSERT(rl);
- ASSERT(rl->r_cnt);
- ASSERT(rl->r_type == RL_READER);
- for (len = remove->r_len; len != 0; rl = next) {
- len -= rl->r_len;
- if (len) {
- next = AVL_NEXT(tree, rl);
- ASSERT(next);
- ASSERT(rl->r_off + rl->r_len == next->r_off);
- ASSERT(next->r_cnt);
- ASSERT(next->r_type == RL_READER);
+ locked_range_t *lr = avl_find(tree, remove, NULL);
+ ASSERT3P(lr, !=, NULL);
+ ASSERT3U(lr->lr_count, !=, 0);
+ ASSERT3U(lr->lr_type, ==, RL_READER);
+ locked_range_t *next = NULL;
+ for (len = remove->lr_length; len != 0; lr = next) {
+ len -= lr->lr_length;
+ if (len != 0) {
+ next = AVL_NEXT(tree, lr);
+ ASSERT3P(next, !=, NULL);
+ ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+ next->lr_offset);
+ ASSERT3U(next->lr_count, !=, 0);
+ ASSERT3U(next->lr_type, ==, RL_READER);
}
- rl->r_cnt--;
- if (rl->r_cnt == 0) {
- avl_remove(tree, rl);
- if (rl->r_write_wanted) {
- cv_broadcast(&rl->r_wr_cv);
- cv_destroy(&rl->r_wr_cv);
+ lr->lr_count--;
+ if (lr->lr_count == 0) {
+ avl_remove(tree, lr);
+ if (lr->lr_write_wanted) {
+ cv_broadcast(&lr->lr_write_cv);
+ cv_destroy(&lr->lr_write_cv);
}
- if (rl->r_read_wanted) {
- cv_broadcast(&rl->r_rd_cv);
- cv_destroy(&rl->r_rd_cv);
+ if (lr->lr_read_wanted) {
+ cv_broadcast(&lr->lr_read_cv);
+ cv_destroy(&lr->lr_read_cv);
}
- kmem_free(rl, sizeof (rl_t));
+ kmem_free(lr, sizeof (locked_range_t));
}
}
}
- kmem_free(remove, sizeof (rl_t));
+ kmem_free(remove, sizeof (locked_range_t));
}
/*
* Unlock range and destroy range lock structure.
*/
void
-zfs_range_unlock(rl_t *rl)
+rangelock_exit(locked_range_t *lr)
{
- znode_t *zp = rl->r_zp;
+ rangelock_t *rl = lr->lr_rangelock;
- ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
- ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
- ASSERT(!rl->r_proxy);
+ ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+ ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+ ASSERT(!lr->lr_proxy);
- mutex_enter(&zp->z_range_lock);
- if (rl->r_type == RL_WRITER) {
+ mutex_enter(&rl->rl_lock);
+ if (lr->lr_type == RL_WRITER) {
/* writer locks can't be shared or split */
- avl_remove(&zp->z_range_avl, rl);
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted) {
- cv_broadcast(&rl->r_wr_cv);
- cv_destroy(&rl->r_wr_cv);
+ avl_remove(&rl->rl_tree, lr);
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted) {
+ cv_broadcast(&lr->lr_write_cv);
+ cv_destroy(&lr->lr_write_cv);
}
- if (rl->r_read_wanted) {
- cv_broadcast(&rl->r_rd_cv);
- cv_destroy(&rl->r_rd_cv);
+ if (lr->lr_read_wanted) {
+ cv_broadcast(&lr->lr_read_cv);
+ cv_destroy(&lr->lr_read_cv);
}
- kmem_free(rl, sizeof (rl_t));
+ kmem_free(lr, sizeof (locked_range_t));
} else {
/*
- * lock may be shared, let zfs_range_unlock_reader()
+ * lock may be shared, let rangelock_exit_reader()
* release the lock and free the rl_t
*/
- zfs_range_unlock_reader(zp, rl);
- mutex_exit(&zp->z_range_lock);
+ rangelock_exit_reader(rl, lr);
+ mutex_exit(&rl->rl_lock);
}
}
/*
* Reduce range locked as RL_WRITER from whole file to specified range.
- * Asserts the whole file is exclusivly locked and so there's only one
+ * Asserts the whole file is exclusively locked and so there's only one
* entry in the tree.
*/
void
-zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
+rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len)
{
- znode_t *zp = rl->r_zp;
+ rangelock_t *rl = lr->lr_rangelock;
/* Ensure there are no other locks */
- ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
- ASSERT(rl->r_off == 0);
- ASSERT(rl->r_type == RL_WRITER);
- ASSERT(!rl->r_proxy);
- ASSERT3U(rl->r_len, ==, UINT64_MAX);
- ASSERT3U(rl->r_cnt, ==, 1);
-
- mutex_enter(&zp->z_range_lock);
- rl->r_off = off;
- rl->r_len = len;
- mutex_exit(&zp->z_range_lock);
- if (rl->r_write_wanted)
- cv_broadcast(&rl->r_wr_cv);
- if (rl->r_read_wanted)
- cv_broadcast(&rl->r_rd_cv);
-}
-
-/*
- * AVL comparison function used to order range locks
- * Locks are ordered on the start offset of the range.
- */
-int
-zfs_range_compare(const void *arg1, const void *arg2)
-{
- const rl_t *rl1 = arg1;
- const rl_t *rl2 = arg2;
-
- if (rl1->r_off > rl2->r_off)
- return (1);
- if (rl1->r_off < rl2->r_off)
- return (-1);
- return (0);
+ ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+ ASSERT3U(lr->lr_offset, ==, 0);
+ ASSERT3U(lr->lr_type, ==, RL_WRITER);
+ ASSERT(!lr->lr_proxy);
+ ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+ ASSERT3U(lr->lr_count, ==, 1);
+
+ mutex_enter(&rl->rl_lock);
+ lr->lr_offset = off;
+ lr->lr_length = len;
+ mutex_exit(&rl->rl_lock);
+ if (lr->lr_write_wanted)
+ cv_broadcast(&lr->lr_write_cv);
+ if (lr->lr_read_wanted)
+ cv_broadcast(&lr->lr_read_cv);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 475020a20a..a7493bba30 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -513,7 +513,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
ssize_t n, nbytes;
int error = 0;
- rl_t *rl;
xuio_t *xuio = NULL;
ZFS_ENTER(zfsvfs);
@@ -560,7 +559,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* Lock the range against changes.
*/
- rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
+ locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
/*
* If we are reading past end-of-file we can skip
@@ -623,7 +623,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n -= nbytes;
}
out:
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
@@ -663,7 +663,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
zilog_t *zilog;
offset_t woff;
ssize_t n, nbytes;
- rl_t *rl;
int max_blksz = zfsvfs->z_max_blksz;
int error = 0;
int prev_error;
@@ -743,7 +742,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
}
/*
- * Check for mandatory locks before calling zfs_range_lock()
+ * Check for mandatory locks before calling rangelock_enter()
* in order to prevent a deadlock with locks set via fcntl().
*/
if (MANDMODE((mode_t)zp->z_mode) &&
@@ -755,14 +754,15 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
/*
* If in append mode, set the io offset pointer to eof.
*/
+ locked_range_t *lr;
if (ioflag & FAPPEND) {
/*
* Obtain an appending range lock to guarantee file append
* semantics. We reset the write offset once we have the lock.
*/
- rl = zfs_range_lock(zp, 0, n, RL_APPEND);
- woff = rl->r_off;
- if (rl->r_len == UINT64_MAX) {
+ lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
/*
* We overlocked the file because this write will cause
* the file block size to increase.
@@ -777,11 +777,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
* this write, then this range lock will lock the entire file
* so that we can re-write the block safely.
*/
- rl = zfs_range_lock(zp, woff, n, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
}
if (woff >= limit) {
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EFBIG));
}
@@ -862,12 +862,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
}
/*
- * If zfs_range_lock() over-locked we grow the blocksize
+ * If rangelock_enter() over-locked we grow the blocksize
* and then reduce the lock range. This will only happen
- * on the first iteration since zfs_range_reduce() will
- * shrink down r_len to the appropriate size.
+ * on the first iteration since rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
*/
- if (rl->r_len == UINT64_MAX) {
+ if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
@@ -883,7 +883,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
- zfs_range_reduce(rl, woff, n);
+ rangelock_reduce(lr, woff, n);
}
/*
@@ -998,7 +998,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n -= nbytes;
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
/*
* If we're in replay mode, or we made no progress, return error.
@@ -1027,7 +1027,7 @@ zfs_get_done(zgd_t *zgd, int error)
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
- zfs_range_unlock(zgd->zgd_rl);
+ rangelock_exit(zgd->zgd_lr);
/*
* Release the vnode asynchronously as we currently have the
@@ -1089,7 +1089,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
@@ -1110,12 +1111,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
- zgd->zgd_rl = zfs_range_lock(zp, offset, size,
- RL_READER);
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
- zfs_range_unlock(zgd->zgd_rl);
+ rangelock_exit(zgd->zgd_lr);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
@@ -4317,7 +4318,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
size_t io_len;
u_offset_t io_off;
uint_t blksz;
- rl_t *rl;
+ locked_range_t *lr;
int error = 0;
ZFS_ENTER(zfsvfs);
@@ -4352,15 +4353,16 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
/*
* Search the entire vp list for pages >= io_off.
*/
- rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock,
+ io_off, UINT64_MAX, RL_WRITER);
error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
goto out;
}
- rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER);
if (off > zp->z_size) {
/* past end of file */
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
ZFS_EXIT(zfsvfs);
return (0);
}
@@ -4390,7 +4392,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
}
}
out:
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zfsvfs->z_log, zp->z_id);
ZFS_EXIT(zfsvfs);
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 93545ee4a1..536216deaf 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
*/
@@ -110,6 +110,37 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
panic("evicting znode %p\n", user_ptr);
}
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
/*ARGSUSED*/
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
@@ -131,9 +162,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&zp->z_range_avl, zfs_range_compare,
- sizeof (rl_t), offsetof(rl_t, r_node));
+ rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
@@ -155,8 +184,7 @@ zfs_znode_cache_destructor(void *buf, void *arg)
rw_destroy(&zp->z_parent_lock);
rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
- avl_destroy(&zp->z_range_avl);
- mutex_destroy(&zp->z_range_lock);
+ rangelock_fini(&zp->z_rangelock);
ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
@@ -191,7 +219,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
nzp->z_id = ozp->z_id;
ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
- ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
nzp->z_unlinked = ozp->z_unlinked;
nzp->z_atime_dirty = ozp->z_atime_dirty;
nzp->z_zn_prefetch = ozp->z_zn_prefetch;
@@ -1470,20 +1497,20 @@ zfs_extend(znode_t *zp, uint64_t end)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
dmu_tx_t *tx;
- rl_t *rl;
+ locked_range_t *lr;
uint64_t newblksz;
int error;
/*
* We will change zp_size, lock the whole file.
*/
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
/*
* Nothing to do if file already at desired length.
*/
if (end <= zp->z_size) {
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (0);
}
tx = dmu_tx_create(zfsvfs->z_os);
@@ -1513,7 +1540,7 @@ zfs_extend(znode_t *zp, uint64_t end)
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (error);
}
@@ -1525,7 +1552,7 @@ zfs_extend(znode_t *zp, uint64_t end)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
&zp->z_size, sizeof (zp->z_size), tx));
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
dmu_tx_commit(tx);
@@ -1545,19 +1572,19 @@ static int
zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- rl_t *rl;
+ locked_range_t *lr;
int error;
/*
* Lock the range being freed.
*/
- rl = zfs_range_lock(zp, off, len, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
/*
* Nothing to do if file already at desired length.
*/
if (off >= zp->z_size) {
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (0);
}
@@ -1566,7 +1593,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (error);
}
@@ -1585,7 +1612,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
dmu_tx_t *tx;
- rl_t *rl;
+ locked_range_t *lr;
int error;
sa_bulk_attr_t bulk[2];
int count = 0;
@@ -1593,20 +1620,20 @@ zfs_trunc(znode_t *zp, uint64_t end)
/*
* We will change zp_size, lock the whole file.
*/
- rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
/*
* Nothing to do if file already at desired length.
*/
if (end >= zp->z_size) {
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (0);
}
error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
DMU_OBJECT_END);
if (error) {
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (error);
}
tx = dmu_tx_create(zfsvfs->z_os);
@@ -1616,7 +1643,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (error);
}
@@ -1657,7 +1684,7 @@ zfs_trunc(znode_t *zp, uint64_t end)
ASSERT(error == 0);
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 1e787f31b2..196c98c602 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -92,6 +92,7 @@
#include <sys/zil_impl.h>
#include <sys/ht.h>
#include <sys/dkioc_free_util.h>
+#include <sys/zfs_rlock.h>
#include "zfs_namecheck.h"
@@ -130,7 +131,7 @@ typedef struct zvol_state {
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
- znode_t zv_znode; /* for range locking */
+ rangelock_t zv_rangelock;
dnode_t *zv_dn; /* dnode hold */
} zvol_state_t;
@@ -560,9 +561,7 @@ zvol_create_minor(const char *name)
zv->zv_objset = os;
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
zv->zv_flags |= ZVOL_RDONLY;
- mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
- sizeof (rl_t), offsetof(rl_t, r_node));
+ rangelock_init(&zv->zv_rangelock, NULL, NULL);
list_create(&zv->zv_extents, sizeof (zvol_extent_t),
offsetof(zvol_extent_t, ze_node));
/* get and cache the blocksize */
@@ -605,8 +604,7 @@ zvol_remove_zv(zvol_state_t *zv)
(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
ddi_remove_minor_node(zfs_dip, nmbuf);
- avl_destroy(&zv->zv_znode.z_range_avl);
- mutex_destroy(&zv->zv_znode.z_range_lock);
+ rangelock_fini(&zv->zv_rangelock);
kmem_free(zv, sizeof (zvol_state_t));
@@ -987,7 +985,7 @@ zvol_get_done(zgd_t *zgd, int error)
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
- zfs_range_unlock(zgd->zgd_rl);
+ rangelock_exit(zgd->zgd_lr);
kmem_free(zgd, sizeof (zgd_t));
}
@@ -1020,7 +1018,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
- zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_READER);
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
@@ -1033,7 +1031,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
*/
size = zv->zv_volblocksize;
offset = P2ALIGN(offset, size);
- zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+ zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size,
RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
@@ -1229,7 +1227,6 @@ zvol_strategy(buf_t *bp)
size_t resid;
char *addr;
objset_t *os;
- rl_t *rl;
int error = 0;
boolean_t doread = bp->b_flags & B_READ;
boolean_t is_dumpified;
@@ -1287,7 +1284,7 @@ zvol_strategy(buf_t *bp)
* There must be no buffer changes when doing a dmu_sync() because
* we can't change the data whilst calculating the checksum.
*/
- rl = zfs_range_lock(&zv->zv_znode, off, resid,
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid,
doread ? RL_READER : RL_WRITER);
while (resid != 0 && off < volsize) {
@@ -1321,7 +1318,7 @@ zvol_strategy(buf_t *bp)
addr += size;
resid -= size;
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
if ((bp->b_resid = resid) == bp->b_bcount)
bioerror(bp, off > volsize ? EINVAL : error);
@@ -1392,7 +1389,6 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
minor_t minor = getminor(dev);
zvol_state_t *zv;
uint64_t volsize;
- rl_t *rl;
int error = 0;
zone_t *zonep = curzone;
uint64_t tot_bytes;
@@ -1423,8 +1419,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
start = gethrtime();
tot_bytes = 0;
- rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
- RL_READER);
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
@@ -1441,7 +1437,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
break;
}
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
mutex_enter(&zonep->zone_vfs_lock);
zonep->zone_vfs_rwstats.reads++;
@@ -1487,7 +1483,6 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
minor_t minor = getminor(dev);
zvol_state_t *zv;
uint64_t volsize;
- rl_t *rl;
int error = 0;
boolean_t sync;
zone_t *zonep = curzone;
@@ -1527,8 +1522,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
sync = !(zv->zv_flags & ZVOL_WCE) ||
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
- rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
- RL_WRITER);
+ locked_range_t *lr = rangelock_enter(&zv->zv_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_WRITER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio->uio_loffset;
@@ -1552,7 +1547,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
if (error)
break;
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
+
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
@@ -1678,7 +1674,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize,
*minor_hdl = zv;
*objset_hdl = zv->zv_objset;
*zil_hdl = zv->zv_zilog;
- *rl_hdl = &zv->zv_znode;
+ *rl_hdl = &zv->zv_rangelock;
*dnode_hdl = zv->zv_dn;
return (0);
}
@@ -1757,7 +1753,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
zvol_state_t *zv;
struct dk_callback *dkc;
int error = 0;
- rl_t *rl;
+ locked_range_t *lr;
mutex_enter(&zfsdev_state_lock);
@@ -1882,19 +1878,19 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
break;
case DKIOCDUMPINIT:
- rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
RL_WRITER);
error = zvol_dumpify(zv);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
break;
case DKIOCDUMPFINI:
if (!(zv->zv_flags & ZVOL_DUMPIFIED))
break;
- rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
+ lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize,
RL_WRITER);
error = zvol_dump_fini(zv);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
break;
case DKIOCFREE:
@@ -1939,7 +1935,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
length = end - start;
}
- rl = zfs_range_lock(&zv->zv_znode, start, length,
+ lr = rangelock_enter(&zv->zv_rangelock, start, length,
RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
error = dmu_tx_assign(tx, TXG_WAIT);
@@ -1953,7 +1949,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
ZVOL_OBJ, start, length);
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
if (error != 0)
break;
diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c
index 9fe77e88c4..6fdec0d4d2 100644
--- a/usr/src/uts/common/inet/mi.c
+++ b/usr/src/uts/common/inet/mi.c
@@ -86,7 +86,7 @@ typedef struct mtb_s {
clock_t mtb_time_left;
} MTB, *MTBP;
-static int mi_timer_fire(MTBP);
+static void mi_timer_fire(void *);
static int mi_iprintf(char *, va_list, pfi_t, char *);
static void mi_tpi_addr_and_opt(MBLKP, char *, t_scalar_t, char *, t_scalar_t);
static MBLKP mi_tpi_trailer_alloc(MBLKP, size_t, t_scalar_t);
@@ -202,10 +202,10 @@ mi_close_free(IDP ptr)
void
mi_copyin(queue_t *q, MBLKP mp, char *uaddr, size_t len)
{
- struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
- struct copyreq *cq = (struct copyreq *)mp->b_rptr;
- struct copyresp *cp = (struct copyresp *)mp->b_rptr;
- int err;
+ struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
+ struct copyreq *cq = (struct copyreq *)mp->b_rptr;
+ struct copyresp *cp = (struct copyresp *)mp->b_rptr;
+ int err;
MBLKP mp1;
ASSERT(mp->b_datap->db_type == M_IOCTL && !uaddr);
@@ -293,7 +293,7 @@ err_ret:
void
mi_copyin_n(queue_t *q, MBLKP mp, size_t offset, size_t len)
{
- struct copyreq *cq = (struct copyreq *)mp->b_rptr;
+ struct copyreq *cq = (struct copyreq *)mp->b_rptr;
ASSERT(mp->b_datap->db_type == M_IOCDATA);
@@ -1124,7 +1124,7 @@ mi_timer(queue_t *q, MBLKP mp, clock_t tim)
}
}
mtb->mtb_state = TB_RUNNING;
- mtb->mtb_tid = timeout((pfv_t)mi_timer_fire, mtb, tim);
+ mtb->mtb_tid = timeout(mi_timer_fire, mtb, tim);
return;
}
switch (tim) {
@@ -1172,12 +1172,14 @@ mi_timer_alloc(size_t size)
* it has fired then mi_timer() and mi_timer_valid() will clean
* things up.
*/
-static int
-mi_timer_fire(MTBP mtb)
+static void
+mi_timer_fire(void *ptr)
{
+ MTBP mtb = ptr;
+
ASSERT(mtb == (MTBP)mtb->mtb_mp->b_datap->db_base);
ASSERT(mtb->mtb_mp->b_datap->db_type == M_PCSIG);
- return (putq(mtb->mtb_q, mtb->mtb_mp));
+ (void) putq(mtb->mtb_q, mtb->mtb_mp);
}
/*
@@ -1253,7 +1255,7 @@ mi_timer_move(queue_t *q, MBLKP mp)
}
mtb->mtb_q = q;
mtb->mtb_state = TB_RUNNING;
- mtb->mtb_tid = timeout((pfv_t)mi_timer_fire, mtb, tim);
+ mtb->mtb_tid = timeout(mi_timer_fire, mtb, tim);
} else if (mtb->mtb_state != TB_IDLE) {
ASSERT(mtb->mtb_state != TB_TO_BE_FREED);
/*
@@ -1340,7 +1342,7 @@ mi_timer_valid(MBLKP mp)
* the timer was restarted with.
*/
mtb->mtb_state = TB_RUNNING;
- mtb->mtb_tid = timeout((pfv_t)mi_timer_fire,
+ mtb->mtb_tid = timeout(mi_timer_fire,
mtb, mtb->mtb_time_left);
return (B_FALSE);
}
diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
index bf9a369506..1f4dd29b18 100644
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
*/
#include <sys/conf.h>
@@ -40,6 +40,7 @@
#include <sys/arc.h>
#include <sys/zvol.h>
#include <sys/zfs_rlock.h>
+#include <sys/zil.h>
#include <sys/stmf.h>
#include <sys/lpif.h>
@@ -74,8 +75,8 @@
* dmu_tx_abort(tx)
* zil_commit()
*
- * zfs_range_lock()
- * zfs_range_unlock()
+ * rangelock_enter()
+ * rangelock_exit()
*
* zvol_log_write()
*
@@ -87,7 +88,7 @@
* zv_flags - for WCE
* zv_objset - dmu_tx_create
* zv_zilog - zil_commit
- * zv_znode - zfs_range_lock
+ * zv_znode - rangelock_enter
* zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
* GLOBAL DATA
* zvol_maxphys
@@ -113,7 +114,7 @@ sbd_zvol_get_volume_params(sbd_lu_t *sl)
&sl->sl_zvol_minor_hdl, /* minor soft state */
&sl->sl_zvol_objset_hdl, /* dmu_tx_create */
&sl->sl_zvol_zil_hdl, /* zil_commit */
- &sl->sl_zvol_rl_hdl, /* zfs_range_lock */
+ &sl->sl_zvol_rl_hdl, /* locked_range_t */
&sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */
/* dmu_request_arcbuf, */
/* dmu_assign_arcbuf */
@@ -153,7 +154,7 @@ int
sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
{
sbd_zvol_io_t *zvio = dbuf->db_lu_private;
- rl_t *rl;
+ locked_range_t *lr;
int numbufs, error;
uint64_t len = dbuf->db_data_size;
uint64_t offset = zvio->zvio_offset;
@@ -169,13 +170,13 @@ sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
* The range lock is only held until the dmu buffers read in and
* held; not during the callers use of the data.
*/
- rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
+ lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl,
offset, len, TRUE, RDTAG, &numbufs, &dbpp,
DMU_READ_PREFETCH);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
if (error == ECKSUM)
error = EIO;
@@ -337,7 +338,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
sbd_zvol_io_t *zvio = dbuf->db_lu_private;
dmu_tx_t *tx;
int sync, i, error;
- rl_t *rl;
+ locked_range_t *lr;
arc_buf_t **abp = zvio->zvio_abp;
int flags = zvio->zvio_flags;
uint64_t toffset, offset = zvio->zvio_offset;
@@ -345,7 +346,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
- rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
+ lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
@@ -353,7 +354,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
if (error) {
dmu_tx_abort(tx);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
sbd_zvol_rele_write_bufs_abort(sl, dbuf);
return (error);
}
@@ -377,7 +378,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
(ssize_t)len, sync);
dmu_tx_commit(tx);
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
kmem_free(zvio->zvio_abp,
sizeof (arc_buf_t *) * dbuf->db_sglist_length);
zvio->zvio_abp = NULL;
@@ -393,8 +394,6 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
int
sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
{
- int error;
- rl_t *rl;
uint64_t len = (uint64_t)uio->uio_resid;
uint64_t offset = (uint64_t)uio->uio_loffset;
@@ -404,11 +403,11 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
return (EIO);
- rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
+ locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
+ RL_READER);
+ int error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
+ rangelock_exit(lr);
- error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
-
- zfs_range_unlock(rl);
if (error == ECKSUM)
error = EIO;
return (error);
@@ -421,7 +420,6 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
int
sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
{
- rl_t *rl;
dmu_tx_t *tx;
int error, sync;
uint64_t len = (uint64_t)uio->uio_resid;
@@ -435,8 +433,8 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
return (EIO);
- rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
-
+ locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
+ RL_WRITER);
sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
@@ -452,7 +450,8 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
}
dmu_tx_commit(tx);
}
- zfs_range_unlock(rl);
+ rangelock_exit(lr);
+
if (sync && (flags & ZVIO_COMMIT))
zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
if (error == ECKSUM)