diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-01-02 13:20:05 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-01-02 13:20:05 +0000 |
commit | e8cde0f3c5e7729d1fff5b7c87826a6650863a18 (patch) | |
tree | 88e4f45c89a249664d35d557dfacd7f1c324b3ca | |
parent | d6c0c21a61cacd3b34ad4c75bc7a2dc58a36a9c7 (diff) | |
parent | 3ab8de02e091bc75fc50aafc5cbe32c053451d88 (diff) | |
download | illumos-joyent-release-20190103.tar.gz |
[illumos-gate merge]release-20190103
commit 3ab8de02e091bc75fc50aafc5cbe32c053451d88
10164 crle: passing argument 2 to restrict-qualified parameter aliases with argument 1
commit cd8e64e261b359d5d54dad750f50660d3b874238
10161 libstand: set spa in zfs_probe
commit e88f57b4cab0ed8f8a773d404d6781e20a4c26d7
10072 ip: timeout takes void (*)(void *) as callback
commit e74ff6533275aaaa8989786dfbba24b3281e3530
10049 mboot: mboot is no longer needed
commit 7931524763ef94dc16989451dddd206563d03bb4
9689 zfs range lock code should not be zpl-specific
Conflicts:
usr/src/uts/common/fs/zfs/zvol.c
-rw-r--r-- | manifest | 1 | ||||
-rw-r--r-- | usr/src/boot/sys/boot/zfs/zfs.c | 1 | ||||
-rw-r--r-- | usr/src/cmd/boot/Makefile | 1 | ||||
-rw-r--r-- | usr/src/cmd/boot/mbr/Makefile | 46 | ||||
-rw-r--r-- | usr/src/cmd/boot/mbr/mbr.c | 172 | ||||
-rw-r--r-- | usr/src/cmd/rmformat/rmf_slice.c | 26 | ||||
-rw-r--r-- | usr/src/cmd/sgs/crle/common/inspect.c | 17 | ||||
-rw-r--r-- | usr/src/cmd/ztest/ztest.c | 16 | ||||
-rw-r--r-- | usr/src/pkg/manifests/system-boot-real-mode.mf | 1 | ||||
-rw-r--r-- | usr/src/pkg/manifests/system-kernel.mf | 1 | ||||
-rw-r--r-- | usr/src/psm/stand/bootblks/ufs/i386/Makefile | 48 | ||||
-rw-r--r-- | usr/src/psm/stand/bootblks/ufs/i386/mboot.S | 394 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/dmu.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_rlock.h | 69 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/sys/zfs_znode.h | 14 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_rlock.c | 568 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_vnops.c | 54 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zfs_znode.c | 71 | ||||
-rw-r--r-- | usr/src/uts/common/fs/zfs/zvol.c | 52 | ||||
-rw-r--r-- | usr/src/uts/common/inet/mi.c | 26 | ||||
-rw-r--r-- | usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c | 43 |
21 files changed, 507 insertions, 1117 deletions
@@ -5770,7 +5770,6 @@ f usr/lib/fs/ufs/fstyp.so.1 0555 root bin h usr/lib/fs/ufs/fstyp=usr/sbin/fstyp f usr/lib/fs/ufs/labelit 0555 root bin f usr/lib/fs/ufs/lockfs 0555 root bin -f usr/lib/fs/ufs/mboot 0444 root sys f usr/lib/fs/ufs/mkfs 0555 root bin s usr/lib/fs/ufs/mount=../../../../etc/fs/ufs/mount f usr/lib/fs/ufs/ncheck 0555 root bin diff --git a/usr/src/boot/sys/boot/zfs/zfs.c b/usr/src/boot/sys/boot/zfs/zfs.c index 8139f8fa24..1f68b012b2 100644 --- a/usr/src/boot/sys/boot/zfs/zfs.c +++ b/usr/src/boot/sys/boot/zfs/zfs.c @@ -465,6 +465,7 @@ zfs_probe(int fd, uint64_t *pool_guid) spa_t *spa; int ret; + spa = NULL; ret = vdev_probe(vdev_read, (void *)(uintptr_t)fd, &spa); if (ret == 0 && pool_guid != NULL) *pool_guid = spa->spa_guid; diff --git a/usr/src/cmd/boot/Makefile b/usr/src/cmd/boot/Makefile index 32f8c778d6..44f319d263 100644 --- a/usr/src/cmd/boot/Makefile +++ b/usr/src/cmd/boot/Makefile @@ -35,7 +35,6 @@ COMMON_SUBDIRS= \ i386_SUBDIRS= \ installgrub \ - mbr \ symdef sparc_SUBDIRS= diff --git a/usr/src/cmd/boot/mbr/Makefile b/usr/src/cmd/boot/mbr/Makefile deleted file mode 100644 index 844ee0d758..0000000000 --- a/usr/src/cmd/boot/mbr/Makefile +++ /dev/null @@ -1,46 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# -# -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -BOOTPROG= mbr - -OBJS= $(BOOTPROG).o -SRCS = $(OBJS:.o=.c) - -include ../Makefile.com - -CLOBBERFILES = $(BOOTPROG) - -.KEEP_STATE: - -all: $(BOOTPROG) - -install: all $(ROOTBOOTSOLARISBINPROG) - -clean: - -$(RM) $(OBJS) - -lint: lint_SRCS - -include ../Makefile.targ diff --git a/usr/src/cmd/boot/mbr/mbr.c b/usr/src/cmd/boot/mbr/mbr.c deleted file mode 100644 index e4978d224d..0000000000 --- a/usr/src/cmd/boot/mbr/mbr.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <stdio.h> -#include <sys/types.h> -#include <string.h> -#include <fcntl.h> -#include <libgen.h> -#include <stdlib.h> -#include <strings.h> -#include <unistd.h> -#include <sys/dktp/fdisk.h> - -#define SECTOR_SIZE 512 -static char boot_sect[SECTOR_SIZE]; -static char new_mboot[SECTOR_SIZE]; - -static void -usage(char *progname) -{ - fprintf(stderr, "Usage: %s [ -d | -n | -o | -r ] <device> [<mboot>]\n", - basename(progname)); - fprintf(stderr, "\t-n Set new Solaris partition magic 0xbf\n"); - fprintf(stderr, "\t-o Set old Solaris partition magic 0x82\n"); - fprintf(stderr, "\t-r Replace master boot program " - "(/usr/lib/fs/ufs/mboot)\n"); - exit(-1); -} - -int -main(int argc, char *argv[]) -{ - int c, fd, i, sol_part = -1; - int setold = 0, setnew = 0, write_mboot = 0, list_hd = 0; - char *device; - struct mboot *mboot; - char *mboot_file = "/usr/lib/fs/ufs/mboot"; - - while ((c = getopt(argc, argv, "dnor")) != EOF) { - switch (c) { - case 'd': - list_hd = 1; - continue; - case 'n': - setnew = 1; - continue; - case 'o': - setold = 1; - continue; - case 'r': - write_mboot = 1; - continue; - default: - usage(argv[0]); - } - } - - /* check arguments */ - if ((setnew && setold) || argc < optind + 1) { - usage(argv[0]); - } - - if (write_mboot && argc > optind + 1) { - mboot_file = strdup(argv[optind + 1]); - } - if (!mboot_file) { - usage(argv[0]); - } - fd = open(mboot_file, O_RDONLY); - if (fd == -1 || read(fd, new_mboot, SECTOR_SIZE) != SECTOR_SIZE) { - fprintf(stderr, "cannot read file %s\n", mboot_file); - if (fd == -1) - perror("open"); - else - perror("read"); - exit(-1); - } - close(fd); - - device = strdup(argv[optind]); - if (!device) { - usage(argv[0]); - } - fd = open(device, O_RDWR); - if (fd == -1 || read(fd, boot_sect, SECTOR_SIZE) != SECTOR_SIZE) { - fprintf(stderr, "cannot read MBR on %s\n", device); - if (fd == -1) - perror("open"); - else - perror("read"); - exit(-1); - } - - mboot = (struct mboot *)boot_sect; - for (i = 0; i < FD_NUMPART; i++) { - struct ipart *part = (struct ipart *)mboot->parts + i; - if (!list_hd) { - if (part->bootid == 128) - printf("active "); - else - printf(" "); - } - if (setnew && part->systid == 0x82) { - part->systid = 0xbf; - sol_part = i; - } else if (setold && part->systid == 0xbf) { - part->systid = 0x82; - sol_part = i; - } else if (list_hd && - (part->systid == 0x82 || part->systid == 0xbf)) { - sol_part = i; - } - if (!list_hd) - printf("%d (0x%2x): start_sect %u, size_sect %u\n", - i + 1, part->systid, part->relsect, part->numsect); - } - - if (list_hd) { - printf("(hd0,%d,a)\n", sol_part); - (void) close(fd); - return (0); - } - - /* write new mboot */ - if (write_mboot || sol_part != -1) { - if (write_mboot) { - /* copy over the new boot program */ - bcopy((void *)new_mboot, (void *)boot_sect, BOOTSZ); - } - - if ((lseek(fd, 0, SEEK_SET) < 0) || - (write(fd, (void *)boot_sect, SECTOR_SIZE) < 0)) { - perror("failed to update MBR"); - exit(-1); - } - if (sol_part != -1) { - printf("Changed solaris partition %d", sol_part + 1); - if (setnew) - printf("from 0x82 to 0xbf\n"); - else - printf("from 0xbf to 0x82\n"); - } - if (write_mboot) { - printf("Replaced mboot program with %s\n", mboot_file); - } - } - - (void) close(fd); - return (0); -} diff --git a/usr/src/cmd/rmformat/rmf_slice.c b/usr/src/cmd/rmformat/rmf_slice.c index dccd09b609..ec8d5209e0 100644 --- a/usr/src/cmd/rmformat/rmf_slice.c +++ b/usr/src/cmd/rmformat/rmf_slice.c @@ -25,8 +25,8 @@ /* * rmf_slice.c : - * This file contains the functions for parsing a slice file - * for rmformat. + * This file contains the functions for parsing a slice file + * for rmformat. */ #include <sys/types.h> @@ -786,7 +786,7 @@ str2sector(char *str) int32_t valid_slice_file(smedia_handle_t handle, int32_t fd, char *file_name, - struct extvtoc *vt) + struct extvtoc *vt) { struct stat status; int32_t ret_val; @@ -852,17 +852,17 @@ valid_slice_file(smedia_handle_t handle, int32_t fd, char *file_name, * in fdisk table. * Following table describes how is it handled * SPARC: - * SCSI/ATAPI, floppy, pcmcia : don't check for fdisk. + * SCSI/ATAPI, floppy, pcmcia : don't check for fdisk. * DKIOCGGEOM is sufficient. * x86 : floppy, pcmcia : Don't check for fdisk. DKIOCGGEOM is sufficient. - * SCSI/ATAPI : Check for fdisk. + * SCSI/ATAPI : Check for fdisk. * if not present, assume that the solaris * partition covers 100% of the medium - * (minus one cylinder). + * (minus one cylinder). * - * if present : + * if present : * check for active solaris partition. - * if not found, take the first solaris + * if not found, take the first solaris * partition. * If there are no solaris partitions, its an error, stop. */ @@ -1114,7 +1114,7 @@ Solaris partition\n")); static int32_t get_fdisk(smedia_handle_t handle, int32_t fd, int32_t offset, - struct fdisk_info *fdisk) + struct fdisk_info *fdisk) { struct mboot *boot_sec; struct ipart *part; @@ -1220,7 +1220,7 @@ get_fdisk(smedia_handle_t handle, int32_t fd, int32_t offset, /* * wrrite_defualt_label(int32_t fd) - * fd = file descriptor for the device. + * fd = file descriptor for the device. * * For sparc solaris * Create a vtoc partition with @@ -1351,7 +1351,7 @@ write_default_label(smedia_handle_t handle, int32_t fd) return; } - tmp_fd = open("/usr/lib/fs/ufs/mboot", O_RDONLY); + tmp_fd = open("/boot/pmbr", O_RDONLY); if (tmp_fd <= 0) { return; } @@ -1499,9 +1499,9 @@ write_default_label(smedia_handle_t handle, int32_t fd) * zip/jaz media. So, the meta data on the disk should be erased. * * If there is a valid fdisk table, - * erase first 64K of each partition. + * erase first 64K of each partition. * If there is a valid vtoc, - * erase first 64k of each slice. + * erase first 64k of each slice. * Then erase the 0th sector (the home for vtoc and fdisk) of the disk. * Note that teh vtoc on x86 resides in one of the fdisk partition. * So delay the erasing of the solaris partition until the vtoc is read. diff --git a/usr/src/cmd/sgs/crle/common/inspect.c b/usr/src/cmd/sgs/crle/common/inspect.c index 7f68c77a27..38667fccba 100644 --- a/usr/src/cmd/sgs/crle/common/inspect.c +++ b/usr/src/cmd/sgs/crle/common/inspect.c @@ -97,8 +97,8 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags, if (flags & RTC_OBJ_DUMP) { char _alter[PATH_MAX]; - (void) strcpy(_alter, crle->c_objdir); - (void) realpath(_alter, _alter); + (void) strlcpy(alter, crle->c_objdir, sizeof (alter)); + (void) realpath(alter, _alter); (void) snprintf(alter, PATH_MAX, MSG_ORIG(MSG_FMT_PATH), _alter, file); if (strcmp(alter, obj->o_path) == 0) { @@ -107,9 +107,10 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags, return (0); } obj->o_flags |= RTC_OBJ_DUMP; - } else + } else { (void) snprintf(alter, PATH_MAX, MSG_ORIG(MSG_FMT_PATH), crle->c_objdir, file); + } obj->o_flags |= RTC_OBJ_ALTER; /* @@ -119,8 +120,9 @@ enteralt(Crle_desc *crle, const char *path, const char *file, Half flags, if (obj->o_alter) { crle->c_strsize -= strlen(alter) + 1; fmt = MSG_INTL(MSG_DIA_ALTUPDATE); - } else + } else { fmt = MSG_INTL(MSG_DIA_ALTCREATE); + } /* * Allocate the new alternative and update the string table size. @@ -397,8 +399,9 @@ _enterfile(Crle_desc *crle, const char *file, int off, Hash_ent *fent, if ((nfile = malloc(size)) == NULL) return (0); (void) strcpy(nfile, file); - } else + } else { nfile = (char *)file; + } fent->e_key = (Addr)nfile; fent->e_off = off; @@ -1028,9 +1031,9 @@ inspect(Crle_desc *crle, const char *name, Half flags) } else { size_t off = file - name; - if (file == name) + if (file == name) { dir = MSG_ORIG(MSG_DIR_ROOT); - else { + } else { (void) strncpy(_dir, name, off); _dir[off] = '\0'; dir = (const char *)_dir; diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index 46a8dd2c4e..8db1103272 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -237,7 +237,9 @@ typedef struct bufwad { } bufwad_t; /* - * XXX -- fix zfs range locks to be generic so we can use them here. + * It would be better to use a rangelock_t per object. Unfortunately + * the rangelock_t is not a drop-in replacement for rl_t, because we + * still need to map from object ID to rangelock_t. */ typedef enum { RL_READER, @@ -1845,12 +1847,12 @@ static void ztest_get_done(zgd_t *zgd, int error) { ztest_ds_t *zd = zgd->zgd_private; - uint64_t object = zgd->zgd_rl->rl_object; + uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - ztest_range_unlock(zgd->zgd_rl); + ztest_range_unlock((rl_t *)zgd->zgd_lr); ztest_object_unlock(zd, object); umem_free(zgd, sizeof (*zgd)); @@ -1900,8 +1902,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zgd->zgd_private = zd; if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, - RL_READER); + zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -1915,8 +1917,8 @@ ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, offset = 0; } - zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, - RL_READER); + zgd->zgd_lr = (struct locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); diff --git a/usr/src/pkg/manifests/system-boot-real-mode.mf b/usr/src/pkg/manifests/system-boot-real-mode.mf index 094432e059..4492c13f0d 100644 --- a/usr/src/pkg/manifests/system-boot-real-mode.mf +++ b/usr/src/pkg/manifests/system-boot-real-mode.mf @@ -45,7 +45,6 @@ $(i386_ONLY)file path=boot/solaris/bootenv.rc group=sys \ $(i386_ONLY)file path=boot/solaris/devicedb/master group=sys \ original_name=SUNWrmod:boot/solaris/devicedb/master preserve=true $(i386_ONLY)file path=boot/splashimage.xpm group=sys -$(i386_ONLY)file path=usr/lib/fs/ufs/mboot mode=0444 $(i386_ONLY)hardlink path=boot/solaris.xpm target=splashimage.xpm $(i386_ONLY)legacy pkg=SUNWrmodr desc="Realmode Modules, (Root)" \ name="Realmode Modules, (Root)" diff --git a/usr/src/pkg/manifests/system-kernel.mf b/usr/src/pkg/manifests/system-kernel.mf index d9b0c02702..59fe6451a4 100644 --- a/usr/src/pkg/manifests/system-kernel.mf +++ b/usr/src/pkg/manifests/system-kernel.mf @@ -295,7 +295,6 @@ driver name=wc perms="* 0600 root sys" $(i386_ONLY)file path=boot/solaris/bin/create_diskmap group=sys mode=0555 file path=boot/solaris/bin/create_ramdisk group=sys mode=0555 file path=boot/solaris/bin/extract_boot_filelist group=sys mode=0555 -$(i386_ONLY)file path=boot/solaris/bin/mbr group=sys mode=0555 $(i386_ONLY)file path=boot/solaris/bin/symdef group=sys mode=0555 $(i386_ONLY)file path=boot/solaris/bin/update_grub group=sys mode=0555 file path=boot/solaris/filelist.ramdisk group=sys diff --git a/usr/src/psm/stand/bootblks/ufs/i386/Makefile b/usr/src/psm/stand/bootblks/ufs/i386/Makefile index 0d5b74fda8..929fde267e 100644 --- a/usr/src/psm/stand/bootblks/ufs/i386/Makefile +++ b/usr/src/psm/stand/bootblks/ufs/i386/Makefile @@ -25,51 +25,19 @@ # # psm/stand/bootblks/ufs/i386/Makefile # -.KEEP_STATE: BASEDIR = ../.. include $(BASEDIR)/ufs/Makefile.ufs +all := TARGET= all +install := TARGET= install +clean := TARGET= clean +clobber := TARGET= clobber +lint := TARGET= lint -CC = $(GNUC_ROOT)/bin/gcc -ASFLAGS = -B$(GNUC_ROOT)/bin/ -fno-builtin -nostdinc -CPPFLAGS = - -LD = $(GNU_ROOT)/bin/gld -LDFLAGS = -nostdlib -N -Ttext 600 - -OBJCOPY = $(GNU_ROOT)/bin/gobjcopy - -INSTALL_DIR = $(USR)/lib/fs/ufs -INSTALL_TARGETS = $(PROGS:%=$(INSTALL_DIR)/%) - -$(INSTALL_TARGETS) := FILEMODE = 0444 - -PROGS = mboot - - -all: $(PROGS) - -$(PROGS): $$(@).exec - $(OBJCOPY) -O binary $@.exec $@ - -%.exec: %.o - $(LD) $(LDFLAGS) -o $@ $(@:exec=o) - - -install: all $(INSTALL_TARGETS) - -$(INSTALL_DIR)/%: $(INSTALL_DIR) % - $(INS.file) - -$(INSTALL_DIR): - $(INS.dir) - -lint: +.KEEP_STATE: -clean: - $(RM) *.exec *.o +all install lint clean clobber: FRC -clobber: clean - $(RM) $(PROGS) $(INSTALLBOOT) +FRC: diff --git a/usr/src/psm/stand/bootblks/ufs/i386/mboot.S b/usr/src/psm/stand/bootblks/ufs/i386/mboot.S deleted file mode 100644 index 48afbae207..0000000000 --- a/usr/src/psm/stand/bootblks/ufs/i386/mboot.S +++ /dev/null @@ -1,394 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2012 OmniTI Computer Consulting, Inc. All rights reserved. - * - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * SOLARIS MASTER BOOT: - * - * PURPOSE: loads the primary boot from the active fdisk partition. - * in effect, this routine mimics the functionality of INT 0x19. - * - * resides on the first physical sector of the hard drive media. - * loaded by INT 0x19 (ROM bootstrap loader) at address 0x7C00 - * limited to 512 bytes total, including embedded fdisk table. - * - * for compatibility with the ROM BIOS, we contain standard DOS structures: - * - * the fdisk partition table (at offset 0x1BE-0x1FE) - * boot signature bytes (0x55, 0xAA at 0x1FE, 0x1FF) - * - * the above two entities are required in order to be compatible with - * the manner in which the DOS BIOS has always performed its boot operation. - * In the event that our master boot record is inadvertently replaced by - * a standard DOS boot sector, the booting operation will still succeed! - * - * This master boot record uses the relsect/numsect fields of the partition - * table entry, to compute the start of the active partition; therefore, - * it is geometry independent. This means that the drive could be "built" - * on a system with a disk controller that uses a given disk geometry, but - * would run on any other controller. - * - * SYNOPSIS: - * begins execution at 0:0x7C00 - * relocates to 0:0x600 (to get out of the way!) - * reads fdisk table to locate bootable partition - * load boot record from the active fdisk partition at 0x7C00 - * verify boot record signature bytes - * jump to/execute the SOLARIS PARTITION PRIMARY BOOT - * error handler - can either reboot, or invoke INT 0x18. - * - * interface from DOS INT 0x19: BootDev in DL - * (this fails sometimes, so we look for a signature to determine whether - * to rely on DL from the floppy boot, or if we should assume 0x80 from - * the BIOS) - * - * interface to partition boot: BootDev in DL - * - *============================================================================= - * Master boot record: resides on first physical sector of device - */ - -/* - * This file is written in GNU as syntax using Intel assembler syntax. The - * startup label _start will be executed at address PBOOT_ADDR (0x7C00), but - * the text section must be set at address RELOC_ADDR (0x600). With GNU ld - * this can be done using the "-Ttext 600" option. - */ - - -#define PBOOT_ADDR 0x7C00 -#define RELOC_ADDR 0x600 - -#define FDISK_START 0x1BE -#define BOOT_SIG 0xAA55 -#define N_RETRIES 5 - -#define FD_NUMPART 4 -#define FD_PTESIZE 0x10 -#define ACTIVE 0x80 - -/* - * A convenience macro for declaring a message string (using .ascii directive-- - * NOT nul-terminated) surrounded by two labels, which can then be used with - * the SIZEOF() macro to get its length. - */ -#define MSG(label, string) label: .ascii string; label##_end: - -/* - * Returns the length of some consecutive bytes. These bytes must be placed - * between two labels. The ending label must be the same as the starting label - * but with a suffix "_end". - */ -#define SIZEOF(label) (label##_end - label) - - - .title "Solaris_Master_Boot" - - .intel_syntax noprefix /* use Intel syntax */ - .code16 /* 16-bit mode (real mode) */ - - .text /* code segment begins here */ - - .global BootDev - .global _start - -_start: /* _start is loaded at PBOOT_ADDR */ - jmp bootrun - -Version: - .ascii "M3.0" /* ident string */ - -bootrun: - cli /* don't bother me now! */ - - /* prepare to relocate ourselves */ - cld /* prepare for relocation */ - mov si, PBOOT_ADDR - mov di, RELOC_ADDR - - /* set up segment registers */ - mov ax, cs /* initialize segment registers */ - mov ss, ax - mov sp, si /* stack starts down from 7C00 */ - mov es, ax - mov ds, ax - - push cx /* save possible signature on stack */ - mov cx, 0x100 - rep movsw - pop cx /* restore saved cx */ - - /* running at PBOOT_ADDR, jump to RELOC_ADDR-rel addr */ - jmp (new_home - PBOOT_ADDR + RELOC_ADDR) - -new_home: - sti /* re-enable interrupts */ - - /* - * assuming boot device number is in dl has caused problems in the past - * since we still don't absolutely have to rely on it, I've just - * removed the now-pointless code to check for the FACE-CAFE signature - * from mdexec, which doesn't do anything anymore, but left the - * assumption that BootDev is 0x80 and nothing but. If we ever need to - * have BIOS load us from a drive not numbered 0x80, we'll need to - * uncomment the following line; otherwise, the initialized value of - * BootDev, namely 0x80, will be used for disk accesses. - */ - /* mov BootDev, dl */ - - /* set debug flag based on seeing "both shift down" */ - mov ah, 2 /* get shift state */ - int 0x16 - and al, 3 /* isolate shift-key bits */ - cmp al, 3 - jne nodbg - mov byte ptr [debugmode], 1 /* set to 1 */ - -nodbg: - /* - * Search the fdisk table sequentially to find a physical partition - * that is marked as "active" (bootable). - */ - mov bx, RELOC_ADDR + FDISK_START - mov cx, FD_NUMPART - -nxtpart: - cmp byte ptr [bx], ACTIVE - je got_active_part - add bx, FD_PTESIZE - loop nxtpart - -noparts: - mov bp, offset NoActiveErrMsg - mov cx, SIZEOF(NoActiveErrMsg) - jmp fatal_err - -got_active_part: - mov ah, 0 /* reset disk */ - int 0x13 - - push bx /* save partition pointer */ - - /* Check for LBA BIOS */ - mov ah, 0x41 /* chkext function */ - mov bx, 0x55AA /* signature to change */ - mov cx, 0 - int 0x13 - jc noLBA /* carry == failure */ - cmp bx, 0xAA55 - jne noLBA /* bad signature in BX == failure */ - test cx, 1 /* cx & 1 must be true, or... */ - jz noLBA /* ...no LBA */ - - mov bp, offset lbastring - mov cx, SIZEOF(lbastring) - call debugout - - /* - * LBA case: form a packet on the stack and call fn 0x42 to read - * packet, backwards (from hi to lo addresses): - * 8-byte LBA - * seg:ofs buffer address - * byte reserved - * byte nblocks - * byte reserved - * packet size in bytes (>= 0x10) - */ - - pop bx /* restore partition pointer */ - push bx /* and save again */ - mov cx, N_RETRIES /* retry count */ -retryLBA: - pushd 0 /* hi 32 bits of 64-bit sector number */ - push dword ptr [bx+8] /* relsect (lo 32 of 64-bit number) */ - push dword ptr [solaris_priboot] /* seg:ofs of buffer */ - push 1 /* reserved, one block */ - push 0x10 /* reserved, size (0x10) */ - mov ah, 0x42 /* "read LBA" */ - mov si, sp /* (ds already == ss) */ - int 0x13 - lahf /* save flags */ - add sp, 16 /* restore stack */ - sahf /* restore flags */ - jnc readok /* got it */ - mov ah, 0 /* reset disk */ - int 0x13 - loop retryLBA /* try again */ - jmp readerr /* exhausted retries; give up */ - -noLBA: - mov bp, offset chsstring - mov cx, SIZEOF(chsstring) - call debugout - - pop bx /* restore partition pointer */ - push bx /* and save again */ - - /* get BIOS disk parameters */ - mov dl, byte ptr [BootDev] - mov ah, 0x8 - int 0x13 - - jnc geomok - - /* error reading geom; die */ - mov bp, offset GeomErrMsg - mov cx, SIZEOF(GeomErrMsg) - jmp fatal_err - -geomok: - /* calculate sectors per track */ - mov al, cl /* ah doesn't matter; mul dh will set it */ - and al, 0x3F - mov byte ptr [secPerTrk], al - - /* calculate sectors per cylinder */ - inc dh - mul dh - mov word ptr [secPerCyl], ax - - /* calculate cylinder # */ - mov ax, [bx+8] /* ax = loword(relsect) */ - mov dx, [bx+10] /* dx:ax = relsect */ - div word ptr [secPerCyl] /* ax = cyl, */ - /* dx = sect in cyl (0 - cylsize-1) */ - mov bx, ax /* bx = cyl */ - - /* calculate head/sector # */ - mov ax, dx /* ax = sect in cyl (0 - cylsize-1) */ - div byte ptr [secPerTrk] /* al = head, */ - /* ah = 0-rel sect in track */ - inc ah /* ah = 1-rel sector */ - - xor cl,cl /* cl = 0 */ - mov ch, bh /* ch = hi bits of cyl (if any) */ - shr cx, 2 /* cl{7:6} = cyl{9:8} (if any) */ - and cl, 0xC0 /* cl = cyl{9:8} to merge with sect (if any) */ - - or cl, ah /* cl{7:6} = cyl bits, cl{5:0} = sect */ - mov ch, bl /* ch = lo cyl bits */ - mov dh, al /* dh = head */ - mov dl, byte ptr [BootDev] /* dl = drivenum */ - les bx, solaris_priboot /* es:bx points to buffer */ - - mov si, N_RETRIES -retry_noLBA: - mov ax, 0x201 /* 02=read, sector count = 1 */ - - int 0x13 - jnc readok - mov ah, 0 /* reset disk */ - int 0x13 - dec si - cmp si, 0 - jne retry_noLBA /* retry, or fall through to read error */ - -readerr: - mov bp, offset ReadErrMsg - mov cx, SIZEOF(ReadErrMsg) - jmp fatal_err - -readok: - /* verify boot record signature */ - mov bx, PBOOT_ADDR - cmp word ptr [bx+0x1FE], BOOT_SIG - je sigok - - mov bp, offset SigErrMsg - mov cx, SIZEOF(SigErrMsg) - jmp fatal_err - -sigok: - mov dl, byte ptr [BootDev] /* pass BootDev to next boot phase */ - pop si /* and pass partition pointer ds:si */ - call dword ptr [solaris_priboot] /* call doesn't return! */ - - mov bp, offset ReturnErrMsg - mov cx, SIZEOF(ReturnErrMsg) - -fatal_err: /* land of no return....... */ - /* - * bp contains pointer to error message string, - * cx contains string length - */ - mov bx, 0x4F /* video page, attribute */ - call msgout - int 0x18 - -debugout: - /* call with string pointer in es:bp, len in cx */ - cmp byte ptr [debugmode], 0 - je debugout_ret /* skip if not in debug mode */ - - mov bx, 0x1F /* page, attr (white on blue) */ - - /* alternate entry for fatal_err */ -msgout: - pusha - mov ax, 0x1301 - mov dx, 0x1700 /* row, col */ - int 0x10 - - mov al, 7 /* BEL */ - mov cx, 1 - int 0x10 - - mov ah, 0 /* get key */ - int 0x16 - popa - -debugout_ret: - ret - -secPerTrk: - .byte 0 -secPerCyl: - .word 0 -solaris_priboot: - .long PBOOT_ADDR -BootDev: - .byte 0x80 /* assumes drive 80 (see comment above) */ -debugmode: - .byte 0 - -MSG(GeomErrMsg, "Can't read geometry") -MSG(NoActiveErrMsg, "No active partition") -MSG(ReadErrMsg, "Can't read PBR") -MSG(SigErrMsg, "Bad PBR sig") -MSG(ReturnErrMsg, "!!!") -MSG(lbastring, "LBA") -MSG(chsstring, "CHS") - -/* - * For debugging: Here's a representative FDISK table entry - * - * .org 0x1BE - * .byte 0x80,1,1,0,0x82,0xfe,0x7f,4,0x3f,0,0,0,0x86,0xfa,0x3f,0 - */ - .org 0x1FE - - .word BOOT_SIG diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index fb3c6b2fbe..cf24fd5c5e 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -75,6 +75,7 @@ struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; +struct locked_range; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -941,7 +942,7 @@ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; - struct rl *zgd_rl; + struct locked_range *zgd_lr; void *zgd_private; } zgd_t; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h index 93733ba8a2..37a5594bbc 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h @@ -22,6 +22,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ #ifndef _SYS_FS_ZFS_RLOCK_H #define _SYS_FS_ZFS_RLOCK_H @@ -30,54 +33,44 @@ extern "C" { #endif -#ifdef _KERNEL - -#include <sys/zfs_znode.h> - typedef enum { RL_READER, RL_WRITER, RL_APPEND -} rl_type_t; +} rangelock_type_t; -typedef struct rl { - znode_t *r_zp; /* znode this lock applies to */ - avl_node_t r_node; /* avl node link */ - uint64_t r_off; /* file range offset */ - uint64_t r_len; /* file range length */ - uint_t r_cnt; /* range reference count in tree */ - rl_type_t r_type; /* range type */ - kcondvar_t r_wr_cv; /* cv for waiting writers */ - kcondvar_t r_rd_cv; /* cv for waiting readers */ - uint8_t r_proxy; /* acting for original range */ - uint8_t r_write_wanted; /* writer wants to lock this range */ - uint8_t r_read_wanted; /* reader wants to lock this range */ -} rl_t; +struct locked_range; -/* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that - * is converted to RL_WRITER that specified to lock from the start of the - * end of file. Returns the range lock structure. - */ -rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); +typedef void (rangelock_cb_t)(struct locked_range *, void *); -/* Unlock range and destroy range lock structure. */ -void zfs_range_unlock(rl_t *rl); +typedef struct rangelock { + avl_tree_t rl_tree; /* contains locked_range_t */ + kmutex_t rl_lock; + rangelock_cb_t *rl_cb; + void *rl_arg; +} rangelock_t; -/* - * Reduce range locked as RW_WRITER from whole file to specified range. - * Asserts the whole file was previously locked. - */ -void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); +typedef struct locked_range { + rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ + avl_node_t lr_node; /* avl node link */ + uint64_t lr_offset; /* file range offset */ + uint64_t lr_length; /* file range length */ + uint_t lr_count; /* range reference count in tree */ + rangelock_type_t lr_type; /* range type */ + kcondvar_t lr_write_cv; /* cv for waiting writers */ + kcondvar_t lr_read_cv; /* cv for waiting readers */ + uint8_t lr_proxy; /* acting for original range */ + uint8_t lr_write_wanted; /* writer wants to lock this range */ + uint8_t lr_read_wanted; /* reader wants to lock this range */ +} locked_range_t; -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int zfs_range_compare(const void *arg1, const void *arg2); +void rangelock_init(rangelock_t *, rangelock_cb_t *, void *); +void rangelock_fini(rangelock_t *); -#endif /* _KERNEL */ +locked_range_t *rangelock_enter(rangelock_t *, + uint64_t, uint64_t, rangelock_type_t); +void rangelock_exit(locked_range_t *); +void rangelock_reduce(locked_range_t *, uint64_t, uint64_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index bebe577d3f..8c4f8f7dc8 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ @@ -39,6 +39,7 @@ #include <sys/rrwlock.h> #include <sys/zfs_sa.h> #include <sys/zfs_stat.h> +#include <sys/zfs_rlock.h> #endif #include <sys/zfs_acl.h> #include <sys/zil.h> @@ -60,8 +61,8 @@ extern "C" { #define ZFS_APPENDONLY 0x0000004000000000 #define ZFS_NODUMP 0x0000008000000000 #define ZFS_OPAQUE 0x0000010000000000 -#define ZFS_AV_QUARANTINED 0x0000020000000000 -#define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_AV_QUARANTINED 0x0000020000000000 +#define ZFS_AV_MODIFIED 0x0000040000000000 #define ZFS_REPARSE 0x0000080000000000 #define ZFS_OFFLINE 0x0000100000000000 #define ZFS_SPARSE 0x0000200000000000 @@ -81,8 +82,8 @@ extern "C" { */ #define ZFS_XATTR 0x1 /* is an extended attribute */ #define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ -#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ -#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ #define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ @@ -176,8 +177,7 @@ typedef struct znode { krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ - kmutex_t z_range_lock; /* protects changes to z_range_avl */ - avl_tree_t z_range_avl; /* avl tree of file range locks */ + rangelock_t z_rangelock; /* file range locks */ uint8_t z_unlinked; /* file has been unlinked */ uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ diff --git a/usr/src/uts/common/fs/zfs/zfs_rlock.c b/usr/src/uts/common/fs/zfs/zfs_rlock.c index b40bdbea12..4e80ab27cc 100644 --- a/usr/src/uts/common/fs/zfs/zfs_rlock.c +++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* @@ -34,9 +34,9 @@ * Interface * --------- * Defined in zfs_rlock.h but essentially: - * rl = zfs_range_lock(zp, off, len, lock_type); - * zfs_range_unlock(rl); - * zfs_range_reduce(rl, off, len); + * lr = rangelock_enter(zp, off, len, lock_type); + * rangelock_reduce(lr, off, len); // optional + * rangelock_exit(lr); * * AVL tree * -------- @@ -46,9 +46,10 @@ * * Common case * ----------- - * The (hopefully) usual case is of no overlaps or contention for - * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree - * searched that finds no overlap, and *this* rl_t is placed in the tree. + * The (hopefully) usual case is of no overlaps or contention for locks. On + * entry to rangelock_enter(), a locked_range_t is allocated; the tree + * searched that finds no overlap, and *this* locked_range_t is placed in the + * tree. * * Overlaps/Reference counting/Proxy locks * --------------------------------------- @@ -87,67 +88,89 @@ * * Grow block handling * ------------------- - * ZFS supports multiple block sizes currently upto 128K. The smallest + * ZFS supports multiple block sizes, up to 16MB. The smallest * block size is used for the file which is grown as needed. During this * growth all other writers and readers must be excluded. * So if the block size needs to be grown then the whole file is * exclusively locked, then later the caller will reduce the lock - * range to just the range to be written using zfs_reduce_range. + * range to just the range to be written using rangelock_reduce(). */ +#include <sys/zfs_context.h> #include <sys/zfs_rlock.h> /* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +static int +rangelock_compare(const void *arg1, const void *arg2) +{ + const locked_range_t *rl1 = arg1; + const locked_range_t *rl2 = arg2; + + if (rl1->lr_offset > rl2->lr_offset) + return (1); + if (rl1->lr_offset < rl2->lr_offset) + return (-1); + return (0); +} + +/* + * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. + * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), + * and may increase the range that's locked for RL_WRITER. + */ +void +rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) +{ + mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&rl->rl_tree, rangelock_compare, + sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); + rl->rl_cb = cb; + rl->rl_arg = arg; +} + +void +rangelock_fini(rangelock_t *rl) +{ + mutex_destroy(&rl->rl_lock); + avl_destroy(&rl->rl_tree); +} + +/* * Check if a write lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_writer(znode_t *zp, rl_t *new) +rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl; + avl_tree_t *tree = &rl->rl_tree; + locked_range_t *lr; avl_index_t where; - uint64_t end_size; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + uint64_t orig_off = new->lr_offset; + uint64_t orig_len = new->lr_length; + rangelock_type_t orig_type = new->lr_type; for (;;) { /* - * Range locking is also used by zvol and uses a - * dummied up znode. However, for zvol, we don't need to - * append or grow blocksize, and besides we don't have - * a "sa" data or z_zfsvfs - so skip that processing. - * - * Yes, this is ugly, and would be solved by not handling - * grow or append in range lock code. If that was done then - * we could make the range locking code generically available - * to other non-zfs consumers. + * Call callback which can modify new->r_off,len,type. + * Note, the callback is used by the ZPL to handle appending + * and changing blocksizes. It isn't needed for zvols. */ - if (zp->z_vnode) { /* caller is ZPL */ - /* - * If in append mode pick up the current end of file. - * This is done under z_range_lock to avoid races. - */ - if (new->r_type == RL_APPEND) - new->r_off = zp->z_size; - - /* - * If we need to grow the block size then grab the whole - * file range. This is also done under z_range_lock to - * avoid races. - */ - end_size = MAX(zp->z_size, new->r_off + len); - if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || - zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { - new->r_off = 0; - new->r_len = UINT64_MAX; - } + if (rl->rl_cb != NULL) { + rl->rl_cb(new, rl->rl_arg); } /* + * If the type was APPEND, the callback must convert it to + * WRITER. + */ + ASSERT3U(new->lr_type, ==, RL_WRITER); + + /* * First check for the usual case of no locks */ if (avl_numnodes(tree) == 0) { - new->r_type = RL_WRITER; /* convert to writer */ avl_add(tree, new); return; } @@ -155,31 +178,33 @@ zfs_range_lock_writer(znode_t *zp, rl_t *new) /* * Look for any locks in the range. */ - rl = avl_find(tree, new, &where); - if (rl) + lr = avl_find(tree, new, &where); + if (lr != NULL) goto wait; /* already locked at same offset */ - rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - if (rl && (rl->r_off < new->r_off + new->r_len)) + lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + if (lr != NULL && + lr->lr_offset < new->lr_offset + new->lr_length) goto wait; - rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - if (rl && rl->r_off + rl->r_len > new->r_off) + lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); + if (lr != NULL && + lr->lr_offset + lr->lr_length > new->lr_offset) goto wait; - new->r_type = RL_WRITER; /* convert possible RL_APPEND */ avl_insert(tree, new, where); return; wait: - if (!rl->r_write_wanted) { - cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); - rl->r_write_wanted = B_TRUE; + if (!lr->lr_write_wanted) { + cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); + lr->lr_write_wanted = B_TRUE; } - cv_wait(&rl->r_wr_cv, &zp->z_range_lock); + cv_wait(&lr->lr_write_cv, &rl->rl_lock); /* reset to original */ - new->r_off = off; - new->r_len = len; + new->lr_offset = orig_off; + new->lr_length = orig_len; + new->lr_type = orig_type; } } @@ -187,29 +212,29 @@ wait: * If this is an original (non-proxy) lock then replace it by * a proxy and return the proxy. */ -static rl_t * -zfs_range_proxify(avl_tree_t *tree, rl_t *rl) +static locked_range_t * +rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) { - rl_t *proxy; + locked_range_t *proxy; - if (rl->r_proxy) - return (rl); /* already a proxy */ + if (lr->lr_proxy) + return (lr); /* already a proxy */ - ASSERT3U(rl->r_cnt, ==, 1); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); - avl_remove(tree, rl); - rl->r_cnt = 0; + ASSERT3U(lr->lr_count, ==, 1); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); + avl_remove(tree, lr); + lr->lr_count = 0; /* create a proxy range lock */ - proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); - proxy->r_off = rl->r_off; - proxy->r_len = rl->r_len; - proxy->r_cnt = 1; - proxy->r_type = RL_READER; - proxy->r_proxy = B_TRUE; - proxy->r_write_wanted = B_FALSE; - proxy->r_read_wanted = B_FALSE; + proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + proxy->lr_offset = lr->lr_offset; + proxy->lr_length = lr->lr_length; + proxy->lr_count = 1; + proxy->lr_type = RL_READER; + proxy->lr_proxy = B_TRUE; + proxy->lr_write_wanted = B_FALSE; + proxy->lr_read_wanted = B_FALSE; avl_add(tree, proxy); return (proxy); @@ -219,29 +244,27 @@ zfs_range_proxify(avl_tree_t *tree, rl_t *rl) * Split the range lock at the supplied offset * returning the *front* proxy. */ -static rl_t * -zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) +static locked_range_t * +rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) { - rl_t *front, *rear; - - ASSERT3U(rl->r_len, >, 1); - ASSERT3U(off, >, rl->r_off); - ASSERT3U(off, <, rl->r_off + rl->r_len); - ASSERT(rl->r_write_wanted == B_FALSE); - ASSERT(rl->r_read_wanted == B_FALSE); + ASSERT3U(lr->lr_length, >, 1); + ASSERT3U(off, >, lr->lr_offset); + ASSERT3U(off, <, lr->lr_offset + lr->lr_length); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); /* create the rear proxy range lock */ - rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rear->r_off = off; - rear->r_len = rl->r_off + rl->r_len - off; - rear->r_cnt = rl->r_cnt; - rear->r_type = RL_READER; - rear->r_proxy = B_TRUE; - rear->r_write_wanted = B_FALSE; - rear->r_read_wanted = B_FALSE; - - front = zfs_range_proxify(tree, rl); - front->r_len = off - rl->r_off; + locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + rear->lr_offset = off; + rear->lr_length = lr->lr_offset + lr->lr_length - off; + rear->lr_count = lr->lr_count; + rear->lr_type = RL_READER; + rear->lr_proxy = B_TRUE; + rear->lr_write_wanted = B_FALSE; + rear->lr_read_wanted = B_FALSE; + + locked_range_t *front = rangelock_proxify(tree, lr); + front->lr_length = off - lr->lr_offset; avl_insert_here(tree, rear, front, AVL_AFTER); return (front); @@ -251,28 +274,27 @@ zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) * Create and add a new proxy range lock for the supplied range. */ static void -zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) { - rl_t *rl; - - ASSERT(len); - rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rl->r_off = off; - rl->r_len = len; - rl->r_cnt = 1; - rl->r_type = RL_READER; - rl->r_proxy = B_TRUE; - rl->r_write_wanted = B_FALSE; - rl->r_read_wanted = B_FALSE; - avl_add(tree, rl); + ASSERT(len != 0); + locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_count = 1; + lr->lr_type = RL_READER; + lr->lr_proxy = B_TRUE; + lr->lr_write_wanted = B_FALSE; + lr->lr_read_wanted = B_FALSE; + avl_add(tree, lr); } static void -zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) +rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, + locked_range_t *prev, avl_index_t where) { - rl_t *next; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + locked_range_t *next; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; /* * prev arrives either: @@ -281,37 +303,37 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) * range may overlap with the new range * - null, if there were no ranges starting before the new one */ - if (prev) { - if (prev->r_off + prev->r_len <= off) { + if (prev != NULL) { + if (prev->lr_offset + prev->lr_length <= off) { prev = NULL; - } else if (prev->r_off != off) { + } else if (prev->lr_offset != off) { /* * convert to proxy if needed then * split this entry and bump ref count */ - prev = zfs_range_split(tree, prev, off); + prev = rangelock_split(tree, prev, off); prev = AVL_NEXT(tree, prev); /* move to rear range */ } } - ASSERT((prev == NULL) || (prev->r_off == off)); + ASSERT((prev == NULL) || (prev->lr_offset == off)); - if (prev) + if (prev != NULL) next = prev; else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); + next = avl_nearest(tree, where, AVL_AFTER); - if (next == NULL || off + len <= next->r_off) { + if (next == NULL || off + len <= next->lr_offset) { /* no overlaps, use the original new rl_t in the tree */ avl_insert(tree, new, where); return; } - if (off < next->r_off) { + if (off < next->lr_offset) { /* Add a proxy for initial range before the overlap */ - zfs_range_new_proxy(tree, off, next->r_off - off); + rangelock_new_proxy(tree, off, next->lr_offset - off); } - new->r_cnt = 0; /* will use proxies in tree */ + new->lr_count = 0; /* will use proxies in tree */ /* * We now search forward through the ranges, until we go past the end * of the new range. For each entry we make it a proxy if it @@ -319,47 +341,51 @@ zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) * gaps between the ranges then we create a new proxy range. */ for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) + if (off + len <= next->lr_offset) break; - if (prev && prev->r_off + prev->r_len < next->r_off) { + if (prev != NULL && prev->lr_offset + prev->lr_length < + next->lr_offset) { /* there's a gap */ - ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - next->r_off - (prev->r_off + prev->r_len)); + ASSERT3U(next->lr_offset, >, + prev->lr_offset + prev->lr_length); + rangelock_new_proxy(tree, + prev->lr_offset + prev->lr_length, + next->lr_offset - + (prev->lr_offset + prev->lr_length)); } - if (off + len == next->r_off + next->r_len) { + if (off + len == next->lr_offset + next->lr_length) { /* exact overlap with end */ - next = zfs_range_proxify(tree, next); - next->r_cnt++; + next = rangelock_proxify(tree, next); + next->lr_count++; return; } - if (off + len < next->r_off + next->r_len) { + if (off + len < next->lr_offset + next->lr_length) { /* new range ends in the middle of this block */ - next = zfs_range_split(tree, next, off + len); - next->r_cnt++; + next = rangelock_split(tree, next, off + len); + next->lr_count++; return; } - ASSERT3U(off + len, >, next->r_off + next->r_len); - next = zfs_range_proxify(tree, next); - next->r_cnt++; + ASSERT3U(off + len, >, next->lr_offset + next->lr_length); + next = rangelock_proxify(tree, next); + next->lr_count++; } /* Add the remaining end range. */ - zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - (off + len) - (prev->r_off + prev->r_len)); + rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, + (off + len) - (prev->lr_offset + prev->lr_length)); } /* * Check if a reader lock can be grabbed, or wait and recheck until available. */ static void -zfs_range_lock_reader(znode_t *zp, rl_t *new) +rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *prev, *next; + avl_tree_t *tree = &rl->rl_tree; + locked_range_t *prev, *next; avl_index_t where; - uint64_t off = new->r_off; - uint64_t len = new->r_len; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; /* * Look for any writer locks in the range. @@ -367,21 +393,22 @@ zfs_range_lock_reader(znode_t *zp, rl_t *new) retry: prev = avl_find(tree, new, &where); if (prev == NULL) - prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); + prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); /* * Check the previous range for a writer lock overlap. */ - if (prev && (off < prev->r_off + prev->r_len)) { - if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { - if (!prev->r_read_wanted) { - cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); - prev->r_read_wanted = B_TRUE; + if (prev && (off < prev->lr_offset + prev->lr_length)) { + if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { + if (!prev->lr_read_wanted) { + cv_init(&prev->lr_read_cv, + NULL, CV_DEFAULT, NULL); + prev->lr_read_wanted = B_TRUE; } - cv_wait(&prev->r_rd_cv, &zp->z_range_lock); + cv_wait(&prev->lr_read_cv, &rl->rl_lock); goto retry; } - if (off + len < prev->r_off + prev->r_len) + if (off + len < prev->lr_offset + prev->lr_length) goto got_lock; } @@ -389,70 +416,71 @@ retry: * Search through the following ranges to see if there's * write lock any overlap. */ - if (prev) + if (prev != NULL) next = AVL_NEXT(tree, prev); else - next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - for (; next; next = AVL_NEXT(tree, next)) { - if (off + len <= next->r_off) + next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); + for (; next != NULL; next = AVL_NEXT(tree, next)) { + if (off + len <= next->lr_offset) goto got_lock; - if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { - if (!next->r_read_wanted) { - cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); - next->r_read_wanted = B_TRUE; + if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { + if (!next->lr_read_wanted) { + cv_init(&next->lr_read_cv, + NULL, CV_DEFAULT, NULL); + next->lr_read_wanted = B_TRUE; } - cv_wait(&next->r_rd_cv, &zp->z_range_lock); + cv_wait(&next->lr_read_cv, &rl->rl_lock); goto retry; } - if (off + len <= next->r_off + next->r_len) + if (off + len <= next->lr_offset + next->lr_length) goto got_lock; } got_lock: /* * Add the read lock, which may involve splitting existing - * locks and bumping ref counts (r_cnt). + * locks and bumping ref counts (r_count). */ - zfs_range_add_reader(tree, new, prev, where); + rangelock_add_reader(tree, new, prev, where); } /* - * Lock a range (offset, length) as either shared (RL_READER) - * or exclusive (RL_WRITER). Returns the range lock structure - * for later unlocking or reduce range (if entire file - * previously locked as RL_WRITER). + * Lock a range (offset, length) as either shared (RL_READER) or exclusive + * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert + * it to a RL_WRITER lock (with the offset at the end of the file). Returns + * the range lock structure for later unlocking (or reduce range if the + * entire file is locked as RL_WRITER). */ -rl_t * -zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) +locked_range_t * +rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, + rangelock_type_t type) { - rl_t *new; - ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); - new = kmem_alloc(sizeof (rl_t), KM_SLEEP); - new->r_zp = zp; - new->r_off = off; + locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); + new->lr_rangelock = rl; + new->lr_offset = off; if (len + off < off) /* overflow */ len = UINT64_MAX - off; - new->r_len = len; - new->r_cnt = 1; /* assume it's going to be in the tree */ - new->r_type = type; - new->r_proxy = B_FALSE; - new->r_write_wanted = B_FALSE; - new->r_read_wanted = B_FALSE; - - mutex_enter(&zp->z_range_lock); + new->lr_length = len; + new->lr_count = 1; /* assume it's going to be in the tree */ + new->lr_type = type; + new->lr_proxy = B_FALSE; + new->lr_write_wanted = B_FALSE; + new->lr_read_wanted = B_FALSE; + + mutex_enter(&rl->rl_lock); if (type == RL_READER) { /* * First check for the usual case of no locks */ - if (avl_numnodes(&zp->z_range_avl) == 0) - avl_add(&zp->z_range_avl, new); + if (avl_numnodes(&rl->rl_tree) == 0) + avl_add(&rl->rl_tree, new); else - zfs_range_lock_reader(zp, new); + rangelock_enter_reader(rl, new); } else - zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - mutex_exit(&zp->z_range_lock); + rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */ + mutex_exit(&rl->rl_lock); return (new); } @@ -460,10 +488,9 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) * Unlock a reader lock */ static void -zfs_range_unlock_reader(znode_t *zp, rl_t *remove) +rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove) { - avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next = NULL; + avl_tree_t *tree = &rl->rl_tree; uint64_t len; /* @@ -473,133 +500,118 @@ zfs_range_unlock_reader(znode_t *zp, rl_t *remove) * removed from the tree and replaced by proxies (one or * more ranges mapping to the entire range). */ - if (remove->r_cnt == 1) { + if (remove->lr_count == 1) { avl_remove(tree, remove); - if (remove->r_write_wanted) { - cv_broadcast(&remove->r_wr_cv); - cv_destroy(&remove->r_wr_cv); + if (remove->lr_write_wanted) { + cv_broadcast(&remove->lr_write_cv); + cv_destroy(&remove->lr_write_cv); } - if (remove->r_read_wanted) { - cv_broadcast(&remove->r_rd_cv); - cv_destroy(&remove->r_rd_cv); + if (remove->lr_read_wanted) { + cv_broadcast(&remove->lr_read_cv); + cv_destroy(&remove->lr_read_cv); } } else { - ASSERT0(remove->r_cnt); - ASSERT0(remove->r_write_wanted); - ASSERT0(remove->r_read_wanted); + ASSERT0(remove->lr_count); + ASSERT0(remove->lr_write_wanted); + ASSERT0(remove->lr_read_wanted); /* * Find start proxy representing this reader lock, * then decrement ref count on all proxies * that make up this range, freeing them as needed. */ - rl = avl_find(tree, remove, NULL); - ASSERT(rl); - ASSERT(rl->r_cnt); - ASSERT(rl->r_type == RL_READER); - for (len = remove->r_len; len != 0; rl = next) { - len -= rl->r_len; - if (len) { - next = AVL_NEXT(tree, rl); - ASSERT(next); - ASSERT(rl->r_off + rl->r_len == next->r_off); - ASSERT(next->r_cnt); - ASSERT(next->r_type == RL_READER); + locked_range_t *lr = avl_find(tree, remove, NULL); + ASSERT3P(lr, !=, NULL); + ASSERT3U(lr->lr_count, !=, 0); + ASSERT3U(lr->lr_type, ==, RL_READER); + locked_range_t *next = NULL; + for (len = remove->lr_length; len != 0; lr = next) { + len -= lr->lr_length; + if (len != 0) { + next = AVL_NEXT(tree, lr); + ASSERT3P(next, !=, NULL); + ASSERT3U(lr->lr_offset + lr->lr_length, ==, + next->lr_offset); + ASSERT3U(next->lr_count, !=, 0); + ASSERT3U(next->lr_type, ==, RL_READER); } - rl->r_cnt--; - if (rl->r_cnt == 0) { - avl_remove(tree, rl); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); + lr->lr_count--; + if (lr->lr_count == 0) { + avl_remove(tree, lr); + if (lr->lr_write_wanted) { + cv_broadcast(&lr->lr_write_cv); + cv_destroy(&lr->lr_write_cv); } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); + if (lr->lr_read_wanted) { + cv_broadcast(&lr->lr_read_cv); + cv_destroy(&lr->lr_read_cv); } - kmem_free(rl, sizeof (rl_t)); + kmem_free(lr, sizeof (locked_range_t)); } } } - kmem_free(remove, sizeof (rl_t)); + kmem_free(remove, sizeof (locked_range_t)); } /* * Unlock range and destroy range lock structure. */ void -zfs_range_unlock(rl_t *rl) +rangelock_exit(locked_range_t *lr) { - znode_t *zp = rl->r_zp; + rangelock_t *rl = lr->lr_rangelock; - ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); - ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); - ASSERT(!rl->r_proxy); + ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); + ASSERT(lr->lr_count == 1 || lr->lr_count == 0); + ASSERT(!lr->lr_proxy); - mutex_enter(&zp->z_range_lock); - if (rl->r_type == RL_WRITER) { + mutex_enter(&rl->rl_lock); + if (lr->lr_type == RL_WRITER) { /* writer locks can't be shared or split */ - avl_remove(&zp->z_range_avl, rl); - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) { - cv_broadcast(&rl->r_wr_cv); - cv_destroy(&rl->r_wr_cv); + avl_remove(&rl->rl_tree, lr); + mutex_exit(&rl->rl_lock); + if (lr->lr_write_wanted) { + cv_broadcast(&lr->lr_write_cv); + cv_destroy(&lr->lr_write_cv); } - if (rl->r_read_wanted) { - cv_broadcast(&rl->r_rd_cv); - cv_destroy(&rl->r_rd_cv); + if (lr->lr_read_wanted) { + cv_broadcast(&lr->lr_read_cv); + cv_destroy(&lr->lr_read_cv); } - kmem_free(rl, sizeof (rl_t)); + kmem_free(lr, sizeof (locked_range_t)); } else { /* - * lock may be shared, let zfs_range_unlock_reader() + * lock may be shared, let rangelock_exit_reader() * release the lock and free the rl_t */ - zfs_range_unlock_reader(zp, rl); - mutex_exit(&zp->z_range_lock); + rangelock_exit_reader(rl, lr); + mutex_exit(&rl->rl_lock); } } /* * Reduce range locked as RL_WRITER from whole file to specified range. - * Asserts the whole file is exclusivly locked and so there's only one + * Asserts the whole file is exclusively locked and so there's only one * entry in the tree. */ void -zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) +rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) { - znode_t *zp = rl->r_zp; + rangelock_t *rl = lr->lr_rangelock; /* Ensure there are no other locks */ - ASSERT(avl_numnodes(&zp->z_range_avl) == 1); - ASSERT(rl->r_off == 0); - ASSERT(rl->r_type == RL_WRITER); - ASSERT(!rl->r_proxy); - ASSERT3U(rl->r_len, ==, UINT64_MAX); - ASSERT3U(rl->r_cnt, ==, 1); - - mutex_enter(&zp->z_range_lock); - rl->r_off = off; - rl->r_len = len; - mutex_exit(&zp->z_range_lock); - if (rl->r_write_wanted) - cv_broadcast(&rl->r_wr_cv); - if (rl->r_read_wanted) - cv_broadcast(&rl->r_rd_cv); -} - -/* - * AVL comparison function used to order range locks - * Locks are ordered on the start offset of the range. - */ -int -zfs_range_compare(const void *arg1, const void *arg2) -{ - const rl_t *rl1 = arg1; - const rl_t *rl2 = arg2; - - if (rl1->r_off > rl2->r_off) - return (1); - if (rl1->r_off < rl2->r_off) - return (-1); - return (0); + ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); + ASSERT3U(lr->lr_offset, ==, 0); + ASSERT3U(lr->lr_type, ==, RL_WRITER); + ASSERT(!lr->lr_proxy); + ASSERT3U(lr->lr_length, ==, UINT64_MAX); + ASSERT3U(lr->lr_count, ==, 1); + + mutex_enter(&rl->rl_lock); + lr->lr_offset = off; + lr->lr_length = len; + mutex_exit(&rl->rl_lock); + if (lr->lr_write_wanted) + cv_broadcast(&lr->lr_write_cv); + if (lr->lr_read_wanted) + cv_broadcast(&lr->lr_read_cv); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 475020a20a..a7493bba30 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -513,7 +513,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes; int error = 0; - rl_t *rl; xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); @@ -560,7 +559,8 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * Lock the range against changes. */ - rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip @@ -623,7 +623,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } out: - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); @@ -663,7 +663,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zilog_t *zilog; offset_t woff; ssize_t n, nbytes; - rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; int prev_error; @@ -743,7 +742,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Check for mandatory locks before calling zfs_range_lock() + * Check for mandatory locks before calling rangelock_enter() * in order to prevent a deadlock with locks set via fcntl(). */ if (MANDMODE((mode_t)zp->z_mode) && @@ -755,14 +754,15 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) /* * If in append mode, set the io offset pointer to eof. */ + locked_range_t *lr; if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ - rl = zfs_range_lock(zp, 0, n, RL_APPEND); - woff = rl->r_off; - if (rl->r_len == UINT64_MAX) { + lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. @@ -777,11 +777,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ - rl = zfs_range_lock(zp, woff, n, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (woff >= limit) { - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } @@ -862,12 +862,12 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * If zfs_range_lock() over-locked we grow the blocksize + * If rangelock_enter() over-locked we grow the blocksize * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ - if (rl->r_len == UINT64_MAX) { + if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { @@ -883,7 +883,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); - zfs_range_reduce(rl, woff, n); + rangelock_reduce(lr, woff, n); } /* @@ -998,7 +998,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } - zfs_range_unlock(rl); + rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. @@ -1027,7 +1027,7 @@ zfs_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the @@ -1089,7 +1089,8 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -1110,12 +1111,12 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; - zgd->zgd_rl = zfs_range_lock(zp, offset, size, - RL_READER); + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) @@ -4317,7 +4318,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, size_t io_len; u_offset_t io_off; uint_t blksz; - rl_t *rl; + locked_range_t *lr; int error = 0; ZFS_ENTER(zfsvfs); @@ -4352,15 +4353,16 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, /* * Search the entire vp list for pages >= io_off. */ - rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, + io_off, UINT64_MAX, RL_WRITER); error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } - rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER); if (off > zp->z_size) { /* past end of file */ - zfs_range_unlock(rl); + rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } @@ -4390,7 +4392,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: - zfs_range_unlock(rl); + rangelock_exit(lr); if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 93545ee4a1..536216deaf 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -110,6 +110,37 @@ znode_evict_error(dmu_buf_t *dbuf, void *user_ptr) panic("evicting znode %p\n", user_ptr); } +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + /*ARGSUSED*/ static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) @@ -131,9 +162,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zp->z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); + rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; @@ -155,8 +184,7 @@ zfs_znode_cache_destructor(void *buf, void *arg) rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); - avl_destroy(&zp->z_range_avl); - mutex_destroy(&zp->z_range_lock); + rangelock_fini(&zp->z_rangelock); ASSERT(zp->z_dirlocks == NULL); ASSERT(zp->z_acl_cached == NULL); @@ -191,7 +219,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_id = ozp->z_id; ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ - ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); nzp->z_unlinked = ozp->z_unlinked; nzp->z_atime_dirty = ozp->z_atime_dirty; nzp->z_zn_prefetch = ozp->z_zn_prefetch; @@ -1470,20 +1497,20 @@ zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; dmu_tx_t *tx; - rl_t *rl; + locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1513,7 +1540,7 @@ zfs_extend(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1525,7 +1552,7 @@ zfs_extend(znode_t *zp, uint64_t end) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), &zp->z_size, sizeof (zp->z_size), tx)); - zfs_range_unlock(rl); + rangelock_exit(lr); dmu_tx_commit(tx); @@ -1545,19 +1572,19 @@ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; - rl_t *rl; + locked_range_t *lr; int error; /* * Lock the range being freed. */ - rl = zfs_range_lock(zp, off, len, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } @@ -1566,7 +1593,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1585,7 +1612,7 @@ zfs_trunc(znode_t *zp, uint64_t end) zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); dmu_tx_t *tx; - rl_t *rl; + locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; @@ -1593,20 +1620,20 @@ zfs_trunc(znode_t *zp, uint64_t end) /* * We will change zp_size, lock the whole file. */ - rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); @@ -1616,7 +1643,7 @@ zfs_trunc(znode_t *zp, uint64_t end) error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); return (error); } @@ -1657,7 +1684,7 @@ zfs_trunc(znode_t *zp, uint64_t end) ASSERT(error == 0); } - zfs_range_unlock(rl); + rangelock_exit(lr); return (0); } diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 1e787f31b2..196c98c602 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -92,6 +92,7 @@ #include <sys/zil_impl.h> #include <sys/ht.h> #include <sys/dkioc_free_util.h> +#include <sys/zfs_rlock.h> #include "zfs_namecheck.h" @@ -130,7 +131,7 @@ typedef struct zvol_state { uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ - znode_t zv_znode; /* for range locking */ + rangelock_t zv_rangelock; dnode_t *zv_dn; /* dnode hold */ } zvol_state_t; @@ -560,9 +561,7 @@ zvol_create_minor(const char *name) zv->zv_objset = os; if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; - mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, - sizeof (rl_t), offsetof(rl_t, r_node)); + rangelock_init(&zv->zv_rangelock, NULL, NULL); list_create(&zv->zv_extents, sizeof (zvol_extent_t), offsetof(zvol_extent_t, ze_node)); /* get and cache the blocksize */ @@ -605,8 +604,7 @@ zvol_remove_zv(zvol_state_t *zv) (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor); ddi_remove_minor_node(zfs_dip, nmbuf); - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); + rangelock_fini(&zv->zv_rangelock); kmem_free(zv, sizeof (zvol_state_t)); @@ -987,7 +985,7 @@ zvol_get_done(zgd_t *zgd, int error) if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); - zfs_range_unlock(zgd->zgd_rl); + rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } @@ -1020,7 +1018,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -1033,7 +1031,7 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) */ size = zv->zv_volblocksize; offset = P2ALIGN(offset, size); - zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, + zgd->zgd_lr = rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, DMU_READ_NO_PREFETCH); @@ -1229,7 +1227,6 @@ zvol_strategy(buf_t *bp) size_t resid; char *addr; objset_t *os; - rl_t *rl; int error = 0; boolean_t doread = bp->b_flags & B_READ; boolean_t is_dumpified; @@ -1287,7 +1284,7 @@ zvol_strategy(buf_t *bp) * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ - rl = zfs_range_lock(&zv->zv_znode, off, resid, + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); while (resid != 0 && off < volsize) { @@ -1321,7 +1318,7 @@ zvol_strategy(buf_t *bp) addr += size; resid -= size; } - zfs_range_unlock(rl); + rangelock_exit(lr); if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); @@ -1392,7 +1389,6 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) minor_t minor = getminor(dev); zvol_state_t *zv; uint64_t volsize; - rl_t *rl; int error = 0; zone_t *zonep = curzone; uint64_t tot_bytes; @@ -1423,8 +1419,8 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) start = gethrtime(); tot_bytes = 0; - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_READER); + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); @@ -1441,7 +1437,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) break; } } - zfs_range_unlock(rl); + rangelock_exit(lr); mutex_enter(&zonep->zone_vfs_lock); zonep->zone_vfs_rwstats.reads++; @@ -1487,7 +1483,6 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) minor_t minor = getminor(dev); zvol_state_t *zv; uint64_t volsize; - rl_t *rl; int error = 0; boolean_t sync; zone_t *zonep = curzone; @@ -1527,8 +1522,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); - rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, - RL_WRITER); + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, + uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; @@ -1552,7 +1547,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (error) break; } - zfs_range_unlock(rl); + rangelock_exit(lr); + if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); @@ -1678,7 +1674,7 @@ zvol_get_volume_params(minor_t minor, uint64_t *blksize, *minor_hdl = zv; *objset_hdl = zv->zv_objset; *zil_hdl = zv->zv_zilog; - *rl_hdl = &zv->zv_znode; + *rl_hdl = &zv->zv_rangelock; *dnode_hdl = zv->zv_dn; return (0); } @@ -1757,7 +1753,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zvol_state_t *zv; struct dk_callback *dkc; int error = 0; - rl_t *rl; + locked_range_t *lr; mutex_enter(&zfsdev_state_lock); @@ -1882,19 +1878,19 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) break; case DKIOCDUMPINIT: - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dumpify(zv); - zfs_range_unlock(rl); + rangelock_exit(lr); break; case DKIOCDUMPFINI: if (!(zv->zv_flags & ZVOL_DUMPIFIED)) break; - rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, + lr = rangelock_enter(&zv->zv_rangelock, 0, zv->zv_volsize, RL_WRITER); error = zvol_dump_fini(zv); - zfs_range_unlock(rl); + rangelock_exit(lr); break; case DKIOCFREE: @@ -1939,7 +1935,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) length = end - start; } - rl = zfs_range_lock(&zv->zv_znode, start, length, + lr = rangelock_enter(&zv->zv_rangelock, start, length, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, TXG_WAIT); @@ -1953,7 +1949,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) ZVOL_OBJ, start, length); } - zfs_range_unlock(rl); + rangelock_exit(lr); if (error != 0) break; diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c index 9fe77e88c4..6fdec0d4d2 100644 --- a/usr/src/uts/common/inet/mi.c +++ b/usr/src/uts/common/inet/mi.c @@ -86,7 +86,7 @@ typedef struct mtb_s { clock_t mtb_time_left; } MTB, *MTBP; -static int mi_timer_fire(MTBP); +static void mi_timer_fire(void *); static int mi_iprintf(char *, va_list, pfi_t, char *); static void mi_tpi_addr_and_opt(MBLKP, char *, t_scalar_t, char *, t_scalar_t); static MBLKP mi_tpi_trailer_alloc(MBLKP, size_t, t_scalar_t); @@ -202,10 +202,10 @@ mi_close_free(IDP ptr) void mi_copyin(queue_t *q, MBLKP mp, char *uaddr, size_t len) { - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - struct copyreq *cq = (struct copyreq *)mp->b_rptr; - struct copyresp *cp = (struct copyresp *)mp->b_rptr; - int err; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + struct copyreq *cq = (struct copyreq *)mp->b_rptr; + struct copyresp *cp = (struct copyresp *)mp->b_rptr; + int err; MBLKP mp1; ASSERT(mp->b_datap->db_type == M_IOCTL && !uaddr); @@ -293,7 +293,7 @@ err_ret: void mi_copyin_n(queue_t *q, MBLKP mp, size_t offset, size_t len) { - struct copyreq *cq = (struct copyreq *)mp->b_rptr; + struct copyreq *cq = (struct copyreq *)mp->b_rptr; ASSERT(mp->b_datap->db_type == M_IOCDATA); @@ -1124,7 +1124,7 @@ mi_timer(queue_t *q, MBLKP mp, clock_t tim) } } mtb->mtb_state = TB_RUNNING; - mtb->mtb_tid = timeout((pfv_t)mi_timer_fire, mtb, tim); + mtb->mtb_tid = timeout(mi_timer_fire, mtb, tim); return; } switch (tim) { @@ -1172,12 +1172,14 @@ mi_timer_alloc(size_t size) * it has fired then mi_timer() and mi_timer_valid() will clean * things up. */ -static int -mi_timer_fire(MTBP mtb) +static void +mi_timer_fire(void *ptr) { + MTBP mtb = ptr; + ASSERT(mtb == (MTBP)mtb->mtb_mp->b_datap->db_base); ASSERT(mtb->mtb_mp->b_datap->db_type == M_PCSIG); - return (putq(mtb->mtb_q, mtb->mtb_mp)); + (void) putq(mtb->mtb_q, mtb->mtb_mp); } /* @@ -1253,7 +1255,7 @@ mi_timer_move(queue_t *q, MBLKP mp) } mtb->mtb_q = q; mtb->mtb_state = TB_RUNNING; - mtb->mtb_tid = timeout((pfv_t)mi_timer_fire, mtb, tim); + mtb->mtb_tid = timeout(mi_timer_fire, mtb, tim); } else if (mtb->mtb_state != TB_IDLE) { ASSERT(mtb->mtb_state != TB_TO_BE_FREED); /* @@ -1340,7 +1342,7 @@ mi_timer_valid(MBLKP mp) * the timer was restarted with. */ mtb->mtb_state = TB_RUNNING; - mtb->mtb_tid = timeout((pfv_t)mi_timer_fire, + mtb->mtb_tid = timeout(mi_timer_fire, mtb, mtb->mtb_time_left); return (B_FALSE); } diff --git a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c index bf9a369506..1f4dd29b18 100644 --- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c +++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. */ #include <sys/conf.h> @@ -40,6 +40,7 @@ #include <sys/arc.h> #include <sys/zvol.h> #include <sys/zfs_rlock.h> +#include <sys/zil.h> #include <sys/stmf.h> #include <sys/lpif.h> @@ -74,8 +75,8 @@ * dmu_tx_abort(tx) * zil_commit() * - * zfs_range_lock() - * zfs_range_unlock() + * rangelock_enter() + * rangelock_exit() * * zvol_log_write() * @@ -87,7 +88,7 @@ * zv_flags - for WCE * zv_objset - dmu_tx_create * zv_zilog - zil_commit - * zv_znode - zfs_range_lock + * zv_znode - rangelock_enter * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf * GLOBAL DATA * zvol_maxphys @@ -113,7 +114,7 @@ sbd_zvol_get_volume_params(sbd_lu_t *sl) &sl->sl_zvol_minor_hdl, /* minor soft state */ &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ &sl->sl_zvol_zil_hdl, /* zil_commit */ - &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ + &sl->sl_zvol_rl_hdl, /* locked_range_t */ &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */ /* dmu_request_arcbuf, */ /* dmu_assign_arcbuf */ @@ -153,7 +154,7 @@ int sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) { sbd_zvol_io_t *zvio = dbuf->db_lu_private; - rl_t *rl; + locked_range_t *lr; int numbufs, error; uint64_t len = dbuf->db_data_size; uint64_t offset = zvio->zvio_offset; @@ -169,13 +170,13 @@ sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) * The range lock is only held until the dmu buffers read in and * held; not during the callers use of the data. */ - rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); + lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_READER); error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl, offset, len, TRUE, RDTAG, &numbufs, &dbpp, DMU_READ_PREFETCH); - zfs_range_unlock(rl); + rangelock_exit(lr); if (error == ECKSUM) error = EIO; @@ -337,7 +338,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) sbd_zvol_io_t *zvio = dbuf->db_lu_private; dmu_tx_t *tx; int sync, i, error; - rl_t *rl; + locked_range_t *lr; arc_buf_t **abp = zvio->zvio_abp; int flags = zvio->zvio_flags; uint64_t toffset, offset = zvio->zvio_offset; @@ -345,7 +346,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT); - rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); + lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); tx = dmu_tx_create(sl->sl_zvol_objset_hdl); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len); @@ -353,7 +354,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) if (error) { dmu_tx_abort(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); sbd_zvol_rele_write_bufs_abort(sl, dbuf); return (error); } @@ -377,7 +378,7 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, (ssize_t)len, sync); dmu_tx_commit(tx); - zfs_range_unlock(rl); + rangelock_exit(lr); kmem_free(zvio->zvio_abp, sizeof (arc_buf_t *) * dbuf->db_sglist_length); zvio->zvio_abp = NULL; @@ -393,8 +394,6 @@ sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) int sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) { - int error; - rl_t *rl; uint64_t len = (uint64_t)uio->uio_resid; uint64_t offset = (uint64_t)uio->uio_loffset; @@ -404,11 +403,11 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) return (EIO); - rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); + locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, + RL_READER); + int error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); + rangelock_exit(lr); - error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); - - zfs_range_unlock(rl); if (error == ECKSUM) error = EIO; return (error); @@ -421,7 +420,6 @@ sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) int sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) { - rl_t *rl; dmu_tx_t *tx; int error, sync; uint64_t len = (uint64_t)uio->uio_resid; @@ -435,8 +433,8 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) return (EIO); - rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); - + locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, + RL_WRITER); sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); tx = dmu_tx_create(sl->sl_zvol_objset_hdl); @@ -452,7 +450,8 @@ sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) } dmu_tx_commit(tx); } - zfs_range_unlock(rl); + rangelock_exit(lr); + if (sync && (flags & ZVIO_COMMIT)) zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); if (error == ECKSUM) |