[illumos-gate merge]

commit 39fd84a866206a99cbb6b6e63e0c38a367aaa88e 6119 mptsas doesn't handle timeouts in mptsas_get_sata_guid() commit 6d532798b6559eb98b586fd17725d8093f3b9ade 6168 strlcpy() does not return s1 commit 3c9168fa8e9c30d55b3aa2fde74bd7da46df53f5 4053 Add NVME Driver Support to Illumos commit b08923d6c9c63a4f4b647b84d9454d8124fcedd7 6210 ping can misreport ICMP latency 6211 want warnings in the face of long running name lookups for ping 6212 Want sub-second ping interval support 6213 clean up warnings in ping commit 0d045c0d0cb001d79480ee33be28514e847f8612 6209 libc mutexes break kernel writers hearts Conflicts: usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c usr/src/lib/libc/port/threads/assfail.c usr/src/uts/common/Makefile.files usr/src/uts/common/Makefile.rules usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
author: Patrick Mooney <patrick.f.mooney@gmail.com> 2015-09-16 14:55:15 +0000
committer: Patrick Mooney <patrick.f.mooney@gmail.com> 2015-09-16 14:55:15 +0000
commit: 31a74c182cf2b5150a704786c53c053fd31b4c6d (patch)
tree: 5eb53f7e1729ae4bf03e842ae3333e511b1a588d /usr/src
parent: a845c808b8c12dd241b837bd48ae775b26d458fe (diff)
parent: 39fd84a866206a99cbb6b6e63e0c38a367aaa88e (diff)
download: illumos-joyent-31a74c182cf2b5150a704786c53c053fd31b4c6d.tar.gz
19 files changed, 4170 insertions, 110 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
index de617f7c43..504d64b4ef 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
@@ -49,7 +49,8 @@ C99LMODE=	-Xc99=%all
 # with those in the header files. Since we need these features the best
 # course of action is to switch the types of the resulting warnings off
 # when running lint.
-LINTFLAGS += -erroff=E_INCONS_VAL_TYPE_DECL2 -erroff=E_INCONS_ARG_DECL2 
+LINTFLAGS += -erroff=E_INCONS_VAL_TYPE_DECL2 -erroff=E_INCONS_ARG_DECL2 \
+		-erroff=E_NAME_USED_NOT_DEF2
 
 .KEEP_STATE:
 .PARALLEL:
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
index 715e6f317a..2d79419245 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
@@ -176,7 +176,7 @@ static timer_t timer;			/* timer for waiting */
 static volatile boolean_t timer_done = _B_FALSE; /* timer finished? */
 static struct itimerspec interval = { { 0, 0 }, { 1, 0 } }; /* Interval for */
 					/* -I. The default interval is 1s. */
-static hrtime_t mintime = 500000000;	/* minimum time between pings */
+static hrtime_t mintime = NSEC2MSEC(500);	/* minimum time between pings */
 
 /*
  * Globals for our name services warning. See ns_warning_thr() for more on why
@@ -725,16 +725,12 @@ main(int argc, char *argv[])
 	}
 
 	/*
-	 * Finally start up the name services warning thread. Note, we don't
-	 * consider failures of this to be fatal. Importantly, if we're out of
-	 * memory, than we'd really rather let ping keep working, because the
-	 * administrator is probably having a bad day -- let's not make it
-	 * worse.
+	 * Finally start up the name services warning thread.
 	 */
 	if (thr_create(NULL, 0, ns_warning_thr, NULL,
 	    THR_DETACHED | THR_DAEMON, NULL) != 0) {
-		Fprintf(stderr, "%s: %s\n",
-		    progname, strerror(errno));
+		Fprintf(stderr, "%s: failed to create name services "
+		    "thread: %s\n", progname, strerror(errno));
 		exit(EXIT_FAILURE);
 	}
 
diff --git a/usr/src/lib/libc/port/threads/assfail.c b/usr/src/lib/libc/port/threads/assfail.c
index 92dd42693f..b40e6dc029 100644
--- a/usr/src/lib/libc/port/threads/assfail.c
+++ b/usr/src/lib/libc/port/threads/assfail.c
@@ -25,7 +25,7 @@
  */
 /*
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include "lint.h"
@@ -36,7 +36,8 @@ ulwp_t *panic_thread;
 
 static mutex_t assert_lock = DEFAULTMUTEX;
 static ulwp_t *assert_thread = NULL;
-static mutex_t *panic_mutex = NULL;
+
+mutex_t *panic_mutex = NULL;
 
 /*
  * Called from __assert() to set panicstr and panic_thread.
diff --git a/usr/src/man/man3c/string.3c b/usr/src/man/man3c/string.3c
index cc934221e3..882705284d 100644
--- a/usr/src/man/man3c/string.3c
+++ b/usr/src/man/man3c/string.3c
@@ -13,7 +13,7 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH STRING 3C "Dec 20, 2014"
+.TH STRING 3C "Sep 14, 2015"
 .SH NAME
 string, strcasecmp, strcasecmp_l, strncasecmp, strncasecmp_l, strcat, strncat,
 strlcat, strchr, strchrnul, strrchr, strcmp, strncmp, stpcpy, stpncpy, strcpy,
@@ -285,9 +285,10 @@ The \fBstrcpy()\fR function copies string \fIs2\fR to \fIs1\fR, including the
 terminating null character, stopping after the null character has been copied.
 The \fBstrncpy()\fR function copies exactly \fIn\fR bytes, truncating \fIs2\fR
 or adding null characters to \fIs1\fR if necessary. The result will not be
-null-terminated if the length of \fIs2\fR is \fIn\fR or more. Each function
-returns \fIs1\fR.  If copying takes place between objects that overlap, the
-behavior of \fBstrcpy()\fR, \fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
+null-terminated if the length of \fIs2\fR is \fIn\fR or more. Both the
+\fBstrcpy()\fR and \fBstrncpy()\fR functions return \fIs1\fR. If copying takes
+place between objects that overlap, the behavior of \fBstrcpy()\fR,
+\fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
 .LP
 The \fBstrlcpy()\fR function copies  at most \fIdstsize\fR\(mi1 characters
 (\fIdstsize\fR being the  size of the  string buffer \fIdst\fR) from \fIsrc\fR
diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile
index d940a33833..6e8550e309 100644
--- a/usr/src/man/man7d/Makefile
+++ b/usr/src/man/man7d/Makefile
@@ -11,7 +11,7 @@
 
 #
 # Copyright 2011, Richard Lowe
-# Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 # Copyright 2014 Garrett D'Amore <garrett@damore.org>
 # Copyright 2014 Joyent, Inc. All rights reserved.
 #
@@ -222,6 +222,7 @@ i386_MANFILES=	ahci.7d		\
 		npe.7d		\
 		ntxn.7d		\
 		nv_sata.7d	\
+		nvme.7d		\
 		pcn.7d		\
 		radeon.7d	\
 		ral.7d		\
diff --git a/usr/src/man/man7d/nvme.7d b/usr/src/man/man7d/nvme.7d
new file mode 100644
index 0000000000..7742fc22f6
--- /dev/null
+++ b/usr/src/man/man7d/nvme.7d
@@ -0,0 +1,95 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source.  A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+.\"
+.Dd July 20, 2015
+.Dt NVME 7D
+.Os
+.Sh NAME
+.Nm nvme
+.Nd Intel NVMe compliant storage driver
+.Sh DESCRIPTION
+The
+.Nm
+driver uses the
+.Xr blkdev 7D
+framework to provide access to
+.Tn Intel
+NVMe compliant solid-state storage devices.
+.Lp
+NVMe devices supporting multiple namespaces will present each
+namespace as its own
+.Xr blkdev 7D
+instance in the system.
+.
+.Sh CONFIGURATION
+The
+.Nm
+driver can be configured by defining properties in the \fBnvme.conf\fR
+file. The parameters are considered an unstable interface, subject to
+change without notice. The following properties are currently
+supported:
+.Bl -tag -width Va
+.It Va strict-version
+This can be set to 0 to allow
+.Nm
+to attach to devices supporting newer version of the NVMe
+specification. The default value is 1, limiting
+.Nm
+to work with devices up to specification version 1.0.
+.It Va ignore-unknown-vendor-status
+This can be set to 1 to allow
+.Nm
+to continue operating even if it receives an unknown vendor command
+status.
+.It Va admin-queue-len
+This is the number of entries in the admin command queue. Legal values
+are between 16 and 4096, the default value is 256.
+.It Va io-queue-len
+This is the number of entries in each I/O command queue. Legal values
+are between 16 and 65536, the default value is 1024.
+.It Va async-event-limit
+This is the maximum number of asynchronous event requests issued by
+the driver. Asynchronous events are used to report error conditions.
+The driver will never use more asynchronous events than this value, or
+what the hardware supports if it is less, or what 1/10th of the admin
+queue length if it is less.
+.El
+.
+.Sh FILES
+.Bl -tag -compact -width Pa
+.It Pa /dev/dsk/cntnd0sn
+Block device minor nodes.
+.It Pa /dev/rdsk/cntnd0sn
+Raw block device minor nodes.
+.El
+.Lp
+In the device minor nodes, the following substitutions may occur:
+.Bl -tag -offset indent -width Va
+.It Va cn
+A controller number, typically one for each
+.Nm
+device found. Controller numbers are dynamically assigned by the
+system.
+.It Va tn
+The target number, this corresponds to the namespace ID used by the
+hardware. Namespace ID 0 is reserved, hence target numbers start with
+1.
+.It Va sn
+This is the
+.Em slice
+number, representing a subset of the disk.  See
+.Xr dkio 7I .
+.El
+.
+.Sh SEE ALSO
+.Xr blkdev 7D
diff --git a/usr/src/man/man9f/string.9f b/usr/src/man/man9f/string.9f
index 5300eb52d6..fccd40887b 100644
--- a/usr/src/man/man9f/string.9f
+++ b/usr/src/man/man9f/string.9f
@@ -3,7 +3,7 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH STRING 9F "Jun 4, 2014"
+.TH STRING 9F "Sep 14, 2015"
 .SH NAME
 string, strcasecmp, strncasecmp, strncat, strlcat, strchr, strrchr, strcmp,
 strncmp, strcpy, strncpy, strlcpy, strfree, strspn, strdup, ddi_strdup, strlen,
@@ -98,11 +98,9 @@ strnlen \- string operations
 .fi
 
 .SH INTERFACE LEVEL
-.sp
 .LP
 Solaris DDI specific (Solaris DDI).
 .SH DESCRIPTION
-.sp
 .LP
 The arguments \fIs\fR, \fIs1\fR, and \fIs2\fR point to strings (arrays of
 characters terminated by a null character). The \fBstrcat()\fR,
@@ -111,14 +109,12 @@ characters terminated by a null character). The \fBstrcat()\fR,
 Additionally, the \fBstrcpy()\fR function does not check for overflow of the
 array.
 .SS "\fBstrcasecmp()\fR, \fBstrncasecmp()\fR"
-.sp
 .LP
 The \fBstrcasecmp()\fR and \fBstrncasecmp()\fR functions are case-insensitive
 versions of  \fBstrcmp()\fR and \fBstrncmp()\fR respectively, described below.
 They assume the \fBASCII\fR character set and ignore differences in case when
 comparing lower and upper case characters.
 .SS "\fBstrncat()\fR, \fBstrlcat()\fR"
-.sp
 .LP
 The \fBstrncat()\fR function appends at most \fIn\fR characters of string
 \fIs2\fR, including the terminating null character, to the end of string
@@ -150,7 +146,6 @@ if (strlcat(dst, src, dstsize) >= dstsize)
 .in -2
 
 .SS "\fBstrchr()\fR, \fBstrrchr()\fR"
-.sp
 .LP
 The \fBstrchr()\fR function returns a pointer to the first occurrence of
 \fIc\fR (converted to a  \fBchar\fR) in string \fIs\fR, or a null pointer if
@@ -158,7 +153,6 @@ The \fBstrchr()\fR function returns a pointer to the first occurrence of
 pointer to the last occurrence of \fIc\fR. The null character terminating a
 string is considered to be part of the string.
 .SS "\fBstrcmp()\fR, \fBstrncmp()\fR"
-.sp
 .LP
 The \fBstrcmp()\fR function compares two strings byte-by-byte, according to the
 ordering of your machine's character set.  The function returns an integer
@@ -170,15 +164,15 @@ strings being compared. The \fBstrncmp()\fR function makes the same comparison
 but looks at a maximum of \fIn\fR bytes. Bytes following a null byte are not
 compared.
 .SS "\fBstrcpy()\fR, \fBstrncpy()\fR, \fBstrlcpy()\fR"
-.sp
 .LP
 The \fBstrcpy()\fR function copies string \fIs2\fR to \fIs1\fR, including the
 terminating null character, stopping after the null character has been copied.
 The \fBstrncpy()\fR function copies exactly \fIn\fR bytes, truncating \fIs2\fR
 or adding null characters to \fIs1\fR if necessary. The result will not be
-null-terminated if the length of \fIs2\fR is \fIn\fR or more. Each function
-returns \fIs1\fR.  If copying takes place between objects that overlap, the
-behavior of \fBstrcpy()\fR, \fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
+null-terminated if the length of \fIs2\fR is \fIn\fR or more. Both the
+\fBstrcpy()\fR and \fBstrncpy()\fR functions return \fIs1\fR. If copying takes
+place between objects that overlap, the behavior of \fBstrcpy()\fR,
+\fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
 .sp
 .LP
 The \fBstrlcpy()\fR function copies  at most \fIdstsize\fR\(mi1 characters
@@ -195,19 +189,16 @@ if (strlcpy(dst, src, dstsize) >= dstsize)
 .in -2
 
 .SS "\fBstrfree()\fR"
-.sp
 .LP
 The \fBstrfree()\fR function frees the memory associated with the string
 pointed to by \fIs\fR.  This memory pointed to by \fIs\fR must be of size
 \fBstrlen\fR(\fIs\fR)+1, and must have been allocated (either directly or
 indirectly) by \fBkmem_alloc\fR(9F) or \fBkmem_zalloc\fR(9F).
 .SS "\fBstrspn()\fR"
-.sp
 .LP
 The \fBstrspn()\fR function returns the length of the initial segment of string
 \fIs1\fR that consists entirely of characters from string \fIs2\fR.
 .SS "\fBstrdup()\fR, \fBddi_strdup()\fR"
-.sp
 .LP
 The \fBddi_strdup()\fR function returns a pointer to a new string that is a
 duplicate of the string pointed to by \fIs1\fR. The returned pointer can be
@@ -223,7 +214,6 @@ The \fBstrdup()\fR function behaves the same as the \fBddi_strdup()\fR when
 called with the \fBKM_SLEEP\fR flag. This means that \fBstrdup()\fR can sleep
 until memory is available and will always succeed.
 .SS "\fBstrlen()\fR, \fBstrnlen()\fR"
-.sp
 .LP
 The \fBstrlen()\fR function returns the number of bytes in \fIs\fR, not
 including the terminating null character.
@@ -234,7 +224,6 @@ bytes in \fIs\fR, not including the terminating null character. The
 \fBstrnlen()\fR function never examines more than \fIn\fR bytes of the string
 pointed to by \fIs\fR.
 .SH CONTEXT
-.sp
 .LP
 The \fBstrdup()\fR and \fBddi_strdup()\fR functions can be called from user or
 kernel context.
@@ -247,7 +236,6 @@ the \fBKM_NOSLEEP\fR flag is set.
 All the other string manipulation functions can be called from user, interrupt,
 or kernel context.
 .SH ATTRIBUTES
-.sp
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
@@ -263,7 +251,6 @@ Interface Stability	Committed
 .TE
 
 .SH SEE ALSO
-.sp
 .LP
 \fBstring\fR(3C), \fBattributes\fR(5), \fBbcopy\fR(9F), \fBddi_copyin\fR(9F),
 \fBkmem_alloc\fR(9F)
@@ -271,7 +258,6 @@ Interface Stability	Committed
 .LP
 \fIWriting Device Drivers\fR
 .SH NOTES
-.sp
 .LP
 If copying takes place between objects that overlap, the behavior of
 \fBstrlcat()\fR, \fBstrncat()\fR, \fBstrcpy()\fR, \fBstrlcpy()\fR, and
diff --git a/usr/src/pkg/manifests/driver-storage-nvme.mf b/usr/src/pkg/manifests/driver-storage-nvme.mf
new file mode 100644
index 0000000000..3296a3beef
--- /dev/null
+++ b/usr/src/pkg/manifests/driver-storage-nvme.mf
@@ -0,0 +1,45 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+# The default for payload-bearing actions in this package is to appear in the
+# global zone only.  See the include file for greater detail, as well as
+# information about overriding the defaults.
+#
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/driver/storage/nvme@$(PKGVERS)
+set name=pkg.description \
+    value="Driver for Intel NVMe 1.0e compliant storage devices"
+set name=pkg.summary value="NVMe driver"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Hardware
+set name=variant.arch value=i386
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+dir path=usr group=sys
+dir path=usr/share
+dir path=usr/share/man
+dir path=usr/share/man/man7d
+driver name=nvme alias=pciex8086,953 class=disk perms="* 0600 root sys"
+file path=kernel/drv/$(ARCH64)/nvme group=sys
+file path=kernel/drv/nvme group=sys
+file path=kernel/drv/nvme.conf group=sys
+file path=usr/share/man/man7d/nvme.7d
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 61f1153e12..3555a86b48 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -24,7 +24,7 @@
 # Copyright (c) 2012 Joyent, Inc.  All rights reserved.
 # Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
-# Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 # Copyright 2015, Joyent, Inc.
 #
 
@@ -1900,6 +1900,8 @@ YGE_OBJS = yge.o
 
 SKD_OBJS = skd.o
 
+NVME_OBJS = nvme.o
+
 #
 #	Build up defines and paths.
 #
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index b6efec8b69..4bf1113182 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -22,7 +22,7 @@
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2013 Garrett D'Amore <garrett@damore.org>
-# Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 # Copyright 2015 Joyent, Inc.
 #
 
@@ -1019,6 +1019,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nge/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nvme/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nxge/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -2387,6 +2391,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nfp/%.c
 $(LINTS_DIR)/%.ln:              $(UTSBASE)/common/io/nge/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nvme/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nxge/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
new file mode 100644
index 0000000000..5914ca0226
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -0,0 +1,2819 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * blkdev driver for NVMe compliant storage devices
+ *
+ * This driver was written to conform to version 1.0e of the NVMe specification.
+ * It may work with newer versions, but that is completely untested and disabled
+ * by default.
+ *
+ * The driver has only been tested on x86 systems and will not work on big-
+ * endian systems without changes to the code accessing registers and data
+ * structures used by the hardware.
+ *
+ *
+ * Interrupt Usage:
+ *
+ * The driver will use a FIXED interrupt while configuring the device as the
+ * specification requires. Later in the attach process it will switch to MSI-X
+ * or MSI if supported. The driver wants to have one interrupt vector per CPU,
+ * but it will work correctly if less are available. Interrupts can be shared
+ * by queues, the interrupt handler will iterate through the I/O queue array by
+ * steps of n_intr_cnt. Usually only the admin queue will share an interrupt
+ * with one I/O queue. The interrupt handler will retrieve completed commands
+ * from all queues sharing an interrupt vector and will post them to a taskq
+ * for completion processing.
+ *
+ *
+ * Command Processing:
+ *
+ * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
+ * to 65536 I/O commands. The driver will configure one I/O queue pair per
+ * available interrupt vector, with the queue length usually much smaller than
+ * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
+ * interrupt vectors will be used.
+ *
+ * Additionally the hardware provides a single special admin queue pair that can
+ * hold up to 4096 admin commands.
+ *
+ * From the hardware perspective both queues of a queue pair are independent,
+ * but they share some driver state: the command array (holding pointers to
+ * commands currently being processed by the hardware) and the active command
+ * counter. Access to the submission side of a queue pair and the shared state
+ * is protected by nq_mutex. The completion side of a queue pair does not need
+ * that protection apart from its access to the shared state; it is called only
+ * in the interrupt handler which does not run concurrently for the same
+ * interrupt vector.
+ *
+ * When a command is submitted to a queue pair the active command counter is
+ * incremented and a pointer to the command is stored in the command array. The
+ * array index is used as command identifier (CID) in the submission queue
+ * entry. Some commands may take a very long time to complete, and if the queue
+ * wraps around in that time a submission may find the next array slot to still
+ * be used by a long-running command. In this case the array is sequentially
+ * searched for the next free slot. The length of the command array is the same
+ * as the configured queue length.
+ *
+ *
+ * Namespace Support:
+ *
+ * NVMe devices can have multiple namespaces, each being a independent data
+ * store. The driver supports multiple namespaces and creates a blkdev interface
+ * for each namespace found. Namespaces can have various attributes to support
+ * thin provisioning, extended LBAs, and protection information. This driver
+ * does not support any of this and ignores namespaces that have these
+ * attributes.
+ *
+ *
+ * Blkdev Interface:
+ *
+ * This driver uses blkdev to do all the heavy lifting involved with presenting
+ * a disk device to the system. As a result, the processing of I/O requests is
+ * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
+ * setup, and splitting of transfers into manageable chunks.
+ *
+ * I/O requests coming in from blkdev are turned into NVM commands and posted to
+ * an I/O queue. The queue is selected by taking the CPU id modulo the number of
+ * queues. There is currently no timeout handling of I/O commands.
+ *
+ * Blkdev also supports querying device/media information and generating a
+ * devid. The driver reports the best block size as determined by the namespace
+ * format back to blkdev as physical block size to support partition and block
+ * alignment. The devid is composed using the device vendor ID, model number,
+ * serial number, and the namespace ID.
+ *
+ *
+ * Error Handling:
+ *
+ * Error handling is currently limited to detecting fatal hardware errors,
+ * either by asynchronous events, or synchronously through command status or
+ * admin command timeouts. In case of severe errors the device is fenced off,
+ * all further requests will return EIO. FMA is then called to fault the device.
+ *
+ * The hardware has a limit for outstanding asynchronous event requests. Before
+ * this limit is known the driver assumes it is at least 1 and posts a single
+ * asynchronous request. Later when the limit is known more asynchronous event
+ * requests are posted to allow quicker reception of error information. When an
+ * asynchronous event is posted by the hardware the driver will parse the error
+ * status fields and log information or fault the device, depending on the
+ * severity of the asynchronous event. The asynchronous event request is then
+ * reused and posted to the admin queue again.
+ *
+ * On command completion the command status is checked for errors. In case of
+ * errors indicating a driver bug the driver panics. Almost all other error
+ * status values just cause EIO to be returned.
+ *
+ * Command timeouts are currently detected for all admin commands except
+ * asynchronous event requests. If a command times out and the hardware appears
+ * to be healthy the driver attempts to abort the command. If this fails the
+ * driver assumes the device to be dead, fences it off, and calls FMA to retire
+ * it. In general admin commands are issued at attach time only. No timeout
+ * handling of normal I/O commands is presently done.
+ *
+ * In some cases it may be possible that the ABORT command times out, too. In
+ * that case the device is also declared dead and fenced off.
+ *
+ *
+ * Quiesce / Fast Reboot:
+ *
+ * The driver currently does not support fast reboot. A quiesce(9E) entry point
+ * is still provided which is used to send a shutdown notification to the
+ * device.
+ *
+ *
+ * Driver Configuration:
+ *
+ * The following driver properties can be changed to control some aspects of the
+ * drivers operation:
+ * - strict-version: can be set to 0 to allow devices conforming to newer
+ *   versions to be used
+ * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
+ *   specific command status as a fatal error leading device faulting
+ * - admin-queue-len: the maximum length of the admin queue (16-4096)
+ * - io-queue-len: the maximum length of the I/O queues (16-65536)
+ * - async-event-limit: the maximum number of asynchronous event requests to be
+ *   posted by the driver
+ *
+ *
+ * TODO:
+ * - figure out sane default for I/O queue depth reported to blkdev
+ * - polled I/O support to support kernel core dumping
+ * - FMA handling of media errors
+ * - support for the Volatile Write Cache
+ * - support for devices supporting very large I/O requests using chained PRPs
+ * - support for querying log pages from user space
+ * - support for configuring hardware parameters like interrupt coalescing
+ * - support for media formatting and hard partitioning into namespaces
+ * - support for big-endian systems
+ * - support for fast reboot
+ */
+
+#include <sys/byteorder.h>
+#ifdef _BIG_ENDIAN
+#error nvme driver needs porting for big-endian platforms
+#endif
+
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/bitmap.h>
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/disp.h>
+#include <sys/blkdev.h>
+#include <sys/atomic.h>
+#include <sys/archsystm.h>
+
+#include "nvme_reg.h"
+#include "nvme_var.h"
+
+
+/* NVMe spec version supported */
+static const int nvme_version_major = 1;
+static const int nvme_version_minor = 0;
+
+static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
+static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
+static int nvme_quiesce(dev_info_t *);
+static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
+static void nvme_disable_interrupts(nvme_t *);
+static int nvme_enable_interrupts(nvme_t *);
+static int nvme_setup_interrupts(nvme_t *, int, int);
+static void nvme_release_interrupts(nvme_t *);
+static uint_t nvme_intr(caddr_t, caddr_t);
+
+static void nvme_shutdown(nvme_t *, int, boolean_t);
+static boolean_t nvme_reset(nvme_t *, boolean_t);
+static int nvme_init(nvme_t *);
+static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
+static void nvme_free_cmd(nvme_cmd_t *);
+static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
+    bd_xfer_t *);
+static int nvme_admin_cmd(nvme_cmd_t *, int);
+static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
+static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
+static void nvme_wakeup_cmd(void *);
+static void nvme_async_event_task(void *);
+
+static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
+static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
+static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
+static int nvme_check_specific_cmd_status(nvme_cmd_t *);
+static int nvme_check_generic_cmd_status(nvme_cmd_t *);
+static inline int nvme_check_cmd_status(nvme_cmd_t *);
+
+static void nvme_abort_cmd(nvme_cmd_t *);
+static int nvme_async_event(nvme_t *);
+static void *nvme_get_logpage(nvme_t *, uint8_t, ...);
+static void *nvme_identify(nvme_t *, uint32_t);
+static int nvme_set_nqueues(nvme_t *, uint16_t);
+
+static void nvme_free_dma(nvme_dma_t *);
+static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
+    nvme_dma_t **);
+static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
+    nvme_dma_t **);
+static void nvme_free_qpair(nvme_qpair_t *);
+static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
+static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
+
+static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
+static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
+static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
+static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
+
+static boolean_t nvme_check_regs_hdl(nvme_t *);
+static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
+
+static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
+
+static void nvme_bd_xfer_done(void *);
+static void nvme_bd_driveinfo(void *, bd_drive_t *);
+static int nvme_bd_mediainfo(void *, bd_media_t *);
+static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
+static int nvme_bd_read(void *, bd_xfer_t *);
+static int nvme_bd_write(void *, bd_xfer_t *);
+static int nvme_bd_sync(void *, bd_xfer_t *);
+static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
+
+static void nvme_prepare_devid(nvme_t *, uint32_t);
+
+static void *nvme_state;
+static kmem_cache_t *nvme_cmd_cache;
+
+/*
+ * DMA attributes for queue DMA memory
+ *
+ * Queue DMA memory must be page aligned. The maximum length of a queue is
+ * 65536 entries, and an entry can be 64 bytes long.
+ */
+static ddi_dma_attr_t nvme_queue_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
+	.dma_attr_align		= 0x1000,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x1000,
+	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= 1,
+	.dma_attr_granular	= 1,
+	.dma_attr_flags		= 0,
+};
+
+/*
+ * DMA attributes for transfers using Physical Region Page (PRP) entries
+ *
+ * A PRP entry describes one page of DMA memory using the page size specified
+ * in the controller configuration's memory page size register (CC.MPS). It uses
+ * a 64bit base address aligned to this page size. There is no limitation on
+ * chaining PRPs together for arbitrarily large DMA transfers.
+ */
+static ddi_dma_attr_t nvme_prp_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= 0xfff,
+	.dma_attr_align		= 0x1000,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x1000,
+	.dma_attr_maxxfer	= 0x1000,
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= -1,
+	.dma_attr_granular	= 1,
+	.dma_attr_flags		= 0,
+};
+
+/*
+ * DMA attributes for transfers using scatter/gather lists
+ *
+ * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
+ * 32bit length field. SGL Segment and SGL Last Segment entries require the
+ * length to be a multiple of 16 bytes.
+ */
+static ddi_dma_attr_t nvme_sgl_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= 0xffffffffUL,
+	.dma_attr_align		= 1,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x10,
+	.dma_attr_maxxfer	= 0xfffffffffULL,
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= -1,
+	.dma_attr_granular	= 0x10,
+	.dma_attr_flags		= 0
+};
+
+static ddi_device_acc_attr_t nvme_reg_acc_attr = {
+	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
+	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
+	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
+};
+
+static struct dev_ops nvme_dev_ops = {
+	.devo_rev	= DEVO_REV,
+	.devo_refcnt	= 0,
+	.devo_getinfo	= ddi_no_info,
+	.devo_identify	= nulldev,
+	.devo_probe	= nulldev,
+	.devo_attach	= nvme_attach,
+	.devo_detach	= nvme_detach,
+	.devo_reset	= nodev,
+	.devo_cb_ops	= NULL,
+	.devo_bus_ops	= NULL,
+	.devo_power	= NULL,
+	.devo_quiesce	= nvme_quiesce,
+};
+
+static struct modldrv nvme_modldrv = {
+	.drv_modops	= &mod_driverops,
+	.drv_linkinfo	= "NVMe v1.0e",
+	.drv_dev_ops	= &nvme_dev_ops
+};
+
+static struct modlinkage nvme_modlinkage = {
+	.ml_rev		= MODREV_1,
+	.ml_linkage	= { &nvme_modldrv, NULL }
+};
+
+static bd_ops_t nvme_bd_ops = {
+	.o_version	= BD_OPS_VERSION_0,
+	.o_drive_info	= nvme_bd_driveinfo,
+	.o_media_info	= nvme_bd_mediainfo,
+	.o_devid_init	= nvme_bd_devid,
+	.o_sync_cache	= nvme_bd_sync,
+	.o_read		= nvme_bd_read,
+	.o_write	= nvme_bd_write,
+};
+
+int
+_init(void)
+{
+	int error;
+
+	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
+	if (error != DDI_SUCCESS)
+		return (error);
+
+	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
+	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+	bd_mod_init(&nvme_dev_ops);
+
+	error = mod_install(&nvme_modlinkage);
+	if (error != DDI_SUCCESS) {
+		ddi_soft_state_fini(&nvme_state);
+		bd_mod_fini(&nvme_dev_ops);
+	}
+
+	return (error);
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	error = mod_remove(&nvme_modlinkage);
+	if (error == DDI_SUCCESS) {
+		ddi_soft_state_fini(&nvme_state);
+		kmem_cache_destroy(nvme_cmd_cache);
+		bd_mod_fini(&nvme_dev_ops);
+	}
+
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&nvme_modlinkage, modinfop));
+}
+
+static inline void
+nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
+{
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
+}
+
+static inline void
+nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
+{
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
+}
+
+static inline uint64_t
+nvme_get64(nvme_t *nvme, uintptr_t reg)
+{
+	uint64_t val;
+
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
+
+	return (val);
+}
+
+static inline uint32_t
+nvme_get32(nvme_t *nvme, uintptr_t reg)
+{
+	uint32_t val;
+
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
+
+	return (val);
+}
+
+static boolean_t
+nvme_check_regs_hdl(nvme_t *nvme)
+{
+	ddi_fm_error_t error;
+
+	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
+
+	if (error.fme_status != DDI_FM_OK)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static boolean_t
+nvme_check_dma_hdl(nvme_dma_t *dma)
+{
+	ddi_fm_error_t error;
+
+	if (dma == NULL)
+		return (B_FALSE);
+
+	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
+
+	if (error.fme_status != DDI_FM_OK)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+nvme_free_dma(nvme_dma_t *dma)
+{
+	if (dma->nd_dmah != NULL)
+		(void) ddi_dma_unbind_handle(dma->nd_dmah);
+	if (dma->nd_acch != NULL)
+		ddi_dma_mem_free(&dma->nd_acch);
+	if (dma->nd_dmah != NULL)
+		ddi_dma_free_handle(&dma->nd_dmah);
+	kmem_free(dma, sizeof (nvme_dma_t));
+}
+
+static int
+nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
+    ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
+{
+	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
+
+	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
+	    &dma->nd_dmah) != DDI_SUCCESS) {
+		/*
+		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
+		 * the only other possible error is DDI_DMA_BADATTR which
+		 * indicates a driver bug which should cause a panic.
+		 */
+		dev_err(nvme->n_dip, CE_PANIC,
+		    "!failed to get DMA handle, check DMA attributes");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
+	 * or the flags are conflicting, which isn't the case here.
+	 */
+	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
+	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
+	    &dma->nd_len, &dma->nd_acch);
+
+	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
+	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to bind DMA memory");
+		atomic_inc_32(&nvme->n_dma_bind_err);
+		*ret = NULL;
+		nvme_free_dma(dma);
+		return (DDI_FAILURE);
+	}
+
+	bzero(dma->nd_memp, dma->nd_len);
+
+	*ret = dma;
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
+    uint_t flags, nvme_dma_t **dma)
+{
+	uint32_t len = nentry * qe_len;
+	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
+
+	len = roundup(len, nvme->n_pagesize);
+
+	q_dma_attr.dma_attr_minxfer = len;
+
+	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to get DMA memory for queue");
+		goto fail;
+	}
+
+	if ((*dma)->nd_ncookie != 1) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!got too many cookies for queue DMA");
+		goto fail;
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	if (*dma) {
+		nvme_free_dma(*dma);
+		*dma = NULL;
+	}
+
+	return (DDI_FAILURE);
+}
+
+static void
+nvme_free_qpair(nvme_qpair_t *qp)
+{
+	int i;
+
+	mutex_destroy(&qp->nq_mutex);
+
+	if (qp->nq_sqdma != NULL)
+		nvme_free_dma(qp->nq_sqdma);
+	if (qp->nq_cqdma != NULL)
+		nvme_free_dma(qp->nq_cqdma);
+
+	if (qp->nq_active_cmds > 0)
+		for (i = 0; i != qp->nq_nentry; i++)
+			if (qp->nq_cmd[i] != NULL)
+				nvme_free_cmd(qp->nq_cmd[i]);
+
+	if (qp->nq_cmd != NULL)
+		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
+
+	kmem_free(qp, sizeof (nvme_qpair_t));
+}
+
+static int
+nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
+    int idx)
+{
+	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
+
+	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(nvme->n_intr_pri));
+
+	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
+	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
+		goto fail;
+
+	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
+	    DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
+		goto fail;
+
+	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
+	qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp;
+	qp->nq_nentry = nentry;
+
+	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
+	qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx);
+
+	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
+	qp->nq_next_cmd = 0;
+
+	*nqp = qp;
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_free_qpair(qp);
+	*nqp = NULL;
+
+	return (DDI_FAILURE);
+}
+
+static nvme_cmd_t *
+nvme_alloc_cmd(nvme_t *nvme, int kmflag)
+{
+	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
+
+	if (cmd == NULL)
+		return (cmd);
+
+	bzero(cmd, sizeof (nvme_cmd_t));
+
+	cmd->nc_nvme = nvme;
+
+	mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(nvme->n_intr_pri));
+	cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
+
+	return (cmd);
+}
+
+static void
+nvme_free_cmd(nvme_cmd_t *cmd)
+{
+	if (cmd->nc_dma) {
+		nvme_free_dma(cmd->nc_dma);
+		cmd->nc_dma = NULL;
+	}
+
+	cv_destroy(&cmd->nc_cv);
+	mutex_destroy(&cmd->nc_mutex);
+
+	kmem_cache_free(nvme_cmd_cache, cmd);
+}
+
+static int
+nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+	nvme_reg_sqtdbl_t tail = { 0 };
+
+	mutex_enter(&qp->nq_mutex);
+
+	if (qp->nq_active_cmds == qp->nq_nentry) {
+		mutex_exit(&qp->nq_mutex);
+		return (DDI_FAILURE);
+	}
+
+	cmd->nc_completed = B_FALSE;
+
+	/*
+	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
+	 * slot. If the slot is already occupied advance to the next slot and
+	 * try again. This can happen for long running commands like async event
+	 * requests.
+	 */
+	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
+		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
+	qp->nq_cmd[qp->nq_next_cmd] = cmd;
+
+	qp->nq_active_cmds++;
+
+	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
+	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
+	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
+	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
+	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
+	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
+
+	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
+	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
+
+	mutex_exit(&qp->nq_mutex);
+	return (DDI_SUCCESS);
+}
+
+static nvme_cmd_t *
+nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+{
+	nvme_reg_cqhdbl_t head = { 0 };
+
+	nvme_cqe_t *cqe;
+	nvme_cmd_t *cmd;
+
+	(void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
+	    sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
+
+	cqe = &qp->nq_cq[qp->nq_cqhead];
+
+	/* Check phase tag of CQE. Hardware inverts it for new entries. */
+	if (cqe->cqe_sf.sf_p == qp->nq_phase)
+		return (NULL);
+
+	ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
+	ASSERT(cqe->cqe_cid < qp->nq_nentry);
+
+	mutex_enter(&qp->nq_mutex);
+	cmd = qp->nq_cmd[cqe->cqe_cid];
+	qp->nq_cmd[cqe->cqe_cid] = NULL;
+	qp->nq_active_cmds--;
+	mutex_exit(&qp->nq_mutex);
+
+	ASSERT(cmd != NULL);
+	ASSERT(cmd->nc_nvme == nvme);
+	ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
+	ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
+	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
+
+	qp->nq_sqhead = cqe->cqe_sqhd;
+
+	head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
+
+	/* Toggle phase on wrap-around. */
+	if (qp->nq_cqhead == 0)
+		qp->nq_phase = qp->nq_phase ? 0 : 1;
+
+	nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
+
+	return (cmd);
+}
+
+static int
+nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
+	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
+	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
+	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
+
+	if (cmd->nc_nvme->n_strict_version) {
+		cmd->nc_nvme->n_dead = B_TRUE;
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+	}
+
+	return (EIO);
+}
+
+static int
+nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
+	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
+	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
+	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
+	if (cmd->nc_nvme->n_ignore_unknown_vendor_status) {
+		cmd->nc_nvme->n_dead = B_TRUE;
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+	}
+
+	return (EIO);
+}
+
+static int
+nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_INT_NVM_WRITE:
+		/* write fail */
+		/* TODO: post ereport */
+		return (EIO);
+
+	case NVME_CQE_SC_INT_NVM_READ:
+		/* read fail */
+		/* TODO: post ereport */
+		return (EIO);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static int
+nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_GEN_SUCCESS:
+		return (0);
+
+	/*
+	 * Errors indicating a bug in the driver should cause a panic.
+	 */
+	case NVME_CQE_SC_GEN_INV_OPC:
+		/* Invalid Command Opcode */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid opcode in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_INV_FLD:
+		/* Invalid Field in Command */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid field in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_ID_CNFL:
+		/* Command ID Conflict */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "cmd ID conflict in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_INV_NS:
+		/* Invalid Namespace or Format */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid NS/format in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
+		/* LBA Out Of Range */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "LBA out of range in cmd %p", (void *)cmd);
+		return (0);
+
+	/*
+	 * Non-fatal errors, handle gracefully.
+	 */
+	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
+		/* Data Transfer Error (DMA) */
+		/* TODO: post ereport */
+		atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_INTERNAL_ERR:
+		/*
+		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
+		 * detailed error information is returned as async event,
+		 * so we pretty much ignore the error here and handle it
+		 * in the async event handler.
+		 */
+		atomic_inc_32(&cmd->nc_nvme->n_internal_err);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_ABORT_REQUEST:
+		/*
+		 * Command Abort Requested. This normally happens only when a
+		 * command times out.
+		 */
+		/* TODO: post ereport or change blkdev to handle this? */
+		atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
+		return (ECANCELED);
+
+	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
+		/* Command Aborted due to Power Loss Notification */
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+		cmd->nc_nvme->n_dead = B_TRUE;
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
+		/* Command Aborted due to SQ Deletion */
+		atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
+		/* Capacity Exceeded */
+		atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
+		/* Namespace Not Ready */
+		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
+		return (EIO);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static int
+nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_SPC_INV_CQ:
+		/* Completion Queue Invalid */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_QID:
+		/* Invalid Queue Identifier */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
+		/* Max Queue Size Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
+		/* Abort Command Limit Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "abort command limit exceeded in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
+		/* Async Event Request Limit Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "async event request limit exceeded in cmd %p",
+		    (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_SPC_INV_INT_VECT:
+		/* Invalid Interrupt Vector */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
+		/* Invalid Log Page */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_FORMAT:
+		/* Invalid Format */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_format);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_Q_DEL:
+		/* Invalid Queue Deletion */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
+		/* Conflicting Attributes */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_INV_PROT:
+		/* Invalid Protection Information */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_READONLY:
+		/* Write to Read Only Range */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_readonly);
+		return (EROFS);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static inline int
+nvme_check_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	/* take a shortcut if everything is alright */
+	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
+	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
+		return (0);
+
+	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
+		return (nvme_check_generic_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
+		return (nvme_check_specific_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
+		return (nvme_check_integrity_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
+		return (nvme_check_vendor_cmd_status(cmd));
+
+	return (nvme_check_unknown_cmd_status(cmd));
+}
+
+/*
+ * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands
+ *
+ * This functions takes care of cleaning up aborted commands. The command
+ * status is checked to catch any fatal errors.
+ */
+static void
+nvme_abort_cmd_cb(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+
+	/*
+	 * Grab the command mutex. Once we have it we hold the last reference
+	 * to the command and can safely free it.
+	 */
+	mutex_enter(&cmd->nc_mutex);
+	(void) nvme_check_cmd_status(cmd);
+	mutex_exit(&cmd->nc_mutex);
+
+	nvme_free_cmd(cmd);
+}
+
+static void
+nvme_abort_cmd(nvme_cmd_t *abort_cmd)
+{
+	nvme_t *nvme = abort_cmd->nc_nvme;
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_abort_cmd_t ac = { 0 };
+
+	sema_p(&nvme->n_abort_sema);
+
+	ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
+	ac.b.ac_sqid = abort_cmd->nc_sqid;
+
+	/*
+	 * Drop the mutex of the aborted command. From this point on
+	 * we must assume that the abort callback has freed the command.
+	 */
+	mutex_exit(&abort_cmd->nc_mutex);
+
+	cmd->nc_sqid = 0;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_cdw10 = ac.r;
+
+	/*
+	 * Send the ABORT to the hardware. The ABORT command will return _after_
+	 * the aborted command has completed (aborted or otherwise).
+	 */
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		sema_v(&nvme->n_abort_sema);
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for ABORT");
+		atomic_inc_32(&nvme->n_abort_failed);
+		return;
+	}
+	sema_v(&nvme->n_abort_sema);
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!ABORT failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		atomic_inc_32(&nvme->n_abort_failed);
+	} else {
+		atomic_inc_32(&nvme->n_cmd_aborted);
+	}
+
+	nvme_free_cmd(cmd);
+}
+
+/*
+ * nvme_wait_cmd -- wait for command completion or timeout
+ *
+ * Returns B_TRUE if the command completed normally.
+ *
+ * Returns B_FALSE if the command timed out and an abort was attempted. The
+ * command mutex will be dropped and the command must be considered freed. The
+ * freeing of the command is normally done by the abort command callback.
+ *
+ * In case of a serious error or a timeout of the abort command the hardware
+ * will be declared dead and FMA will be notified.
+ */
+static boolean_t
+nvme_wait_cmd(nvme_cmd_t *cmd, uint_t usec)
+{
+	clock_t timeout = ddi_get_lbolt() + drv_usectohz(usec);
+	nvme_t *nvme = cmd->nc_nvme;
+	nvme_reg_csts_t csts;
+
+	ASSERT(mutex_owned(&cmd->nc_mutex));
+
+	while (!cmd->nc_completed) {
+		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
+			break;
+	}
+
+	if (cmd->nc_completed)
+		return (B_TRUE);
+
+	/*
+	 * The command timed out. Change the callback to the cleanup function.
+	 */
+	cmd->nc_callback = nvme_abort_cmd_cb;
+
+	/*
+	 * Check controller for fatal status, any errors associated with the
+	 * register or DMA handle, or for a double timeout (abort command timed
+	 * out). If necessary log a warning and call FMA.
+	 */
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	dev_err(nvme->n_dip, CE_WARN, "!command timeout, "
+	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
+	atomic_inc_32(&nvme->n_cmd_timeout);
+
+	if (csts.b.csts_cfs ||
+	    nvme_check_regs_hdl(nvme) ||
+	    nvme_check_dma_hdl(cmd->nc_dma) ||
+	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		mutex_exit(&cmd->nc_mutex);
+	} else {
+		/*
+		 * Try to abort the command. The command mutex is released by
+		 * nvme_abort_cmd().
+		 * If the abort succeeds it will have freed the aborted command.
+		 * If the abort fails for other reasons we must assume that the
+		 * command may complete at any time, and the callback will free
+		 * it for us.
+		 */
+		nvme_abort_cmd(cmd);
+	}
+
+	return (B_FALSE);
+}
+
+static void
+nvme_wakeup_cmd(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+
+	mutex_enter(&cmd->nc_mutex);
+	/*
+	 * There is a slight chance that this command completed shortly after
+	 * the timeout was hit in nvme_wait_cmd() but before the callback was
+	 * changed. Catch that case here and clean up accordingly.
+	 */
+	if (cmd->nc_callback == nvme_abort_cmd_cb) {
+		mutex_exit(&cmd->nc_mutex);
+		nvme_abort_cmd_cb(cmd);
+		return;
+	}
+
+	cmd->nc_completed = B_TRUE;
+	cv_signal(&cmd->nc_cv);
+	mutex_exit(&cmd->nc_mutex);
+}
+
+static void
+nvme_async_event_task(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+	nvme_t *nvme = cmd->nc_nvme;
+	nvme_error_log_entry_t *error_log = NULL;
+	nvme_health_log_t *health_log = NULL;
+	nvme_async_event_t event;
+	int ret;
+
+	/*
+	 * Check for errors associated with the async request itself. The only
+	 * command-specific error is "async event limit exceeded", which
+	 * indicates a programming error in the driver and causes a panic in
+	 * nvme_check_cmd_status().
+	 *
+	 * Other possible errors are various scenarios where the async request
+	 * was aborted, or internal errors in the device. Internal errors are
+	 * reported to FMA, the command aborts need no special handling here.
+	 */
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+		    "!async event request returned failure, sct = %x, "
+		    "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
+		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
+		    cmd->nc_cqe.cqe_sf.sf_m);
+
+		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
+		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
+			cmd->nc_nvme->n_dead = B_TRUE;
+			ddi_fm_service_impact(cmd->nc_nvme->n_dip,
+			    DDI_SERVICE_LOST);
+		}
+		nvme_free_cmd(cmd);
+		return;
+	}
+
+
+	event.r = cmd->nc_cqe.cqe_dw0;
+
+	/* Clear CQE and re-submit the async request. */
+	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
+	ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to resubmit async event request");
+		atomic_inc_32(&nvme->n_async_resubmit_failed);
+		nvme_free_cmd(cmd);
+	}
+
+	switch (event.b.ae_type) {
+	case NVME_ASYNC_TYPE_ERROR:
+		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
+			error_log = (nvme_error_log_entry_t *)
+			    nvme_get_logpage(nvme, event.b.ae_logpage);
+		} else {
+			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
+			    "async event reply: %d", event.b.ae_logpage);
+			atomic_inc_32(&nvme->n_wrong_logpage);
+		}
+
+		switch (event.b.ae_info) {
+		case NVME_ASYNC_ERROR_INV_SQ:
+			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
+			    "invalid submission queue");
+			return;
+
+		case NVME_ASYNC_ERROR_INV_DBL:
+			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
+			    "invalid doorbell write value");
+			return;
+
+		case NVME_ASYNC_ERROR_DIAGFAIL:
+			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
+			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+			nvme->n_dead = B_TRUE;
+			atomic_inc_32(&nvme->n_diagfail_event);
+			break;
+
+		case NVME_ASYNC_ERROR_PERSISTENT:
+			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
+			    "device error");
+			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+			nvme->n_dead = B_TRUE;
+			atomic_inc_32(&nvme->n_persistent_event);
+			break;
+
+		case NVME_ASYNC_ERROR_TRANSIENT:
+			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
+			    "device error");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_transient_event);
+			break;
+
+		case NVME_ASYNC_ERROR_FW_LOAD:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!firmware image load error");
+			atomic_inc_32(&nvme->n_fw_load_event);
+			break;
+		}
+		break;
+
+	case NVME_ASYNC_TYPE_HEALTH:
+		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
+			health_log = (nvme_health_log_t *)
+			    nvme_get_logpage(nvme, event.b.ae_logpage, -1);
+		} else {
+			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
+			    "async event reply: %d", event.b.ae_logpage);
+			atomic_inc_32(&nvme->n_wrong_logpage);
+		}
+
+		switch (event.b.ae_info) {
+		case NVME_ASYNC_HEALTH_RELIABILITY:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!device reliability compromised");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_reliability_event);
+			break;
+
+		case NVME_ASYNC_HEALTH_TEMPERATURE:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!temperature above threshold");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_temperature_event);
+			break;
+
+		case NVME_ASYNC_HEALTH_SPARE:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!spare space below threshold");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_spare_event);
+			break;
+		}
+		break;
+
+	case NVME_ASYNC_TYPE_VENDOR:
+		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
+		    "received, info = %x, logpage = %x", event.b.ae_info,
+		    event.b.ae_logpage);
+		atomic_inc_32(&nvme->n_vendor_event);
+		break;
+
+	default:
+		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
+		    "type = %x, info = %x, logpage = %x", event.b.ae_type,
+		    event.b.ae_info, event.b.ae_logpage);
+		atomic_inc_32(&nvme->n_unknown_event);
+		break;
+	}
+
+	if (error_log)
+		kmem_free(error_log, sizeof (nvme_error_log_entry_t) *
+		    nvme->n_error_log_len);
+
+	if (health_log)
+		kmem_free(health_log, sizeof (nvme_health_log_t));
+}
+
+static int
+nvme_admin_cmd(nvme_cmd_t *cmd, int usec)
+{
+	int ret;
+
+	mutex_enter(&cmd->nc_mutex);
+	ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		mutex_exit(&cmd->nc_mutex);
+		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+		    "!nvme_submit_cmd failed");
+		atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_wait_cmd(cmd, usec) == B_FALSE) {
+		/*
+		 * The command timed out. An abort command was posted that
+		 * will take care of the cleanup.
+		 */
+		return (DDI_FAILURE);
+	}
+	mutex_exit(&cmd->nc_mutex);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_async_event(nvme_t *nvme)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	int ret;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
+	cmd->nc_callback = nvme_async_event_task;
+
+	ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static void *
+nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	void *buf = NULL;
+	nvme_getlogpage_t getlogpage;
+	size_t bufsize;
+	va_list ap;
+
+	va_start(ap, logpage);
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
+
+	getlogpage.b.lp_lid = logpage;
+
+	switch (logpage) {
+	case NVME_LOGPAGE_ERROR:
+		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
+		bufsize = nvme->n_error_log_len *
+		    sizeof (nvme_error_log_entry_t);
+		break;
+
+	case NVME_LOGPAGE_HEALTH:
+		cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
+		bufsize = sizeof (nvme_health_log_t);
+		break;
+
+	case NVME_LOGPAGE_FWSLOT:
+		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
+		bufsize = sizeof (nvme_fwslot_log_t);
+		break;
+
+	default:
+		dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
+		    logpage);
+		atomic_inc_32(&nvme->n_unknown_logpage);
+		goto fail;
+	}
+
+	va_end(ap);
+
+	getlogpage.b.lp_numd = bufsize / sizeof (uint32_t);
+
+	cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
+
+	if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
+	    DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_zalloc_dma failed for GET LOG PAGE");
+		goto fail;
+	}
+
+	if (cmd->nc_dma->nd_ncookie > 2) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!too many DMA cookies for GET LOG PAGE");
+		atomic_inc_32(&nvme->n_too_many_cookies);
+		goto fail;
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	if (cmd->nc_dma->nd_ncookie > 1) {
+		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
+		    &cmd->nc_dma->nd_cookie);
+		cmd->nc_sqe.sqe_dptr.d_prp[1] =
+		    cmd->nc_dma->nd_cookie.dmac_laddress;
+	}
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for GET LOG PAGE");
+		return (NULL);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!GET LOG PAGE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		goto fail;
+	}
+
+	buf = kmem_alloc(bufsize, KM_SLEEP);
+	bcopy(cmd->nc_dma->nd_memp, buf, bufsize);
+
+fail:
+	nvme_free_cmd(cmd);
+
+	return (buf);
+}
+
+static void *
+nvme_identify(nvme_t *nvme, uint32_t nsid)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	void *buf = NULL;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
+	cmd->nc_sqe.sqe_nsid = nsid;
+	cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
+
+	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
+	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_zalloc_dma failed for IDENTIFY");
+		goto fail;
+	}
+
+	if (cmd->nc_dma->nd_ncookie > 2) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!too many DMA cookies for IDENTIFY");
+		atomic_inc_32(&nvme->n_too_many_cookies);
+		goto fail;
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	if (cmd->nc_dma->nd_ncookie > 1) {
+		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
+		    &cmd->nc_dma->nd_cookie);
+		cmd->nc_sqe.sqe_dptr.d_prp[1] =
+		    cmd->nc_dma->nd_cookie.dmac_laddress;
+	}
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for IDENTIFY");
+		return (NULL);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!IDENTIFY failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		goto fail;
+	}
+
+	buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
+	bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE);
+
+fail:
+	nvme_free_cmd(cmd);
+
+	return (buf);
+}
+
+static int
+nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_nqueue_t nq = { 0 };
+
+	nq.b.nq_nsq = nq.b.nq_ncq = nqueues;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
+	cmd->nc_sqe.sqe_cdw10 = NVME_FEAT_NQUEUES;
+	cmd->nc_sqe.sqe_cdw11 = nq.r;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for SET FEATURES (NQUEUES)");
+		return (0);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!SET FEATURES (NQUEUES) failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (0);
+	}
+
+	nq.r = cmd->nc_cqe.cqe_dw0;
+	nvme_free_cmd(cmd);
+
+	/*
+	 * Always use the same number of submission and completion queues, and
+	 * never use more than the requested number of queues.
+	 */
+	return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq)));
+}
+
+static int
+nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_create_queue_dw10_t dw10 = { 0 };
+	nvme_create_cq_dw11_t c_dw11 = { 0 };
+	nvme_create_sq_dw11_t s_dw11 = { 0 };
+
+	dw10.b.q_qid = idx;
+	dw10.b.q_qsize = qp->nq_nentry - 1;
+
+	c_dw11.b.cq_pc = 1;
+	c_dw11.b.cq_ien = 1;
+	c_dw11.b.cq_iv = idx % nvme->n_intr_cnt;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
+	cmd->nc_sqe.sqe_cdw10 = dw10.r;
+	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for CREATE CQUEUE");
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	nvme_free_cmd(cmd);
+
+	s_dw11.b.sq_pc = 1;
+	s_dw11.b.sq_cqid = idx;
+
+	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
+	cmd->nc_sqe.sqe_cdw10 = dw10.r;
+	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for CREATE SQUEUE");
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	nvme_free_cmd(cmd);
+
+	return (DDI_SUCCESS);
+}
+
+static boolean_t
+nvme_reset(nvme_t *nvme, boolean_t quiesce)
+{
+	nvme_reg_csts_t csts;
+	int i;
+
+	nvme_put32(nvme, NVME_REG_CC, 0);
+
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	if (csts.b.csts_rdy == 1) {
+		nvme_put32(nvme, NVME_REG_CC, 0);
+		for (i = 0; i != nvme->n_timeout * 10; i++) {
+			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+			if (csts.b.csts_rdy == 0)
+				break;
+
+			if (quiesce)
+				drv_usecwait(50000);
+			else
+				delay(drv_usectohz(50000));
+		}
+	}
+
+	nvme_put32(nvme, NVME_REG_AQA, 0);
+	nvme_put32(nvme, NVME_REG_ASQ, 0);
+	nvme_put32(nvme, NVME_REG_ACQ, 0);
+
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
+}
+
+static void
+nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce)
+{
+	nvme_reg_cc_t cc;
+	nvme_reg_csts_t csts;
+	int i;
+
+	ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT);
+
+	cc.r = nvme_get32(nvme, NVME_REG_CC);
+	cc.b.cc_shn = mode & 0x3;
+	nvme_put32(nvme, NVME_REG_CC, cc.r);
+
+	for (i = 0; i != 10; i++) {
+		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
+			break;
+
+		if (quiesce)
+			drv_usecwait(100000);
+		else
+			delay(drv_usectohz(100000));
+	}
+}
+
+
+static void
+nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
+{
+	char model[sizeof (nvme->n_idctl->id_model) + 1];
+	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
+
+	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
+	bcopy(nvme->n_idctl->id_serial, serial,
+	    sizeof (nvme->n_idctl->id_serial));
+
+	model[sizeof (nvme->n_idctl->id_model)] = '\0';
+	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
+
+	(void) snprintf(nvme->n_ns[nsid - 1].ns_devid,
+	    sizeof (nvme->n_ns[0].ns_devid), "%4X-%s-%s-%X",
+	    nvme->n_idctl->id_vid, model, serial, nsid);
+}
+
+static int
+nvme_init(nvme_t *nvme)
+{
+	nvme_reg_cc_t cc = { 0 };
+	nvme_reg_aqa_t aqa = { 0 };
+	nvme_reg_asq_t asq = { 0 };
+	nvme_reg_acq_t acq = { 0 };
+	nvme_reg_cap_t cap;
+	nvme_reg_vs_t vs;
+	nvme_reg_csts_t csts;
+	int i = 0;
+	int nqueues;
+
+	/* Setup fixed interrupt for admin queue. */
+	if (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to setup fixed interrupt");
+		goto fail;
+	}
+
+	/* Check controller version */
+	vs.r = nvme_get32(nvme, NVME_REG_VS);
+	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
+	    vs.b.vs_mjr, vs.b.vs_mnr);
+
+	if (nvme_version_major < vs.b.vs_mjr &&
+	    nvme_version_minor < vs.b.vs_mnr) {
+		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
+		    nvme_version_major, nvme_version_minor);
+		if (nvme->n_strict_version)
+			goto fail;
+	}
+
+	/* retrieve controller configuration */
+	cap.r = nvme_get64(nvme, NVME_REG_CAP);
+
+	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!NVM command set not supported by hardware");
+		goto fail;
+	}
+
+	nvme->n_nssr_supported = cap.b.cap_nssrs;
+	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
+	nvme->n_timeout = cap.b.cap_to;
+	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
+	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
+	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
+
+	/*
+	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
+	 * the base page size of 4k (1<<12), so add 12 here to get the real
+	 * page size value.
+	 */
+	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
+	    cap.b.cap_mpsmax + 12);
+	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
+
+	/*
+	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
+	 */
+	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
+	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
+
+	/*
+	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
+	 * Maxxfer may be increased after we identified the controller limits.
+	 */
+	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
+	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
+	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
+
+	/*
+	 * Reset controller if it's still in ready state.
+	 */
+	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
+		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		goto fail;
+	}
+
+	/*
+	 * Create the admin queue pair.
+	 */
+	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!unable to allocate admin qpair");
+		goto fail;
+	}
+	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
+	nvme->n_ioq[0] = nvme->n_adminq;
+
+	nvme->n_progress |= NVME_ADMIN_QUEUE;
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
+	    "admin-queue-len", nvme->n_admin_queue_len);
+
+	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
+	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
+	acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress;
+
+	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
+	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
+
+	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
+	nvme_put64(nvme, NVME_REG_ASQ, asq);
+	nvme_put64(nvme, NVME_REG_ACQ, acq);
+
+	cc.b.cc_ams = 0; /* use Round-Robin arbitration */
+	cc.b.cc_css = 0; /* use NVM command set */
+	cc.b.cc_mps = nvme->n_pageshift - 12;
+	cc.b.cc_shn = 0; /* no shutdown in progress */
+	cc.b.cc_en = 1;  /* enable controller */
+
+	nvme_put32(nvme, NVME_REG_CC, cc.r);
+
+	/*
+	 * Wait for the controller to become ready.
+	 */
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	if (csts.b.csts_rdy == 0) {
+		for (i = 0; i != nvme->n_timeout * 10; i++) {
+			delay(drv_usectohz(50000));
+			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+
+			if (csts.b.csts_cfs == 1) {
+				dev_err(nvme->n_dip, CE_WARN,
+				    "!controller fatal status at init");
+				ddi_fm_service_impact(nvme->n_dip,
+				    DDI_SERVICE_LOST);
+				nvme->n_dead = B_TRUE;
+				goto fail;
+			}
+
+			if (csts.b.csts_rdy == 1)
+				break;
+		}
+	}
+
+	if (csts.b.csts_rdy == 0) {
+		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		goto fail;
+	}
+
+	/*
+	 * Assume an abort command limit of 1. We'll destroy and re-init
+	 * that later when we know the true abort command limit.
+	 */
+	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
+
+	/*
+	 * Post an asynchronous event command to catch errors.
+	 */
+	if (nvme_async_event(nvme) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to post async event");
+		goto fail;
+	}
+
+	/*
+	 * Identify Controller
+	 */
+	nvme->n_idctl = nvme_identify(nvme, 0);
+	if (nvme->n_idctl == NULL) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to identify controller");
+		goto fail;
+	}
+
+	/*
+	 * Get controller limits.
+	 */
+	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
+	    MIN(nvme->n_admin_queue_len / 10,
+	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
+	    "async-event-limit", nvme->n_async_event_limit);
+
+	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
+
+	/* disable NVMe interrupts while reinitializing the semaphore */
+	nvme_disable_interrupts(nvme);
+	sema_destroy(&nvme->n_abort_sema);
+	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
+	    SEMA_DRIVER, NULL);
+	if (nvme_enable_interrupts(nvme) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to re-enable interrupts");
+		goto fail;
+	}
+
+	nvme->n_progress |= NVME_CTRL_LIMITS;
+
+	if (nvme->n_idctl->id_mdts == 0)
+		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
+	else
+		nvme->n_max_data_transfer_size =
+		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
+
+	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
+
+	/*
+	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
+	 * Chained PRPs are currently unsupported.
+	 *
+	 * This is a no-op on hardware which doesn't support a transfer size
+	 * big enough to require chained PRPs.
+	 */
+	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
+	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
+
+	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
+
+	/*
+	 * Make sure the minimum/maximum queue entry sizes are not
+	 * larger/smaller than the default.
+	 */
+
+	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
+	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
+	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
+	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
+		goto fail;
+
+	/*
+	 * Check for the presence of a Volatile Write Cache. If present,
+	 * enable it by default.
+	 */
+	if (nvme->n_idctl->id_vwc.vwc_present == 0) {
+		nvme->n_volatile_write_cache_enabled = B_FALSE;
+		nvme_bd_ops.o_sync_cache = NULL;
+	} else {
+		/*
+		 * TODO: send SET FEATURES to enable VWC
+		 * (have no hardware to test this)
+		 */
+		nvme->n_volatile_write_cache_enabled = B_FALSE;
+		nvme_bd_ops.o_sync_cache = NULL;
+	}
+
+	/*
+	 * Grab a copy of all mandatory log pages.
+	 *
+	 * TODO: should go away once user space tool exists to print logs
+	 */
+	nvme->n_error_log = (nvme_error_log_entry_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_ERROR);
+	nvme->n_health_log = (nvme_health_log_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_HEALTH, -1);
+	nvme->n_fwslot_log = (nvme_fwslot_log_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_FWSLOT);
+
+	/*
+	 * Identify Namespaces
+	 */
+	nvme->n_namespace_count = nvme->n_idctl->id_nn;
+	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
+	    nvme->n_namespace_count, KM_SLEEP);
+
+	for (i = 0; i != nvme->n_namespace_count; i++) {
+		nvme_identify_nsid_t *idns;
+		int last_rp;
+
+		nvme->n_ns[i].ns_nvme = nvme;
+		nvme->n_ns[i].ns_idns = idns = nvme_identify(nvme, i + 1);
+
+		if (idns == NULL) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to identify namespace %d", i + 1);
+			goto fail;
+		}
+
+		nvme->n_ns[i].ns_id = i + 1;
+		nvme->n_ns[i].ns_block_count = idns->id_nsize;
+		nvme->n_ns[i].ns_block_size =
+		    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
+		nvme->n_ns[i].ns_best_block_size = nvme->n_ns[i].ns_block_size;
+
+		nvme_prepare_devid(nvme, nvme->n_ns[i].ns_id);
+
+		/*
+		 * Find the LBA format with no metadata and the best relative
+		 * performance. A value of 3 means "degraded", 0 is best.
+		 */
+		last_rp = 3;
+		for (int j = 0; j != idns->id_nlbaf; j++) {
+			if (idns->id_lbaf[j].lbaf_lbads == 0)
+				break;
+			if (idns->id_lbaf[j].lbaf_ms != 0)
+				continue;
+			if (idns->id_lbaf[j].lbaf_rp >= last_rp)
+				continue;
+			last_rp = idns->id_lbaf[j].lbaf_rp;
+			nvme->n_ns[i].ns_best_block_size =
+			    1 << idns->id_lbaf[j].lbaf_lbads;
+		}
+
+		/*
+		 * We currently don't support namespaces that use either:
+		 * - thin provisioning
+		 * - extended LBAs
+		 * - protection information
+		 */
+		if (idns->id_nsfeat.f_thin ||
+		    idns->id_flbas.lba_extlba ||
+		    idns->id_dps.dp_pinfo) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!ignoring namespace %d, unsupported features: "
+			    "thin = %d, extlba = %d, pinfo = %d", i + 1,
+			    idns->id_nsfeat.f_thin, idns->id_flbas.lba_extlba,
+			    idns->id_dps.dp_pinfo);
+			nvme->n_ns[i].ns_ignore = B_TRUE;
+		}
+	}
+
+	/*
+	 * Try to set up MSI/MSI-X interrupts.
+	 */
+	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
+	    != 0) {
+		nvme_release_interrupts(nvme);
+
+		nqueues = MIN(UINT16_MAX, ncpus);
+
+		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
+		    nqueues) != DDI_SUCCESS) &&
+		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
+		    nqueues) != DDI_SUCCESS)) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to setup MSI/MSI-X interrupts");
+			goto fail;
+		}
+	}
+
+	nqueues = nvme->n_intr_cnt;
+
+	/*
+	 * Create I/O queue pairs.
+	 */
+	nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues);
+	if (nvme->n_ioq_count == 0) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to set number of I/O queues to %d", nqueues);
+		goto fail;
+	}
+
+	/*
+	 * Reallocate I/O queue array
+	 */
+	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
+	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
+	    (nvme->n_ioq_count + 1), KM_SLEEP);
+	nvme->n_ioq[0] = nvme->n_adminq;
+
+	/*
+	 * If we got less queues than we asked for we might as well give
+	 * some of the interrupt vectors back to the system.
+	 */
+	if (nvme->n_ioq_count < nqueues) {
+		nvme_release_interrupts(nvme);
+
+		if (nvme_setup_interrupts(nvme, nvme->n_intr_type, nqueues)
+		    != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to reduce number of interrupts");
+			goto fail;
+		}
+	}
+
+	/*
+	 * Alloc & register I/O queue pairs
+	 */
+	nvme->n_io_queue_len =
+	    MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries);
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len",
+	    nvme->n_io_queue_len);
+
+	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
+		if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
+		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!unable to allocate I/O qpair %d", i);
+			goto fail;
+		}
+
+		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
+		    != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!unable to create I/O qpair %d", i);
+			goto fail;
+		}
+	}
+
+	/*
+	 * Post more asynchronous events commands to reduce event reporting
+	 * latency as suggested by the spec.
+	 */
+	for (i = 1; i != nvme->n_async_event_limit; i++) {
+		if (nvme_async_event(nvme) != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to post async event %d", i);
+			goto fail;
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	(void) nvme_reset(nvme, B_FALSE);
+	return (DDI_FAILURE);
+}
+
+static uint_t
+nvme_intr(caddr_t arg1, caddr_t arg2)
+{
+	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
+	nvme_t *nvme = (nvme_t *)arg1;
+	int inum = (int)(uintptr_t)arg2;
+	int qnum;
+	nvme_cmd_t *cmd;
+
+	if (inum >= nvme->n_intr_cnt)
+		return (DDI_INTR_UNCLAIMED);
+
+	/*
+	 * The interrupt vector a queue uses is calculated as queue_idx %
+	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
+	 * in steps of n_intr_cnt to process all queues using this vector.
+	 */
+	for (qnum = inum;
+	    qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
+	    qnum += nvme->n_intr_cnt) {
+		while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
+			taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+		}
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static void
+nvme_disable_interrupts(nvme_t *nvme)
+{
+	int i;
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
+			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
+		else
+			(void) ddi_intr_disable(nvme->n_inth[i]);
+	}
+}
+
+static int
+nvme_enable_interrupts(nvme_t *nvme)
+{
+	int i, fail = 0;
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) {
+			if (ddi_intr_block_enable(&nvme->n_inth[i], 1) !=
+			    DDI_SUCCESS)
+				fail++;
+		} else {
+			if (ddi_intr_enable(nvme->n_inth[i]) != DDI_SUCCESS)
+				fail++;
+		}
+	}
+
+	return (fail ? DDI_FAILURE : DDI_SUCCESS);
+}
+
+static void
+nvme_release_interrupts(nvme_t *nvme)
+{
+	int i;
+
+	nvme_disable_interrupts(nvme);
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
+		(void) ddi_intr_free(nvme->n_inth[i]);
+	}
+
+	kmem_free(nvme->n_inth, nvme->n_inth_sz);
+	nvme->n_inth = NULL;
+	nvme->n_inth_sz = 0;
+
+	nvme->n_progress &= ~NVME_INTERRUPTS;
+}
+
+static int
+nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
+{
+	int nintrs, navail, count;
+	int ret;
+	int i;
+
+	if (nvme->n_intr_types == 0) {
+		ret = ddi_intr_get_supported_types(nvme->n_dip,
+		    &nvme->n_intr_types);
+		if (ret != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!%s: ddi_intr_get_supported types failed",
+			    __func__);
+			return (ret);
+		}
+	}
+
+	if ((nvme->n_intr_types & intr_type) == 0)
+		return (DDI_FAILURE);
+
+	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
+		    __func__);
+		return (ret);
+	}
+
+	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
+		    __func__);
+		return (ret);
+	}
+
+	/* We want at most one interrupt per queue pair. */
+	if (navail > nqpairs)
+		navail = nqpairs;
+
+	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
+	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
+
+	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
+	    &count, 0);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
+		    __func__);
+		goto fail;
+	}
+
+	nvme->n_intr_cnt = count;
+
+	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
+		    __func__);
+		goto fail;
+	}
+
+	for (i = 0; i < count; i++) {
+		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
+		    (void *)nvme, (void *)(uintptr_t)i);
+		if (ret != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!%s: ddi_intr_add_handler failed", __func__);
+			goto fail;
+		}
+	}
+
+	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
+
+	ret = nvme_enable_interrupts(nvme);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!%s: nvme_enable_interrupts failed", __func__);
+		goto fail;
+	}
+
+	nvme->n_intr_type = intr_type;
+
+	nvme->n_progress |= NVME_INTERRUPTS;
+
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_release_interrupts(nvme);
+
+	return (ret);
+}
+
+static int
+nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
+{
+	_NOTE(ARGUNUSED(arg));
+
+	pci_ereport_post(dip, fm_error, NULL);
+	return (fm_error->fme_status);
+}
+
+static int
+nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	nvme_t *nvme;
+	int instance;
+	int nregs;
+	off_t regsize;
+	int i;
+	char name[32];
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+
+	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+	ddi_set_driver_private(dip, nvme);
+	nvme->n_dip = dip;
+
+	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
+	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
+	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
+	    B_TRUE : B_FALSE;
+	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
+	nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN);
+	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "async-event-limit",
+	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
+
+	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
+		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
+	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
+		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
+
+	if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN)
+		nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN;
+
+	if (nvme->n_async_event_limit < 1)
+		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
+
+	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
+	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
+	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
+	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
+
+	/*
+	 * Setup FMA support.
+	 */
+	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
+	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
+	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
+
+	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
+
+	if (nvme->n_fm_cap) {
+		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
+			nvme->n_reg_acc_attr.devacc_attr_access =
+			    DDI_FLAGERR_ACC;
+
+		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
+			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+		}
+
+		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
+		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			pci_ereport_setup(dip);
+
+		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			ddi_fm_handler_register(dip, nvme_fm_errcb,
+			    (void *)nvme);
+	}
+
+	nvme->n_progress |= NVME_FMA_INIT;
+
+	/*
+	 * The spec defines several register sets. Only the controller
+	 * registers (set 1) are currently used.
+	 */
+	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
+	    nregs < 2 ||
+	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
+		goto fail;
+
+	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
+	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "!failed to map regset 1");
+		goto fail;
+	}
+
+	nvme->n_progress |= NVME_REGS_MAPPED;
+
+	/*
+	 * Create taskq for command completion.
+	 */
+	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq",
+	    ddi_driver_name(dip), ddi_get_instance(dip));
+	nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus),
+	    TASKQ_DEFAULTPRI, 0);
+	if (nvme->n_cmd_taskq == NULL) {
+		dev_err(dip, CE_WARN, "!failed to create cmd taskq");
+		goto fail;
+	}
+
+
+	if (nvme_init(nvme) != DDI_SUCCESS)
+		goto fail;
+
+	/*
+	 * Attach the blkdev driver for each namespace.
+	 */
+	for (i = 0; i != nvme->n_namespace_count; i++) {
+		if (nvme->n_ns[i].ns_ignore)
+			continue;
+
+		nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i],
+		    &nvme_bd_ops, &nvme->n_prp_dma_attr, KM_SLEEP);
+
+		if (nvme->n_ns[i].ns_bd_hdl == NULL) {
+			dev_err(dip, CE_WARN,
+			    "!failed to get blkdev handle for namespace %d", i);
+			goto fail;
+		}
+
+		if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl)
+		    != DDI_SUCCESS) {
+			dev_err(dip, CE_WARN,
+			    "!failed to attach blkdev handle for namespace %d",
+			    i);
+			goto fail;
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	/* attach successful anyway so that FMA can retire the device */
+	if (nvme->n_dead)
+		return (DDI_SUCCESS);
+
+	(void) nvme_detach(dip, DDI_DETACH);
+
+	return (DDI_FAILURE);
+}
+
+static int
+nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int instance, i;
+	nvme_t *nvme;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+
+	if (nvme == NULL)
+		return (DDI_FAILURE);
+
+	if (nvme->n_ns) {
+		for (i = 0; i != nvme->n_namespace_count; i++) {
+			if (nvme->n_ns[i].ns_bd_hdl) {
+				(void) bd_detach_handle(
+				    nvme->n_ns[i].ns_bd_hdl);
+				bd_free_handle(nvme->n_ns[i].ns_bd_hdl);
+			}
+
+			if (nvme->n_ns[i].ns_idns)
+				kmem_free(nvme->n_ns[i].ns_idns,
+				    sizeof (nvme_identify_nsid_t));
+		}
+
+		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
+		    nvme->n_namespace_count);
+	}
+
+	if (nvme->n_progress & NVME_INTERRUPTS)
+		nvme_release_interrupts(nvme);
+
+	if (nvme->n_cmd_taskq)
+		ddi_taskq_wait(nvme->n_cmd_taskq);
+
+	if (nvme->n_ioq_count > 0) {
+		for (i = 1; i != nvme->n_ioq_count + 1; i++) {
+			if (nvme->n_ioq[i] != NULL) {
+				/* TODO: send destroy queue commands */
+				nvme_free_qpair(nvme->n_ioq[i]);
+			}
+		}
+
+		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
+		    (nvme->n_ioq_count + 1));
+	}
+
+	if (nvme->n_progress & NVME_REGS_MAPPED) {
+		nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
+		(void) nvme_reset(nvme, B_FALSE);
+	}
+
+	if (nvme->n_cmd_taskq)
+		ddi_taskq_destroy(nvme->n_cmd_taskq);
+
+	if (nvme->n_progress & NVME_CTRL_LIMITS)
+		sema_destroy(&nvme->n_abort_sema);
+
+	if (nvme->n_progress & NVME_ADMIN_QUEUE)
+		nvme_free_qpair(nvme->n_adminq);
+
+	if (nvme->n_idctl)
+		kmem_free(nvme->n_idctl, sizeof (nvme_identify_ctrl_t));
+
+	if (nvme->n_progress & NVME_REGS_MAPPED)
+		ddi_regs_map_free(&nvme->n_regh);
+
+	if (nvme->n_progress & NVME_FMA_INIT) {
+		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			ddi_fm_handler_unregister(nvme->n_dip);
+
+		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
+		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			pci_ereport_teardown(nvme->n_dip);
+
+		ddi_fm_fini(nvme->n_dip);
+	}
+
+	ddi_soft_state_free(nvme_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_quiesce(dev_info_t *dip)
+{
+	int instance;
+	nvme_t *nvme;
+
+	instance = ddi_get_instance(dip);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+
+	if (nvme == NULL)
+		return (DDI_FAILURE);
+
+	nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE);
+
+	(void) nvme_reset(nvme, B_TRUE);
+
+	return (DDI_FAILURE);
+}
+
+static int
+nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer)
+{
+	nvme_t *nvme = cmd->nc_nvme;
+	int nprp_page, nprp;
+	uint64_t *prp;
+
+	if (xfer->x_ndmac == 0)
+		return (DDI_FAILURE);
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress;
+	ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
+
+	if (xfer->x_ndmac == 1) {
+		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
+		return (DDI_SUCCESS);
+	} else if (xfer->x_ndmac == 2) {
+		cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress;
+		return (DDI_SUCCESS);
+	}
+
+	xfer->x_ndmac--;
+
+	nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1;
+	ASSERT(nprp_page > 0);
+	nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page;
+
+	/*
+	 * We currently don't support chained PRPs and set up our DMA
+	 * attributes to reflect that. If we still get an I/O request
+	 * that needs a chained PRP something is very wrong.
+	 */
+	VERIFY(nprp == 1);
+
+	if (nvme_zalloc_dma(nvme, nvme->n_pagesize * nprp, DDI_DMA_READ,
+	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: nvme_zalloc_dma failed",
+		    __func__);
+		return (DDI_FAILURE);
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, &cmd->nc_dma->nd_cookie);
+
+	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
+	for (prp = (uint64_t *)cmd->nc_dma->nd_memp;
+	    xfer->x_ndmac > 0;
+	    prp++, xfer->x_ndmac--) {
+		*prp = xfer->x_dmac.dmac_laddress;
+		ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
+	}
+
+	(void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len,
+	    DDI_DMA_SYNC_FORDEV);
+	return (DDI_SUCCESS);
+}
+
+static nvme_cmd_t *
+nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
+{
+	nvme_t *nvme = ns->ns_nvme;
+	nvme_cmd_t *cmd;
+
+	/*
+	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
+	 */
+	cmd = nvme_alloc_cmd(nvme, (xfer->x_flags & BD_XFER_POLL) ?
+	    KM_NOSLEEP : KM_SLEEP);
+
+	if (cmd == NULL)
+		return (NULL);
+
+	cmd->nc_sqe.sqe_opc = opc;
+	cmd->nc_callback = nvme_bd_xfer_done;
+	cmd->nc_xfer = xfer;
+
+	switch (opc) {
+	case NVME_OPC_NVM_WRITE:
+	case NVME_OPC_NVM_READ:
+		VERIFY(xfer->x_nblks <= 0x10000);
+
+		cmd->nc_sqe.sqe_nsid = ns->ns_id;
+
+		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
+		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
+		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
+
+		if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS)
+			goto fail;
+		break;
+
+	case NVME_OPC_NVM_FLUSH:
+		cmd->nc_sqe.sqe_nsid = ns->ns_id;
+		break;
+
+	default:
+		goto fail;
+	}
+
+	return (cmd);
+
+fail:
+	nvme_free_cmd(cmd);
+	return (NULL);
+}
+
+static void
+nvme_bd_xfer_done(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+	bd_xfer_t *xfer = cmd->nc_xfer;
+	int error = 0;
+
+	error = nvme_check_cmd_status(cmd);
+	nvme_free_cmd(cmd);
+
+	bd_xfer_done(xfer, error);
+}
+
+static void
+nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
+{
+	nvme_namespace_t *ns = arg;
+	nvme_t *nvme = ns->ns_nvme;
+
+	/*
+	 * blkdev maintains one queue size per instance (namespace),
+	 * but all namespace share the I/O queues.
+	 * TODO: need to figure out a sane default, or use per-NS I/O queues,
+	 * or change blkdev to handle EAGAIN
+	 */
+	drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len
+	    / nvme->n_namespace_count;
+
+	/*
+	 * d_maxxfer is not set, which means the value is taken from the DMA
+	 * attributes specified to bd_alloc_handle.
+	 */
+
+	drive->d_removable = B_FALSE;
+	drive->d_hotpluggable = B_FALSE;
+
+	drive->d_target = ns->ns_id;
+	drive->d_lun = 0;
+}
+
+static int
+nvme_bd_mediainfo(void *arg, bd_media_t *media)
+{
+	nvme_namespace_t *ns = arg;
+
+	media->m_nblks = ns->ns_block_count;
+	media->m_blksize = ns->ns_block_size;
+	media->m_readonly = B_FALSE;
+	media->m_solidstate = B_TRUE;
+
+	media->m_pblksize = ns->ns_best_block_size;
+
+	return (0);
+}
+
+static int
+nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
+{
+	nvme_t *nvme = ns->ns_nvme;
+	nvme_cmd_t *cmd;
+
+	if (nvme->n_dead)
+		return (EIO);
+
+	/* No polling for now */
+	if (xfer->x_flags & BD_XFER_POLL)
+		return (EIO);
+
+	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
+	if (cmd == NULL)
+		return (ENOMEM);
+
+	cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
+	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
+
+	if (nvme_submit_cmd(nvme->n_ioq[cmd->nc_sqid], cmd)
+	    != DDI_SUCCESS)
+		return (EAGAIN);
+
+	return (0);
+}
+
+static int
+nvme_bd_read(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
+}
+
+static int
+nvme_bd_write(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
+}
+
+static int
+nvme_bd_sync(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	if (ns->ns_nvme->n_dead)
+		return (EIO);
+
+	/*
+	 * If the volatile write cache isn't enabled the FLUSH command is a
+	 * no-op, so we can take a shortcut here.
+	 */
+	if (ns->ns_nvme->n_volatile_write_cache_enabled == B_FALSE) {
+		bd_xfer_done(xfer, ENOTSUP);
+		return (0);
+	}
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
+}
+
+static int
+nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
+	    ns->ns_devid, devid));
+}
diff --git a/usr/src/uts/common/io/nvme/nvme.conf b/usr/src/uts/common/io/nvme/nvme.conf
new file mode 100644
index 0000000000..186bd38018
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme.conf
@@ -0,0 +1,40 @@
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+# The driver was tested only against devices supporting v1.0 of the
+# NVMe specification. Uncomment this to be able to use devices conforming
+# to newer specifications.
+#strict-version=0;
+
+#
+# The driver does currently not support any vendor specific extension to the
+# specification. By default it will fault the device if it receives a vendor-
+# specific command status. Uncomment this to disable this behaviour.
+#ignore-unknown-vendor-status=1;
+
+#
+# The maximum length of the admin queue can be overridden here (16-4096).
+#admin-queue-len=256;
+
+#
+# The maximum length of the individual I/O queues can be overriden here
+# (16-65536).
+#io-queue-len=1024;
+
+#
+# The maximum number of outstanding asynchronous event requests can
+# overridden here.
+#async-event-limit=10;
+
+
diff --git a/usr/src/uts/common/io/nvme/nvme_reg.h b/usr/src/uts/common/io/nvme/nvme_reg.h
new file mode 100644
index 0000000000..8fb44a3730
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme_reg.h
@@ -0,0 +1,692 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * NVMe hardware interface
+ */
+
+#ifndef _NVME_REG_H
+#define	_NVME_REG_H
+
+#pragma pack(1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * NVMe constants
+ */
+#define	NVME_MAX_ADMIN_QUEUE_LEN	4096
+
+/*
+ * NVMe registers and register fields
+ */
+#define	NVME_REG_CAP	0x0		/* Controller Capabilities */
+#define	NVME_REG_VS	0x8		/* Version */
+#define	NVME_REG_INTMS	0xc		/* Interrupt Mask Set */
+#define	NVME_REG_INTMC	0x10		/* Interrupt Mask Clear */
+#define	NVME_REG_CC	0x14		/* Controller Configuration */
+#define	NVME_REG_CSTS	0x1c		/* Controller Status */
+#define	NVME_REG_NSSR	0x20		/* NVM Subsystem Reset */
+#define	NVME_REG_AQA	0x24		/* Admin Queue Attributes */
+#define	NVME_REG_ASQ	0x28		/* Admin Submission Queue */
+#define	NVME_REG_ACQ	0x30		/* Admin Completion Qeueu */
+#define	NVME_REG_SQTDBL(nvme, n) \
+	(0x1000 + ((2 * (n)) * nvme->n_doorbell_stride))
+#define	NVME_REG_CQHDBL(nvme, n) \
+	(0x1000 + ((2 * (n) + 1) * nvme->n_doorbell_stride))
+
+#define	 NVME_CAP_CSS_NVM	1	/* NVM Command Set */
+#define	 NVME_CAP_AMS_WRR	1	/* Weighted Round-Robin */
+
+/* CAP -- Controller Capabilities */
+typedef union {
+	struct {
+		uint16_t cap_mqes;	/* Maximum Queue Entries Supported */
+		uint8_t cap_cqr:1;	/* Contiguous Queues Required */
+		uint8_t cap_ams:2;	/* Arbitration Mechanisms Supported */
+		uint8_t cap_rsvd1:5;
+		uint8_t cap_to;		/* Timeout */
+		uint16_t cap_dstrd:4;	/* Doorbell Stride */
+		uint16_t cap_nssrs:1;	/* NVM Subsystem Reset Supported */
+		uint16_t cap_css:8;	/* Command Sets Supported */
+		uint16_t cap_rsvd2:3;
+		uint8_t cap_mpsmin:4;	/* Memory Page Size Minimum */
+		uint8_t cap_mpsmax:4;	/* Memory Page Size Maximum */
+		uint8_t cap_rsvd3;
+	} b;
+	uint64_t r;
+} nvme_reg_cap_t;
+
+/* VS -- Version */
+typedef union {
+	struct {
+		uint8_t vs_rsvd;
+		uint8_t vs_mnr;		/* Minor Version Number */
+		uint16_t vs_mjr;	/* Major Version Number */
+	} b;
+	uint32_t r;
+} nvme_reg_vs_t;
+
+/* CC -- Controller Configuration */
+#define	NVME_CC_SHN_NORMAL	1	/* Normal Shutdown Notification */
+#define	NVME_CC_SHN_ABRUPT	2	/* Abrupt Shutdown Notification */
+
+typedef union {
+	struct {
+		uint16_t cc_en:1;	/* Enable */
+		uint16_t cc_rsvd1:3;
+		uint16_t cc_css:3;	/* I/O Command Set Selected */
+		uint16_t cc_mps:4;	/* Memory Page Size */
+		uint16_t cc_ams:3;	/* Arbitration Mechanism Selected */
+		uint16_t cc_shn:2;	/* Shutdown Notification */
+		uint8_t cc_iosqes:4;	/* I/O Submission Queue Entry Size */
+		uint8_t cc_iocqes:4;	/* I/O Completion Queue Entry Size */
+		uint8_t cc_rsvd2;
+	} b;
+	uint32_t r;
+} nvme_reg_cc_t;
+
+/* CSTS -- Controller Status */
+#define	NVME_CSTS_SHN_OCCURING	1	/* Shutdown Processing Occuring */
+#define	NVME_CSTS_SHN_COMPLETE	2	/* Shutdown Processing Complete */
+
+typedef union {
+	struct {
+		uint32_t csts_rdy:1;	/* Ready */
+		uint32_t csts_cfs:1;	/* Controller Fatal Status */
+		uint32_t csts_shst:2;	/* Shutdown Status */
+		uint32_t csts_nssro:1;	/* NVM Subsystem Reset Occured */
+		uint32_t csts_rsvd:28;
+	} b;
+	uint32_t r;
+} nvme_reg_csts_t;
+
+/* NSSR -- NVM Subsystem Reset */
+#define	NVME_NSSR_NSSRC	0x4e564d65	/* NSSR magic value */
+typedef uint32_t nvme_reg_nssr_t;
+
+/* AQA -- Admin Queue Attributes */
+typedef union {
+	struct {
+		uint16_t aqa_asqs:12;	/* Admin Submission Queue Size */
+		uint16_t aqa_rsvd1:4;
+		uint16_t aqa_acqs:12;	/* Admin Completion Queue Size */
+		uint16_t aqa_rsvd2:4;
+	} b;
+	uint32_t r;
+} nvme_reg_aqa_t;
+
+/*
+ * The spec specifies the lower 12 bits of ASQ and ACQ as reserved, which is
+ * probably a specification bug. The full 64bit regs are used as base address,
+ * and the lower bits must be zero to ensure alignment on the page size
+ * specified in CC.MPS.
+ */
+/* ASQ -- Admin Submission Queue Base Address */
+typedef uint64_t nvme_reg_asq_t;	/* Admin Submission Queue Base */
+
+/* ACQ -- Admin Completion Queue Base Address */
+typedef uint64_t nvme_reg_acq_t;	/* Admin Completion Queue Base */
+
+/* SQyTDBL -- Submission Queue y Tail Doorbell */
+typedef union {
+	struct {
+		uint16_t sqtdbl_sqt;	/* Submission Queue Tail */
+		uint16_t sqtdbl_rsvd;
+	} b;
+	uint32_t r;
+} nvme_reg_sqtdbl_t;
+
+/* CQyHDBL -- Completion Queue y Head Doorbell */
+typedef union {
+	struct {
+		uint16_t cqhdbl_cqh;	/* Completion Queue Head */
+		uint16_t cqhdbl_rsvd;
+	} b;
+	uint32_t r;
+} nvme_reg_cqhdbl_t;
+
+/*
+ * NVMe submission queue entries
+ */
+
+/* NVMe scatter/gather list descriptor */
+typedef struct {
+	uint64_t sgl_addr;		/* Address */
+	uint32_t sgl_len;		/* Length */
+	uint8_t sgl_rsvd[3];
+	uint8_t sgl_zero:4;
+	uint8_t sgl_type:4;		/* SGL descriptor type */
+} nvme_sgl_t;
+
+/* NVMe SGL descriptor type */
+#define	NVME_SGL_DATA_BLOCK	0
+#define	NVME_SGL_BIT_BUCKET	1
+#define	NVME_SGL_SEGMENT	2
+#define	NVME_SGL_LAST_SEGMENT	3
+#define	NVME_SGL_VENDOR		0xf
+
+/* NVMe submission queue entry */
+typedef struct {
+	uint8_t sqe_opc;		/* Opcode */
+	uint8_t sqe_fuse:2;		/* Fused Operation */
+	uint8_t sqe_rsvd:5;
+	uint8_t sqe_psdt:1;		/* PRP or SGL for Data Transfer */
+	uint16_t sqe_cid;		/* Command Identifier */
+	uint32_t sqe_nsid;		/* Namespace Identifier */
+	uint64_t sqe_rsvd1;
+	union {
+		uint64_t m_ptr;		/* Metadata Pointer */
+		uint64_t m_sglp;	/* Metadata SGL Segment Pointer */
+	} sqe_m;
+	union {
+		uint64_t d_prp[2];	/* Physical Page Region Entries 1 & 2 */
+		nvme_sgl_t d_sgl;	/* SGL Entry 1 */
+	} sqe_dptr;			/* Data Pointer */
+	uint32_t sqe_cdw10;		/* Number of Dwords in Data Transfer */
+	uint32_t sqe_cdw11;		/* Number of Dwords in Metadata Xfer */
+	uint32_t sqe_cdw12;
+	uint32_t sqe_cdw13;
+	uint32_t sqe_cdw14;
+	uint32_t sqe_cdw15;
+} nvme_sqe_t;
+
+/* NVMe admin command opcodes */
+#define	NVME_OPC_DELETE_SQUEUE	0x0
+#define	NVME_OPC_CREATE_SQUEUE	0x1
+#define	NVME_OPC_GET_LOG_PAGE	0x2
+#define	NVME_OPC_DELETE_CQUEUE	0x4
+#define	NVME_OPC_CREATE_CQUEUE	0x5
+#define	NVME_OPC_IDENTIFY	0x6
+#define	NVME_OPC_ABORT		0x8
+#define	NVME_OPC_SET_FEATURES	0x9
+#define	NVME_OPC_GET_FEATURES	0xa
+#define	NVME_OPC_ASYNC_EVENT	0xc
+#define	NVME_OPC_FW_ACTIVATE	0x10
+#define	NVME_OPC_FW_IMAGE_LOAD	0x11
+
+/* NVMe NVM command set specific admin command opcodes */
+#define	NVME_OPC_NVM_FORMAT	0x80
+#define	NVME_OPC_NVM_SEC_SEND	0x81
+#define	NVME_OPC_NVM_SEC_RECV	0x82
+
+/* NVMe NVM command opcodes */
+#define	NVME_OPC_NVM_FLUSH	0x0
+#define	NVME_OPC_NVM_WRITE	0x1
+#define	NVME_OPC_NVM_READ	0x2
+#define	NVME_OPC_NVM_WRITE_UNC	0x4
+#define	NVME_OPC_NVM_COMPARE	0x5
+#define	NVME_OPC_NVM_WRITE_ZERO	0x8
+#define	NVME_OPC_NVM_DSET_MGMT	0x9
+#define	NVME_OPC_NVM_RESV_REG	0xd
+#define	NVME_OPC_NVM_RESV_REPRT	0xe
+#define	NVME_OPC_NVM_RESV_ACQ	0x11
+#define	NVME_OPC_NVM_RESV_REL	0x12
+
+/*
+ * NVMe completion queue entry
+ */
+typedef struct {
+	uint16_t sf_p:1;		/* Phase Tag */
+	uint16_t sf_sc:8;		/* Status Code */
+	uint16_t sf_sct:3;		/* Status Code Type */
+	uint16_t sf_rsvd2:2;
+	uint16_t sf_m:1;		/* More */
+	uint16_t sf_dnr:1;		/* Do Not Retry */
+} nvme_cqe_sf_t;
+
+typedef struct {
+	uint32_t cqe_dw0;		/* Command Specific */
+	uint32_t cqe_rsvd1;
+	uint16_t cqe_sqhd;		/* SQ Head Pointer */
+	uint16_t cqe_sqid;		/* SQ Identifier */
+	uint16_t cqe_cid;		/* Command Identifier */
+	nvme_cqe_sf_t cqe_sf;		/* Status Field */
+} nvme_cqe_t;
+
+/* NVMe completion status code type */
+#define	NVME_CQE_SCT_GENERIC	0	/* Generic Command Status */
+#define	NVME_CQE_SCT_SPECIFIC	1	/* Command Specific Status */
+#define	NVME_CQE_SCT_INTEGRITY	2	/* Media and Data Integrity Errors */
+#define	NVME_CQE_SCT_VENDOR	7	/* Vendor Specific */
+
+/* NVMe completion status code (generic) */
+#define	NVME_CQE_SC_GEN_SUCCESS		0x0	/* Successful Completion */
+#define	NVME_CQE_SC_GEN_INV_OPC		0x1	/* Invalid Command Opcode */
+#define	NVME_CQE_SC_GEN_INV_FLD		0x2	/* Invalid Field in Command */
+#define	NVME_CQE_SC_GEN_ID_CNFL		0x3	/* Command ID Conflict */
+#define	NVME_CQE_SC_GEN_DATA_XFR_ERR	0x4	/* Data Transfer Error */
+#define	NVME_CQE_SC_GEN_ABORT_PWRLOSS	0x5	/* Cmds Aborted / Pwr Loss */
+#define	NVME_CQE_SC_GEN_INTERNAL_ERR	0x6	/* Internal Error */
+#define	NVME_CQE_SC_GEN_ABORT_REQUEST	0x7	/* Command Abort Requested */
+#define	NVME_CQE_SC_GEN_ABORT_SQ_DEL	0x8	/* Cmd Aborted / SQ deletion */
+#define	NVME_CQE_SC_GEN_ABORT_FUSE_FAIL	0x9	/* Cmd Aborted / Failed Fused */
+#define	NVME_CQE_SC_GEN_ABORT_FUSE_MISS	0xa	/* Cmd Aborted / Missing Fusd */
+#define	NVME_CQE_SC_GEN_INV_NS		0xb	/* Inval Namespace or Format */
+#define	NVME_CQE_SC_GEN_CMD_SEQ_ERR	0xc	/* Command Sequence Error */
+#define	NVME_CQE_SC_GEN_INV_SGL_LAST	0xd	/* Inval SGL Last Seg Desc */
+#define	NVME_CQE_SC_GEN_INV_SGL_NUM	0xe	/* Inval Number of SGL Desc */
+#define	NVME_CQE_SC_GEN_INV_DSGL_LEN	0xf	/* Data SGL Length Invalid */
+#define	NVME_CQE_SC_GEN_INV_MSGL_LEN	0x10	/* Metadata SGL Length Inval */
+#define	NVME_CQE_SC_GEN_INV_SGL_DESC	0x11	/* SGL Descriptor Type Inval */
+
+/* NVMe completion status code (generic NVM commands) */
+#define	NVME_CQE_SC_GEN_NVM_LBA_RANGE	0x80	/* LBA Out Of Range */
+#define	NVME_CQE_SC_GEN_NVM_CAP_EXC	0x81	/* Capacity Exceeded */
+#define	NVME_CQE_SC_GEN_NVM_NS_NOTRDY	0x82	/* Namespace Not Ready */
+#define	NVME_CQE_SC_GEN_NVM_RSV_CNFLCT	0x83	/* Reservation Conflict */
+
+/* NVMe completion status code (command specific) */
+#define	NVME_CQE_SC_SPC_INV_CQ		0x0	/* Completion Queue Invalid */
+#define	NVME_CQE_SC_SPC_INV_QID		0x1	/* Invalid Queue Identifier */
+#define	NVME_CQE_SC_SPC_MAX_QSZ_EXC	0x2	/* Max Queue Size Exceeded */
+#define	NVME_CQE_SC_SPC_ABRT_CMD_EXC	0x3	/* Abort Cmd Limit Exceeded */
+#define	NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC	0x5	/* Async Event Request Limit */
+#define	NVME_CQE_SC_SPC_INV_FW_SLOT	0x6	/* Invalid Firmware Slot */
+#define	NVME_CQE_SC_SPC_INV_FW_IMG	0x7	/* Invalid Firmware Image */
+#define	NVME_CQE_SC_SPC_INV_INT_VECT	0x8	/* Invalid Interrupt Vector */
+#define	NVME_CQE_SC_SPC_INV_LOG_PAGE	0x9	/* Invalid Log Page */
+#define	NVME_CQE_SC_SPC_INV_FORMAT	0xa	/* Invalid Format */
+#define	NVME_CQE_SC_SPC_FW_RESET	0xb	/* FW Application Reset Reqd */
+#define	NVME_CQE_SC_SPC_INV_Q_DEL	0xc	/* Invalid Queue Deletion */
+#define	NVME_CQE_SC_SPC_FEAT_SAVE	0xd	/* Feature Id Not Saveable */
+#define	NVME_CQE_SC_SPC_FEAT_CHG	0xe	/* Feature Not Changeable */
+#define	NVME_CQE_SC_SPC_FEAT_NS_SPEC	0xf	/* Feature Not Namespace Spec */
+#define	NVME_CQE_SC_SPC_FW_NSSR		0x10	/* FW Application NSSR Reqd */
+
+/* NVMe completion status code (NVM command specific */
+#define	NVME_CQE_SC_SPC_NVM_CNFL_ATTR	0x80	/* Conflicting Attributes */
+#define	NVME_CQE_SC_SPC_NVM_INV_PROT	0x81	/* Invalid Protection */
+#define	NVME_CQE_SC_SPC_NVM_READONLY	0x82	/* Write to Read Only Range */
+
+/* NVMe completion status code (data / metadata integrity) */
+#define	NVME_CQE_SC_INT_NVM_WRITE	0x80	/* Write Fault */
+#define	NVME_CQE_SC_INT_NVM_READ	0x81	/* Unrecovered Read Error */
+#define	NVME_CQE_SC_INT_NVM_GUARD	0x82	/* Guard Check Error */
+#define	NVME_CQE_SC_INT_NVM_APPL_TAG	0x83	/* Application Tag Check Err */
+#define	NVME_CQE_SC_INT_NVM_REF_TAG	0x84	/* Reference Tag Check Err */
+#define	NVME_CQE_SC_INT_NVM_COMPARE	0x85	/* Compare Failure */
+#define	NVME_CQE_SC_INT_NVM_ACCESS	0x86	/* Access Denied */
+
+/*
+ * NVMe Asynchronous Event Request
+ */
+#define	NVME_ASYNC_TYPE_ERROR		0x0	/* Error Status */
+#define	NVME_ASYNC_TYPE_HEALTH		0x1	/* SMART/Health Status */
+#define	NVME_ASYNC_TYPE_VENDOR		0x7	/* vendor specific */
+
+#define	NVME_ASYNC_ERROR_INV_SQ		0x0	/* Invalid Submission Queue */
+#define	NVME_ASYNC_ERROR_INV_DBL	0x1	/* Invalid Doorbell Write */
+#define	NVME_ASYNC_ERROR_DIAGFAIL	0x2	/* Diagnostic Failure */
+#define	NVME_ASYNC_ERROR_PERSISTENT	0x3	/* Persistent Internal Error */
+#define	NVME_ASYNC_ERROR_TRANSIENT	0x4	/* Transient Internal Error */
+#define	NVME_ASYNC_ERROR_FW_LOAD	0x5	/* Firmware Image Load Error */
+
+#define	NVME_ASYNC_HEALTH_RELIABILITY	0x0	/* Device Reliability */
+#define	NVME_ASYNC_HEALTH_TEMPERATURE	0x1	/* Temp. Above Threshold */
+#define	NVME_ASYNC_HEALTH_SPARE		0x2	/* Spare Below Threshold */
+
+typedef union {
+	struct {
+		uint8_t ae_type:3;		/* Asynchronous Event Type */
+		uint8_t ae_rsvd1:5;
+		uint8_t ae_info;		/* Asynchronous Event Info */
+		uint8_t ae_logpage;		/* Associated Log Page */
+		uint8_t ae_rsvd2;
+	} b;
+	uint32_t r;
+} nvme_async_event_t;
+
+/*
+ * NVMe Create Completion/Submission Queue
+ */
+typedef union {
+	struct {
+		uint16_t q_qid;			/* Queue Identifier */
+		uint16_t q_qsize; 		/* Queue Size */
+	} b;
+	uint32_t r;
+} nvme_create_queue_dw10_t;
+
+typedef union {
+	struct {
+		uint16_t cq_pc:1;		/* Physically Contiguous */
+		uint16_t cq_ien:1;		/* Interrupts Enabled */
+		uint16_t cq_rsvd:14;
+		uint16_t cq_iv;			/* Interrupt Vector */
+	} b;
+	uint32_t r;
+} nvme_create_cq_dw11_t;
+
+typedef union {
+	struct {
+		uint16_t sq_pc:1;		/* Physically Contiguous */
+		uint16_t sq_qprio:2;		/* Queue Priority */
+		uint16_t sq_rsvd:13;
+		uint16_t sq_cqid;		/* Completion Queue ID */
+	} b;
+	uint32_t r;
+} nvme_create_sq_dw11_t;
+
+/*
+ * NVMe Identify
+ */
+
+/* NVMe Identify parameters (cdw10) */
+#define	NVME_IDENTIFY_NSID	0x0	/* Identify Namespace */
+#define	NVME_IDENTIFY_CTRL	0x1	/* Identify Controller */
+#define	NVME_IDENTIFY_LIST	0x2	/* Identify List Namespaces */
+
+#define	NVME_IDENTIFY_BUFSIZE	4096	/* buffer size for Identify */
+
+/* NVMe Queue Entry Size bitfield */
+typedef struct {
+	uint8_t qes_min:4;		/* minimum entry size */
+	uint8_t qes_max:4;		/* maximum entry size */
+} nvme_idctl_qes_t;
+
+/* NVMe Power State Descriptor */
+typedef struct {
+	uint16_t psd_mp;		/* Maximum Power */
+	uint16_t psd_rsvd1;
+	uint32_t psd_enlat;		/* Entry Latency */
+	uint32_t psd_exlat;		/* Exit Latency */
+	uint8_t psd_rrt:5;		/* Relative Read Throughput */
+	uint8_t psd_rsvd2:3;
+	uint8_t psd_rrl:5;		/* Relative Read Latency */
+	uint8_t psd_rsvd3:3;
+	uint8_t psd_rwt:5;		/* Relative Write Throughput */
+	uint8_t	psd_rsvd4:3;
+	uint8_t psd_rwl:5;		/* Relative Write Latency */
+	uint8_t psd_rsvd5:3;
+	uint8_t psd_rsvd6[16];
+} nvme_idctl_psd_t;
+
+/* NVMe Identify Controller Data Structure */
+typedef struct {
+	/* Controller Capabilities & Features */
+	uint16_t id_vid;		/* PCI vendor ID */
+	uint16_t id_ssvid; 		/* PCI subsystem vendor ID */
+	char id_serial[20];		/* Serial Number */
+	char id_model[40];		/* Model Number */
+	char id_fwrev[8];		/* Firmware Revision */
+	uint8_t id_rab;			/* Recommended Arbitration Burst */
+	uint8_t id_oui[3];		/* vendor IEEE OUI */
+	struct {			/* Multi-Interface Capabilities */
+		uint8_t m_multi:1;	/* HW has multiple PCIe interfaces */
+		uint8_t m_rsvd:7;
+	} id_mic;
+	uint8_t	id_mdts;		/* Maximum Data Transfer Size */
+	uint8_t id_rsvd_cc[256 - 78];
+
+	/* Admin Command Set Attributes */
+	struct {			/* Optional Admin Command Support */
+		uint16_t oa_security:1;	/* Security Send & Receive */
+		uint16_t oa_format:1;	/* Format NVM */
+		uint16_t oa_firmare:1;	/* Firmware Activate & Download */
+		uint16_t oa_rsvd:13;
+	} id_oacs;
+	uint8_t	id_acl;			/* Abort Command Limit */
+	uint8_t id_aerl;		/* Asynchronous Event Request Limit */
+	struct {			/* Firmware Updates */
+		uint8_t fw_readonly:1;	/* Slot 1 is Read-Only */
+		uint8_t	fw_nslot:3;	/* number of firmware slots */
+		uint8_t fw_rsvd:4;
+	} id_frmw;
+	struct {			/* Log Page Attributes */
+		uint8_t lp_smart:1;	/* SMART/Health information per NS */
+		uint8_t lp_rsvd:7;
+	} id_lpa;
+	uint8_t id_elpe;		/* Error Log Page Entries */
+	uint8_t	id_npss;		/* Number of Power States */
+	struct {			/* Admin Vendor Specific Command Conf */
+		uint8_t av_spec:1;	/* use format from spec */
+		uint8_t av_rsvd:7;
+	} id_avscc;
+	uint8_t id_rsvd_ac[256 - 9];
+
+	/* NVM Command Set Attributes */
+	nvme_idctl_qes_t id_sqes;	/* Submission Queue Entry Size */
+	nvme_idctl_qes_t id_cqes;	/* Completion Queue Entry Size */
+	uint16_t id_rsvd_nc_1;
+	uint32_t id_nn;			/* Number of Namespaces */
+	struct {			/* Optional NVM Command Support */
+		uint16_t on_compare:1;	/* Compare */
+		uint16_t on_wr_unc:1;	/* Write Uncorrectable */
+		uint16_t on_dset_mgmt:1; /* Dataset Management */
+		uint16_t on_rsvd:13;
+	} id_oncs;
+	struct {			/* Fused Operation Support */
+		uint16_t f_cmp_wr:1;	/* Compare and Write */
+		uint16_t f_rsvd:15;
+	} id_fuses;
+	struct {			/* Format NVM Attributes */
+		uint8_t fn_format:1;	/* Format applies to all NS */
+		uint8_t fn_sec_erase:1;	/* Secure Erase applies to all NS */
+		uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */
+		uint8_t fn_rsvd:5;
+	} id_fna;
+	struct {			/* Volatile Write Cache */
+		uint8_t vwc_present:1;	/* Volatile Write Cache present */
+		uint8_t rsvd:7;
+	} id_vwc;
+	uint16_t id_awun;		/* Atomic Write Unit Normal */
+	uint16_t id_awupf;		/* Atomic Write Unit Power Fail */
+	struct {			/* NVM Vendor Specific Command Conf */
+		uint8_t nv_spec:1;	/* use format from spec */
+		uint8_t nv_rsvd:7;
+	} id_nvscc;
+	uint8_t id_rsvd_nc_2[192 - 19];
+
+	/* I/O Command Set Attributes */
+	uint8_t id_rsvd_ioc[1344];
+
+	/* Power State Descriptors */
+	nvme_idctl_psd_t id_psd[32];
+
+	/* Vendor Specific */
+	uint8_t id_vs[1024];
+} nvme_identify_ctrl_t;
+
+/* NVMe Identify Namespace LBA Format */
+typedef struct {
+	uint16_t lbaf_ms;		/* Metadata Size */
+	uint8_t lbaf_lbads;		/* LBA Data Size */
+	uint8_t lbaf_rp:2;		/* Relative Performance */
+	uint8_t lbaf_rsvd1:6;
+} nvme_idns_lbaf_t;
+
+/* NVMe Identify Namespace Data Structure */
+typedef struct {
+	uint64_t id_nsize;		/* Namespace Size */
+	uint64_t id_ncap;		/* Namespace Capacity */
+	uint64_t id_nuse;		/* Namespace Utilization */
+	struct {			/* Namespace Features */
+		uint8_t f_thin:1;	/* Thin Provisioning */
+		uint8_t f_rsvd:7;
+	} id_nsfeat;
+	uint8_t id_nlbaf;		/* Number of LBA formats */
+	struct {			/* Formatted LBA size */
+		uint8_t lba_format:4;	/* LBA format */
+		uint8_t lba_extlba:1;	/* extended LBA (includes metadata) */
+		uint8_t lba_rsvd:3;
+	} id_flbas;
+	struct {			/* Metadata Capabilities */
+		uint8_t mc_extlba:1;	/* extended LBA transfers */
+		uint8_t mc_separate:1;	/* separate metadata transfers */
+		uint8_t mc_rsvd:6;
+	} id_mc;
+	struct {			/* Data Protection Capabilities */
+		uint8_t dp_type1:1;	/* Protection Information Type 1 */
+		uint8_t dp_type2:1;	/* Protection Information Type 2 */
+		uint8_t dp_type3:1;	/* Protection Information Type 3 */
+		uint8_t dp_first:1;	/* first 8 bytes of metadata */
+		uint8_t dp_last:1;	/* last 8 bytes of metadata */
+	} id_dpc;
+	struct {			/* Data Protection Settings */
+		uint8_t dp_pinfo:3;	/* Protection Information enabled */
+		uint8_t dp_first:1;	/* first 8 bytes of metadata */
+	} id_dps;
+	uint8_t id_rsvd1[128 - 30];
+	nvme_idns_lbaf_t id_lbaf[16];	/* LBA Formats */
+
+	uint8_t id_rsvd2[192];
+
+	uint8_t id_vs[3712];		/* Vendor Specific */
+} nvme_identify_nsid_t;
+
+
+/*
+ * NVMe Abort Command
+ */
+typedef union {
+	struct {
+		uint16_t ac_sqid;	/* Submission Queue ID */
+		uint16_t ac_cid;	/* Command ID */
+	} b;
+	uint32_t r;
+} nvme_abort_cmd_t;
+
+
+/*
+ * NVMe Get / Set Features
+ */
+#define	NVME_FEAT_ARBITRATION	0x1	/* Command Arbitration */
+#define	NVME_FEAT_POWER_MGMT	0x2	/* Power Management */
+#define	NVME_FEAT_LBA_RANGE	0x3	/* LBA Range Type */
+#define	NVME_FEAT_TEMPERATURE	0x4	/* Temperature Threshold */
+#define	NVME_FEAT_ERROR		0x5	/* Error Recovery */
+#define	NVME_FEAT_WRITE_CACHE	0x6	/* Volatile Write Cache */
+#define	NVME_FEAT_NQUEUES	0x7	/* Number of Queues */
+#define	NVME_FEAT_INTR_COAL	0x8	/* Interrupt Coalescing */
+#define	NVME_FEAT_INTR_VECT	0x9	/* Interrupt Vector Configuration */
+#define	NVME_FEAT_WRITE_ATOM	0xa	/* Write Atomicity */
+#define	NVME_FEAT_ASYNC_EVENT	0xb	/* Asynchronous Event Configuration */
+
+#define	NVME_FEAT_PROGRESS	0x80	/* Software Progress Marker */
+
+/* Arbitration Feature */
+typedef struct {
+	uint8_t arb_ab:3;		/* Arbitration Burst */
+	uint8_t arb_rsvd:5;
+	uint8_t arb_lpw;		/* Low Priority Weight */
+	uint8_t arb_mpw;		/* Medium Priority Weight */
+	uint8_t arb_hpw;		/* High Priority Weight */
+} nvme_arbitration_dw11_t;
+
+/* LBA Range Type Feature */
+typedef struct {
+	uint32_t lr_num:6;		/* Number of LBA ranges */
+	uint32_t lr_rsvd:26;
+} nvme_lba_range_type_dw11_t;
+
+typedef struct {
+	uint8_t lr_type;		/* Type */
+	struct {			/* Attributes */
+		uint8_t lr_write:1;	/* may be overwritten */
+		uint8_t lr_hidden:1;	/* hidden from OS/EFI/BIOS */
+		uint8_t lr_rsvd1:6;
+	} lr_attr;
+	uint8_t lr_rsvd2[14];
+	uint64_t lr_slba;		/* Starting LBA */
+	uint64_t lr_nlb;		/* Number of Logical Blocks */
+	uint8_t lr_guid[16];		/* Unique Identifier */
+	uint8_t lr_rsvd3[16];
+} nvme_lba_range_type_t;
+
+/* Number of Queues */
+typedef union {
+	struct {
+		uint16_t nq_nsq;	/* Number of Submission Queues */
+		uint16_t nq_ncq;	/* Number of Completion Queues */
+	} b;
+	uint32_t r;
+} nvme_nqueue_t;
+
+
+/*
+ * NVMe Get Log Page
+ */
+#define	NVME_LOGPAGE_ERROR	0x1	/* Error Information */
+#define	NVME_LOGPAGE_HEALTH	0x2	/* SMART/Health Information */
+#define	NVME_LOGPAGE_FWSLOT	0x3	/* Firmware Slot Information */
+
+typedef union {
+	struct {
+		uint8_t lp_lid;		/* Log Page Identifier */
+		uint8_t lp_rsvd1;
+		uint16_t lp_numd:12;	/* Number of Dwords */
+		uint16_t lp_rsvd2:4;
+	} b;
+	uint32_t r;
+} nvme_getlogpage_t;
+
+typedef struct {
+	uint64_t el_count;		/* Error Count */
+	uint16_t el_sqid;		/* Submission Queue ID */
+	uint16_t el_cid;		/* Command ID */
+	nvme_cqe_sf_t el_sf;		/* Status Field */
+	uint8_t	el_byte;		/* Parameter Error Location byte */
+	uint8_t	el_bit:3;		/* Parameter Error Location bit */
+	uint8_t el_rsvd1:5;
+	uint64_t el_lba;		/* Logical Block Address */
+	uint32_t el_nsid;		/* Namespace ID */
+	uint8_t	el_vendor;		/* Vendor Specific Information avail */
+	uint8_t el_rsvd2[64 - 29];
+} nvme_error_log_entry_t;
+
+typedef struct {
+	uint64_t lo;
+	uint64_t hi;
+} nvme_uint128_t;
+
+typedef struct {
+	uint8_t hl_crit_warn;		/* Critical Warning */
+	uint16_t hl_temp;		/* Temperature */
+	uint8_t hl_avail_spare;		/* Available Spare */
+	uint8_t hl_avail_spare_thr;	/* Available Spare Threshold */
+	uint8_t hl_used;		/* Percentage Used */
+	uint8_t hl_rsvd1[32 - 6];
+	nvme_uint128_t hl_data_read;	/* Data Units Read */
+	nvme_uint128_t hl_data_write;	/* Data Units Written */
+	nvme_uint128_t hl_host_read;	/* Host Read Commands */
+	nvme_uint128_t hl_host_write;	/* Host Write Commands */
+	nvme_uint128_t hl_ctrl_busy;	/* Controller Busy Time */
+	nvme_uint128_t hl_power_cycles;	/* Power Cycles */
+	nvme_uint128_t hl_power_on_hours; /* Power On Hours */
+	nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */
+	nvme_uint128_t hl_media_errors;	/* Media Errors */
+	nvme_uint128_t hl_errors_logged; /* Number of errors logged */
+	uint8_t hl_rsvd2[512 - 192];
+} nvme_health_log_t;
+
+typedef struct {
+	uint8_t fw_afi:3;		/* Active Firmware Slot */
+	uint8_t fw_rsvd1:5;
+	uint8_t fw_rsvd2[7];
+	char fw_frs[7][8];		/* Firmware Revision / Slot */
+	uint8_t fw_rsvd3[512 - 64];
+} nvme_fwslot_log_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#pragma pack() /* pack(1) */
+
+#endif /* _NVME_REG_H */
diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h
new file mode 100644
index 0000000000..37f446556d
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme_var.h
@@ -0,0 +1,240 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _NVME_VAR_H
+#define	_NVME_VAR_H
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/blkdev.h>
+#include <sys/taskq_impl.h>
+
+/*
+ * NVMe driver state
+ */
+
+#ifdef __cplusplus
+/* extern "C" { */
+#endif
+
+#define	NVME_FMA_INIT			0x1
+#define	NVME_REGS_MAPPED 		0x2
+#define	NVME_ADMIN_QUEUE 		0x4
+#define	NVME_CTRL_LIMITS 		0x8
+#define	NVME_INTERRUPTS  		0x10
+
+#define	NVME_MIN_ADMIN_QUEUE_LEN	16
+#define	NVME_MIN_IO_QUEUE_LEN		16
+#define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
+#define	NVME_DEFAULT_IO_QUEUE_LEN	1024
+#define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
+#define	NVME_MIN_ASYNC_EVENT_LIMIT	1
+
+#define	NVME_ADMIN_CMD_TIMEOUT		100000
+
+typedef struct nvme nvme_t;
+typedef struct nvme_namespace nvme_namespace_t;
+typedef struct nvme_dma nvme_dma_t;
+typedef struct nvme_cmd nvme_cmd_t;
+typedef struct nvme_qpair nvme_qpair_t;
+typedef struct nvme_task_arg nvme_task_arg_t;
+
+struct nvme_dma {
+	ddi_dma_handle_t nd_dmah;
+	ddi_acc_handle_t nd_acch;
+	ddi_dma_cookie_t nd_cookie;
+	uint_t nd_ncookie;
+	caddr_t nd_memp;
+	size_t nd_len;
+};
+
+struct nvme_cmd {
+	nvme_sqe_t nc_sqe;
+	nvme_cqe_t nc_cqe;
+
+	void (*nc_callback)(void *);
+	bd_xfer_t *nc_xfer;
+	boolean_t nc_completed;
+	uint16_t nc_sqid;
+
+	nvme_dma_t *nc_dma;
+
+	kmutex_t nc_mutex;
+	kcondvar_t nc_cv;
+
+	taskq_ent_t nc_tqent;
+	nvme_t *nc_nvme;
+};
+
+struct nvme_qpair {
+	size_t nq_nentry;
+
+	nvme_dma_t *nq_sqdma;
+	nvme_sqe_t *nq_sq;
+	uint_t nq_sqhead;
+	uint_t nq_sqtail;
+	uintptr_t nq_sqtdbl;
+
+	nvme_dma_t *nq_cqdma;
+	nvme_cqe_t *nq_cq;
+	uint_t nq_cqhead;
+	uint_t nq_cqtail;
+	uintptr_t nq_cqhdbl;
+
+	nvme_cmd_t **nq_cmd;
+	uint16_t nq_next_cmd;
+	uint_t nq_active_cmds;
+	int nq_phase;
+
+	kmutex_t nq_mutex;
+};
+
+struct nvme {
+	dev_info_t *n_dip;
+	int n_progress;
+
+	caddr_t n_regs;
+	ddi_acc_handle_t n_regh;
+
+	kmem_cache_t *n_cmd_cache;
+
+	size_t n_inth_sz;
+	ddi_intr_handle_t *n_inth;
+	int n_intr_cnt;
+	uint_t n_intr_pri;
+	int n_intr_cap;
+	int n_intr_type;
+	int n_intr_types;
+
+	boolean_t n_dead;
+	boolean_t n_strict_version;
+	boolean_t n_ignore_unknown_vendor_status;
+	uint32_t n_admin_queue_len;
+	uint32_t n_io_queue_len;
+	uint16_t n_async_event_limit;
+	uint16_t n_abort_command_limit;
+	uint64_t n_max_data_transfer_size;
+	boolean_t n_volatile_write_cache_enabled;
+	int n_error_log_len;
+
+	int n_nssr_supported;
+	int n_doorbell_stride;
+	int n_timeout;
+	int n_arbitration_mechanisms;
+	int n_cont_queues_reqd;
+	int n_max_queue_entries;
+	int n_pageshift;
+	int n_pagesize;
+
+	int n_namespace_count;
+	int n_ioq_count;
+
+	nvme_identify_ctrl_t *n_idctl;
+
+	nvme_qpair_t *n_adminq;
+	nvme_qpair_t **n_ioq;
+
+	nvme_namespace_t *n_ns;
+
+	ddi_dma_attr_t n_queue_dma_attr;
+	ddi_dma_attr_t n_prp_dma_attr;
+	ddi_dma_attr_t n_sgl_dma_attr;
+	ddi_device_acc_attr_t n_reg_acc_attr;
+	ddi_iblock_cookie_t n_fm_ibc;
+	int n_fm_cap;
+
+	ksema_t n_abort_sema;
+
+	ddi_taskq_t *n_cmd_taskq;
+
+	nvme_error_log_entry_t *n_error_log;
+	nvme_health_log_t *n_health_log;
+	nvme_fwslot_log_t *n_fwslot_log;
+
+	/* errors detected by driver */
+	uint32_t n_dma_bind_err;
+	uint32_t n_abort_failed;
+	uint32_t n_cmd_timeout;
+	uint32_t n_cmd_aborted;
+	uint32_t n_async_resubmit_failed;
+	uint32_t n_wrong_logpage;
+	uint32_t n_unknown_logpage;
+	uint32_t n_too_many_cookies;
+	uint32_t n_admin_queue_full;
+
+	/* errors detected by hardware */
+	uint32_t n_data_xfr_err;
+	uint32_t n_internal_err;
+	uint32_t n_abort_rq_err;
+	uint32_t n_abort_sq_del;
+	uint32_t n_nvm_cap_exc;
+	uint32_t n_nvm_ns_notrdy;
+	uint32_t n_inv_cq_err;
+	uint32_t n_inv_qid_err;
+	uint32_t n_max_qsz_exc;
+	uint32_t n_inv_int_vect;
+	uint32_t n_inv_log_page;
+	uint32_t n_inv_format;
+	uint32_t n_inv_q_del;
+	uint32_t n_cnfl_attr;
+	uint32_t n_inv_prot;
+	uint32_t n_readonly;
+
+	/* errors reported by asynchronous events */
+	uint32_t n_diagfail_event;
+	uint32_t n_persistent_event;
+	uint32_t n_transient_event;
+	uint32_t n_fw_load_event;
+	uint32_t n_reliability_event;
+	uint32_t n_temperature_event;
+	uint32_t n_spare_event;
+	uint32_t n_vendor_event;
+	uint32_t n_unknown_event;
+
+};
+
+struct nvme_namespace {
+	nvme_t *ns_nvme;
+	bd_handle_t ns_bd_hdl;
+
+	uint32_t ns_id;
+	size_t ns_block_count;
+	size_t ns_block_size;
+	size_t ns_best_block_size;
+
+	boolean_t ns_ignore;
+
+	nvme_identify_nsid_t *ns_idns;
+
+	/*
+	 * Section 7.7 of the spec describes how to get a unique ID for
+	 * the controller: the vendor ID, the model name and the serial
+	 * number shall be unique when combined.
+	 *
+	 * We add the hex namespace ID to get a unique ID for the namespace.
+	 */
+	char ns_devid[4 + 1 + 20 + 1 + 40 + 1 + 8 + 1];
+};
+
+struct nvme_task_arg {
+	nvme_t *nt_nvme;
+	nvme_cmd_t *nt_cmd;
+};
+
+#ifdef __cplusplus
+/* } */
+#endif
+
+#endif /* _NVME_VAR_H */
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index db9197ca36..f065e09abf 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2015, Joyent, Inc.
  * Copyright 2014 OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
@@ -410,7 +410,7 @@ static void mptsas_record_event(void *args);
 static int mptsas_reg_access(mptsas_t *mpt, mptsas_reg_access_t *data,
     int mode);
 
-mptsas_target_t *mptsas_tgt_alloc(mptsas_t *, uint16_t, uint64_t,
+mptsas_target_t *mptsas_tgt_alloc(refhash_t *, uint16_t, uint64_t,
     uint32_t, mptsas_phymask_t, uint8_t);
 static mptsas_smp_t *mptsas_smp_alloc(mptsas_t *, mptsas_smp_t *);
 static int mptsas_online_smp(dev_info_t *pdip, mptsas_smp_t *smp_node,
@@ -782,6 +782,23 @@ mptsas_target_addr_cmp(const void *a, const void *b)
 	return ((int)bap->mta_phymask - (int)aap->mta_phymask);
 }
 
+static uint64_t
+mptsas_tmp_target_hash(const void *tp)
+{
+	return ((uint64_t)(uintptr_t)tp);
+}
+
+static int
+mptsas_tmp_target_cmp(const void *a, const void *b)
+{
+	if (a > b)
+		return (1);
+	if (b < a)
+		return (-1);
+
+	return (0);
+}
+
 static void
 mptsas_target_free(void *op)
 {
@@ -808,6 +825,7 @@ mptsas_destroy_hashes(mptsas_t *mpt)
 	    sp = refhash_next(mpt->m_smp_targets, sp)) {
 		refhash_remove(mpt->m_smp_targets, sp);
 	}
+	refhash_destroy(mpt->m_tmp_targets);
 	refhash_destroy(mpt->m_targets);
 	refhash_destroy(mpt->m_smp_targets);
 	mpt->m_targets = NULL;
@@ -1365,6 +1383,16 @@ mptsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	    offsetof(mptsas_target_t, m_addr), KM_SLEEP);
 
 	/*
+	 * The refhash for temporary targets uses the address of the target
+	 * struct itself as tag, so the tag offset is 0. See the implementation
+	 * of mptsas_tmp_target_hash() and mptsas_tmp_target_cmp().
+	 */
+	mpt->m_tmp_targets = refhash_create(MPTSAS_TMP_TARGET_BUCKET_COUNT,
+	    mptsas_tmp_target_hash, mptsas_tmp_target_cmp,
+	    mptsas_target_free, sizeof (mptsas_target_t),
+	    offsetof(mptsas_target_t, m_link), 0, KM_SLEEP);
+
+	/*
 	 * Fill in the phy_info structure and get the base WWID
 	 */
 	if (mptsas_get_manufacture_page5(mpt) == DDI_FAILURE) {
@@ -1550,6 +1578,8 @@ fail:
 			mptsas_hba_teardown(mpt);
 		}
 
+		if (mpt->m_tmp_targets)
+			refhash_destroy(mpt->m_tmp_targets);
 		if (mpt->m_targets)
 			refhash_destroy(mpt->m_targets);
 		if (mpt->m_smp_targets)
@@ -6375,10 +6405,15 @@ mptsas_handle_topo_change(mptsas_topo_change_list_t *topo_node,
 				mptsas_log(mpt, CE_NOTE,
 				    "mptsas_handle_topo_change: could not "
 				    "allocate memory. \n");
+			} else if (rval == DEV_INFO_FAIL_GUID) {
+				mptsas_log(mpt, CE_NOTE,
+				    "mptsas_handle_topo_change: could not "
+				    "get SATA GUID for target %d. \n",
+				    topo_node->devhdl);
 			}
 			/*
-			 * If rval is DEV_INFO_PHYS_DISK than there is nothing
-			 * else to do, just leave.
+			 * If rval is DEV_INFO_PHYS_DISK or indicates failure
+			 * then there is nothing else to do, just leave.
 			 */
 			if (rval != DEV_INFO_SUCCESS) {
 				return;
@@ -9876,6 +9911,61 @@ mptsas_watch(void *arg)
 }
 
 static void
+mptsas_watchsubr_tgt(mptsas_t *mpt, mptsas_target_t *ptgt, hrtime_t timestamp)
+{
+	mptsas_cmd_t	*cmd;
+
+	/*
+	 * If we were draining due to a qfull condition,
+	 * go back to full throttle.
+	 */
+	if ((ptgt->m_t_throttle < MAX_THROTTLE) &&
+	    (ptgt->m_t_throttle > HOLD_THROTTLE) &&
+	    (ptgt->m_t_ncmds < ptgt->m_t_throttle)) {
+		mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE);
+		mptsas_restart_hba(mpt);
+	}
+
+	cmd = TAILQ_LAST(&ptgt->m_active_cmdq, mptsas_active_cmdq);
+	if (cmd == NULL)
+		return;
+
+	if (cmd->cmd_active_expiration <= timestamp) {
+		/*
+		 * Earliest command timeout expired. Drain throttle.
+		 */
+		mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
+
+		/*
+		 * Check for remaining commands.
+		 */
+		cmd = TAILQ_FIRST(&ptgt->m_active_cmdq);
+		if (cmd->cmd_active_expiration > timestamp) {
+			/*
+			 * Wait for remaining commands to complete or
+			 * time out.
+			 */
+			NDBG23(("command timed out, pending drain"));
+			return;
+		}
+
+		/*
+		 * All command timeouts expired.
+		 */
+		mptsas_log(mpt, CE_NOTE, "Timeout of %d seconds "
+		    "expired with %d commands on target %d lun %d.",
+		    cmd->cmd_pkt->pkt_time, ptgt->m_t_ncmds,
+		    ptgt->m_devhdl, Lun(cmd));
+
+		mptsas_cmd_timeout(mpt, ptgt);
+	} else if (cmd->cmd_active_expiration <=
+	    timestamp + (hrtime_t)mptsas_scsi_watchdog_tick * NANOSEC) {
+		NDBG23(("pending timeout"));
+		mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
+	}
+}
+
+static void
 mptsas_watchsubr(mptsas_t *mpt)
 {
 	int		i;
@@ -9926,54 +10016,12 @@ mptsas_watchsubr(mptsas_t *mpt)
 
 	for (ptgt = refhash_first(mpt->m_targets); ptgt != NULL;
 	    ptgt = refhash_next(mpt->m_targets, ptgt)) {
-		/*
-		 * If we were draining due to a qfull condition,
-		 * go back to full throttle.
-		 */
-		if ((ptgt->m_t_throttle < MAX_THROTTLE) &&
-		    (ptgt->m_t_throttle > HOLD_THROTTLE) &&
-		    (ptgt->m_t_ncmds < ptgt->m_t_throttle)) {
-			mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE);
-			mptsas_restart_hba(mpt);
-		}
-
-		cmd = TAILQ_LAST(&ptgt->m_active_cmdq, mptsas_active_cmdq);
-		if (cmd == NULL)
-			continue;
-
-		if (cmd->cmd_active_expiration <= timestamp) {
-			/*
-			 * Earliest command timeout expired. Drain throttle.
-			 */
-			mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
-
-			/*
-			 * Check for remaining commands.
-			 */
-			cmd = TAILQ_FIRST(&ptgt->m_active_cmdq);
-			if (cmd->cmd_active_expiration > timestamp) {
-				/*
-				 * Wait for remaining commands to complete or
-				 * time out.
-				 */
-				NDBG23(("command timed out, pending drain"));
-				continue;
-			}
-
-			/*
-			 * All command timeouts expired.
-			 */
-			mptsas_log(mpt, CE_NOTE, "Timeout of %d seconds "
-			    "expired with %d commands on target %d lun %d.",
-			    cmd->cmd_pkt->pkt_time, ptgt->m_t_ncmds,
-			    ptgt->m_devhdl, Lun(cmd));
+		mptsas_watchsubr_tgt(mpt, ptgt, timestamp);
+	}
 
-			mptsas_cmd_timeout(mpt, ptgt);
-		} else if (cmd->cmd_active_expiration <=
-		    timestamp + (hrtime_t)mptsas_scsi_watchdog_tick * NANOSEC) {
-			NDBG23(("pending timeout"));
-			mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
-		}
+	for (ptgt = refhash_first(mpt->m_tmp_targets); ptgt != NULL;
+	    ptgt = refhash_next(mpt->m_tmp_targets, ptgt)) {
+		mptsas_watchsubr_tgt(mpt, ptgt, timestamp);
 	}
 }
 
@@ -13555,28 +13603,32 @@ mptsas_get_target_device_info(mptsas_t *mpt, uint32_t page_address,
 	 */
 	if (dev_info & (MPI2_SAS_DEVICE_INFO_SATA_DEVICE |
 	    MPI2_SAS_DEVICE_INFO_ATAPI_DEVICE)) {
+		/* alloc a temporary target to send the cmd to */
+		tmp_tgt = mptsas_tgt_alloc(mpt->m_tmp_targets, *dev_handle,
+		    0, dev_info, 0, 0);
 		mutex_exit(&mpt->m_mutex);
-		/* alloc a tmp_tgt to send the cmd */
-		tmp_tgt = kmem_zalloc(sizeof (struct mptsas_target),
-		    KM_SLEEP);
-		tmp_tgt->m_devhdl = *dev_handle;
-		tmp_tgt->m_deviceinfo = dev_info;
-		tmp_tgt->m_qfull_retries = QFULL_RETRIES;
-		tmp_tgt->m_qfull_retry_interval =
-		    drv_usectohz(QFULL_RETRY_INTERVAL * 1000);
-		tmp_tgt->m_t_throttle = MAX_THROTTLE;
+
 		devicename = mptsas_get_sata_guid(mpt, tmp_tgt, 0);
-		kmem_free(tmp_tgt, sizeof (struct mptsas_target));
-		mutex_enter(&mpt->m_mutex);
+
+		if (devicename == -1) {
+			mutex_enter(&mpt->m_mutex);
+			refhash_remove(mpt->m_tmp_targets, tmp_tgt);
+			rval = DEV_INFO_FAIL_GUID;
+			return (rval);
+		}
+
 		if (devicename != 0 && (((devicename >> 56) & 0xf0) == 0x50)) {
 			sas_wwn = devicename;
 		} else if (dev_info & MPI2_SAS_DEVICE_INFO_DIRECT_ATTACH) {
 			sas_wwn = 0;
 		}
+
+		mutex_enter(&mpt->m_mutex);
+		refhash_remove(mpt->m_tmp_targets, tmp_tgt);
 	}
 
 	phymask = mptsas_physport_to_phymask(mpt, physport);
-	*pptgt = mptsas_tgt_alloc(mpt, *dev_handle, sas_wwn,
+	*pptgt = mptsas_tgt_alloc(mpt->m_targets, *dev_handle, sas_wwn,
 	    dev_info, phymask, phynum);
 	if (*pptgt == NULL) {
 		mptsas_log(mpt, CE_WARN, "Failed to allocated target"
@@ -13609,6 +13661,7 @@ inq83_retry:
 	if (rval != DDI_SUCCESS) {
 		mptsas_log(mpt, CE_WARN, "!mptsas request inquiry page "
 		    "0x83 for target:%x, lun:%x failed!", target, lun);
+		sata_guid = -1;
 		goto out;
 	}
 	/* According to SAT2, the first descriptor is logic unit name */
@@ -14442,7 +14495,8 @@ mptsas_update_hashtab(struct mptsas *mpt)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &dev_handle, &ptgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			break;
 		}
 
@@ -16119,7 +16173,8 @@ mptsas_phy_to_tgt(mptsas_t *mpt, mptsas_phymask_t phymask, uint8_t phy)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &cur_handle, &ptgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			break;
 		}
 		if ((rval == DEV_INFO_WRONG_DEVICE_TYPE) ||
@@ -16188,7 +16243,8 @@ mptsas_wwid_to_ptgt(mptsas_t *mpt, mptsas_phymask_t phymask, uint64_t wwid)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &cur_handle, &tmp_tgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			tmp_tgt = NULL;
 			break;
 		}
@@ -16256,7 +16312,7 @@ mptsas_wwid_to_psmp(mptsas_t *mpt, mptsas_phymask_t phymask, uint64_t wwid)
 }
 
 mptsas_target_t *
-mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
+mptsas_tgt_alloc(refhash_t *refhash, uint16_t devhdl, uint64_t wwid,
     uint32_t devinfo, mptsas_phymask_t phymask, uint8_t phynum)
 {
 	mptsas_target_t *tmp_tgt = NULL;
@@ -16264,7 +16320,7 @@ mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
 
 	addr.mta_wwn = wwid;
 	addr.mta_phymask = phymask;
-	tmp_tgt = refhash_lookup(mpt->m_targets, &addr);
+	tmp_tgt = refhash_lookup(refhash, &addr);
 	if (tmp_tgt != NULL) {
 		NDBG20(("Hash item already exist"));
 		tmp_tgt->m_deviceinfo = devinfo;
@@ -16288,7 +16344,7 @@ mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
 	tmp_tgt->m_t_throttle = MAX_THROTTLE;
 	TAILQ_INIT(&tmp_tgt->m_active_cmdq);
 
-	refhash_insert(mpt->m_targets, tmp_tgt);
+	refhash_insert(refhash, tmp_tgt);
 
 	return (tmp_tgt);
 }
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
index 371db950e5..728730a176 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
@@ -23,6 +23,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
 /*
@@ -92,7 +93,7 @@ static int mptsas_get_raid_wwid(mptsas_t *mpt, mptsas_raidvol_t *raidvol);
 
 extern int mptsas_check_dma_handle(ddi_dma_handle_t handle);
 extern int mptsas_check_acc_handle(ddi_acc_handle_t handle);
-extern mptsas_target_t *mptsas_tgt_alloc(mptsas_t *, uint16_t,
+extern mptsas_target_t *mptsas_tgt_alloc(refhash_t *, uint16_t,
     uint64_t, uint32_t, mptsas_phymask_t, uint8_t);
 
 static int
@@ -216,7 +217,7 @@ mptsas_raidconf_page_0_cb(mptsas_t *mpt, caddr_t page_memp,
 			/*
 			 * RAID uses phymask of 0.
 			 */
-			ptgt = mptsas_tgt_alloc(mpt,
+			ptgt = mptsas_tgt_alloc(mpt->m_targets,
 			    voldevhandle, raidwwn, 0, 0, 0);
 
 			raidconfig->m_raidvol[vol].m_raidtgt =
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
index 2689fe27c4..02116b45c4 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
+++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2015, Joyent, Inc.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
  */
@@ -100,6 +100,7 @@ typedef uint16_t		mptsas_phymask_t;
  */
 #define	MPTSAS_SMP_BUCKET_COUNT		23
 #define	MPTSAS_TARGET_BUCKET_COUNT	97
+#define	MPTSAS_TMP_TARGET_BUCKET_COUNT	13
 
 /*
  * MPT HW defines
@@ -557,6 +558,7 @@ _NOTE(DATA_READABLE_WITHOUT_LOCK(mptsas_topo_change_list_t::flags))
 #define	DEV_INFO_WRONG_DEVICE_TYPE	0x2
 #define	DEV_INFO_PHYS_DISK		0x3
 #define	DEV_INFO_FAIL_ALLOC		0x4
+#define	DEV_INFO_FAIL_GUID		0x5
 
 /*
  * mpt hotplug event defines
@@ -699,6 +701,7 @@ typedef struct mptsas {
 
 	refhash_t	*m_targets;
 	refhash_t	*m_smp_targets;
+	refhash_t	*m_tmp_targets;
 
 	m_raidconfig_t	m_raidconfig[MPTSAS_MAX_RAIDCONFIGS];
 	uint8_t		m_num_raid_configs;
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index 75ba9472c2..740f0a74a5 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -19,7 +19,7 @@
 #
 
 # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2014 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
 # Copyright (c) 2013 Andrew Stormont.  All rights reserved.
 # Copyright (c) 2015, Joyent, Inc. All rights reserved.
 
@@ -292,6 +292,7 @@ DRV_KMODS	+= nfp
 DRV_KMODS	+= nsmb
 DRV_KMODS	+= nulldriver
 DRV_KMODS	+= nv_sata
+DRV_KMODS	+= nvme
 DRV_KMODS	+= nxge
 DRV_KMODS	+= oce
 DRV_KMODS	+= openeepr
diff --git a/usr/src/uts/intel/nvme/Makefile b/usr/src/uts/intel/nvme/Makefile
new file mode 100644
index 0000000000..529d4378a2
--- /dev/null
+++ b/usr/src/uts/intel/nvme/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+#	Paths to the base of the uts directory trees
+#
+UTSBASE = ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= nvme
+OBJECTS		= $(NVME_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(NVME_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io/nvme
+#
+#	Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Driver depends on blkdev
+#
+LDFLAGS		+= -dy -N drv/blkdev
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
author	Patrick Mooney <patrick.f.mooney@gmail.com>	2015-09-16 14:55:15 +0000
committer	Patrick Mooney <patrick.f.mooney@gmail.com>	2015-09-16 14:55:15 +0000
commit	31a74c182cf2b5150a704786c53c053fd31b4c6d (patch)
tree	5eb53f7e1729ae4bf03e842ae3333e511b1a588d /usr/src
parent	a845c808b8c12dd241b837bd48ae775b26d458fe (diff)
parent	39fd84a866206a99cbb6b6e63e0c38a367aaa88e (diff)
download	illumos-joyent-31a74c182cf2b5150a704786c53c053fd31b4c6d.tar.gz