From 0d045c0d0cb001d79480ee33be28514e847f8612 Mon Sep 17 00:00:00 2001
From: Robert Mustacchi <rm@joyent.com>
Date: Tue, 2 Jun 2015 17:12:04 +0000
Subject: 6209 libc mutexes break kernel writers hearts Reviewed by: Jerry
 Jelinek <jerry.jelinek@joyent.com> Reviewed by: Josef 'Jeff' Sipek
 <jeffpc@josefsipek.net> Reviewed by: Dan McDonald <danmcd@omniti.com>
 Reviewed by: Garrett D'Amore <garrett@damore.org> Approved by: Dan McDonald
 <danmcd@omniti.com>

---
 usr/src/head/synch.h                          |  6 ++++
 usr/src/lib/libc/inc/thr_uberdata.h           |  4 ++-
 usr/src/lib/libc/port/mapfile-vers            |  2 ++
 usr/src/lib/libc/port/threads/assfail.c       | 10 +++++++
 usr/src/lib/libc/port/threads/synch.c         | 43 +++++++++++++++++++++++++++
 usr/src/lib/libzpool/common/kernel.c          |  4 +--
 usr/src/lib/libzpool/common/sys/zfs_context.h |  6 ++--
 7 files changed, 70 insertions(+), 5 deletions(-)

(limited to 'usr/src')

diff --git a/usr/src/head/synch.h b/usr/src/head/synch.h
index c0f68f12f0..dda7fa0a3c 100644
--- a/usr/src/head/synch.h
+++ b/usr/src/head/synch.h
@@ -211,6 +211,12 @@ void smt_pause(void);
 
 #endif /* _ASM */
 
+/*
+ * Panicking versions of our favorite friends.
+ */
+void mutex_enter(mutex_t *);
+void mutex_exit(mutex_t *);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/usr/src/lib/libc/inc/thr_uberdata.h b/usr/src/lib/libc/inc/thr_uberdata.h
index de0d4a6b05..4815d11486 100644
--- a/usr/src/lib/libc/inc/thr_uberdata.h
+++ b/usr/src/lib/libc/inc/thr_uberdata.h
@@ -23,7 +23,7 @@
  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 /*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #ifndef _THR_UBERDATA_H
@@ -1229,6 +1229,8 @@ extern	void	getgregs(ulwp_t *, gregset_t);
 extern	void	setgregs(ulwp_t *, gregset_t);
 extern	void	thr_panic(const char *);
 #pragma rarely_called(thr_panic)
+extern	void	mutex_panic(mutex_t *, const char *);
+#pragma rarely_called(mutex_panic)
 extern	ulwp_t	*find_lwp(thread_t);
 extern	void	finish_init(void);
 extern	void	update_sched(ulwp_t *);
diff --git a/usr/src/lib/libc/port/mapfile-vers b/usr/src/lib/libc/port/mapfile-vers
index c4571ef2f1..c6967141b6 100644
--- a/usr/src/lib/libc/port/mapfile-vers
+++ b/usr/src/lib/libc/port/mapfile-vers
@@ -2915,6 +2915,8 @@ $endif
 	msgctl64;
 	__multi_innetgr;
 	_mutex_destroy		{ FLAGS = NODYNSORT };
+	mutex_enter;
+	mutex_exit;
 	mutex_held;
 	_mutex_init		{ FLAGS = NODYNSORT };
 	_mutex_unlock		{ FLAGS = NODYNSORT };
diff --git a/usr/src/lib/libc/port/threads/assfail.c b/usr/src/lib/libc/port/threads/assfail.c
index 8aebefbe4a..b40e6dc029 100644
--- a/usr/src/lib/libc/port/threads/assfail.c
+++ b/usr/src/lib/libc/port/threads/assfail.c
@@ -25,6 +25,7 @@
  */
 /*
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #include "lint.h"
@@ -36,6 +37,8 @@ ulwp_t *panic_thread;
 static mutex_t assert_lock = DEFAULTMUTEX;
 static ulwp_t *assert_thread = NULL;
 
+mutex_t *panic_mutex = NULL;
+
 /*
  * Called from __assert() to set panicstr and panic_thread.
  */
@@ -129,6 +132,13 @@ aio_panic(const char *why)
 	common_panic("*** libc aio system failure: ", why);
 }
 
+void
+mutex_panic(mutex_t *mp, const char *why)
+{
+	panic_mutex = mp;
+	common_panic("*** libc mutex system failure: ", why);
+}
+
 /*
  * Utility function for converting a long integer to a string, avoiding stdio.
  * 'base' must be one of 10 or 16
diff --git a/usr/src/lib/libc/port/threads/synch.c b/usr/src/lib/libc/port/threads/synch.c
index a7c4aed9ef..b4efb58d17 100644
--- a/usr/src/lib/libc/port/threads/synch.c
+++ b/usr/src/lib/libc/port/threads/synch.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include "lint.h"
@@ -2314,6 +2315,29 @@ mutex_lock(mutex_t *mp)
 	return (mutex_lock_impl(mp, NULL));
 }
 
+void
+mutex_enter(mutex_t *mp)
+{
+	int ret;
+	int attr = mp->mutex_type & ALL_ATTRIBUTES;
+
+	/*
+	 * Require LOCK_ERRORCHECK, accept LOCK_RECURSIVE.
+	 */
+	if (attr != LOCK_ERRORCHECK &&
+	    attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
+		mutex_panic(mp, "mutex_enter: bad mutex type");
+	}
+	ret = mutex_lock(mp);
+	if (ret == EDEADLK) {
+		mutex_panic(mp, "recursive mutex_enter");
+	} else if (ret == EAGAIN) {
+		mutex_panic(mp, "excessive recursive mutex_enter");
+	} else if (ret != 0) {
+		mutex_panic(mp, "unknown mutex_enter failure");
+	}
+}
+
 int
 pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
 	const struct timespec *_RESTRICT_KYWD abstime)
@@ -2573,6 +2597,25 @@ slow_unlock:
 	return (mutex_unlock_internal(mp, 0));
 }
 
+void
+mutex_exit(mutex_t *mp)
+{
+	int ret;
+	int attr = mp->mutex_type & ALL_ATTRIBUTES;
+
+	if (attr != LOCK_ERRORCHECK &&
+	    attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
+		mutex_panic(mp, "mutex_exit: bad mutex type");
+	}
+	ret = mutex_unlock(mp);
+	if (ret == EPERM) {
+		mutex_panic(mp, "mutex_exit: not owner");
+	} else if (ret != 0) {
+		mutex_panic(mp, "unknown mutex_exit failure");
+	}
+
+}
+
 /*
  * Internally to the library, almost all mutex lock/unlock actions
  * go through these lmutex_ functions, to protect critical regions.
diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c
index dd4221deb5..a74276e95e 100644
--- a/usr/src/lib/libzpool/common/kernel.c
+++ b/usr/src/lib/libzpool/common/kernel.c
@@ -156,7 +156,7 @@ zmutex_destroy(kmutex_t *mp)
 }
 
 void
-mutex_enter(kmutex_t *mp)
+zmutex_enter(kmutex_t *mp)
 {
 	ASSERT(mp->initialized == B_TRUE);
 	ASSERT(mp->m_owner != (void *)-1UL);
@@ -181,7 +181,7 @@ mutex_tryenter(kmutex_t *mp)
 }
 
 void
-mutex_exit(kmutex_t *mp)
+zmutex_exit(kmutex_t *mp)
 {
 	ASSERT(mp->initialized == B_TRUE);
 	ASSERT(mutex_owner(mp) == curthread);
diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h
index 6216006dd4..9e4d8ed0b8 100644
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h
@@ -225,11 +225,13 @@ extern int _mutex_destroy(mutex_t *mp);
 
 #define	mutex_init(mp, b, c, d)		zmutex_init((kmutex_t *)(mp))
 #define	mutex_destroy(mp)		zmutex_destroy((kmutex_t *)(mp))
+#define	mutex_enter(mp)			zmutex_enter(mp)
+#define	mutex_exit(mp)			zmutex_exit(mp)
 
 extern void zmutex_init(kmutex_t *mp);
 extern void zmutex_destroy(kmutex_t *mp);
-extern void mutex_enter(kmutex_t *mp);
-extern void mutex_exit(kmutex_t *mp);
+extern void zmutex_enter(kmutex_t *mp);
+extern void zmutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 extern void *mutex_owner(kmutex_t *mp);
 
-- 
cgit v1.2.3


From b08923d6c9c63a4f4b647b84d9454d8124fcedd7 Mon Sep 17 00:00:00 2001
From: Robert Mustacchi <rm@joyent.com>
Date: Thu, 21 May 2015 14:49:36 +0000
Subject: 6210 ping can misreport ICMP latency 6211 want warnings in the face
 of long running name lookups for ping 6212 Want sub-second ping interval
 support 6213 clean up warnings in ping Reviewed by: Jerry Jelinek
 <jerry.jelinek@joyent.com> Reviewed by: Joshua M. Clulow <jmc@joyent.com>
 Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com> Reviewed by: Josef
 'Jeff' Sipek <jeffpc@josefsipek.net> Approved by: Dan McDonald
 <danmcd@omniti.com>

---
 usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile    |  11 +-
 usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c      | 224 +++++++++++++++++++++++--
 usr/src/cmd/cmd-inet/usr.sbin/ping/ping.h      |   6 +-
 usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux.c  |  27 +--
 usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux6.c |  48 +++---
 usr/src/man/man1m/ping.1m                      |  53 +++++-
 6 files changed, 309 insertions(+), 60 deletions(-)

(limited to 'usr/src')

diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
index 88f36be473..9724fff4e5 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/Makefile
@@ -38,17 +38,18 @@ $(ROOTUSRSBIN)/ping	:=	FILEMODE= 04555
 # when IPv6 inspired new interfaces are part of standards.
 LDLIBS +=	-lxnet -lsocket -lnsl -lm -linetutil
 
-# These #defines are required to use UNIX 98 interfaces
-CPPFLAGS += -D_XOPEN_SOURCE=500 -D__EXTENSIONS__
+# These #defines are required to use SUSv3 interfaces
+CPPFLAGS += -D_XOPEN_SOURCE=600 -D__EXTENSIONS__
+
+C99MODE=	-xc99=%all
 
 # Setting the above defines to use the UNIX98 ancillary data feature
 # causes lint to output warnings about lint library declarations conflicting
 # with those in the header files. Since we need these features the best
 # course of action is to switch the types of the resulting warnings off
 # when running lint.
-LINTFLAGS += -erroff=E_INCONS_VAL_TYPE_DECL2 -erroff=E_INCONS_ARG_DECL2 
-
-CERRWARN += -_gcc=-Wno-uninitialized
+LINTFLAGS += -erroff=E_INCONS_VAL_TYPE_DECL2 -erroff=E_INCONS_ARG_DECL2 \
+		-erroff=E_NAME_USED_NOT_DEF2
 
 .KEEP_STATE:
 .PARALLEL:
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
index 2146efd67b..2d79419245 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
@@ -37,6 +37,11 @@
  * contributors.
  */
 
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
+#include <assert.h>
 #include <stdio.h>
 #include <strings.h>
 #include <errno.h>
@@ -45,6 +50,9 @@
 #include <signal.h>
 #include <limits.h>
 #include <math.h>
+#include <locale.h>
+#include <thread.h>
+#include <synch.h>
 
 #include <sys/time.h>
 #include <sys/param.h>
@@ -53,6 +61,7 @@
 #include <sys/stropts.h>
 #include <sys/file.h>
 #include <sys/sysmacros.h>
+#include <sys/debug.h>
 
 #include <arpa/inet.h>
 #include <net/if.h>
@@ -153,7 +162,6 @@ static int eff_num_gw;			/* effective number of gateways */
 static int num_wraps = -1;		/* no of times 64K icmp_seq wrapped */
 static ushort_t dest_port = 32768 + 666; /* starting port for the UDP probes */
 static char *gw_list[MAXMAX_GWS];	/* list of gateways as user enters */
-static int interval = 1;		/* interval between transmissions */
 static int options;			/* socket options */
 static int moptions;			/* multicast options */
 int npackets;				/* number of packets to send */
@@ -164,6 +172,22 @@ static int timeout = TIMEOUT;		/* timeout value (sec) for probes */
 static struct if_entry out_if;		/* interface argument */
 int ident;				/* ID for this ping run */
 static hrtime_t t_last_probe_sent;	/* the time we sent the last probe */
+static timer_t timer;			/* timer for waiting */
+static volatile boolean_t timer_done = _B_FALSE; /* timer finished? */
+static struct itimerspec interval = { { 0, 0 }, { 1, 0 } }; /* Interval for */
+					/* -I. The default interval is 1s. */
+static hrtime_t mintime = NSEC2MSEC(500);	/* minimum time between pings */
+
+/*
+ * Globals for our name services warning. See ns_warning_thr() for more on why
+ * this exists.
+ */
+static mutex_t ns_lock = ERRORCHECKMUTEX; /* Protects the following data */
+static boolean_t ns_active = _B_FALSE;	/* Lookup is going on */
+static hrtime_t ns_starttime;		/* Time the lookup started */
+static int ns_sleeptime = 2;		/* Time in seconds between checks */
+static int ns_warntime = 2;		/* Time in seconds before warning */
+static int ns_warninter = 60;		/* Time in seconds between warnings */
 
 /*
  * This buffer stores the received packets. Currently it needs to be 32 bit
@@ -203,6 +227,8 @@ static ushort_t in_cksum(ushort_t *, int);
 static int int_arg(char *s, char *what);
 boolean_t is_a_target(struct addrinfo *, union any_in_addr *);
 static void mirror_gws(union any_in_addr *, int);
+static void *ns_warning_thr(void *);
+static void parse_interval(char *s);
 static void pinger(int, struct sockaddr *, struct msghdr *, int);
 char *pr_name(char *, int);
 char *pr_protocol(int);
@@ -249,6 +275,8 @@ main(int argc, char *argv[])
 
 	progname = argv[0];
 
+	(void) setlocale(LC_ALL, "");
+
 	/*
 	 * This program needs the net_icmpaccess privilege for creating
 	 * raw ICMP sockets.  It needs sys_ip_config for using the
@@ -322,7 +350,7 @@ main(int argc, char *argv[])
 
 		case 'I':
 			stats = _B_TRUE;
-			interval = int_arg(optarg, "interval");
+			parse_interval(optarg);
 			break;
 
 		case 'i':
@@ -689,6 +717,23 @@ main(int argc, char *argv[])
 		}
 	}
 
+	/* Create our timer for future use */
+	if (timer_create(CLOCK_REALTIME, NULL, &timer) != 0) {
+		Fprintf(stderr, "%s: failed to create timer: %s\n",
+		    progname, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Finally start up the name services warning thread.
+	 */
+	if (thr_create(NULL, 0, ns_warning_thr, NULL,
+	    THR_DETACHED | THR_DAEMON, NULL) != 0) {
+		Fprintf(stderr, "%s: failed to create name services "
+		    "thread: %s\n", progname, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
 	/* Let's get things going */
 	send_scheduled_probe();
 
@@ -1142,8 +1187,8 @@ select_src_addr(union any_in_addr *dst_addr, int family,
     union any_in_addr *src_addr)
 {
 	struct sockaddr *sock;
-	struct sockaddr_in *sin;
-	struct sockaddr_in6 *sin6;
+	struct sockaddr_in *sin = NULL;
+	struct sockaddr_in6 *sin6 = NULL;
 	int tmp_fd;
 	size_t sock_len;
 
@@ -1202,8 +1247,10 @@ select_src_addr(union any_in_addr *dst_addr, int family,
 	}
 
 	if (family == AF_INET) {
+		assert(sin != NULL);
 		src_addr->addr = sin->sin_addr;
 	} else {
+		assert(sin6 != NULL);
 		src_addr->addr6 = sin6->sin6_addr;
 	}
 
@@ -1606,6 +1653,14 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
 		}
 	}
 
+	/* Ensure that timestamping is requested on the receive socket */
+	if (setsockopt(recv_sock, SOL_SOCKET, SO_TIMESTAMP,
+	    &on, sizeof (on)) == -1) {
+		Fprintf(stderr, "%s: warning: timing accuracy diminished -- "
+		    "setsockopt SO_TIMESTAMP failed %s", progname,
+		    strerror(errno));
+	}
+
 	*send_sockp = send_sock;
 	*recv_sockp = recv_sock;
 
@@ -1683,14 +1738,21 @@ void
 sigalrm_handler(void)
 {
 	/*
-	 * Guard againist denial-of-service attacks. Make sure ping doesn't
-	 * send probes for every SIGALRM it receives. Evil hacker can generate
-	 * SIGALRMs as fast as it can, but ping will ignore those which are
-	 * received too soon (earlier than 0.5 sec) after it sent the last
-	 * probe.  We use gethrtime() instead of gettimeofday() because
-	 * the latter is not linear and is prone to resetting or drifting
+	 * If we've been told that we're done, the timer should be cancelled
+	 * and not rescheduled, just return.
+	 */
+	if (timer_done == _B_TRUE)
+		return;
+
+	/*
+	 * Guard against denial-of-service attacks. Make sure ping doesn't send
+	 * probes for every SIGALRM it receives in the case of errant SIGALRMs.
+	 * ping will ignore those which are received too soon (the smaller of
+	 * 0.5 sec and the ping interval, if in effect) after it sent the last
+	 * probe.  We use gethrtime() instead of gettimeofday() because the
+	 * latter is not linear and is prone to resetting or drifting.
 	 */
-	if ((gethrtime() - t_last_probe_sent) < 500000000) {
+	if ((gethrtime() - t_last_probe_sent) < mintime) {
 		return;
 	}
 	send_scheduled_probe();
@@ -1704,10 +1766,12 @@ void
 schedule_sigalrm(void)
 {
 	int waittime;
+	struct itimerspec it;
 
+	bzero(&it, sizeof (struct itimerspec));
 	if (npackets == 0 ||
 	    current_targetaddr->num_sent < current_targetaddr->num_probes) {
-		(void) alarm(interval);
+		it = interval;
 	} else {
 		if (current_targetaddr->got_reply) {
 			waittime = 2 * tmax / MICROSEC;
@@ -1716,7 +1780,13 @@ schedule_sigalrm(void)
 		} else {
 			waittime = MAX_WAIT;
 		}
-		(void) alarm(waittime);
+		it.it_value.tv_sec = waittime;
+	}
+
+	if (timer_settime(timer, TIMER_RELTIME, &it, NULL) != 0) {
+		Fprintf(stderr, "%s: unexpected error updating time: %s\n",
+		    progname, strerror(errno));
+		exit(EXIT_FAILURE);
 	}
 }
 
@@ -1772,7 +1842,7 @@ send_scheduled_probe()
 		 * Did we reach the end of road?
 		 */
 		if (current_targetaddr == NULL) {
-			(void) alarm(0);	/* cancel alarm */
+			timer_done = _B_TRUE;
 			if (stats)
 				finish();
 			if (is_alive)
@@ -1933,7 +2003,7 @@ ushort_t udp_src_port6, ushort_t udp_src_port)
 		 */
 		if ((npackets > 0) && (current_targetaddr->next == NULL) &&
 		    (nreceived_last_target == npackets)) {
-			(void) alarm(0);	/* cancel alarm */
+			timer_done = _B_TRUE;
 			finish();
 		}
 	} /* infinite loop */
@@ -2144,6 +2214,10 @@ pr_name(char *addr, int family)
 	/* compare with the buffered (previous) lookup */
 	if (memcmp(addr, &prev_addr, alen) != 0) {
 		int flags = (nflag) ? NI_NUMERICHOST : NI_NAMEREQD;
+		mutex_enter(&ns_lock);
+		ns_active = _B_TRUE;
+		ns_starttime = gethrtime();
+		mutex_exit(&ns_lock);
 		if (getnameinfo(sa, slen, buf, sizeof (buf),
 		    NULL, 0, flags) != 0) {
 			/* getnameinfo() failed; return just the address */
@@ -2158,6 +2232,9 @@ pr_name(char *addr, int family)
 			    inet_ntop(family, (const void *)addr, abuf,
 			    sizeof (abuf)));
 		}
+		mutex_enter(&ns_lock);
+		ns_active = _B_FALSE;
+		mutex_exit(&ns_lock);
 
 		/* LINTED E_BAD_PTR_CAST_ALIGN */
 		prev_addr = *(struct in6_addr *)addr;
@@ -2419,10 +2496,123 @@ int_arg(char *s, char *what)
 	}
 
 	if (errno || *ep != '\0' || num < 0) {
-		(void) Fprintf(stderr, "%s: bad %s: %s\n",
-		    progname, what, s);
+		Fprintf(stderr, "%s: bad %s: %s\n", progname, what, s);
 		exit(EXIT_FAILURE);
 	}
 
 	return (num);
 }
+
+/*
+ * Parse the interval into a itimerspec. The interval used to originally be
+ * parsed as an integer argument. That means that one used to be able to specify
+ * an interval in hex. The strtod() family honors that at times, with strtod
+ * sometimes doing so depending on the compilation environment and strtof() and
+ * srtold() always doing that. To facilitiate that and not worry about a
+ * careless Makefile change breaking us, we instead just use strtold here, even
+ * though we really don't need the precision.
+ */
+static void
+parse_interval(char *s)
+{
+	long double val;
+	char *end;
+
+	errno = 0;
+	val = strtold(s, &end);
+	if (errno != 0 || *end != '\0') {
+		Fprintf(stderr, "%s: bad interval: %s\n", progname, s);
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Check values that we know are going to be bad. Anything greater than
+	 * INT_MAX, anything less than 0, look for specific NaNs. Also, clamp
+	 * the value at 0.01 seconds.
+	 */
+	if (val == NAN || val <= 0.0 || val >= INT_MAX) {
+		Fprintf(stderr, "%s: bad interval: %s\n", progname, s);
+		exit(EXIT_FAILURE);
+	}
+
+	if (val < 0.01) {
+		Fprintf(stderr, "%s: interval too small: %Lf\n", progname, val);
+		exit(EXIT_FAILURE);
+	}
+
+	interval.it_value.tv_sec = (long)val;
+	interval.it_value.tv_nsec = (long)((val - interval.it_value.tv_sec) *
+	    NANOSEC);
+
+	if (interval.it_value.tv_sec == 0 &&
+	    interval.it_value.tv_nsec < mintime) {
+		mintime = interval.it_value.tv_nsec;
+	}
+}
+
+/*
+ * We should have an SO_TIMESTAMP message for this socket to indicate
+ * the actual time that the message took. If we don't we'll fall back to
+ * gettimeofday(); however, that can cause any delays due to DNS
+ * resolution and the like to end up wreaking havoc on us.
+ */
+void
+ping_gettime(struct msghdr *msg, struct timeval *tv)
+{
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
+	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+		    cmsg->cmsg_type == SO_TIMESTAMP &&
+		    cmsg->cmsg_len == CMSG_LEN(sizeof (*tv))) {
+			bcopy(CMSG_DATA(cmsg), tv, sizeof (*tv));
+			return;
+		}
+	}
+
+	(void) gettimeofday(tv, (struct timezone *)NULL);
+}
+
+/*
+ * The purpose of this thread is to try and inform a user that we're blocked
+ * doing name lookups. For various reasons, ping has to try and look up the IP
+ * addresses it receives via name services unless the -n flag is specified. The
+ * irony of this is that when trying to use ping to actually diagnose a broken
+ * network, name services are unlikely to be available and that will result in a
+ * lot of confusion as to why pings seem like they're not working. As such, we
+ * basically wake up every 2 seconds and check whether or not we've hit such a
+ * condition where we should inform the user via stderr.
+ *
+ * Once they've been informed, we do not inform them again until approximately a
+ * minute of time has passed, in case that things are working intermittently.
+ */
+/*ARGSUSED*/
+static void *
+ns_warning_thr(void *unused)
+{
+	hrtime_t last_warn = 0;
+	for (;;) {
+		hrtime_t now;
+
+		(void) sleep(ns_sleeptime);
+		now = gethrtime();
+		mutex_enter(&ns_lock);
+		if (ns_active == _B_TRUE &&
+		    now - ns_starttime >= ns_warntime * NANOSEC) {
+			if (now - last_warn >=
+			    ns_warninter * NANOSEC) {
+				last_warn = now;
+				Fprintf(stderr, "%s: warning: ICMP responses "
+				    "received, but name service lookups are "
+				    "taking a while. Use ping -n to disable "
+				    "name service lookups.\n",
+				    progname);
+			}
+		}
+		mutex_exit(&ns_lock);
+	}
+
+	/* LINTED: E_STMT_NOT_REACHED */
+	return (NULL);
+}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.h b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.h
index 65be7a1116..9dd579841e 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.h
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.h
@@ -27,7 +27,9 @@
 #ifndef _PING_H
 #define	_PING_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -131,6 +133,8 @@ extern boolean_t use_udp;
 extern boolean_t verbose;
 extern boolean_t send_reply;
 
+extern void ping_gettime(struct msghdr *, struct timeval *);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux.c
index d76fae0aaf..9db03cc7a0 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux.c
@@ -27,13 +27,15 @@
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
 /*
  * Portions of this source code were derived from Berkeley 4.3 BSD
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdio.h>
 #include <string.h>
 #include <strings.h>
@@ -233,9 +235,10 @@ check_reply(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 	int hlen, hlen1;
 	int64_t triptime;
 	boolean_t valid_reply = _B_FALSE;
-	boolean_t reply_matched_current_target;	/* Is the source address of */
-						/* this reply same as where */
-						/* we're sending currently? */
+	boolean_t reply_matched_current_target = _B_FALSE; /* Is the source */
+						/* address of this reply same */
+						/* as where we're sending */
+						/* currently? */
 	boolean_t last_reply_from_targetaddr = _B_FALSE; /* Is this stats, */
 						/* probe all with npackets>0 */
 						/* and we received reply for */
@@ -282,7 +285,7 @@ check_reply(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 	/* LINTED */
 	intp = (int32_t *)buf;
 
-	(void) gettimeofday(&tv, (struct timezone *)NULL);
+	ping_gettime(msg, &tv);
 
 	/* LINTED */
 	ip = (struct ip *)buf;
@@ -352,8 +355,8 @@ check_reply(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 			nreceived++;
 			reply_matched_current_target =
 			    seq_match(current_targetaddr->starting_seq_num,
-				current_targetaddr->num_sent,
-				ntohs(up->uh_dport));
+			    current_targetaddr->num_sent,
+			    ntohs(up->uh_dport));
 			if (reply_matched_current_target) {
 				current_targetaddr->got_reply = _B_TRUE;
 				nreceived_last_target++;
@@ -552,8 +555,8 @@ check_reply(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 			nreceived++;
 			reply_matched_current_target =
 			    seq_match(current_targetaddr->starting_seq_num,
-				current_targetaddr->num_sent,
-				ntohs(icp->icmp_seq));
+			    current_targetaddr->num_sent,
+			    ntohs(icp->icmp_seq));
 			if (reply_matched_current_target) {
 				current_targetaddr->got_reply = _B_TRUE;
 				nreceived_last_target++;
@@ -863,8 +866,8 @@ check_reply(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 			nreceived++;
 			reply_matched_current_target =
 			    seq_match(current_targetaddr->starting_seq_num,
-				current_targetaddr->num_sent,
-				ntohs(icp->icmp_seq));
+			    current_targetaddr->num_sent,
+			    ntohs(icp->icmp_seq));
 			if (reply_matched_current_target) {
 				current_targetaddr->got_reply = _B_TRUE;
 				nreceived_last_target++;
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux6.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux6.c
index 374c88fd1e..02008e62aa 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux6.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping_aux6.c
@@ -27,13 +27,15 @@
 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved  	*/
 
+/*
+ * Copyright 2015, Joyent, Inc.
+ */
+
 /*
  * Portions of this source code were derived from Berkeley 4.3 BSD
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdio.h>
 #include <string.h>
 #include <strings.h>
@@ -236,7 +238,7 @@ set_ancillary_data(struct msghdr *msgp, int hoplimit,
 		 * therefore let's use bcopy, instead of assignment.
 		 */
 		(void) bcopy(&in6addr_any, &pktinfop->ipi6_addr,
-		sizeof (struct in6_addr));
+		    sizeof (struct in6_addr));
 
 		/*
 		 *  We can assume pktinfop->ipi6_ifindex is 32 bit aligned.
@@ -271,9 +273,10 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 	struct timeval *tp;
 	int64_t triptime;
 	boolean_t valid_reply = _B_FALSE;
-	boolean_t reply_matched_current_target;	/* Is the source address of */
-						/* this reply same as where */
-						/* we're sending currently? */
+	boolean_t reply_matched_current_target = _B_FALSE; /* Is the source */
+						/* address of this reply same */
+						/* as where we're sending */
+						/* currently? */
 	boolean_t last_reply_from_targetaddr = _B_FALSE; /* Is this stats, */
 						/* probe all with npackets>0 */
 						/* and we received reply for */
@@ -309,8 +312,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 	/* LINTED */
 	intp = (int32_t *)buf;
 
-	/* get time now for most accurate time calculation */
-	(void) gettimeofday(&tv, (struct timezone *)NULL);
+	ping_gettime(msg, &tv);
 
 	/* Ignore packets > 64k or control buffers that don't fit */
 	if (msg->msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
@@ -341,7 +343,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("packet too short (%d bytes) from %s\n",
 				    cc,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			return;
 		}
@@ -362,7 +364,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("packet too short (%d bytes) from %s\n",
 				    cc,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			return;
 		}
@@ -389,8 +391,8 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 			nreceived++;
 			reply_matched_current_target =
 			    seq_match(current_targetaddr->starting_seq_num,
-				current_targetaddr->num_sent,
-				ntohs(up->uh_dport));
+			    current_targetaddr->num_sent,
+			    ntohs(up->uh_dport));
 			if (reply_matched_current_target) {
 				current_targetaddr->got_reply = _B_TRUE;
 				nreceived_last_target++;
@@ -489,12 +491,12 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("ICMPv6 %d Unreachable from gateway "
 				    "%s\n", icmp6->icmp6_code,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			} else {
 				Printf("ICMPv6 %s from gateway %s\n",
 				    unreach6[icmp6->icmp6_code],
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			Printf(" for %s from %s", pr_protocol(last_hdr),
 			    pr_name((char *)&ip6h->ip6_src, AF_INET6));
@@ -584,7 +586,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("packet too short (%d bytes) from %s\n",
 				    cc,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			return;
 		}
@@ -596,12 +598,12 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("ICMPv6 %d time exceeded from %s\n",
 				    icmp6->icmp6_code,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			} else {
 				Printf("ICMPv6 %s from %s\n",
 				    timexceed6[icmp6->icmp6_code],
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			Printf(" for %s from %s", pr_protocol(last_hdr),
 			    pr_name((char *)&ip6h->ip6_src, AF_INET6));
@@ -627,7 +629,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("packet too short (%d bytes) from %s\n",
 				    cc,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			return;
 		}
@@ -639,12 +641,12 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("ICMPv6 %d parameter problem from %s\n",
 				    icmp6->icmp6_code,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			} else {
 				Printf("ICMPv6 %s from %s\n",
 				    param_prob6[icmp6->icmp6_code],
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			icmp6->icmp6_pptr = ntohl(icmp6->icmp6_pptr);
 			Printf(" in byte %d", icmp6->icmp6_pptr);
@@ -692,8 +694,8 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 			nreceived++;
 			reply_matched_current_target =
 			    seq_match(current_targetaddr->starting_seq_num,
-				current_targetaddr->num_sent,
-				ntohs(icmp6->icmp6_seq));
+			    current_targetaddr->num_sent,
+			    ntohs(icmp6->icmp6_seq));
 			if (reply_matched_current_target) {
 				current_targetaddr->got_reply = _B_TRUE;
 				nreceived_last_target++;
@@ -859,7 +861,7 @@ check_reply6(struct addrinfo *ai_dst, struct msghdr *msg, int cc,
 				Printf("packet too short (%d bytes) from %s\n",
 				    cc,
 				    pr_name((char *)&from6->sin6_addr,
-					AF_INET6));
+				    AF_INET6));
 			}
 			return;
 		}
diff --git a/usr/src/man/man1m/ping.1m b/usr/src/man/man1m/ping.1m
index 4097ca8bb8..a5a29be2d6 100644
--- a/usr/src/man/man1m/ping.1m
+++ b/usr/src/man/man1m/ping.1m
@@ -1,10 +1,11 @@
 '\" te
 .\" Copyright (C) 2006, Sun Microsystems, Inc. All Rights Reserved
 .\" Copyright 1989 AT&T
+.\" Copyright 2015, Joyent, Inc.
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH PING 1M "Feb 20, 2015"
+.TH PING 1M "May 21, 2015"
 .SH NAME
 ping \- send ICMP (ICMP6) ECHO_REQUEST packets to network hosts
 .SH SYNOPSIS
@@ -123,7 +124,8 @@ range from 0 to 1048575. This option is valid only on IPv6.
 .RS 24n
 Turn on the statistics mode and specify the interval between successive
 transmissions. The default is one second. See the discussion of the \fB-s\fR
-option.
+option. The minimum interval is 0.01 seconds. It is an error to specify
+a smaller interval.
 .RE
 
 .sp
@@ -422,3 +424,50 @@ machine was not alive.
 \fBifconfig\fR(1M), \fBin.routed\fR(1M), \fBndd\fR(1M), \fBnetstat\fR(1M),
 \fBrpcinfo\fR(1M), \fBtraceroute\fR(1M), \fBattributes\fR(5), \fBicmp\fR(7P),
 \fBicmp6\fR(7P)
+
+.SH DIAGNOSTICS
+.ne 2
+.na
+\fB\fBping: warning: ICMP responses received, but name service lookups
+are taking a while. Use ping -n to disable name service lookups.\fB\fB
+.ad
+.sp .6
+.RS 4n
+When the
+.B -n
+flag is not specified,
+.B ping
+tries to lookup the name corresponding to the IP address that it
+received via name services. If name services are unavailable, it may
+take time before the system properly times out the name service lookups.
+As a result, it may appear that no ICMP replies are being received when
+they in fact are. This diagnostic indicates that this has occurred and
+indicates that there are valid responses and that using the
+.B -n
+flag will stop this from occurring.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBping: warning: timing accuracy diminished -- setsockopt
+SO_TIMESTAMP failed\fR\fR
+.ad
+.sp .6
+.RS 4n
+By default, the system attempts to use the
+.B SO_TIMESTAMP
+socket option to allow for more accurate time stamps that reflect when
+the ICMP echo replies were received by the system as opposed to when
+they were received by the
+.B ping
+command. These differences may occur because an operator stopped the
+process or because
+.B ping
+was blocked up behind a name service look up. When this diagnostic is
+emitted, the
+.B ping
+command will continue to function, but it will be doing the time
+stamping itself, which may cause the timing intervals reported to be
+longer than they actually are.
+.RE
-- 
cgit v1.2.3


From 3c9168fa8e9c30d55b3aa2fde74bd7da46df53f5 Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
Date: Wed, 15 Oct 2014 17:53:08 +0200
Subject: 4053 Add NVME Driver Support to Illumos Reviewed by: Dan Fields
 <dan.fields@nexenta.com> Reviewed by: Josef 'Jeff' Sipek
 <josef.sipek@nexenta.com> Reviewed by: Robert Mustacchi <rm@joyent.com>
 Approved by: Dan McDonald <danmcd@omniti.com>

---
 usr/src/man/man7d/Makefile                   |    3 +-
 usr/src/man/man7d/nvme.7d                    |   95 +
 usr/src/pkg/manifests/driver-storage-nvme.mf |   45 +
 usr/src/uts/common/Makefile.files            |    4 +-
 usr/src/uts/common/Makefile.rules            |    9 +-
 usr/src/uts/common/io/nvme/nvme.c            | 2819 ++++++++++++++++++++++++++
 usr/src/uts/common/io/nvme/nvme.conf         |   40 +
 usr/src/uts/common/io/nvme/nvme_reg.h        |  692 +++++++
 usr/src/uts/common/io/nvme/nvme_var.h        |  240 +++
 usr/src/uts/intel/Makefile.intel             |    3 +-
 usr/src/uts/intel/nvme/Makefile              |   73 +
 11 files changed, 4019 insertions(+), 4 deletions(-)
 create mode 100644 usr/src/man/man7d/nvme.7d
 create mode 100644 usr/src/pkg/manifests/driver-storage-nvme.mf
 create mode 100644 usr/src/uts/common/io/nvme/nvme.c
 create mode 100644 usr/src/uts/common/io/nvme/nvme.conf
 create mode 100644 usr/src/uts/common/io/nvme/nvme_reg.h
 create mode 100644 usr/src/uts/common/io/nvme/nvme_var.h
 create mode 100644 usr/src/uts/intel/nvme/Makefile

(limited to 'usr/src')

diff --git a/usr/src/man/man7d/Makefile b/usr/src/man/man7d/Makefile
index 5b5210e985..eb75db718a 100644
--- a/usr/src/man/man7d/Makefile
+++ b/usr/src/man/man7d/Makefile
@@ -11,7 +11,7 @@
 
 #
 # Copyright 2011, Richard Lowe
-# Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 # Copyright 2014 Garrett D'Amore <garrett@damore.org>
 #
 
@@ -219,6 +219,7 @@ i386_MANFILES=	ahci.7d		\
 		npe.7d		\
 		ntxn.7d		\
 		nv_sata.7d	\
+		nvme.7d		\
 		pcn.7d		\
 		radeon.7d	\
 		ral.7d		\
diff --git a/usr/src/man/man7d/nvme.7d b/usr/src/man/man7d/nvme.7d
new file mode 100644
index 0000000000..7742fc22f6
--- /dev/null
+++ b/usr/src/man/man7d/nvme.7d
@@ -0,0 +1,95 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source.  A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+.\"
+.Dd July 20, 2015
+.Dt NVME 7D
+.Os
+.Sh NAME
+.Nm nvme
+.Nd Intel NVMe compliant storage driver
+.Sh DESCRIPTION
+The
+.Nm
+driver uses the
+.Xr blkdev 7D
+framework to provide access to
+.Tn Intel
+NVMe compliant solid-state storage devices.
+.Lp
+NVMe devices supporting multiple namespaces will present each
+namespace as its own
+.Xr blkdev 7D
+instance in the system.
+.
+.Sh CONFIGURATION
+The
+.Nm
+driver can be configured by defining properties in the \fBnvme.conf\fR
+file. The parameters are considered an unstable interface, subject to
+change without notice. The following properties are currently
+supported:
+.Bl -tag -width Va
+.It Va strict-version
+This can be set to 0 to allow
+.Nm
+to attach to devices supporting newer version of the NVMe
+specification. The default value is 1, limiting
+.Nm
+to work with devices up to specification version 1.0.
+.It Va ignore-unknown-vendor-status
+This can be set to 1 to allow
+.Nm
+to continue operating even if it receives an unknown vendor command
+status.
+.It Va admin-queue-len
+This is the number of entries in the admin command queue. Legal values
+are between 16 and 4096, the default value is 256.
+.It Va io-queue-len
+This is the number of entries in each I/O command queue. Legal values
+are between 16 and 65536, the default value is 1024.
+.It Va async-event-limit
+This is the maximum number of asynchronous event requests issued by
+the driver. Asynchronous events are used to report error conditions.
+The driver will never use more asynchronous events than this value, or
+what the hardware supports if it is less, or what 1/10th of the admin
+queue length if it is less.
+.El
+.
+.Sh FILES
+.Bl -tag -compact -width Pa
+.It Pa /dev/dsk/cntnd0sn
+Block device minor nodes.
+.It Pa /dev/rdsk/cntnd0sn
+Raw block device minor nodes.
+.El
+.Lp
+In the device minor nodes, the following substitutions may occur:
+.Bl -tag -offset indent -width Va
+.It Va cn
+A controller number, typically one for each
+.Nm
+device found. Controller numbers are dynamically assigned by the
+system.
+.It Va tn
+The target number, this corresponds to the namespace ID used by the
+hardware. Namespace ID 0 is reserved, hence target numbers start with
+1.
+.It Va sn
+This is the
+.Em slice
+number, representing a subset of the disk.  See
+.Xr dkio 7I .
+.El
+.
+.Sh SEE ALSO
+.Xr blkdev 7D
diff --git a/usr/src/pkg/manifests/driver-storage-nvme.mf b/usr/src/pkg/manifests/driver-storage-nvme.mf
new file mode 100644
index 0000000000..3296a3beef
--- /dev/null
+++ b/usr/src/pkg/manifests/driver-storage-nvme.mf
@@ -0,0 +1,45 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+# The default for payload-bearing actions in this package is to appear in the
+# global zone only.  See the include file for greater detail, as well as
+# information about overriding the defaults.
+#
+<include global_zone_only_component>
+set name=pkg.fmri value=pkg:/driver/storage/nvme@$(PKGVERS)
+set name=pkg.description \
+    value="Driver for Intel NVMe 1.0e compliant storage devices"
+set name=pkg.summary value="NVMe driver"
+set name=info.classification \
+    value=org.opensolaris.category.2008:System/Hardware
+set name=variant.arch value=i386
+dir path=kernel group=sys
+dir path=kernel/drv group=sys
+dir path=kernel/drv/$(ARCH64) group=sys
+dir path=usr group=sys
+dir path=usr/share
+dir path=usr/share/man
+dir path=usr/share/man/man7d
+driver name=nvme alias=pciex8086,953 class=disk perms="* 0600 root sys"
+file path=kernel/drv/$(ARCH64)/nvme group=sys
+file path=kernel/drv/nvme group=sys
+file path=kernel/drv/nvme.conf group=sys
+file path=usr/share/man/man7d/nvme.7d
+license lic_CDDL license=lic_CDDL
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 39135c3a6e..d9d0bfa6fd 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -24,7 +24,7 @@
 # Copyright (c) 2012 Joyent, Inc.  All rights reserved.
 # Copyright (c) 2011, 2014 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
-# Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 #
 
 #
@@ -1863,6 +1863,8 @@ YGE_OBJS = yge.o
 
 SKD_OBJS = skd.o
 
+NVME_OBJS = nvme.o
+
 #
 #	Build up defines and paths.
 #
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index f8a10eae5b..cf2e880c5e 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -22,7 +22,7 @@
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright 2013 Garrett D'Amore <garrett@damore.org>
-# Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+# Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
 #
 
 #
@@ -990,6 +990,10 @@ $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nge/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nvme/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/io/nxge/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -2288,6 +2292,9 @@ $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/net80211/%.c
 $(LINTS_DIR)/%.ln:              $(UTSBASE)/common/io/nge/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nvme/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/io/nxge/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
new file mode 100644
index 0000000000..5914ca0226
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -0,0 +1,2819 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * blkdev driver for NVMe compliant storage devices
+ *
+ * This driver was written to conform to version 1.0e of the NVMe specification.
+ * It may work with newer versions, but that is completely untested and disabled
+ * by default.
+ *
+ * The driver has only been tested on x86 systems and will not work on big-
+ * endian systems without changes to the code accessing registers and data
+ * structures used by the hardware.
+ *
+ *
+ * Interrupt Usage:
+ *
+ * The driver will use a FIXED interrupt while configuring the device as the
+ * specification requires. Later in the attach process it will switch to MSI-X
+ * or MSI if supported. The driver wants to have one interrupt vector per CPU,
+ * but it will work correctly if less are available. Interrupts can be shared
+ * by queues, the interrupt handler will iterate through the I/O queue array by
+ * steps of n_intr_cnt. Usually only the admin queue will share an interrupt
+ * with one I/O queue. The interrupt handler will retrieve completed commands
+ * from all queues sharing an interrupt vector and will post them to a taskq
+ * for completion processing.
+ *
+ *
+ * Command Processing:
+ *
+ * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
+ * to 65536 I/O commands. The driver will configure one I/O queue pair per
+ * available interrupt vector, with the queue length usually much smaller than
+ * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
+ * interrupt vectors will be used.
+ *
+ * Additionally the hardware provides a single special admin queue pair that can
+ * hold up to 4096 admin commands.
+ *
+ * From the hardware perspective both queues of a queue pair are independent,
+ * but they share some driver state: the command array (holding pointers to
+ * commands currently being processed by the hardware) and the active command
+ * counter. Access to the submission side of a queue pair and the shared state
+ * is protected by nq_mutex. The completion side of a queue pair does not need
+ * that protection apart from its access to the shared state; it is called only
+ * in the interrupt handler which does not run concurrently for the same
+ * interrupt vector.
+ *
+ * When a command is submitted to a queue pair the active command counter is
+ * incremented and a pointer to the command is stored in the command array. The
+ * array index is used as command identifier (CID) in the submission queue
+ * entry. Some commands may take a very long time to complete, and if the queue
+ * wraps around in that time a submission may find the next array slot to still
+ * be used by a long-running command. In this case the array is sequentially
+ * searched for the next free slot. The length of the command array is the same
+ * as the configured queue length.
+ *
+ *
+ * Namespace Support:
+ *
+ * NVMe devices can have multiple namespaces, each being a independent data
+ * store. The driver supports multiple namespaces and creates a blkdev interface
+ * for each namespace found. Namespaces can have various attributes to support
+ * thin provisioning, extended LBAs, and protection information. This driver
+ * does not support any of this and ignores namespaces that have these
+ * attributes.
+ *
+ *
+ * Blkdev Interface:
+ *
+ * This driver uses blkdev to do all the heavy lifting involved with presenting
+ * a disk device to the system. As a result, the processing of I/O requests is
+ * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
+ * setup, and splitting of transfers into manageable chunks.
+ *
+ * I/O requests coming in from blkdev are turned into NVM commands and posted to
+ * an I/O queue. The queue is selected by taking the CPU id modulo the number of
+ * queues. There is currently no timeout handling of I/O commands.
+ *
+ * Blkdev also supports querying device/media information and generating a
+ * devid. The driver reports the best block size as determined by the namespace
+ * format back to blkdev as physical block size to support partition and block
+ * alignment. The devid is composed using the device vendor ID, model number,
+ * serial number, and the namespace ID.
+ *
+ *
+ * Error Handling:
+ *
+ * Error handling is currently limited to detecting fatal hardware errors,
+ * either by asynchronous events, or synchronously through command status or
+ * admin command timeouts. In case of severe errors the device is fenced off,
+ * all further requests will return EIO. FMA is then called to fault the device.
+ *
+ * The hardware has a limit for outstanding asynchronous event requests. Before
+ * this limit is known the driver assumes it is at least 1 and posts a single
+ * asynchronous request. Later when the limit is known more asynchronous event
+ * requests are posted to allow quicker reception of error information. When an
+ * asynchronous event is posted by the hardware the driver will parse the error
+ * status fields and log information or fault the device, depending on the
+ * severity of the asynchronous event. The asynchronous event request is then
+ * reused and posted to the admin queue again.
+ *
+ * On command completion the command status is checked for errors. In case of
+ * errors indicating a driver bug the driver panics. Almost all other error
+ * status values just cause EIO to be returned.
+ *
+ * Command timeouts are currently detected for all admin commands except
+ * asynchronous event requests. If a command times out and the hardware appears
+ * to be healthy the driver attempts to abort the command. If this fails the
+ * driver assumes the device to be dead, fences it off, and calls FMA to retire
+ * it. In general admin commands are issued at attach time only. No timeout
+ * handling of normal I/O commands is presently done.
+ *
+ * In some cases it may be possible that the ABORT command times out, too. In
+ * that case the device is also declared dead and fenced off.
+ *
+ *
+ * Quiesce / Fast Reboot:
+ *
+ * The driver currently does not support fast reboot. A quiesce(9E) entry point
+ * is still provided which is used to send a shutdown notification to the
+ * device.
+ *
+ *
+ * Driver Configuration:
+ *
+ * The following driver properties can be changed to control some aspects of the
+ * drivers operation:
+ * - strict-version: can be set to 0 to allow devices conforming to newer
+ *   versions to be used
+ * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
+ *   specific command status as a fatal error leading device faulting
+ * - admin-queue-len: the maximum length of the admin queue (16-4096)
+ * - io-queue-len: the maximum length of the I/O queues (16-65536)
+ * - async-event-limit: the maximum number of asynchronous event requests to be
+ *   posted by the driver
+ *
+ *
+ * TODO:
+ * - figure out sane default for I/O queue depth reported to blkdev
+ * - polled I/O support to support kernel core dumping
+ * - FMA handling of media errors
+ * - support for the Volatile Write Cache
+ * - support for devices supporting very large I/O requests using chained PRPs
+ * - support for querying log pages from user space
+ * - support for configuring hardware parameters like interrupt coalescing
+ * - support for media formatting and hard partitioning into namespaces
+ * - support for big-endian systems
+ * - support for fast reboot
+ */
+
+#include <sys/byteorder.h>
+#ifdef _BIG_ENDIAN
+#error nvme driver needs porting for big-endian platforms
+#endif
+
+#include <sys/modctl.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/bitmap.h>
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+#include <sys/varargs.h>
+#include <sys/cpuvar.h>
+#include <sys/disp.h>
+#include <sys/blkdev.h>
+#include <sys/atomic.h>
+#include <sys/archsystm.h>
+
+#include "nvme_reg.h"
+#include "nvme_var.h"
+
+
+/* NVMe spec version supported */
+static const int nvme_version_major = 1;
+static const int nvme_version_minor = 0;
+
+static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
+static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
+static int nvme_quiesce(dev_info_t *);
+static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
+static void nvme_disable_interrupts(nvme_t *);
+static int nvme_enable_interrupts(nvme_t *);
+static int nvme_setup_interrupts(nvme_t *, int, int);
+static void nvme_release_interrupts(nvme_t *);
+static uint_t nvme_intr(caddr_t, caddr_t);
+
+static void nvme_shutdown(nvme_t *, int, boolean_t);
+static boolean_t nvme_reset(nvme_t *, boolean_t);
+static int nvme_init(nvme_t *);
+static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
+static void nvme_free_cmd(nvme_cmd_t *);
+static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
+    bd_xfer_t *);
+static int nvme_admin_cmd(nvme_cmd_t *, int);
+static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
+static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
+static void nvme_wakeup_cmd(void *);
+static void nvme_async_event_task(void *);
+
+static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
+static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
+static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
+static int nvme_check_specific_cmd_status(nvme_cmd_t *);
+static int nvme_check_generic_cmd_status(nvme_cmd_t *);
+static inline int nvme_check_cmd_status(nvme_cmd_t *);
+
+static void nvme_abort_cmd(nvme_cmd_t *);
+static int nvme_async_event(nvme_t *);
+static void *nvme_get_logpage(nvme_t *, uint8_t, ...);
+static void *nvme_identify(nvme_t *, uint32_t);
+static int nvme_set_nqueues(nvme_t *, uint16_t);
+
+static void nvme_free_dma(nvme_dma_t *);
+static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
+    nvme_dma_t **);
+static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
+    nvme_dma_t **);
+static void nvme_free_qpair(nvme_qpair_t *);
+static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
+static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
+
+static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
+static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
+static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
+static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
+
+static boolean_t nvme_check_regs_hdl(nvme_t *);
+static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
+
+static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
+
+static void nvme_bd_xfer_done(void *);
+static void nvme_bd_driveinfo(void *, bd_drive_t *);
+static int nvme_bd_mediainfo(void *, bd_media_t *);
+static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
+static int nvme_bd_read(void *, bd_xfer_t *);
+static int nvme_bd_write(void *, bd_xfer_t *);
+static int nvme_bd_sync(void *, bd_xfer_t *);
+static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
+
+static void nvme_prepare_devid(nvme_t *, uint32_t);
+
+static void *nvme_state;
+static kmem_cache_t *nvme_cmd_cache;
+
+/*
+ * DMA attributes for queue DMA memory
+ *
+ * Queue DMA memory must be page aligned. The maximum length of a queue is
+ * 65536 entries, and an entry can be 64 bytes long.
+ */
+static ddi_dma_attr_t nvme_queue_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
+	.dma_attr_align		= 0x1000,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x1000,
+	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= 1,
+	.dma_attr_granular	= 1,
+	.dma_attr_flags		= 0,
+};
+
+/*
+ * DMA attributes for transfers using Physical Region Page (PRP) entries
+ *
+ * A PRP entry describes one page of DMA memory using the page size specified
+ * in the controller configuration's memory page size register (CC.MPS). It uses
+ * a 64bit base address aligned to this page size. There is no limitation on
+ * chaining PRPs together for arbitrarily large DMA transfers.
+ */
+static ddi_dma_attr_t nvme_prp_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= 0xfff,
+	.dma_attr_align		= 0x1000,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x1000,
+	.dma_attr_maxxfer	= 0x1000,
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= -1,
+	.dma_attr_granular	= 1,
+	.dma_attr_flags		= 0,
+};
+
+/*
+ * DMA attributes for transfers using scatter/gather lists
+ *
+ * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
+ * 32bit length field. SGL Segment and SGL Last Segment entries require the
+ * length to be a multiple of 16 bytes.
+ */
+static ddi_dma_attr_t nvme_sgl_dma_attr = {
+	.dma_attr_version	= DMA_ATTR_V0,
+	.dma_attr_addr_lo	= 0,
+	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
+	.dma_attr_count_max	= 0xffffffffUL,
+	.dma_attr_align		= 1,
+	.dma_attr_burstsizes	= 0x7ff,
+	.dma_attr_minxfer	= 0x10,
+	.dma_attr_maxxfer	= 0xfffffffffULL,
+	.dma_attr_seg		= 0xffffffffffffffffULL,
+	.dma_attr_sgllen	= -1,
+	.dma_attr_granular	= 0x10,
+	.dma_attr_flags		= 0
+};
+
+static ddi_device_acc_attr_t nvme_reg_acc_attr = {
+	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
+	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
+	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
+};
+
+static struct dev_ops nvme_dev_ops = {
+	.devo_rev	= DEVO_REV,
+	.devo_refcnt	= 0,
+	.devo_getinfo	= ddi_no_info,
+	.devo_identify	= nulldev,
+	.devo_probe	= nulldev,
+	.devo_attach	= nvme_attach,
+	.devo_detach	= nvme_detach,
+	.devo_reset	= nodev,
+	.devo_cb_ops	= NULL,
+	.devo_bus_ops	= NULL,
+	.devo_power	= NULL,
+	.devo_quiesce	= nvme_quiesce,
+};
+
+static struct modldrv nvme_modldrv = {
+	.drv_modops	= &mod_driverops,
+	.drv_linkinfo	= "NVMe v1.0e",
+	.drv_dev_ops	= &nvme_dev_ops
+};
+
+static struct modlinkage nvme_modlinkage = {
+	.ml_rev		= MODREV_1,
+	.ml_linkage	= { &nvme_modldrv, NULL }
+};
+
+static bd_ops_t nvme_bd_ops = {
+	.o_version	= BD_OPS_VERSION_0,
+	.o_drive_info	= nvme_bd_driveinfo,
+	.o_media_info	= nvme_bd_mediainfo,
+	.o_devid_init	= nvme_bd_devid,
+	.o_sync_cache	= nvme_bd_sync,
+	.o_read		= nvme_bd_read,
+	.o_write	= nvme_bd_write,
+};
+
+int
+_init(void)
+{
+	int error;
+
+	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
+	if (error != DDI_SUCCESS)
+		return (error);
+
+	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
+	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+	bd_mod_init(&nvme_dev_ops);
+
+	error = mod_install(&nvme_modlinkage);
+	if (error != DDI_SUCCESS) {
+		ddi_soft_state_fini(&nvme_state);
+		bd_mod_fini(&nvme_dev_ops);
+	}
+
+	return (error);
+}
+
+int
+_fini(void)
+{
+	int error;
+
+	error = mod_remove(&nvme_modlinkage);
+	if (error == DDI_SUCCESS) {
+		ddi_soft_state_fini(&nvme_state);
+		kmem_cache_destroy(nvme_cmd_cache);
+		bd_mod_fini(&nvme_dev_ops);
+	}
+
+	return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&nvme_modlinkage, modinfop));
+}
+
+static inline void
+nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
+{
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
+}
+
+static inline void
+nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
+{
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
+}
+
+static inline uint64_t
+nvme_get64(nvme_t *nvme, uintptr_t reg)
+{
+	uint64_t val;
+
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
+
+	return (val);
+}
+
+static inline uint32_t
+nvme_get32(nvme_t *nvme, uintptr_t reg)
+{
+	uint32_t val;
+
+	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
+
+	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
+	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
+
+	return (val);
+}
+
+static boolean_t
+nvme_check_regs_hdl(nvme_t *nvme)
+{
+	ddi_fm_error_t error;
+
+	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
+
+	if (error.fme_status != DDI_FM_OK)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static boolean_t
+nvme_check_dma_hdl(nvme_dma_t *dma)
+{
+	ddi_fm_error_t error;
+
+	if (dma == NULL)
+		return (B_FALSE);
+
+	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
+
+	if (error.fme_status != DDI_FM_OK)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+nvme_free_dma(nvme_dma_t *dma)
+{
+	if (dma->nd_dmah != NULL)
+		(void) ddi_dma_unbind_handle(dma->nd_dmah);
+	if (dma->nd_acch != NULL)
+		ddi_dma_mem_free(&dma->nd_acch);
+	if (dma->nd_dmah != NULL)
+		ddi_dma_free_handle(&dma->nd_dmah);
+	kmem_free(dma, sizeof (nvme_dma_t));
+}
+
+static int
+nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
+    ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
+{
+	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
+
+	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
+	    &dma->nd_dmah) != DDI_SUCCESS) {
+		/*
+		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
+		 * the only other possible error is DDI_DMA_BADATTR which
+		 * indicates a driver bug which should cause a panic.
+		 */
+		dev_err(nvme->n_dip, CE_PANIC,
+		    "!failed to get DMA handle, check DMA attributes");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
+	 * or the flags are conflicting, which isn't the case here.
+	 */
+	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
+	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
+	    &dma->nd_len, &dma->nd_acch);
+
+	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
+	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
+	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to bind DMA memory");
+		atomic_inc_32(&nvme->n_dma_bind_err);
+		*ret = NULL;
+		nvme_free_dma(dma);
+		return (DDI_FAILURE);
+	}
+
+	bzero(dma->nd_memp, dma->nd_len);
+
+	*ret = dma;
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
+    uint_t flags, nvme_dma_t **dma)
+{
+	uint32_t len = nentry * qe_len;
+	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
+
+	len = roundup(len, nvme->n_pagesize);
+
+	q_dma_attr.dma_attr_minxfer = len;
+
+	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to get DMA memory for queue");
+		goto fail;
+	}
+
+	if ((*dma)->nd_ncookie != 1) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!got too many cookies for queue DMA");
+		goto fail;
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	if (*dma) {
+		nvme_free_dma(*dma);
+		*dma = NULL;
+	}
+
+	return (DDI_FAILURE);
+}
+
+static void
+nvme_free_qpair(nvme_qpair_t *qp)
+{
+	int i;
+
+	mutex_destroy(&qp->nq_mutex);
+
+	if (qp->nq_sqdma != NULL)
+		nvme_free_dma(qp->nq_sqdma);
+	if (qp->nq_cqdma != NULL)
+		nvme_free_dma(qp->nq_cqdma);
+
+	if (qp->nq_active_cmds > 0)
+		for (i = 0; i != qp->nq_nentry; i++)
+			if (qp->nq_cmd[i] != NULL)
+				nvme_free_cmd(qp->nq_cmd[i]);
+
+	if (qp->nq_cmd != NULL)
+		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
+
+	kmem_free(qp, sizeof (nvme_qpair_t));
+}
+
+static int
+nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
+    int idx)
+{
+	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
+
+	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(nvme->n_intr_pri));
+
+	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
+	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
+		goto fail;
+
+	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
+	    DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
+		goto fail;
+
+	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
+	qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp;
+	qp->nq_nentry = nentry;
+
+	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
+	qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx);
+
+	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
+	qp->nq_next_cmd = 0;
+
+	*nqp = qp;
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_free_qpair(qp);
+	*nqp = NULL;
+
+	return (DDI_FAILURE);
+}
+
+static nvme_cmd_t *
+nvme_alloc_cmd(nvme_t *nvme, int kmflag)
+{
+	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
+
+	if (cmd == NULL)
+		return (cmd);
+
+	bzero(cmd, sizeof (nvme_cmd_t));
+
+	cmd->nc_nvme = nvme;
+
+	mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(nvme->n_intr_pri));
+	cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
+
+	return (cmd);
+}
+
+static void
+nvme_free_cmd(nvme_cmd_t *cmd)
+{
+	if (cmd->nc_dma) {
+		nvme_free_dma(cmd->nc_dma);
+		cmd->nc_dma = NULL;
+	}
+
+	cv_destroy(&cmd->nc_cv);
+	mutex_destroy(&cmd->nc_mutex);
+
+	kmem_cache_free(nvme_cmd_cache, cmd);
+}
+
+static int
+nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+	nvme_reg_sqtdbl_t tail = { 0 };
+
+	mutex_enter(&qp->nq_mutex);
+
+	if (qp->nq_active_cmds == qp->nq_nentry) {
+		mutex_exit(&qp->nq_mutex);
+		return (DDI_FAILURE);
+	}
+
+	cmd->nc_completed = B_FALSE;
+
+	/*
+	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
+	 * slot. If the slot is already occupied advance to the next slot and
+	 * try again. This can happen for long running commands like async event
+	 * requests.
+	 */
+	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
+		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
+	qp->nq_cmd[qp->nq_next_cmd] = cmd;
+
+	qp->nq_active_cmds++;
+
+	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
+	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
+	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
+	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
+	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
+	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
+
+	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
+	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
+
+	mutex_exit(&qp->nq_mutex);
+	return (DDI_SUCCESS);
+}
+
+static nvme_cmd_t *
+nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+{
+	nvme_reg_cqhdbl_t head = { 0 };
+
+	nvme_cqe_t *cqe;
+	nvme_cmd_t *cmd;
+
+	(void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
+	    sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
+
+	cqe = &qp->nq_cq[qp->nq_cqhead];
+
+	/* Check phase tag of CQE. Hardware inverts it for new entries. */
+	if (cqe->cqe_sf.sf_p == qp->nq_phase)
+		return (NULL);
+
+	ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
+	ASSERT(cqe->cqe_cid < qp->nq_nentry);
+
+	mutex_enter(&qp->nq_mutex);
+	cmd = qp->nq_cmd[cqe->cqe_cid];
+	qp->nq_cmd[cqe->cqe_cid] = NULL;
+	qp->nq_active_cmds--;
+	mutex_exit(&qp->nq_mutex);
+
+	ASSERT(cmd != NULL);
+	ASSERT(cmd->nc_nvme == nvme);
+	ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
+	ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
+	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
+
+	qp->nq_sqhead = cqe->cqe_sqhd;
+
+	head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
+
+	/* Toggle phase on wrap-around. */
+	if (qp->nq_cqhead == 0)
+		qp->nq_phase = qp->nq_phase ? 0 : 1;
+
+	nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
+
+	return (cmd);
+}
+
+static int
+nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
+	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
+	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
+	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
+
+	if (cmd->nc_nvme->n_strict_version) {
+		cmd->nc_nvme->n_dead = B_TRUE;
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+	}
+
+	return (EIO);
+}
+
+static int
+nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
+	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
+	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
+	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
+	if (cmd->nc_nvme->n_ignore_unknown_vendor_status) {
+		cmd->nc_nvme->n_dead = B_TRUE;
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+	}
+
+	return (EIO);
+}
+
+static int
+nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_INT_NVM_WRITE:
+		/* write fail */
+		/* TODO: post ereport */
+		return (EIO);
+
+	case NVME_CQE_SC_INT_NVM_READ:
+		/* read fail */
+		/* TODO: post ereport */
+		return (EIO);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static int
+nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_GEN_SUCCESS:
+		return (0);
+
+	/*
+	 * Errors indicating a bug in the driver should cause a panic.
+	 */
+	case NVME_CQE_SC_GEN_INV_OPC:
+		/* Invalid Command Opcode */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid opcode in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_INV_FLD:
+		/* Invalid Field in Command */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid field in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_ID_CNFL:
+		/* Command ID Conflict */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "cmd ID conflict in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_INV_NS:
+		/* Invalid Namespace or Format */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "invalid NS/format in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
+		/* LBA Out Of Range */
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "LBA out of range in cmd %p", (void *)cmd);
+		return (0);
+
+	/*
+	 * Non-fatal errors, handle gracefully.
+	 */
+	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
+		/* Data Transfer Error (DMA) */
+		/* TODO: post ereport */
+		atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_INTERNAL_ERR:
+		/*
+		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
+		 * detailed error information is returned as async event,
+		 * so we pretty much ignore the error here and handle it
+		 * in the async event handler.
+		 */
+		atomic_inc_32(&cmd->nc_nvme->n_internal_err);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_ABORT_REQUEST:
+		/*
+		 * Command Abort Requested. This normally happens only when a
+		 * command times out.
+		 */
+		/* TODO: post ereport or change blkdev to handle this? */
+		atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
+		return (ECANCELED);
+
+	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
+		/* Command Aborted due to Power Loss Notification */
+		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
+		cmd->nc_nvme->n_dead = B_TRUE;
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
+		/* Command Aborted due to SQ Deletion */
+		atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
+		/* Capacity Exceeded */
+		atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
+		return (EIO);
+
+	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
+		/* Namespace Not Ready */
+		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
+		return (EIO);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static int
+nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	switch (cqe->cqe_sf.sf_sc) {
+	case NVME_CQE_SC_SPC_INV_CQ:
+		/* Completion Queue Invalid */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_QID:
+		/* Invalid Queue Identifier */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
+		/* Max Queue Size Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
+		/* Abort Command Limit Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "abort command limit exceeded in cmd %p", (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
+		/* Async Event Request Limit Exceeded */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
+		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
+		    "async event request limit exceeded in cmd %p",
+		    (void *)cmd);
+		return (0);
+
+	case NVME_CQE_SC_SPC_INV_INT_VECT:
+		/* Invalid Interrupt Vector */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
+		/* Invalid Log Page */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_FORMAT:
+		/* Invalid Format */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_format);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_INV_Q_DEL:
+		/* Invalid Queue Deletion */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
+		/* Conflicting Attributes */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_INV_PROT:
+		/* Invalid Protection Information */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
+		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
+		return (EINVAL);
+
+	case NVME_CQE_SC_SPC_NVM_READONLY:
+		/* Write to Read Only Range */
+		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
+		atomic_inc_32(&cmd->nc_nvme->n_readonly);
+		return (EROFS);
+
+	default:
+		return (nvme_check_unknown_cmd_status(cmd));
+	}
+}
+
+static inline int
+nvme_check_cmd_status(nvme_cmd_t *cmd)
+{
+	nvme_cqe_t *cqe = &cmd->nc_cqe;
+
+	/* take a shortcut if everything is alright */
+	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
+	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
+		return (0);
+
+	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
+		return (nvme_check_generic_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
+		return (nvme_check_specific_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
+		return (nvme_check_integrity_cmd_status(cmd));
+	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
+		return (nvme_check_vendor_cmd_status(cmd));
+
+	return (nvme_check_unknown_cmd_status(cmd));
+}
+
+/*
+ * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands
+ *
+ * This functions takes care of cleaning up aborted commands. The command
+ * status is checked to catch any fatal errors.
+ */
+static void
+nvme_abort_cmd_cb(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+
+	/*
+	 * Grab the command mutex. Once we have it we hold the last reference
+	 * to the command and can safely free it.
+	 */
+	mutex_enter(&cmd->nc_mutex);
+	(void) nvme_check_cmd_status(cmd);
+	mutex_exit(&cmd->nc_mutex);
+
+	nvme_free_cmd(cmd);
+}
+
+static void
+nvme_abort_cmd(nvme_cmd_t *abort_cmd)
+{
+	nvme_t *nvme = abort_cmd->nc_nvme;
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_abort_cmd_t ac = { 0 };
+
+	sema_p(&nvme->n_abort_sema);
+
+	ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
+	ac.b.ac_sqid = abort_cmd->nc_sqid;
+
+	/*
+	 * Drop the mutex of the aborted command. From this point on
+	 * we must assume that the abort callback has freed the command.
+	 */
+	mutex_exit(&abort_cmd->nc_mutex);
+
+	cmd->nc_sqid = 0;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_cdw10 = ac.r;
+
+	/*
+	 * Send the ABORT to the hardware. The ABORT command will return _after_
+	 * the aborted command has completed (aborted or otherwise).
+	 */
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		sema_v(&nvme->n_abort_sema);
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for ABORT");
+		atomic_inc_32(&nvme->n_abort_failed);
+		return;
+	}
+	sema_v(&nvme->n_abort_sema);
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!ABORT failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		atomic_inc_32(&nvme->n_abort_failed);
+	} else {
+		atomic_inc_32(&nvme->n_cmd_aborted);
+	}
+
+	nvme_free_cmd(cmd);
+}
+
+/*
+ * nvme_wait_cmd -- wait for command completion or timeout
+ *
+ * Returns B_TRUE if the command completed normally.
+ *
+ * Returns B_FALSE if the command timed out and an abort was attempted. The
+ * command mutex will be dropped and the command must be considered freed. The
+ * freeing of the command is normally done by the abort command callback.
+ *
+ * In case of a serious error or a timeout of the abort command the hardware
+ * will be declared dead and FMA will be notified.
+ */
+static boolean_t
+nvme_wait_cmd(nvme_cmd_t *cmd, uint_t usec)
+{
+	clock_t timeout = ddi_get_lbolt() + drv_usectohz(usec);
+	nvme_t *nvme = cmd->nc_nvme;
+	nvme_reg_csts_t csts;
+
+	ASSERT(mutex_owned(&cmd->nc_mutex));
+
+	while (!cmd->nc_completed) {
+		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
+			break;
+	}
+
+	if (cmd->nc_completed)
+		return (B_TRUE);
+
+	/*
+	 * The command timed out. Change the callback to the cleanup function.
+	 */
+	cmd->nc_callback = nvme_abort_cmd_cb;
+
+	/*
+	 * Check controller for fatal status, any errors associated with the
+	 * register or DMA handle, or for a double timeout (abort command timed
+	 * out). If necessary log a warning and call FMA.
+	 */
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	dev_err(nvme->n_dip, CE_WARN, "!command timeout, "
+	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
+	atomic_inc_32(&nvme->n_cmd_timeout);
+
+	if (csts.b.csts_cfs ||
+	    nvme_check_regs_hdl(nvme) ||
+	    nvme_check_dma_hdl(cmd->nc_dma) ||
+	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		mutex_exit(&cmd->nc_mutex);
+	} else {
+		/*
+		 * Try to abort the command. The command mutex is released by
+		 * nvme_abort_cmd().
+		 * If the abort succeeds it will have freed the aborted command.
+		 * If the abort fails for other reasons we must assume that the
+		 * command may complete at any time, and the callback will free
+		 * it for us.
+		 */
+		nvme_abort_cmd(cmd);
+	}
+
+	return (B_FALSE);
+}
+
+static void
+nvme_wakeup_cmd(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+
+	mutex_enter(&cmd->nc_mutex);
+	/*
+	 * There is a slight chance that this command completed shortly after
+	 * the timeout was hit in nvme_wait_cmd() but before the callback was
+	 * changed. Catch that case here and clean up accordingly.
+	 */
+	if (cmd->nc_callback == nvme_abort_cmd_cb) {
+		mutex_exit(&cmd->nc_mutex);
+		nvme_abort_cmd_cb(cmd);
+		return;
+	}
+
+	cmd->nc_completed = B_TRUE;
+	cv_signal(&cmd->nc_cv);
+	mutex_exit(&cmd->nc_mutex);
+}
+
+static void
+nvme_async_event_task(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+	nvme_t *nvme = cmd->nc_nvme;
+	nvme_error_log_entry_t *error_log = NULL;
+	nvme_health_log_t *health_log = NULL;
+	nvme_async_event_t event;
+	int ret;
+
+	/*
+	 * Check for errors associated with the async request itself. The only
+	 * command-specific error is "async event limit exceeded", which
+	 * indicates a programming error in the driver and causes a panic in
+	 * nvme_check_cmd_status().
+	 *
+	 * Other possible errors are various scenarios where the async request
+	 * was aborted, or internal errors in the device. Internal errors are
+	 * reported to FMA, the command aborts need no special handling here.
+	 */
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+		    "!async event request returned failure, sct = %x, "
+		    "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
+		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
+		    cmd->nc_cqe.cqe_sf.sf_m);
+
+		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
+		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
+			cmd->nc_nvme->n_dead = B_TRUE;
+			ddi_fm_service_impact(cmd->nc_nvme->n_dip,
+			    DDI_SERVICE_LOST);
+		}
+		nvme_free_cmd(cmd);
+		return;
+	}
+
+
+	event.r = cmd->nc_cqe.cqe_dw0;
+
+	/* Clear CQE and re-submit the async request. */
+	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
+	ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to resubmit async event request");
+		atomic_inc_32(&nvme->n_async_resubmit_failed);
+		nvme_free_cmd(cmd);
+	}
+
+	switch (event.b.ae_type) {
+	case NVME_ASYNC_TYPE_ERROR:
+		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
+			error_log = (nvme_error_log_entry_t *)
+			    nvme_get_logpage(nvme, event.b.ae_logpage);
+		} else {
+			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
+			    "async event reply: %d", event.b.ae_logpage);
+			atomic_inc_32(&nvme->n_wrong_logpage);
+		}
+
+		switch (event.b.ae_info) {
+		case NVME_ASYNC_ERROR_INV_SQ:
+			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
+			    "invalid submission queue");
+			return;
+
+		case NVME_ASYNC_ERROR_INV_DBL:
+			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
+			    "invalid doorbell write value");
+			return;
+
+		case NVME_ASYNC_ERROR_DIAGFAIL:
+			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
+			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+			nvme->n_dead = B_TRUE;
+			atomic_inc_32(&nvme->n_diagfail_event);
+			break;
+
+		case NVME_ASYNC_ERROR_PERSISTENT:
+			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
+			    "device error");
+			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+			nvme->n_dead = B_TRUE;
+			atomic_inc_32(&nvme->n_persistent_event);
+			break;
+
+		case NVME_ASYNC_ERROR_TRANSIENT:
+			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
+			    "device error");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_transient_event);
+			break;
+
+		case NVME_ASYNC_ERROR_FW_LOAD:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!firmware image load error");
+			atomic_inc_32(&nvme->n_fw_load_event);
+			break;
+		}
+		break;
+
+	case NVME_ASYNC_TYPE_HEALTH:
+		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
+			health_log = (nvme_health_log_t *)
+			    nvme_get_logpage(nvme, event.b.ae_logpage, -1);
+		} else {
+			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
+			    "async event reply: %d", event.b.ae_logpage);
+			atomic_inc_32(&nvme->n_wrong_logpage);
+		}
+
+		switch (event.b.ae_info) {
+		case NVME_ASYNC_HEALTH_RELIABILITY:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!device reliability compromised");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_reliability_event);
+			break;
+
+		case NVME_ASYNC_HEALTH_TEMPERATURE:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!temperature above threshold");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_temperature_event);
+			break;
+
+		case NVME_ASYNC_HEALTH_SPARE:
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!spare space below threshold");
+			/* TODO: send ereport */
+			atomic_inc_32(&nvme->n_spare_event);
+			break;
+		}
+		break;
+
+	case NVME_ASYNC_TYPE_VENDOR:
+		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
+		    "received, info = %x, logpage = %x", event.b.ae_info,
+		    event.b.ae_logpage);
+		atomic_inc_32(&nvme->n_vendor_event);
+		break;
+
+	default:
+		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
+		    "type = %x, info = %x, logpage = %x", event.b.ae_type,
+		    event.b.ae_info, event.b.ae_logpage);
+		atomic_inc_32(&nvme->n_unknown_event);
+		break;
+	}
+
+	if (error_log)
+		kmem_free(error_log, sizeof (nvme_error_log_entry_t) *
+		    nvme->n_error_log_len);
+
+	if (health_log)
+		kmem_free(health_log, sizeof (nvme_health_log_t));
+}
+
+static int
+nvme_admin_cmd(nvme_cmd_t *cmd, int usec)
+{
+	int ret;
+
+	mutex_enter(&cmd->nc_mutex);
+	ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		mutex_exit(&cmd->nc_mutex);
+		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
+		    "!nvme_submit_cmd failed");
+		atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_wait_cmd(cmd, usec) == B_FALSE) {
+		/*
+		 * The command timed out. An abort command was posted that
+		 * will take care of the cleanup.
+		 */
+		return (DDI_FAILURE);
+	}
+	mutex_exit(&cmd->nc_mutex);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_async_event(nvme_t *nvme)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	int ret;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
+	cmd->nc_callback = nvme_async_event_task;
+
+	ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+static void *
+nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	void *buf = NULL;
+	nvme_getlogpage_t getlogpage;
+	size_t bufsize;
+	va_list ap;
+
+	va_start(ap, logpage);
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
+
+	getlogpage.b.lp_lid = logpage;
+
+	switch (logpage) {
+	case NVME_LOGPAGE_ERROR:
+		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
+		bufsize = nvme->n_error_log_len *
+		    sizeof (nvme_error_log_entry_t);
+		break;
+
+	case NVME_LOGPAGE_HEALTH:
+		cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
+		bufsize = sizeof (nvme_health_log_t);
+		break;
+
+	case NVME_LOGPAGE_FWSLOT:
+		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
+		bufsize = sizeof (nvme_fwslot_log_t);
+		break;
+
+	default:
+		dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
+		    logpage);
+		atomic_inc_32(&nvme->n_unknown_logpage);
+		goto fail;
+	}
+
+	va_end(ap);
+
+	getlogpage.b.lp_numd = bufsize / sizeof (uint32_t);
+
+	cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
+
+	if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
+	    DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_zalloc_dma failed for GET LOG PAGE");
+		goto fail;
+	}
+
+	if (cmd->nc_dma->nd_ncookie > 2) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!too many DMA cookies for GET LOG PAGE");
+		atomic_inc_32(&nvme->n_too_many_cookies);
+		goto fail;
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	if (cmd->nc_dma->nd_ncookie > 1) {
+		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
+		    &cmd->nc_dma->nd_cookie);
+		cmd->nc_sqe.sqe_dptr.d_prp[1] =
+		    cmd->nc_dma->nd_cookie.dmac_laddress;
+	}
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for GET LOG PAGE");
+		return (NULL);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!GET LOG PAGE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		goto fail;
+	}
+
+	buf = kmem_alloc(bufsize, KM_SLEEP);
+	bcopy(cmd->nc_dma->nd_memp, buf, bufsize);
+
+fail:
+	nvme_free_cmd(cmd);
+
+	return (buf);
+}
+
+static void *
+nvme_identify(nvme_t *nvme, uint32_t nsid)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	void *buf = NULL;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
+	cmd->nc_sqe.sqe_nsid = nsid;
+	cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
+
+	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
+	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_zalloc_dma failed for IDENTIFY");
+		goto fail;
+	}
+
+	if (cmd->nc_dma->nd_ncookie > 2) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!too many DMA cookies for IDENTIFY");
+		atomic_inc_32(&nvme->n_too_many_cookies);
+		goto fail;
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	if (cmd->nc_dma->nd_ncookie > 1) {
+		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
+		    &cmd->nc_dma->nd_cookie);
+		cmd->nc_sqe.sqe_dptr.d_prp[1] =
+		    cmd->nc_dma->nd_cookie.dmac_laddress;
+	}
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for IDENTIFY");
+		return (NULL);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!IDENTIFY failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		goto fail;
+	}
+
+	buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
+	bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE);
+
+fail:
+	nvme_free_cmd(cmd);
+
+	return (buf);
+}
+
+static int
+nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_nqueue_t nq = { 0 };
+
+	nq.b.nq_nsq = nq.b.nq_ncq = nqueues;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
+	cmd->nc_sqe.sqe_cdw10 = NVME_FEAT_NQUEUES;
+	cmd->nc_sqe.sqe_cdw11 = nq.r;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for SET FEATURES (NQUEUES)");
+		return (0);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!SET FEATURES (NQUEUES) failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (0);
+	}
+
+	nq.r = cmd->nc_cqe.cqe_dw0;
+	nvme_free_cmd(cmd);
+
+	/*
+	 * Always use the same number of submission and completion queues, and
+	 * never use more than the requested number of queues.
+	 */
+	return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq)));
+}
+
+static int
+nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
+{
+	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	nvme_create_queue_dw10_t dw10 = { 0 };
+	nvme_create_cq_dw11_t c_dw11 = { 0 };
+	nvme_create_sq_dw11_t s_dw11 = { 0 };
+
+	dw10.b.q_qid = idx;
+	dw10.b.q_qsize = qp->nq_nentry - 1;
+
+	c_dw11.b.cq_pc = 1;
+	c_dw11.b.cq_ien = 1;
+	c_dw11.b.cq_iv = idx % nvme->n_intr_cnt;
+
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
+	cmd->nc_sqe.sqe_cdw10 = dw10.r;
+	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for CREATE CQUEUE");
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	nvme_free_cmd(cmd);
+
+	s_dw11.b.sq_pc = 1;
+	s_dw11.b.sq_cqid = idx;
+
+	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
+	cmd->nc_sqid = 0;
+	cmd->nc_callback = nvme_wakeup_cmd;
+	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
+	cmd->nc_sqe.sqe_cdw10 = dw10.r;
+	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
+
+	if (nvme_admin_cmd(cmd, NVME_ADMIN_CMD_TIMEOUT) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!nvme_admin_cmd failed for CREATE SQUEUE");
+		return (DDI_FAILURE);
+	}
+
+	if (nvme_check_cmd_status(cmd)) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
+		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
+		nvme_free_cmd(cmd);
+		return (DDI_FAILURE);
+	}
+
+	nvme_free_cmd(cmd);
+
+	return (DDI_SUCCESS);
+}
+
+static boolean_t
+nvme_reset(nvme_t *nvme, boolean_t quiesce)
+{
+	nvme_reg_csts_t csts;
+	int i;
+
+	nvme_put32(nvme, NVME_REG_CC, 0);
+
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	if (csts.b.csts_rdy == 1) {
+		nvme_put32(nvme, NVME_REG_CC, 0);
+		for (i = 0; i != nvme->n_timeout * 10; i++) {
+			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+			if (csts.b.csts_rdy == 0)
+				break;
+
+			if (quiesce)
+				drv_usecwait(50000);
+			else
+				delay(drv_usectohz(50000));
+		}
+	}
+
+	nvme_put32(nvme, NVME_REG_AQA, 0);
+	nvme_put32(nvme, NVME_REG_ASQ, 0);
+	nvme_put32(nvme, NVME_REG_ACQ, 0);
+
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
+}
+
+static void
+nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce)
+{
+	nvme_reg_cc_t cc;
+	nvme_reg_csts_t csts;
+	int i;
+
+	ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT);
+
+	cc.r = nvme_get32(nvme, NVME_REG_CC);
+	cc.b.cc_shn = mode & 0x3;
+	nvme_put32(nvme, NVME_REG_CC, cc.r);
+
+	for (i = 0; i != 10; i++) {
+		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
+			break;
+
+		if (quiesce)
+			drv_usecwait(100000);
+		else
+			delay(drv_usectohz(100000));
+	}
+}
+
+
+static void
+nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
+{
+	char model[sizeof (nvme->n_idctl->id_model) + 1];
+	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
+
+	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
+	bcopy(nvme->n_idctl->id_serial, serial,
+	    sizeof (nvme->n_idctl->id_serial));
+
+	model[sizeof (nvme->n_idctl->id_model)] = '\0';
+	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
+
+	(void) snprintf(nvme->n_ns[nsid - 1].ns_devid,
+	    sizeof (nvme->n_ns[0].ns_devid), "%4X-%s-%s-%X",
+	    nvme->n_idctl->id_vid, model, serial, nsid);
+}
+
+static int
+nvme_init(nvme_t *nvme)
+{
+	nvme_reg_cc_t cc = { 0 };
+	nvme_reg_aqa_t aqa = { 0 };
+	nvme_reg_asq_t asq = { 0 };
+	nvme_reg_acq_t acq = { 0 };
+	nvme_reg_cap_t cap;
+	nvme_reg_vs_t vs;
+	nvme_reg_csts_t csts;
+	int i = 0;
+	int nqueues;
+
+	/* Setup fixed interrupt for admin queue. */
+	if (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to setup fixed interrupt");
+		goto fail;
+	}
+
+	/* Check controller version */
+	vs.r = nvme_get32(nvme, NVME_REG_VS);
+	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
+	    vs.b.vs_mjr, vs.b.vs_mnr);
+
+	if (nvme_version_major < vs.b.vs_mjr &&
+	    nvme_version_minor < vs.b.vs_mnr) {
+		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
+		    nvme_version_major, nvme_version_minor);
+		if (nvme->n_strict_version)
+			goto fail;
+	}
+
+	/* retrieve controller configuration */
+	cap.r = nvme_get64(nvme, NVME_REG_CAP);
+
+	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!NVM command set not supported by hardware");
+		goto fail;
+	}
+
+	nvme->n_nssr_supported = cap.b.cap_nssrs;
+	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
+	nvme->n_timeout = cap.b.cap_to;
+	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
+	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
+	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
+
+	/*
+	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
+	 * the base page size of 4k (1<<12), so add 12 here to get the real
+	 * page size value.
+	 */
+	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
+	    cap.b.cap_mpsmax + 12);
+	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
+
+	/*
+	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
+	 */
+	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
+	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
+
+	/*
+	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
+	 * Maxxfer may be increased after we identified the controller limits.
+	 */
+	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
+	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
+	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
+
+	/*
+	 * Reset controller if it's still in ready state.
+	 */
+	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
+		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		goto fail;
+	}
+
+	/*
+	 * Create the admin queue pair.
+	 */
+	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
+	    != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!unable to allocate admin qpair");
+		goto fail;
+	}
+	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
+	nvme->n_ioq[0] = nvme->n_adminq;
+
+	nvme->n_progress |= NVME_ADMIN_QUEUE;
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
+	    "admin-queue-len", nvme->n_admin_queue_len);
+
+	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
+	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
+	acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress;
+
+	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
+	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
+
+	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
+	nvme_put64(nvme, NVME_REG_ASQ, asq);
+	nvme_put64(nvme, NVME_REG_ACQ, acq);
+
+	cc.b.cc_ams = 0; /* use Round-Robin arbitration */
+	cc.b.cc_css = 0; /* use NVM command set */
+	cc.b.cc_mps = nvme->n_pageshift - 12;
+	cc.b.cc_shn = 0; /* no shutdown in progress */
+	cc.b.cc_en = 1;  /* enable controller */
+
+	nvme_put32(nvme, NVME_REG_CC, cc.r);
+
+	/*
+	 * Wait for the controller to become ready.
+	 */
+	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+	if (csts.b.csts_rdy == 0) {
+		for (i = 0; i != nvme->n_timeout * 10; i++) {
+			delay(drv_usectohz(50000));
+			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
+
+			if (csts.b.csts_cfs == 1) {
+				dev_err(nvme->n_dip, CE_WARN,
+				    "!controller fatal status at init");
+				ddi_fm_service_impact(nvme->n_dip,
+				    DDI_SERVICE_LOST);
+				nvme->n_dead = B_TRUE;
+				goto fail;
+			}
+
+			if (csts.b.csts_rdy == 1)
+				break;
+		}
+	}
+
+	if (csts.b.csts_rdy == 0) {
+		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
+		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
+		nvme->n_dead = B_TRUE;
+		goto fail;
+	}
+
+	/*
+	 * Assume an abort command limit of 1. We'll destroy and re-init
+	 * that later when we know the true abort command limit.
+	 */
+	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
+
+	/*
+	 * Post an asynchronous event command to catch errors.
+	 */
+	if (nvme_async_event(nvme) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to post async event");
+		goto fail;
+	}
+
+	/*
+	 * Identify Controller
+	 */
+	nvme->n_idctl = nvme_identify(nvme, 0);
+	if (nvme->n_idctl == NULL) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to identify controller");
+		goto fail;
+	}
+
+	/*
+	 * Get controller limits.
+	 */
+	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
+	    MIN(nvme->n_admin_queue_len / 10,
+	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
+	    "async-event-limit", nvme->n_async_event_limit);
+
+	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
+
+	/* disable NVMe interrupts while reinitializing the semaphore */
+	nvme_disable_interrupts(nvme);
+	sema_destroy(&nvme->n_abort_sema);
+	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
+	    SEMA_DRIVER, NULL);
+	if (nvme_enable_interrupts(nvme) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to re-enable interrupts");
+		goto fail;
+	}
+
+	nvme->n_progress |= NVME_CTRL_LIMITS;
+
+	if (nvme->n_idctl->id_mdts == 0)
+		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
+	else
+		nvme->n_max_data_transfer_size =
+		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
+
+	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
+
+	/*
+	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
+	 * Chained PRPs are currently unsupported.
+	 *
+	 * This is a no-op on hardware which doesn't support a transfer size
+	 * big enough to require chained PRPs.
+	 */
+	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
+	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
+
+	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
+
+	/*
+	 * Make sure the minimum/maximum queue entry sizes are not
+	 * larger/smaller than the default.
+	 */
+
+	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
+	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
+	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
+	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
+		goto fail;
+
+	/*
+	 * Check for the presence of a Volatile Write Cache. If present,
+	 * enable it by default.
+	 */
+	if (nvme->n_idctl->id_vwc.vwc_present == 0) {
+		nvme->n_volatile_write_cache_enabled = B_FALSE;
+		nvme_bd_ops.o_sync_cache = NULL;
+	} else {
+		/*
+		 * TODO: send SET FEATURES to enable VWC
+		 * (have no hardware to test this)
+		 */
+		nvme->n_volatile_write_cache_enabled = B_FALSE;
+		nvme_bd_ops.o_sync_cache = NULL;
+	}
+
+	/*
+	 * Grab a copy of all mandatory log pages.
+	 *
+	 * TODO: should go away once user space tool exists to print logs
+	 */
+	nvme->n_error_log = (nvme_error_log_entry_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_ERROR);
+	nvme->n_health_log = (nvme_health_log_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_HEALTH, -1);
+	nvme->n_fwslot_log = (nvme_fwslot_log_t *)
+	    nvme_get_logpage(nvme, NVME_LOGPAGE_FWSLOT);
+
+	/*
+	 * Identify Namespaces
+	 */
+	nvme->n_namespace_count = nvme->n_idctl->id_nn;
+	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
+	    nvme->n_namespace_count, KM_SLEEP);
+
+	for (i = 0; i != nvme->n_namespace_count; i++) {
+		nvme_identify_nsid_t *idns;
+		int last_rp;
+
+		nvme->n_ns[i].ns_nvme = nvme;
+		nvme->n_ns[i].ns_idns = idns = nvme_identify(nvme, i + 1);
+
+		if (idns == NULL) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to identify namespace %d", i + 1);
+			goto fail;
+		}
+
+		nvme->n_ns[i].ns_id = i + 1;
+		nvme->n_ns[i].ns_block_count = idns->id_nsize;
+		nvme->n_ns[i].ns_block_size =
+		    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
+		nvme->n_ns[i].ns_best_block_size = nvme->n_ns[i].ns_block_size;
+
+		nvme_prepare_devid(nvme, nvme->n_ns[i].ns_id);
+
+		/*
+		 * Find the LBA format with no metadata and the best relative
+		 * performance. A value of 3 means "degraded", 0 is best.
+		 */
+		last_rp = 3;
+		for (int j = 0; j != idns->id_nlbaf; j++) {
+			if (idns->id_lbaf[j].lbaf_lbads == 0)
+				break;
+			if (idns->id_lbaf[j].lbaf_ms != 0)
+				continue;
+			if (idns->id_lbaf[j].lbaf_rp >= last_rp)
+				continue;
+			last_rp = idns->id_lbaf[j].lbaf_rp;
+			nvme->n_ns[i].ns_best_block_size =
+			    1 << idns->id_lbaf[j].lbaf_lbads;
+		}
+
+		/*
+		 * We currently don't support namespaces that use either:
+		 * - thin provisioning
+		 * - extended LBAs
+		 * - protection information
+		 */
+		if (idns->id_nsfeat.f_thin ||
+		    idns->id_flbas.lba_extlba ||
+		    idns->id_dps.dp_pinfo) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!ignoring namespace %d, unsupported features: "
+			    "thin = %d, extlba = %d, pinfo = %d", i + 1,
+			    idns->id_nsfeat.f_thin, idns->id_flbas.lba_extlba,
+			    idns->id_dps.dp_pinfo);
+			nvme->n_ns[i].ns_ignore = B_TRUE;
+		}
+	}
+
+	/*
+	 * Try to set up MSI/MSI-X interrupts.
+	 */
+	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
+	    != 0) {
+		nvme_release_interrupts(nvme);
+
+		nqueues = MIN(UINT16_MAX, ncpus);
+
+		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
+		    nqueues) != DDI_SUCCESS) &&
+		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
+		    nqueues) != DDI_SUCCESS)) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to setup MSI/MSI-X interrupts");
+			goto fail;
+		}
+	}
+
+	nqueues = nvme->n_intr_cnt;
+
+	/*
+	 * Create I/O queue pairs.
+	 */
+	nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues);
+	if (nvme->n_ioq_count == 0) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to set number of I/O queues to %d", nqueues);
+		goto fail;
+	}
+
+	/*
+	 * Reallocate I/O queue array
+	 */
+	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
+	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
+	    (nvme->n_ioq_count + 1), KM_SLEEP);
+	nvme->n_ioq[0] = nvme->n_adminq;
+
+	/*
+	 * If we got less queues than we asked for we might as well give
+	 * some of the interrupt vectors back to the system.
+	 */
+	if (nvme->n_ioq_count < nqueues) {
+		nvme_release_interrupts(nvme);
+
+		if (nvme_setup_interrupts(nvme, nvme->n_intr_type, nqueues)
+		    != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to reduce number of interrupts");
+			goto fail;
+		}
+	}
+
+	/*
+	 * Alloc & register I/O queue pairs
+	 */
+	nvme->n_io_queue_len =
+	    MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries);
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len",
+	    nvme->n_io_queue_len);
+
+	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
+		if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
+		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!unable to allocate I/O qpair %d", i);
+			goto fail;
+		}
+
+		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
+		    != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!unable to create I/O qpair %d", i);
+			goto fail;
+		}
+	}
+
+	/*
+	 * Post more asynchronous events commands to reduce event reporting
+	 * latency as suggested by the spec.
+	 */
+	for (i = 1; i != nvme->n_async_event_limit; i++) {
+		if (nvme_async_event(nvme) != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!failed to post async event %d", i);
+			goto fail;
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	(void) nvme_reset(nvme, B_FALSE);
+	return (DDI_FAILURE);
+}
+
+static uint_t
+nvme_intr(caddr_t arg1, caddr_t arg2)
+{
+	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
+	nvme_t *nvme = (nvme_t *)arg1;
+	int inum = (int)(uintptr_t)arg2;
+	int qnum;
+	nvme_cmd_t *cmd;
+
+	if (inum >= nvme->n_intr_cnt)
+		return (DDI_INTR_UNCLAIMED);
+
+	/*
+	 * The interrupt vector a queue uses is calculated as queue_idx %
+	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
+	 * in steps of n_intr_cnt to process all queues using this vector.
+	 */
+	for (qnum = inum;
+	    qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
+	    qnum += nvme->n_intr_cnt) {
+		while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
+			taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+		}
+	}
+
+	return (DDI_INTR_CLAIMED);
+}
+
+static void
+nvme_disable_interrupts(nvme_t *nvme)
+{
+	int i;
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
+			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
+		else
+			(void) ddi_intr_disable(nvme->n_inth[i]);
+	}
+}
+
+static int
+nvme_enable_interrupts(nvme_t *nvme)
+{
+	int i, fail = 0;
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) {
+			if (ddi_intr_block_enable(&nvme->n_inth[i], 1) !=
+			    DDI_SUCCESS)
+				fail++;
+		} else {
+			if (ddi_intr_enable(nvme->n_inth[i]) != DDI_SUCCESS)
+				fail++;
+		}
+	}
+
+	return (fail ? DDI_FAILURE : DDI_SUCCESS);
+}
+
+static void
+nvme_release_interrupts(nvme_t *nvme)
+{
+	int i;
+
+	nvme_disable_interrupts(nvme);
+
+	for (i = 0; i < nvme->n_intr_cnt; i++) {
+		if (nvme->n_inth[i] == NULL)
+			break;
+
+		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
+		(void) ddi_intr_free(nvme->n_inth[i]);
+	}
+
+	kmem_free(nvme->n_inth, nvme->n_inth_sz);
+	nvme->n_inth = NULL;
+	nvme->n_inth_sz = 0;
+
+	nvme->n_progress &= ~NVME_INTERRUPTS;
+}
+
+static int
+nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
+{
+	int nintrs, navail, count;
+	int ret;
+	int i;
+
+	if (nvme->n_intr_types == 0) {
+		ret = ddi_intr_get_supported_types(nvme->n_dip,
+		    &nvme->n_intr_types);
+		if (ret != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!%s: ddi_intr_get_supported types failed",
+			    __func__);
+			return (ret);
+		}
+	}
+
+	if ((nvme->n_intr_types & intr_type) == 0)
+		return (DDI_FAILURE);
+
+	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
+		    __func__);
+		return (ret);
+	}
+
+	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
+		    __func__);
+		return (ret);
+	}
+
+	/* We want at most one interrupt per queue pair. */
+	if (navail > nqpairs)
+		navail = nqpairs;
+
+	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
+	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
+
+	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
+	    &count, 0);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
+		    __func__);
+		goto fail;
+	}
+
+	nvme->n_intr_cnt = count;
+
+	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
+		    __func__);
+		goto fail;
+	}
+
+	for (i = 0; i < count; i++) {
+		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
+		    (void *)nvme, (void *)(uintptr_t)i);
+		if (ret != DDI_SUCCESS) {
+			dev_err(nvme->n_dip, CE_WARN,
+			    "!%s: ddi_intr_add_handler failed", __func__);
+			goto fail;
+		}
+	}
+
+	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
+
+	ret = nvme_enable_interrupts(nvme);
+
+	if (ret != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!%s: nvme_enable_interrupts failed", __func__);
+		goto fail;
+	}
+
+	nvme->n_intr_type = intr_type;
+
+	nvme->n_progress |= NVME_INTERRUPTS;
+
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_release_interrupts(nvme);
+
+	return (ret);
+}
+
+static int
+nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
+{
+	_NOTE(ARGUNUSED(arg));
+
+	pci_ereport_post(dip, fm_error, NULL);
+	return (fm_error->fme_status);
+}
+
+static int
+nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	nvme_t *nvme;
+	int instance;
+	int nregs;
+	off_t regsize;
+	int i;
+	char name[32];
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+
+	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+	ddi_set_driver_private(dip, nvme);
+	nvme->n_dip = dip;
+
+	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
+	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
+	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
+	    B_TRUE : B_FALSE;
+	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
+	nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN);
+	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "async-event-limit",
+	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
+
+	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
+		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
+	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
+		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
+
+	if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN)
+		nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN;
+
+	if (nvme->n_async_event_limit < 1)
+		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
+
+	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
+	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
+	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
+	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
+
+	/*
+	 * Setup FMA support.
+	 */
+	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
+	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
+	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
+
+	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
+
+	if (nvme->n_fm_cap) {
+		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
+			nvme->n_reg_acc_attr.devacc_attr_access =
+			    DDI_FLAGERR_ACC;
+
+		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
+			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
+		}
+
+		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
+		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			pci_ereport_setup(dip);
+
+		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			ddi_fm_handler_register(dip, nvme_fm_errcb,
+			    (void *)nvme);
+	}
+
+	nvme->n_progress |= NVME_FMA_INIT;
+
+	/*
+	 * The spec defines several register sets. Only the controller
+	 * registers (set 1) are currently used.
+	 */
+	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
+	    nregs < 2 ||
+	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
+		goto fail;
+
+	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
+	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
+		dev_err(dip, CE_WARN, "!failed to map regset 1");
+		goto fail;
+	}
+
+	nvme->n_progress |= NVME_REGS_MAPPED;
+
+	/*
+	 * Create taskq for command completion.
+	 */
+	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq",
+	    ddi_driver_name(dip), ddi_get_instance(dip));
+	nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus),
+	    TASKQ_DEFAULTPRI, 0);
+	if (nvme->n_cmd_taskq == NULL) {
+		dev_err(dip, CE_WARN, "!failed to create cmd taskq");
+		goto fail;
+	}
+
+
+	if (nvme_init(nvme) != DDI_SUCCESS)
+		goto fail;
+
+	/*
+	 * Attach the blkdev driver for each namespace.
+	 */
+	for (i = 0; i != nvme->n_namespace_count; i++) {
+		if (nvme->n_ns[i].ns_ignore)
+			continue;
+
+		nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i],
+		    &nvme_bd_ops, &nvme->n_prp_dma_attr, KM_SLEEP);
+
+		if (nvme->n_ns[i].ns_bd_hdl == NULL) {
+			dev_err(dip, CE_WARN,
+			    "!failed to get blkdev handle for namespace %d", i);
+			goto fail;
+		}
+
+		if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl)
+		    != DDI_SUCCESS) {
+			dev_err(dip, CE_WARN,
+			    "!failed to attach blkdev handle for namespace %d",
+			    i);
+			goto fail;
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+fail:
+	/* attach successful anyway so that FMA can retire the device */
+	if (nvme->n_dead)
+		return (DDI_SUCCESS);
+
+	(void) nvme_detach(dip, DDI_DETACH);
+
+	return (DDI_FAILURE);
+}
+
+static int
+nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int instance, i;
+	nvme_t *nvme;
+
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+
+	if (nvme == NULL)
+		return (DDI_FAILURE);
+
+	if (nvme->n_ns) {
+		for (i = 0; i != nvme->n_namespace_count; i++) {
+			if (nvme->n_ns[i].ns_bd_hdl) {
+				(void) bd_detach_handle(
+				    nvme->n_ns[i].ns_bd_hdl);
+				bd_free_handle(nvme->n_ns[i].ns_bd_hdl);
+			}
+
+			if (nvme->n_ns[i].ns_idns)
+				kmem_free(nvme->n_ns[i].ns_idns,
+				    sizeof (nvme_identify_nsid_t));
+		}
+
+		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
+		    nvme->n_namespace_count);
+	}
+
+	if (nvme->n_progress & NVME_INTERRUPTS)
+		nvme_release_interrupts(nvme);
+
+	if (nvme->n_cmd_taskq)
+		ddi_taskq_wait(nvme->n_cmd_taskq);
+
+	if (nvme->n_ioq_count > 0) {
+		for (i = 1; i != nvme->n_ioq_count + 1; i++) {
+			if (nvme->n_ioq[i] != NULL) {
+				/* TODO: send destroy queue commands */
+				nvme_free_qpair(nvme->n_ioq[i]);
+			}
+		}
+
+		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
+		    (nvme->n_ioq_count + 1));
+	}
+
+	if (nvme->n_progress & NVME_REGS_MAPPED) {
+		nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
+		(void) nvme_reset(nvme, B_FALSE);
+	}
+
+	if (nvme->n_cmd_taskq)
+		ddi_taskq_destroy(nvme->n_cmd_taskq);
+
+	if (nvme->n_progress & NVME_CTRL_LIMITS)
+		sema_destroy(&nvme->n_abort_sema);
+
+	if (nvme->n_progress & NVME_ADMIN_QUEUE)
+		nvme_free_qpair(nvme->n_adminq);
+
+	if (nvme->n_idctl)
+		kmem_free(nvme->n_idctl, sizeof (nvme_identify_ctrl_t));
+
+	if (nvme->n_progress & NVME_REGS_MAPPED)
+		ddi_regs_map_free(&nvme->n_regh);
+
+	if (nvme->n_progress & NVME_FMA_INIT) {
+		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			ddi_fm_handler_unregister(nvme->n_dip);
+
+		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
+		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
+			pci_ereport_teardown(nvme->n_dip);
+
+		ddi_fm_fini(nvme->n_dip);
+	}
+
+	ddi_soft_state_free(nvme_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+nvme_quiesce(dev_info_t *dip)
+{
+	int instance;
+	nvme_t *nvme;
+
+	instance = ddi_get_instance(dip);
+
+	nvme = ddi_get_soft_state(nvme_state, instance);
+
+	if (nvme == NULL)
+		return (DDI_FAILURE);
+
+	nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE);
+
+	(void) nvme_reset(nvme, B_TRUE);
+
+	return (DDI_FAILURE);
+}
+
+static int
+nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer)
+{
+	nvme_t *nvme = cmd->nc_nvme;
+	int nprp_page, nprp;
+	uint64_t *prp;
+
+	if (xfer->x_ndmac == 0)
+		return (DDI_FAILURE);
+
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress;
+	ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
+
+	if (xfer->x_ndmac == 1) {
+		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
+		return (DDI_SUCCESS);
+	} else if (xfer->x_ndmac == 2) {
+		cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress;
+		return (DDI_SUCCESS);
+	}
+
+	xfer->x_ndmac--;
+
+	nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1;
+	ASSERT(nprp_page > 0);
+	nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page;
+
+	/*
+	 * We currently don't support chained PRPs and set up our DMA
+	 * attributes to reflect that. If we still get an I/O request
+	 * that needs a chained PRP something is very wrong.
+	 */
+	VERIFY(nprp == 1);
+
+	if (nvme_zalloc_dma(nvme, nvme->n_pagesize * nprp, DDI_DMA_READ,
+	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN, "!%s: nvme_zalloc_dma failed",
+		    __func__);
+		return (DDI_FAILURE);
+	}
+
+	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress;
+	ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, &cmd->nc_dma->nd_cookie);
+
+	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
+	for (prp = (uint64_t *)cmd->nc_dma->nd_memp;
+	    xfer->x_ndmac > 0;
+	    prp++, xfer->x_ndmac--) {
+		*prp = xfer->x_dmac.dmac_laddress;
+		ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac);
+	}
+
+	(void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len,
+	    DDI_DMA_SYNC_FORDEV);
+	return (DDI_SUCCESS);
+}
+
+static nvme_cmd_t *
+nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
+{
+	nvme_t *nvme = ns->ns_nvme;
+	nvme_cmd_t *cmd;
+
+	/*
+	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
+	 */
+	cmd = nvme_alloc_cmd(nvme, (xfer->x_flags & BD_XFER_POLL) ?
+	    KM_NOSLEEP : KM_SLEEP);
+
+	if (cmd == NULL)
+		return (NULL);
+
+	cmd->nc_sqe.sqe_opc = opc;
+	cmd->nc_callback = nvme_bd_xfer_done;
+	cmd->nc_xfer = xfer;
+
+	switch (opc) {
+	case NVME_OPC_NVM_WRITE:
+	case NVME_OPC_NVM_READ:
+		VERIFY(xfer->x_nblks <= 0x10000);
+
+		cmd->nc_sqe.sqe_nsid = ns->ns_id;
+
+		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
+		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
+		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
+
+		if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS)
+			goto fail;
+		break;
+
+	case NVME_OPC_NVM_FLUSH:
+		cmd->nc_sqe.sqe_nsid = ns->ns_id;
+		break;
+
+	default:
+		goto fail;
+	}
+
+	return (cmd);
+
+fail:
+	nvme_free_cmd(cmd);
+	return (NULL);
+}
+
+static void
+nvme_bd_xfer_done(void *arg)
+{
+	nvme_cmd_t *cmd = arg;
+	bd_xfer_t *xfer = cmd->nc_xfer;
+	int error = 0;
+
+	error = nvme_check_cmd_status(cmd);
+	nvme_free_cmd(cmd);
+
+	bd_xfer_done(xfer, error);
+}
+
+static void
+nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
+{
+	nvme_namespace_t *ns = arg;
+	nvme_t *nvme = ns->ns_nvme;
+
+	/*
+	 * blkdev maintains one queue size per instance (namespace),
+	 * but all namespace share the I/O queues.
+	 * TODO: need to figure out a sane default, or use per-NS I/O queues,
+	 * or change blkdev to handle EAGAIN
+	 */
+	drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len
+	    / nvme->n_namespace_count;
+
+	/*
+	 * d_maxxfer is not set, which means the value is taken from the DMA
+	 * attributes specified to bd_alloc_handle.
+	 */
+
+	drive->d_removable = B_FALSE;
+	drive->d_hotpluggable = B_FALSE;
+
+	drive->d_target = ns->ns_id;
+	drive->d_lun = 0;
+}
+
+static int
+nvme_bd_mediainfo(void *arg, bd_media_t *media)
+{
+	nvme_namespace_t *ns = arg;
+
+	media->m_nblks = ns->ns_block_count;
+	media->m_blksize = ns->ns_block_size;
+	media->m_readonly = B_FALSE;
+	media->m_solidstate = B_TRUE;
+
+	media->m_pblksize = ns->ns_best_block_size;
+
+	return (0);
+}
+
+static int
+nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
+{
+	nvme_t *nvme = ns->ns_nvme;
+	nvme_cmd_t *cmd;
+
+	if (nvme->n_dead)
+		return (EIO);
+
+	/* No polling for now */
+	if (xfer->x_flags & BD_XFER_POLL)
+		return (EIO);
+
+	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
+	if (cmd == NULL)
+		return (ENOMEM);
+
+	cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
+	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
+
+	if (nvme_submit_cmd(nvme->n_ioq[cmd->nc_sqid], cmd)
+	    != DDI_SUCCESS)
+		return (EAGAIN);
+
+	return (0);
+}
+
+static int
+nvme_bd_read(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
+}
+
+static int
+nvme_bd_write(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
+}
+
+static int
+nvme_bd_sync(void *arg, bd_xfer_t *xfer)
+{
+	nvme_namespace_t *ns = arg;
+
+	if (ns->ns_nvme->n_dead)
+		return (EIO);
+
+	/*
+	 * If the volatile write cache isn't enabled the FLUSH command is a
+	 * no-op, so we can take a shortcut here.
+	 */
+	if (ns->ns_nvme->n_volatile_write_cache_enabled == B_FALSE) {
+		bd_xfer_done(xfer, ENOTSUP);
+		return (0);
+	}
+
+	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
+}
+
+static int
+nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
+{
+	nvme_namespace_t *ns = arg;
+
+	return (ddi_devid_init(devinfo, DEVID_ENCAP, strlen(ns->ns_devid),
+	    ns->ns_devid, devid));
+}
diff --git a/usr/src/uts/common/io/nvme/nvme.conf b/usr/src/uts/common/io/nvme/nvme.conf
new file mode 100644
index 0000000000..186bd38018
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme.conf
@@ -0,0 +1,40 @@
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+# The driver was tested only against devices supporting v1.0 of the
+# NVMe specification. Uncomment this to be able to use devices conforming
+# to newer specifications.
+#strict-version=0;
+
+#
+# The driver does currently not support any vendor specific extension to the
+# specification. By default it will fault the device if it receives a vendor-
+# specific command status. Uncomment this to disable this behaviour.
+#ignore-unknown-vendor-status=1;
+
+#
+# The maximum length of the admin queue can be overridden here (16-4096).
+#admin-queue-len=256;
+
+#
+# The maximum length of the individual I/O queues can be overriden here
+# (16-65536).
+#io-queue-len=1024;
+
+#
+# The maximum number of outstanding asynchronous event requests can
+# overridden here.
+#async-event-limit=10;
+
+
diff --git a/usr/src/uts/common/io/nvme/nvme_reg.h b/usr/src/uts/common/io/nvme/nvme_reg.h
new file mode 100644
index 0000000000..8fb44a3730
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme_reg.h
@@ -0,0 +1,692 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/*
+ * NVMe hardware interface
+ */
+
+#ifndef _NVME_REG_H
+#define	_NVME_REG_H
+
+#pragma pack(1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * NVMe constants
+ */
+#define	NVME_MAX_ADMIN_QUEUE_LEN	4096
+
+/*
+ * NVMe registers and register fields
+ */
+#define	NVME_REG_CAP	0x0		/* Controller Capabilities */
+#define	NVME_REG_VS	0x8		/* Version */
+#define	NVME_REG_INTMS	0xc		/* Interrupt Mask Set */
+#define	NVME_REG_INTMC	0x10		/* Interrupt Mask Clear */
+#define	NVME_REG_CC	0x14		/* Controller Configuration */
+#define	NVME_REG_CSTS	0x1c		/* Controller Status */
+#define	NVME_REG_NSSR	0x20		/* NVM Subsystem Reset */
+#define	NVME_REG_AQA	0x24		/* Admin Queue Attributes */
+#define	NVME_REG_ASQ	0x28		/* Admin Submission Queue */
+#define	NVME_REG_ACQ	0x30		/* Admin Completion Qeueu */
+#define	NVME_REG_SQTDBL(nvme, n) \
+	(0x1000 + ((2 * (n)) * nvme->n_doorbell_stride))
+#define	NVME_REG_CQHDBL(nvme, n) \
+	(0x1000 + ((2 * (n) + 1) * nvme->n_doorbell_stride))
+
+#define	 NVME_CAP_CSS_NVM	1	/* NVM Command Set */
+#define	 NVME_CAP_AMS_WRR	1	/* Weighted Round-Robin */
+
+/* CAP -- Controller Capabilities */
+typedef union {
+	struct {
+		uint16_t cap_mqes;	/* Maximum Queue Entries Supported */
+		uint8_t cap_cqr:1;	/* Contiguous Queues Required */
+		uint8_t cap_ams:2;	/* Arbitration Mechanisms Supported */
+		uint8_t cap_rsvd1:5;
+		uint8_t cap_to;		/* Timeout */
+		uint16_t cap_dstrd:4;	/* Doorbell Stride */
+		uint16_t cap_nssrs:1;	/* NVM Subsystem Reset Supported */
+		uint16_t cap_css:8;	/* Command Sets Supported */
+		uint16_t cap_rsvd2:3;
+		uint8_t cap_mpsmin:4;	/* Memory Page Size Minimum */
+		uint8_t cap_mpsmax:4;	/* Memory Page Size Maximum */
+		uint8_t cap_rsvd3;
+	} b;
+	uint64_t r;
+} nvme_reg_cap_t;
+
+/* VS -- Version */
+typedef union {
+	struct {
+		uint8_t vs_rsvd;
+		uint8_t vs_mnr;		/* Minor Version Number */
+		uint16_t vs_mjr;	/* Major Version Number */
+	} b;
+	uint32_t r;
+} nvme_reg_vs_t;
+
+/* CC -- Controller Configuration */
+#define	NVME_CC_SHN_NORMAL	1	/* Normal Shutdown Notification */
+#define	NVME_CC_SHN_ABRUPT	2	/* Abrupt Shutdown Notification */
+
+typedef union {
+	struct {
+		uint16_t cc_en:1;	/* Enable */
+		uint16_t cc_rsvd1:3;
+		uint16_t cc_css:3;	/* I/O Command Set Selected */
+		uint16_t cc_mps:4;	/* Memory Page Size */
+		uint16_t cc_ams:3;	/* Arbitration Mechanism Selected */
+		uint16_t cc_shn:2;	/* Shutdown Notification */
+		uint8_t cc_iosqes:4;	/* I/O Submission Queue Entry Size */
+		uint8_t cc_iocqes:4;	/* I/O Completion Queue Entry Size */
+		uint8_t cc_rsvd2;
+	} b;
+	uint32_t r;
+} nvme_reg_cc_t;
+
+/* CSTS -- Controller Status */
+#define	NVME_CSTS_SHN_OCCURING	1	/* Shutdown Processing Occuring */
+#define	NVME_CSTS_SHN_COMPLETE	2	/* Shutdown Processing Complete */
+
+typedef union {
+	struct {
+		uint32_t csts_rdy:1;	/* Ready */
+		uint32_t csts_cfs:1;	/* Controller Fatal Status */
+		uint32_t csts_shst:2;	/* Shutdown Status */
+		uint32_t csts_nssro:1;	/* NVM Subsystem Reset Occured */
+		uint32_t csts_rsvd:28;
+	} b;
+	uint32_t r;
+} nvme_reg_csts_t;
+
+/* NSSR -- NVM Subsystem Reset */
+#define	NVME_NSSR_NSSRC	0x4e564d65	/* NSSR magic value */
+typedef uint32_t nvme_reg_nssr_t;
+
+/* AQA -- Admin Queue Attributes */
+typedef union {
+	struct {
+		uint16_t aqa_asqs:12;	/* Admin Submission Queue Size */
+		uint16_t aqa_rsvd1:4;
+		uint16_t aqa_acqs:12;	/* Admin Completion Queue Size */
+		uint16_t aqa_rsvd2:4;
+	} b;
+	uint32_t r;
+} nvme_reg_aqa_t;
+
+/*
+ * The spec specifies the lower 12 bits of ASQ and ACQ as reserved, which is
+ * probably a specification bug. The full 64bit regs are used as base address,
+ * and the lower bits must be zero to ensure alignment on the page size
+ * specified in CC.MPS.
+ */
+/* ASQ -- Admin Submission Queue Base Address */
+typedef uint64_t nvme_reg_asq_t;	/* Admin Submission Queue Base */
+
+/* ACQ -- Admin Completion Queue Base Address */
+typedef uint64_t nvme_reg_acq_t;	/* Admin Completion Queue Base */
+
+/* SQyTDBL -- Submission Queue y Tail Doorbell */
+typedef union {
+	struct {
+		uint16_t sqtdbl_sqt;	/* Submission Queue Tail */
+		uint16_t sqtdbl_rsvd;
+	} b;
+	uint32_t r;
+} nvme_reg_sqtdbl_t;
+
+/* CQyHDBL -- Completion Queue y Head Doorbell */
+typedef union {
+	struct {
+		uint16_t cqhdbl_cqh;	/* Completion Queue Head */
+		uint16_t cqhdbl_rsvd;
+	} b;
+	uint32_t r;
+} nvme_reg_cqhdbl_t;
+
+/*
+ * NVMe submission queue entries
+ */
+
+/* NVMe scatter/gather list descriptor */
+typedef struct {
+	uint64_t sgl_addr;		/* Address */
+	uint32_t sgl_len;		/* Length */
+	uint8_t sgl_rsvd[3];
+	uint8_t sgl_zero:4;
+	uint8_t sgl_type:4;		/* SGL descriptor type */
+} nvme_sgl_t;
+
+/* NVMe SGL descriptor type */
+#define	NVME_SGL_DATA_BLOCK	0
+#define	NVME_SGL_BIT_BUCKET	1
+#define	NVME_SGL_SEGMENT	2
+#define	NVME_SGL_LAST_SEGMENT	3
+#define	NVME_SGL_VENDOR		0xf
+
+/* NVMe submission queue entry */
+typedef struct {
+	uint8_t sqe_opc;		/* Opcode */
+	uint8_t sqe_fuse:2;		/* Fused Operation */
+	uint8_t sqe_rsvd:5;
+	uint8_t sqe_psdt:1;		/* PRP or SGL for Data Transfer */
+	uint16_t sqe_cid;		/* Command Identifier */
+	uint32_t sqe_nsid;		/* Namespace Identifier */
+	uint64_t sqe_rsvd1;
+	union {
+		uint64_t m_ptr;		/* Metadata Pointer */
+		uint64_t m_sglp;	/* Metadata SGL Segment Pointer */
+	} sqe_m;
+	union {
+		uint64_t d_prp[2];	/* Physical Page Region Entries 1 & 2 */
+		nvme_sgl_t d_sgl;	/* SGL Entry 1 */
+	} sqe_dptr;			/* Data Pointer */
+	uint32_t sqe_cdw10;		/* Number of Dwords in Data Transfer */
+	uint32_t sqe_cdw11;		/* Number of Dwords in Metadata Xfer */
+	uint32_t sqe_cdw12;
+	uint32_t sqe_cdw13;
+	uint32_t sqe_cdw14;
+	uint32_t sqe_cdw15;
+} nvme_sqe_t;
+
+/* NVMe admin command opcodes */
+#define	NVME_OPC_DELETE_SQUEUE	0x0
+#define	NVME_OPC_CREATE_SQUEUE	0x1
+#define	NVME_OPC_GET_LOG_PAGE	0x2
+#define	NVME_OPC_DELETE_CQUEUE	0x4
+#define	NVME_OPC_CREATE_CQUEUE	0x5
+#define	NVME_OPC_IDENTIFY	0x6
+#define	NVME_OPC_ABORT		0x8
+#define	NVME_OPC_SET_FEATURES	0x9
+#define	NVME_OPC_GET_FEATURES	0xa
+#define	NVME_OPC_ASYNC_EVENT	0xc
+#define	NVME_OPC_FW_ACTIVATE	0x10
+#define	NVME_OPC_FW_IMAGE_LOAD	0x11
+
+/* NVMe NVM command set specific admin command opcodes */
+#define	NVME_OPC_NVM_FORMAT	0x80
+#define	NVME_OPC_NVM_SEC_SEND	0x81
+#define	NVME_OPC_NVM_SEC_RECV	0x82
+
+/* NVMe NVM command opcodes */
+#define	NVME_OPC_NVM_FLUSH	0x0
+#define	NVME_OPC_NVM_WRITE	0x1
+#define	NVME_OPC_NVM_READ	0x2
+#define	NVME_OPC_NVM_WRITE_UNC	0x4
+#define	NVME_OPC_NVM_COMPARE	0x5
+#define	NVME_OPC_NVM_WRITE_ZERO	0x8
+#define	NVME_OPC_NVM_DSET_MGMT	0x9
+#define	NVME_OPC_NVM_RESV_REG	0xd
+#define	NVME_OPC_NVM_RESV_REPRT	0xe
+#define	NVME_OPC_NVM_RESV_ACQ	0x11
+#define	NVME_OPC_NVM_RESV_REL	0x12
+
+/*
+ * NVMe completion queue entry
+ */
+typedef struct {
+	uint16_t sf_p:1;		/* Phase Tag */
+	uint16_t sf_sc:8;		/* Status Code */
+	uint16_t sf_sct:3;		/* Status Code Type */
+	uint16_t sf_rsvd2:2;
+	uint16_t sf_m:1;		/* More */
+	uint16_t sf_dnr:1;		/* Do Not Retry */
+} nvme_cqe_sf_t;
+
+typedef struct {
+	uint32_t cqe_dw0;		/* Command Specific */
+	uint32_t cqe_rsvd1;
+	uint16_t cqe_sqhd;		/* SQ Head Pointer */
+	uint16_t cqe_sqid;		/* SQ Identifier */
+	uint16_t cqe_cid;		/* Command Identifier */
+	nvme_cqe_sf_t cqe_sf;		/* Status Field */
+} nvme_cqe_t;
+
+/* NVMe completion status code type */
+#define	NVME_CQE_SCT_GENERIC	0	/* Generic Command Status */
+#define	NVME_CQE_SCT_SPECIFIC	1	/* Command Specific Status */
+#define	NVME_CQE_SCT_INTEGRITY	2	/* Media and Data Integrity Errors */
+#define	NVME_CQE_SCT_VENDOR	7	/* Vendor Specific */
+
+/* NVMe completion status code (generic) */
+#define	NVME_CQE_SC_GEN_SUCCESS		0x0	/* Successful Completion */
+#define	NVME_CQE_SC_GEN_INV_OPC		0x1	/* Invalid Command Opcode */
+#define	NVME_CQE_SC_GEN_INV_FLD		0x2	/* Invalid Field in Command */
+#define	NVME_CQE_SC_GEN_ID_CNFL		0x3	/* Command ID Conflict */
+#define	NVME_CQE_SC_GEN_DATA_XFR_ERR	0x4	/* Data Transfer Error */
+#define	NVME_CQE_SC_GEN_ABORT_PWRLOSS	0x5	/* Cmds Aborted / Pwr Loss */
+#define	NVME_CQE_SC_GEN_INTERNAL_ERR	0x6	/* Internal Error */
+#define	NVME_CQE_SC_GEN_ABORT_REQUEST	0x7	/* Command Abort Requested */
+#define	NVME_CQE_SC_GEN_ABORT_SQ_DEL	0x8	/* Cmd Aborted / SQ deletion */
+#define	NVME_CQE_SC_GEN_ABORT_FUSE_FAIL	0x9	/* Cmd Aborted / Failed Fused */
+#define	NVME_CQE_SC_GEN_ABORT_FUSE_MISS	0xa	/* Cmd Aborted / Missing Fusd */
+#define	NVME_CQE_SC_GEN_INV_NS		0xb	/* Inval Namespace or Format */
+#define	NVME_CQE_SC_GEN_CMD_SEQ_ERR	0xc	/* Command Sequence Error */
+#define	NVME_CQE_SC_GEN_INV_SGL_LAST	0xd	/* Inval SGL Last Seg Desc */
+#define	NVME_CQE_SC_GEN_INV_SGL_NUM	0xe	/* Inval Number of SGL Desc */
+#define	NVME_CQE_SC_GEN_INV_DSGL_LEN	0xf	/* Data SGL Length Invalid */
+#define	NVME_CQE_SC_GEN_INV_MSGL_LEN	0x10	/* Metadata SGL Length Inval */
+#define	NVME_CQE_SC_GEN_INV_SGL_DESC	0x11	/* SGL Descriptor Type Inval */
+
+/* NVMe completion status code (generic NVM commands) */
+#define	NVME_CQE_SC_GEN_NVM_LBA_RANGE	0x80	/* LBA Out Of Range */
+#define	NVME_CQE_SC_GEN_NVM_CAP_EXC	0x81	/* Capacity Exceeded */
+#define	NVME_CQE_SC_GEN_NVM_NS_NOTRDY	0x82	/* Namespace Not Ready */
+#define	NVME_CQE_SC_GEN_NVM_RSV_CNFLCT	0x83	/* Reservation Conflict */
+
+/* NVMe completion status code (command specific) */
+#define	NVME_CQE_SC_SPC_INV_CQ		0x0	/* Completion Queue Invalid */
+#define	NVME_CQE_SC_SPC_INV_QID		0x1	/* Invalid Queue Identifier */
+#define	NVME_CQE_SC_SPC_MAX_QSZ_EXC	0x2	/* Max Queue Size Exceeded */
+#define	NVME_CQE_SC_SPC_ABRT_CMD_EXC	0x3	/* Abort Cmd Limit Exceeded */
+#define	NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC	0x5	/* Async Event Request Limit */
+#define	NVME_CQE_SC_SPC_INV_FW_SLOT	0x6	/* Invalid Firmware Slot */
+#define	NVME_CQE_SC_SPC_INV_FW_IMG	0x7	/* Invalid Firmware Image */
+#define	NVME_CQE_SC_SPC_INV_INT_VECT	0x8	/* Invalid Interrupt Vector */
+#define	NVME_CQE_SC_SPC_INV_LOG_PAGE	0x9	/* Invalid Log Page */
+#define	NVME_CQE_SC_SPC_INV_FORMAT	0xa	/* Invalid Format */
+#define	NVME_CQE_SC_SPC_FW_RESET	0xb	/* FW Application Reset Reqd */
+#define	NVME_CQE_SC_SPC_INV_Q_DEL	0xc	/* Invalid Queue Deletion */
+#define	NVME_CQE_SC_SPC_FEAT_SAVE	0xd	/* Feature Id Not Saveable */
+#define	NVME_CQE_SC_SPC_FEAT_CHG	0xe	/* Feature Not Changeable */
+#define	NVME_CQE_SC_SPC_FEAT_NS_SPEC	0xf	/* Feature Not Namespace Spec */
+#define	NVME_CQE_SC_SPC_FW_NSSR		0x10	/* FW Application NSSR Reqd */
+
+/* NVMe completion status code (NVM command specific */
+#define	NVME_CQE_SC_SPC_NVM_CNFL_ATTR	0x80	/* Conflicting Attributes */
+#define	NVME_CQE_SC_SPC_NVM_INV_PROT	0x81	/* Invalid Protection */
+#define	NVME_CQE_SC_SPC_NVM_READONLY	0x82	/* Write to Read Only Range */
+
+/* NVMe completion status code (data / metadata integrity) */
+#define	NVME_CQE_SC_INT_NVM_WRITE	0x80	/* Write Fault */
+#define	NVME_CQE_SC_INT_NVM_READ	0x81	/* Unrecovered Read Error */
+#define	NVME_CQE_SC_INT_NVM_GUARD	0x82	/* Guard Check Error */
+#define	NVME_CQE_SC_INT_NVM_APPL_TAG	0x83	/* Application Tag Check Err */
+#define	NVME_CQE_SC_INT_NVM_REF_TAG	0x84	/* Reference Tag Check Err */
+#define	NVME_CQE_SC_INT_NVM_COMPARE	0x85	/* Compare Failure */
+#define	NVME_CQE_SC_INT_NVM_ACCESS	0x86	/* Access Denied */
+
+/*
+ * NVMe Asynchronous Event Request
+ */
+#define	NVME_ASYNC_TYPE_ERROR		0x0	/* Error Status */
+#define	NVME_ASYNC_TYPE_HEALTH		0x1	/* SMART/Health Status */
+#define	NVME_ASYNC_TYPE_VENDOR		0x7	/* vendor specific */
+
+#define	NVME_ASYNC_ERROR_INV_SQ		0x0	/* Invalid Submission Queue */
+#define	NVME_ASYNC_ERROR_INV_DBL	0x1	/* Invalid Doorbell Write */
+#define	NVME_ASYNC_ERROR_DIAGFAIL	0x2	/* Diagnostic Failure */
+#define	NVME_ASYNC_ERROR_PERSISTENT	0x3	/* Persistent Internal Error */
+#define	NVME_ASYNC_ERROR_TRANSIENT	0x4	/* Transient Internal Error */
+#define	NVME_ASYNC_ERROR_FW_LOAD	0x5	/* Firmware Image Load Error */
+
+#define	NVME_ASYNC_HEALTH_RELIABILITY	0x0	/* Device Reliability */
+#define	NVME_ASYNC_HEALTH_TEMPERATURE	0x1	/* Temp. Above Threshold */
+#define	NVME_ASYNC_HEALTH_SPARE		0x2	/* Spare Below Threshold */
+
+typedef union {
+	struct {
+		uint8_t ae_type:3;		/* Asynchronous Event Type */
+		uint8_t ae_rsvd1:5;
+		uint8_t ae_info;		/* Asynchronous Event Info */
+		uint8_t ae_logpage;		/* Associated Log Page */
+		uint8_t ae_rsvd2;
+	} b;
+	uint32_t r;
+} nvme_async_event_t;
+
+/*
+ * NVMe Create Completion/Submission Queue
+ */
+typedef union {
+	struct {
+		uint16_t q_qid;			/* Queue Identifier */
+		uint16_t q_qsize; 		/* Queue Size */
+	} b;
+	uint32_t r;
+} nvme_create_queue_dw10_t;
+
+typedef union {
+	struct {
+		uint16_t cq_pc:1;		/* Physically Contiguous */
+		uint16_t cq_ien:1;		/* Interrupts Enabled */
+		uint16_t cq_rsvd:14;
+		uint16_t cq_iv;			/* Interrupt Vector */
+	} b;
+	uint32_t r;
+} nvme_create_cq_dw11_t;
+
+typedef union {
+	struct {
+		uint16_t sq_pc:1;		/* Physically Contiguous */
+		uint16_t sq_qprio:2;		/* Queue Priority */
+		uint16_t sq_rsvd:13;
+		uint16_t sq_cqid;		/* Completion Queue ID */
+	} b;
+	uint32_t r;
+} nvme_create_sq_dw11_t;
+
+/*
+ * NVMe Identify
+ */
+
+/* NVMe Identify parameters (cdw10) */
+#define	NVME_IDENTIFY_NSID	0x0	/* Identify Namespace */
+#define	NVME_IDENTIFY_CTRL	0x1	/* Identify Controller */
+#define	NVME_IDENTIFY_LIST	0x2	/* Identify List Namespaces */
+
+#define	NVME_IDENTIFY_BUFSIZE	4096	/* buffer size for Identify */
+
+/* NVMe Queue Entry Size bitfield */
+typedef struct {
+	uint8_t qes_min:4;		/* minimum entry size */
+	uint8_t qes_max:4;		/* maximum entry size */
+} nvme_idctl_qes_t;
+
+/* NVMe Power State Descriptor */
+typedef struct {
+	uint16_t psd_mp;		/* Maximum Power */
+	uint16_t psd_rsvd1;
+	uint32_t psd_enlat;		/* Entry Latency */
+	uint32_t psd_exlat;		/* Exit Latency */
+	uint8_t psd_rrt:5;		/* Relative Read Throughput */
+	uint8_t psd_rsvd2:3;
+	uint8_t psd_rrl:5;		/* Relative Read Latency */
+	uint8_t psd_rsvd3:3;
+	uint8_t psd_rwt:5;		/* Relative Write Throughput */
+	uint8_t	psd_rsvd4:3;
+	uint8_t psd_rwl:5;		/* Relative Write Latency */
+	uint8_t psd_rsvd5:3;
+	uint8_t psd_rsvd6[16];
+} nvme_idctl_psd_t;
+
+/* NVMe Identify Controller Data Structure */
+typedef struct {
+	/* Controller Capabilities & Features */
+	uint16_t id_vid;		/* PCI vendor ID */
+	uint16_t id_ssvid; 		/* PCI subsystem vendor ID */
+	char id_serial[20];		/* Serial Number */
+	char id_model[40];		/* Model Number */
+	char id_fwrev[8];		/* Firmware Revision */
+	uint8_t id_rab;			/* Recommended Arbitration Burst */
+	uint8_t id_oui[3];		/* vendor IEEE OUI */
+	struct {			/* Multi-Interface Capabilities */
+		uint8_t m_multi:1;	/* HW has multiple PCIe interfaces */
+		uint8_t m_rsvd:7;
+	} id_mic;
+	uint8_t	id_mdts;		/* Maximum Data Transfer Size */
+	uint8_t id_rsvd_cc[256 - 78];
+
+	/* Admin Command Set Attributes */
+	struct {			/* Optional Admin Command Support */
+		uint16_t oa_security:1;	/* Security Send & Receive */
+		uint16_t oa_format:1;	/* Format NVM */
+		uint16_t oa_firmare:1;	/* Firmware Activate & Download */
+		uint16_t oa_rsvd:13;
+	} id_oacs;
+	uint8_t	id_acl;			/* Abort Command Limit */
+	uint8_t id_aerl;		/* Asynchronous Event Request Limit */
+	struct {			/* Firmware Updates */
+		uint8_t fw_readonly:1;	/* Slot 1 is Read-Only */
+		uint8_t	fw_nslot:3;	/* number of firmware slots */
+		uint8_t fw_rsvd:4;
+	} id_frmw;
+	struct {			/* Log Page Attributes */
+		uint8_t lp_smart:1;	/* SMART/Health information per NS */
+		uint8_t lp_rsvd:7;
+	} id_lpa;
+	uint8_t id_elpe;		/* Error Log Page Entries */
+	uint8_t	id_npss;		/* Number of Power States */
+	struct {			/* Admin Vendor Specific Command Conf */
+		uint8_t av_spec:1;	/* use format from spec */
+		uint8_t av_rsvd:7;
+	} id_avscc;
+	uint8_t id_rsvd_ac[256 - 9];
+
+	/* NVM Command Set Attributes */
+	nvme_idctl_qes_t id_sqes;	/* Submission Queue Entry Size */
+	nvme_idctl_qes_t id_cqes;	/* Completion Queue Entry Size */
+	uint16_t id_rsvd_nc_1;
+	uint32_t id_nn;			/* Number of Namespaces */
+	struct {			/* Optional NVM Command Support */
+		uint16_t on_compare:1;	/* Compare */
+		uint16_t on_wr_unc:1;	/* Write Uncorrectable */
+		uint16_t on_dset_mgmt:1; /* Dataset Management */
+		uint16_t on_rsvd:13;
+	} id_oncs;
+	struct {			/* Fused Operation Support */
+		uint16_t f_cmp_wr:1;	/* Compare and Write */
+		uint16_t f_rsvd:15;
+	} id_fuses;
+	struct {			/* Format NVM Attributes */
+		uint8_t fn_format:1;	/* Format applies to all NS */
+		uint8_t fn_sec_erase:1;	/* Secure Erase applies to all NS */
+		uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */
+		uint8_t fn_rsvd:5;
+	} id_fna;
+	struct {			/* Volatile Write Cache */
+		uint8_t vwc_present:1;	/* Volatile Write Cache present */
+		uint8_t rsvd:7;
+	} id_vwc;
+	uint16_t id_awun;		/* Atomic Write Unit Normal */
+	uint16_t id_awupf;		/* Atomic Write Unit Power Fail */
+	struct {			/* NVM Vendor Specific Command Conf */
+		uint8_t nv_spec:1;	/* use format from spec */
+		uint8_t nv_rsvd:7;
+	} id_nvscc;
+	uint8_t id_rsvd_nc_2[192 - 19];
+
+	/* I/O Command Set Attributes */
+	uint8_t id_rsvd_ioc[1344];
+
+	/* Power State Descriptors */
+	nvme_idctl_psd_t id_psd[32];
+
+	/* Vendor Specific */
+	uint8_t id_vs[1024];
+} nvme_identify_ctrl_t;
+
+/* NVMe Identify Namespace LBA Format */
+typedef struct {
+	uint16_t lbaf_ms;		/* Metadata Size */
+	uint8_t lbaf_lbads;		/* LBA Data Size */
+	uint8_t lbaf_rp:2;		/* Relative Performance */
+	uint8_t lbaf_rsvd1:6;
+} nvme_idns_lbaf_t;
+
+/* NVMe Identify Namespace Data Structure */
+typedef struct {
+	uint64_t id_nsize;		/* Namespace Size */
+	uint64_t id_ncap;		/* Namespace Capacity */
+	uint64_t id_nuse;		/* Namespace Utilization */
+	struct {			/* Namespace Features */
+		uint8_t f_thin:1;	/* Thin Provisioning */
+		uint8_t f_rsvd:7;
+	} id_nsfeat;
+	uint8_t id_nlbaf;		/* Number of LBA formats */
+	struct {			/* Formatted LBA size */
+		uint8_t lba_format:4;	/* LBA format */
+		uint8_t lba_extlba:1;	/* extended LBA (includes metadata) */
+		uint8_t lba_rsvd:3;
+	} id_flbas;
+	struct {			/* Metadata Capabilities */
+		uint8_t mc_extlba:1;	/* extended LBA transfers */
+		uint8_t mc_separate:1;	/* separate metadata transfers */
+		uint8_t mc_rsvd:6;
+	} id_mc;
+	struct {			/* Data Protection Capabilities */
+		uint8_t dp_type1:1;	/* Protection Information Type 1 */
+		uint8_t dp_type2:1;	/* Protection Information Type 2 */
+		uint8_t dp_type3:1;	/* Protection Information Type 3 */
+		uint8_t dp_first:1;	/* first 8 bytes of metadata */
+		uint8_t dp_last:1;	/* last 8 bytes of metadata */
+	} id_dpc;
+	struct {			/* Data Protection Settings */
+		uint8_t dp_pinfo:3;	/* Protection Information enabled */
+		uint8_t dp_first:1;	/* first 8 bytes of metadata */
+	} id_dps;
+	uint8_t id_rsvd1[128 - 30];
+	nvme_idns_lbaf_t id_lbaf[16];	/* LBA Formats */
+
+	uint8_t id_rsvd2[192];
+
+	uint8_t id_vs[3712];		/* Vendor Specific */
+} nvme_identify_nsid_t;
+
+
+/*
+ * NVMe Abort Command
+ */
+typedef union {
+	struct {
+		uint16_t ac_sqid;	/* Submission Queue ID */
+		uint16_t ac_cid;	/* Command ID */
+	} b;
+	uint32_t r;
+} nvme_abort_cmd_t;
+
+
+/*
+ * NVMe Get / Set Features
+ */
+#define	NVME_FEAT_ARBITRATION	0x1	/* Command Arbitration */
+#define	NVME_FEAT_POWER_MGMT	0x2	/* Power Management */
+#define	NVME_FEAT_LBA_RANGE	0x3	/* LBA Range Type */
+#define	NVME_FEAT_TEMPERATURE	0x4	/* Temperature Threshold */
+#define	NVME_FEAT_ERROR		0x5	/* Error Recovery */
+#define	NVME_FEAT_WRITE_CACHE	0x6	/* Volatile Write Cache */
+#define	NVME_FEAT_NQUEUES	0x7	/* Number of Queues */
+#define	NVME_FEAT_INTR_COAL	0x8	/* Interrupt Coalescing */
+#define	NVME_FEAT_INTR_VECT	0x9	/* Interrupt Vector Configuration */
+#define	NVME_FEAT_WRITE_ATOM	0xa	/* Write Atomicity */
+#define	NVME_FEAT_ASYNC_EVENT	0xb	/* Asynchronous Event Configuration */
+
+#define	NVME_FEAT_PROGRESS	0x80	/* Software Progress Marker */
+
+/* Arbitration Feature */
+typedef struct {
+	uint8_t arb_ab:3;		/* Arbitration Burst */
+	uint8_t arb_rsvd:5;
+	uint8_t arb_lpw;		/* Low Priority Weight */
+	uint8_t arb_mpw;		/* Medium Priority Weight */
+	uint8_t arb_hpw;		/* High Priority Weight */
+} nvme_arbitration_dw11_t;
+
+/* LBA Range Type Feature */
+typedef struct {
+	uint32_t lr_num:6;		/* Number of LBA ranges */
+	uint32_t lr_rsvd:26;
+} nvme_lba_range_type_dw11_t;
+
+typedef struct {
+	uint8_t lr_type;		/* Type */
+	struct {			/* Attributes */
+		uint8_t lr_write:1;	/* may be overwritten */
+		uint8_t lr_hidden:1;	/* hidden from OS/EFI/BIOS */
+		uint8_t lr_rsvd1:6;
+	} lr_attr;
+	uint8_t lr_rsvd2[14];
+	uint64_t lr_slba;		/* Starting LBA */
+	uint64_t lr_nlb;		/* Number of Logical Blocks */
+	uint8_t lr_guid[16];		/* Unique Identifier */
+	uint8_t lr_rsvd3[16];
+} nvme_lba_range_type_t;
+
+/* Number of Queues */
+typedef union {
+	struct {
+		uint16_t nq_nsq;	/* Number of Submission Queues */
+		uint16_t nq_ncq;	/* Number of Completion Queues */
+	} b;
+	uint32_t r;
+} nvme_nqueue_t;
+
+
+/*
+ * NVMe Get Log Page
+ */
+#define	NVME_LOGPAGE_ERROR	0x1	/* Error Information */
+#define	NVME_LOGPAGE_HEALTH	0x2	/* SMART/Health Information */
+#define	NVME_LOGPAGE_FWSLOT	0x3	/* Firmware Slot Information */
+
+typedef union {
+	struct {
+		uint8_t lp_lid;		/* Log Page Identifier */
+		uint8_t lp_rsvd1;
+		uint16_t lp_numd:12;	/* Number of Dwords */
+		uint16_t lp_rsvd2:4;
+	} b;
+	uint32_t r;
+} nvme_getlogpage_t;
+
+typedef struct {
+	uint64_t el_count;		/* Error Count */
+	uint16_t el_sqid;		/* Submission Queue ID */
+	uint16_t el_cid;		/* Command ID */
+	nvme_cqe_sf_t el_sf;		/* Status Field */
+	uint8_t	el_byte;		/* Parameter Error Location byte */
+	uint8_t	el_bit:3;		/* Parameter Error Location bit */
+	uint8_t el_rsvd1:5;
+	uint64_t el_lba;		/* Logical Block Address */
+	uint32_t el_nsid;		/* Namespace ID */
+	uint8_t	el_vendor;		/* Vendor Specific Information avail */
+	uint8_t el_rsvd2[64 - 29];
+} nvme_error_log_entry_t;
+
+typedef struct {
+	uint64_t lo;
+	uint64_t hi;
+} nvme_uint128_t;
+
+typedef struct {
+	uint8_t hl_crit_warn;		/* Critical Warning */
+	uint16_t hl_temp;		/* Temperature */
+	uint8_t hl_avail_spare;		/* Available Spare */
+	uint8_t hl_avail_spare_thr;	/* Available Spare Threshold */
+	uint8_t hl_used;		/* Percentage Used */
+	uint8_t hl_rsvd1[32 - 6];
+	nvme_uint128_t hl_data_read;	/* Data Units Read */
+	nvme_uint128_t hl_data_write;	/* Data Units Written */
+	nvme_uint128_t hl_host_read;	/* Host Read Commands */
+	nvme_uint128_t hl_host_write;	/* Host Write Commands */
+	nvme_uint128_t hl_ctrl_busy;	/* Controller Busy Time */
+	nvme_uint128_t hl_power_cycles;	/* Power Cycles */
+	nvme_uint128_t hl_power_on_hours; /* Power On Hours */
+	nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */
+	nvme_uint128_t hl_media_errors;	/* Media Errors */
+	nvme_uint128_t hl_errors_logged; /* Number of errors logged */
+	uint8_t hl_rsvd2[512 - 192];
+} nvme_health_log_t;
+
+typedef struct {
+	uint8_t fw_afi:3;		/* Active Firmware Slot */
+	uint8_t fw_rsvd1:5;
+	uint8_t fw_rsvd2[7];
+	char fw_frs[7][8];		/* Firmware Revision / Slot */
+	uint8_t fw_rsvd3[512 - 64];
+} nvme_fwslot_log_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#pragma pack() /* pack(1) */
+
+#endif /* _NVME_REG_H */
diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h
new file mode 100644
index 0000000000..37f446556d
--- /dev/null
+++ b/usr/src/uts/common/io/nvme/nvme_var.h
@@ -0,0 +1,240 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ */
+
+#ifndef _NVME_VAR_H
+#define	_NVME_VAR_H
+
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/blkdev.h>
+#include <sys/taskq_impl.h>
+
+/*
+ * NVMe driver state
+ */
+
+#ifdef __cplusplus
+/* extern "C" { */
+#endif
+
+#define	NVME_FMA_INIT			0x1
+#define	NVME_REGS_MAPPED 		0x2
+#define	NVME_ADMIN_QUEUE 		0x4
+#define	NVME_CTRL_LIMITS 		0x8
+#define	NVME_INTERRUPTS  		0x10
+
+#define	NVME_MIN_ADMIN_QUEUE_LEN	16
+#define	NVME_MIN_IO_QUEUE_LEN		16
+#define	NVME_DEFAULT_ADMIN_QUEUE_LEN	256
+#define	NVME_DEFAULT_IO_QUEUE_LEN	1024
+#define	NVME_DEFAULT_ASYNC_EVENT_LIMIT	10
+#define	NVME_MIN_ASYNC_EVENT_LIMIT	1
+
+#define	NVME_ADMIN_CMD_TIMEOUT		100000
+
+typedef struct nvme nvme_t;
+typedef struct nvme_namespace nvme_namespace_t;
+typedef struct nvme_dma nvme_dma_t;
+typedef struct nvme_cmd nvme_cmd_t;
+typedef struct nvme_qpair nvme_qpair_t;
+typedef struct nvme_task_arg nvme_task_arg_t;
+
+struct nvme_dma {
+	ddi_dma_handle_t nd_dmah;
+	ddi_acc_handle_t nd_acch;
+	ddi_dma_cookie_t nd_cookie;
+	uint_t nd_ncookie;
+	caddr_t nd_memp;
+	size_t nd_len;
+};
+
+struct nvme_cmd {
+	nvme_sqe_t nc_sqe;
+	nvme_cqe_t nc_cqe;
+
+	void (*nc_callback)(void *);
+	bd_xfer_t *nc_xfer;
+	boolean_t nc_completed;
+	uint16_t nc_sqid;
+
+	nvme_dma_t *nc_dma;
+
+	kmutex_t nc_mutex;
+	kcondvar_t nc_cv;
+
+	taskq_ent_t nc_tqent;
+	nvme_t *nc_nvme;
+};
+
+struct nvme_qpair {
+	size_t nq_nentry;
+
+	nvme_dma_t *nq_sqdma;
+	nvme_sqe_t *nq_sq;
+	uint_t nq_sqhead;
+	uint_t nq_sqtail;
+	uintptr_t nq_sqtdbl;
+
+	nvme_dma_t *nq_cqdma;
+	nvme_cqe_t *nq_cq;
+	uint_t nq_cqhead;
+	uint_t nq_cqtail;
+	uintptr_t nq_cqhdbl;
+
+	nvme_cmd_t **nq_cmd;
+	uint16_t nq_next_cmd;
+	uint_t nq_active_cmds;
+	int nq_phase;
+
+	kmutex_t nq_mutex;
+};
+
+struct nvme {
+	dev_info_t *n_dip;
+	int n_progress;
+
+	caddr_t n_regs;
+	ddi_acc_handle_t n_regh;
+
+	kmem_cache_t *n_cmd_cache;
+
+	size_t n_inth_sz;
+	ddi_intr_handle_t *n_inth;
+	int n_intr_cnt;
+	uint_t n_intr_pri;
+	int n_intr_cap;
+	int n_intr_type;
+	int n_intr_types;
+
+	boolean_t n_dead;
+	boolean_t n_strict_version;
+	boolean_t n_ignore_unknown_vendor_status;
+	uint32_t n_admin_queue_len;
+	uint32_t n_io_queue_len;
+	uint16_t n_async_event_limit;
+	uint16_t n_abort_command_limit;
+	uint64_t n_max_data_transfer_size;
+	boolean_t n_volatile_write_cache_enabled;
+	int n_error_log_len;
+
+	int n_nssr_supported;
+	int n_doorbell_stride;
+	int n_timeout;
+	int n_arbitration_mechanisms;
+	int n_cont_queues_reqd;
+	int n_max_queue_entries;
+	int n_pageshift;
+	int n_pagesize;
+
+	int n_namespace_count;
+	int n_ioq_count;
+
+	nvme_identify_ctrl_t *n_idctl;
+
+	nvme_qpair_t *n_adminq;
+	nvme_qpair_t **n_ioq;
+
+	nvme_namespace_t *n_ns;
+
+	ddi_dma_attr_t n_queue_dma_attr;
+	ddi_dma_attr_t n_prp_dma_attr;
+	ddi_dma_attr_t n_sgl_dma_attr;
+	ddi_device_acc_attr_t n_reg_acc_attr;
+	ddi_iblock_cookie_t n_fm_ibc;
+	int n_fm_cap;
+
+	ksema_t n_abort_sema;
+
+	ddi_taskq_t *n_cmd_taskq;
+
+	nvme_error_log_entry_t *n_error_log;
+	nvme_health_log_t *n_health_log;
+	nvme_fwslot_log_t *n_fwslot_log;
+
+	/* errors detected by driver */
+	uint32_t n_dma_bind_err;
+	uint32_t n_abort_failed;
+	uint32_t n_cmd_timeout;
+	uint32_t n_cmd_aborted;
+	uint32_t n_async_resubmit_failed;
+	uint32_t n_wrong_logpage;
+	uint32_t n_unknown_logpage;
+	uint32_t n_too_many_cookies;
+	uint32_t n_admin_queue_full;
+
+	/* errors detected by hardware */
+	uint32_t n_data_xfr_err;
+	uint32_t n_internal_err;
+	uint32_t n_abort_rq_err;
+	uint32_t n_abort_sq_del;
+	uint32_t n_nvm_cap_exc;
+	uint32_t n_nvm_ns_notrdy;
+	uint32_t n_inv_cq_err;
+	uint32_t n_inv_qid_err;
+	uint32_t n_max_qsz_exc;
+	uint32_t n_inv_int_vect;
+	uint32_t n_inv_log_page;
+	uint32_t n_inv_format;
+	uint32_t n_inv_q_del;
+	uint32_t n_cnfl_attr;
+	uint32_t n_inv_prot;
+	uint32_t n_readonly;
+
+	/* errors reported by asynchronous events */
+	uint32_t n_diagfail_event;
+	uint32_t n_persistent_event;
+	uint32_t n_transient_event;
+	uint32_t n_fw_load_event;
+	uint32_t n_reliability_event;
+	uint32_t n_temperature_event;
+	uint32_t n_spare_event;
+	uint32_t n_vendor_event;
+	uint32_t n_unknown_event;
+
+};
+
+struct nvme_namespace {
+	nvme_t *ns_nvme;
+	bd_handle_t ns_bd_hdl;
+
+	uint32_t ns_id;
+	size_t ns_block_count;
+	size_t ns_block_size;
+	size_t ns_best_block_size;
+
+	boolean_t ns_ignore;
+
+	nvme_identify_nsid_t *ns_idns;
+
+	/*
+	 * Section 7.7 of the spec describes how to get a unique ID for
+	 * the controller: the vendor ID, the model name and the serial
+	 * number shall be unique when combined.
+	 *
+	 * We add the hex namespace ID to get a unique ID for the namespace.
+	 */
+	char ns_devid[4 + 1 + 20 + 1 + 40 + 1 + 8 + 1];
+};
+
+struct nvme_task_arg {
+	nvme_t *nt_nvme;
+	nvme_cmd_t *nt_cmd;
+};
+
+#ifdef __cplusplus
+/* } */
+#endif
+
+#endif /* _NVME_VAR_H */
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index 3809a6c45e..3b3ee0723d 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -19,7 +19,7 @@
 #
 
 # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2014 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2015 Nexenta Systems, Inc. All rights reserved.
 # Copyright (c) 2013 Andrew Stormont.  All rights reserved.
 
 #
@@ -286,6 +286,7 @@ DRV_KMODS	+= nca
 DRV_KMODS	+= nsmb
 DRV_KMODS	+= nulldriver
 DRV_KMODS	+= nv_sata
+DRV_KMODS	+= nvme
 DRV_KMODS	+= nxge
 DRV_KMODS	+= oce
 DRV_KMODS	+= openeepr
diff --git a/usr/src/uts/intel/nvme/Makefile b/usr/src/uts/intel/nvme/Makefile
new file mode 100644
index 0000000000..529d4378a2
--- /dev/null
+++ b/usr/src/uts/intel/nvme/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+#
+
+#
+#	Paths to the base of the uts directory trees
+#
+UTSBASE = ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= nvme
+OBJECTS		= $(NVME_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(NVME_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_DRV_DIR)/$(MODULE)
+CONF_SRCDIR	= $(UTSBASE)/common/io/nvme
+#
+#	Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Driver depends on blkdev
+#
+LDFLAGS		+= -dy -N drv/blkdev
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
-- 
cgit v1.2.3


From 6d532798b6559eb98b586fd17725d8093f3b9ade Mon Sep 17 00:00:00 2001
From: Damian Wojslaw <damian@wojslaw.pl>
Date: Mon, 14 Sep 2015 16:35:38 +0200
Subject: 6168 strlcpy() does not return s1 Reviewed by: Marcel Telka
 <marcel.telka@nexenta.com> Approved by: Robert Mustacchi <rm@joyent.com>

---
 usr/src/man/man3c/string.3c |  9 +++++----
 usr/src/man/man9f/string.9f | 24 +++++-------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'usr/src')

diff --git a/usr/src/man/man3c/string.3c b/usr/src/man/man3c/string.3c
index cc934221e3..882705284d 100644
--- a/usr/src/man/man3c/string.3c
+++ b/usr/src/man/man3c/string.3c
@@ -13,7 +13,7 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH STRING 3C "Dec 20, 2014"
+.TH STRING 3C "Sep 14, 2015"
 .SH NAME
 string, strcasecmp, strcasecmp_l, strncasecmp, strncasecmp_l, strcat, strncat,
 strlcat, strchr, strchrnul, strrchr, strcmp, strncmp, stpcpy, stpncpy, strcpy,
@@ -285,9 +285,10 @@ The \fBstrcpy()\fR function copies string \fIs2\fR to \fIs1\fR, including the
 terminating null character, stopping after the null character has been copied.
 The \fBstrncpy()\fR function copies exactly \fIn\fR bytes, truncating \fIs2\fR
 or adding null characters to \fIs1\fR if necessary. The result will not be
-null-terminated if the length of \fIs2\fR is \fIn\fR or more. Each function
-returns \fIs1\fR.  If copying takes place between objects that overlap, the
-behavior of \fBstrcpy()\fR, \fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
+null-terminated if the length of \fIs2\fR is \fIn\fR or more. Both the
+\fBstrcpy()\fR and \fBstrncpy()\fR functions return \fIs1\fR. If copying takes
+place between objects that overlap, the behavior of \fBstrcpy()\fR,
+\fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
 .LP
 The \fBstrlcpy()\fR function copies  at most \fIdstsize\fR\(mi1 characters
 (\fIdstsize\fR being the  size of the  string buffer \fIdst\fR) from \fIsrc\fR
diff --git a/usr/src/man/man9f/string.9f b/usr/src/man/man9f/string.9f
index 5300eb52d6..fccd40887b 100644
--- a/usr/src/man/man9f/string.9f
+++ b/usr/src/man/man9f/string.9f
@@ -3,7 +3,7 @@
 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH STRING 9F "Jun 4, 2014"
+.TH STRING 9F "Sep 14, 2015"
 .SH NAME
 string, strcasecmp, strncasecmp, strncat, strlcat, strchr, strrchr, strcmp,
 strncmp, strcpy, strncpy, strlcpy, strfree, strspn, strdup, ddi_strdup, strlen,
@@ -98,11 +98,9 @@ strnlen \- string operations
 .fi
 
 .SH INTERFACE LEVEL
-.sp
 .LP
 Solaris DDI specific (Solaris DDI).
 .SH DESCRIPTION
-.sp
 .LP
 The arguments \fIs\fR, \fIs1\fR, and \fIs2\fR point to strings (arrays of
 characters terminated by a null character). The \fBstrcat()\fR,
@@ -111,14 +109,12 @@ characters terminated by a null character). The \fBstrcat()\fR,
 Additionally, the \fBstrcpy()\fR function does not check for overflow of the
 array.
 .SS "\fBstrcasecmp()\fR, \fBstrncasecmp()\fR"
-.sp
 .LP
 The \fBstrcasecmp()\fR and \fBstrncasecmp()\fR functions are case-insensitive
 versions of  \fBstrcmp()\fR and \fBstrncmp()\fR respectively, described below.
 They assume the \fBASCII\fR character set and ignore differences in case when
 comparing lower and upper case characters.
 .SS "\fBstrncat()\fR, \fBstrlcat()\fR"
-.sp
 .LP
 The \fBstrncat()\fR function appends at most \fIn\fR characters of string
 \fIs2\fR, including the terminating null character, to the end of string
@@ -150,7 +146,6 @@ if (strlcat(dst, src, dstsize) >= dstsize)
 .in -2
 
 .SS "\fBstrchr()\fR, \fBstrrchr()\fR"
-.sp
 .LP
 The \fBstrchr()\fR function returns a pointer to the first occurrence of
 \fIc\fR (converted to a  \fBchar\fR) in string \fIs\fR, or a null pointer if
@@ -158,7 +153,6 @@ The \fBstrchr()\fR function returns a pointer to the first occurrence of
 pointer to the last occurrence of \fIc\fR. The null character terminating a
 string is considered to be part of the string.
 .SS "\fBstrcmp()\fR, \fBstrncmp()\fR"
-.sp
 .LP
 The \fBstrcmp()\fR function compares two strings byte-by-byte, according to the
 ordering of your machine's character set.  The function returns an integer
@@ -170,15 +164,15 @@ strings being compared. The \fBstrncmp()\fR function makes the same comparison
 but looks at a maximum of \fIn\fR bytes. Bytes following a null byte are not
 compared.
 .SS "\fBstrcpy()\fR, \fBstrncpy()\fR, \fBstrlcpy()\fR"
-.sp
 .LP
 The \fBstrcpy()\fR function copies string \fIs2\fR to \fIs1\fR, including the
 terminating null character, stopping after the null character has been copied.
 The \fBstrncpy()\fR function copies exactly \fIn\fR bytes, truncating \fIs2\fR
 or adding null characters to \fIs1\fR if necessary. The result will not be
-null-terminated if the length of \fIs2\fR is \fIn\fR or more. Each function
-returns \fIs1\fR.  If copying takes place between objects that overlap, the
-behavior of \fBstrcpy()\fR, \fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
+null-terminated if the length of \fIs2\fR is \fIn\fR or more. Both the
+\fBstrcpy()\fR and \fBstrncpy()\fR functions return \fIs1\fR. If copying takes
+place between objects that overlap, the behavior of \fBstrcpy()\fR,
+\fBstrncpy()\fR, and \fBstrlcpy()\fR is undefined.
 .sp
 .LP
 The \fBstrlcpy()\fR function copies  at most \fIdstsize\fR\(mi1 characters
@@ -195,19 +189,16 @@ if (strlcpy(dst, src, dstsize) >= dstsize)
 .in -2
 
 .SS "\fBstrfree()\fR"
-.sp
 .LP
 The \fBstrfree()\fR function frees the memory associated with the string
 pointed to by \fIs\fR.  This memory pointed to by \fIs\fR must be of size
 \fBstrlen\fR(\fIs\fR)+1, and must have been allocated (either directly or
 indirectly) by \fBkmem_alloc\fR(9F) or \fBkmem_zalloc\fR(9F).
 .SS "\fBstrspn()\fR"
-.sp
 .LP
 The \fBstrspn()\fR function returns the length of the initial segment of string
 \fIs1\fR that consists entirely of characters from string \fIs2\fR.
 .SS "\fBstrdup()\fR, \fBddi_strdup()\fR"
-.sp
 .LP
 The \fBddi_strdup()\fR function returns a pointer to a new string that is a
 duplicate of the string pointed to by \fIs1\fR. The returned pointer can be
@@ -223,7 +214,6 @@ The \fBstrdup()\fR function behaves the same as the \fBddi_strdup()\fR when
 called with the \fBKM_SLEEP\fR flag. This means that \fBstrdup()\fR can sleep
 until memory is available and will always succeed.
 .SS "\fBstrlen()\fR, \fBstrnlen()\fR"
-.sp
 .LP
 The \fBstrlen()\fR function returns the number of bytes in \fIs\fR, not
 including the terminating null character.
@@ -234,7 +224,6 @@ bytes in \fIs\fR, not including the terminating null character. The
 \fBstrnlen()\fR function never examines more than \fIn\fR bytes of the string
 pointed to by \fIs\fR.
 .SH CONTEXT
-.sp
 .LP
 The \fBstrdup()\fR and \fBddi_strdup()\fR functions can be called from user or
 kernel context.
@@ -247,7 +236,6 @@ the \fBKM_NOSLEEP\fR flag is set.
 All the other string manipulation functions can be called from user, interrupt,
 or kernel context.
 .SH ATTRIBUTES
-.sp
 .LP
 See \fBattributes\fR(5) for descriptions of the following attributes:
 .sp
@@ -263,7 +251,6 @@ Interface Stability	Committed
 .TE
 
 .SH SEE ALSO
-.sp
 .LP
 \fBstring\fR(3C), \fBattributes\fR(5), \fBbcopy\fR(9F), \fBddi_copyin\fR(9F),
 \fBkmem_alloc\fR(9F)
@@ -271,7 +258,6 @@ Interface Stability	Committed
 .LP
 \fIWriting Device Drivers\fR
 .SH NOTES
-.sp
 .LP
 If copying takes place between objects that overlap, the behavior of
 \fBstrlcat()\fR, \fBstrncat()\fR, \fBstrcpy()\fR, \fBstrlcpy()\fR, and
-- 
cgit v1.2.3


From 6734c4b0468cc77a7871a5dd5c23a5562557d64c Mon Sep 17 00:00:00 2001
From: Robert Mustacchi <rm@joyent.com>
Date: Sat, 8 Aug 2015 00:24:00 +0000
Subject: 6189 want smbios 3.0 support 6190 libsmbios checks against the wrong
 version 6191 libsmbios can be a bit more forgiving to the past 6192 smbios
 misprints extended onboard devices Reviewed by: Igor Kozhukhov
 <ikozhukhov@gmail.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed
 by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Approved by: Garrett D'Amore
 <garrett@damore.org>

---
 usr/src/cmd/smbios/smbios.c               | 50 ++++++++++++++++++++------
 usr/src/common/smbios/mktables.sh         |  1 +
 usr/src/common/smbios/smb_info.c          | 45 ++++++++++++++++++-----
 usr/src/common/smbios/smb_open.c          |  2 ++
 usr/src/lib/libsmbios/common/mapfile-vers |  1 +
 usr/src/uts/common/sys/smbios.h           | 45 ++++++++++++++++++++---
 usr/src/uts/common/sys/smbios_impl.h      | 60 +++++++++++++++++++++++++++++++
 7 files changed, 181 insertions(+), 23 deletions(-)

(limited to 'usr/src')

diff --git a/usr/src/cmd/smbios/smbios.c b/usr/src/cmd/smbios/smbios.c
index 1278548fb7..a0bde99ac3 100644
--- a/usr/src/cmd/smbios/smbios.c
+++ b/usr/src/cmd/smbios/smbios.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -439,20 +440,38 @@ print_processor(smbios_hdl_t *shp, id_t id, FILE *fp)
 		    (float)SMB_PRV_VOLTAGE(p.smbp_voltage) / 10);
 	}
 
-	if (p.smbp_corecount != 0)
-		oprintf(fp, "  Core Count: %u\n", p.smbp_corecount);
-	else
+	if (p.smbp_corecount != 0) {
+		if (p.smbp_corecount != 0xff || p.smbp_corecount2 == 0)
+			oprintf(fp, "  Core Count: %u\n", p.smbp_corecount);
+		else
+			oprintf(fp, "  Core Count: %u\n", p.smbp_corecount2);
+	} else {
 		oprintf(fp, "  Core Count: Unknown\n");
+	}
 
-	if (p.smbp_coresenabled != 0)
-		oprintf(fp, "  Cores Enabled: %u\n", p.smbp_coresenabled);
-	else
+	if (p.smbp_coresenabled != 0) {
+		if (p.smbp_coresenabled != 0xff || p.smbp_coresenabled2 == 0) {
+			oprintf(fp, "  Cores Enabled: %u\n",
+			    p.smbp_coresenabled);
+		} else {
+			oprintf(fp, "  Cores Enabled: %u\n",
+			    p.smbp_coresenabled2);
+		}
+	} else {
 		oprintf(fp, "  Cores Enabled: Unknown\n");
+	}
 
-	if (p.smbp_threadcount != 0)
-		oprintf(fp, "  Thread Count: %u\n", p.smbp_threadcount);
-	else
+	if (p.smbp_threadcount != 0) {
+		if (p.smbp_threadcount != 0xff || p.smbp_threadcount2 == 0) {
+			oprintf(fp, "  Thread Count: %u\n",
+			    p.smbp_threadcount);
+		} else {
+			oprintf(fp, "  Thread Count: %u\n",
+			    p.smbp_threadcount2);
+		}
+	} else {
 		oprintf(fp, "  Thread Count: Unknown\n");
+	}
 
 	if (p.smbp_cflags) {
 		flag_printf(fp, "Processor Characteristics",
@@ -590,12 +609,23 @@ print_slot(smbios_hdl_t *shp, id_t id, FILE *fp)
 static void
 print_obdevs_ext(smbios_hdl_t *shp, id_t id, FILE *fp)
 {
+	boolean_t enabled;
 	smbios_obdev_ext_t oe;
+	const char *type;
 
 	(void) smbios_info_obdevs_ext(shp, id, &oe);
 
+	/*
+	 * Bit 7 is always whether or not the device is enabled while bits 0:6
+	 * are the actual device type.
+	 */
+	enabled = oe.smboe_dtype >> 7;
+	type = smbios_onboard_type_desc(oe.smboe_dtype & 0x7f);
+
 	oprintf(fp, "  Reference Designator: %s\n", oe.smboe_name);
-	oprintf(fp, "  Device Type: %u\n", oe.smboe_dtype);
+	oprintf(fp, "  Device Enabled: %s\n", enabled == B_TRUE ? "true" :
+	    "false");
+	oprintf(fp, "  Device Type: %s\n", type);
 	oprintf(fp, "  Device Type Instance: %u\n", oe.smboe_dti);
 	oprintf(fp, "  Segment Group Number: %u\n", oe.smboe_sg);
 	oprintf(fp, "  Bus Number: %u\n", oe.smboe_bus);
diff --git a/usr/src/common/smbios/mktables.sh b/usr/src/common/smbios/mktables.sh
index 73537a81f3..0d8f0cf997 100644
--- a/usr/src/common/smbios/mktables.sh
+++ b/usr/src/common/smbios/mktables.sh
@@ -83,6 +83,7 @@ SMB_MDF_	smbios_memdevice_flag_desc	uint_t
 SMB_MDFF_	smbios_memdevice_form_desc	uint_t
 SMB_MDT_	smbios_memdevice_type_desc	uint_t
 SMB_MDR_	smbios_memdevice_rank_desc	uint_t
+SMB_OBT_	smbios_onboard_type_desc	uint_t
 SMB_POC_	smbios_port_conn_desc		uint_t
 SMB_POT_	smbios_port_type_desc		uint_t
 SMB_PRC_	smbios_processor_core_flag_desc	uint_t
diff --git a/usr/src/common/smbios/smb_info.c b/usr/src/common/smbios/smb_info.c
index edd4f312b6..2eb9d8351b 100644
--- a/usr/src/common/smbios/smb_info.c
+++ b/usr/src/common/smbios/smb_info.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -45,6 +46,23 @@
  * also to automatically handle the case of a structure that has been extended.
  * When necessary, this code can use smb_gteq() to determine whether the SMBIOS
  * data is of a particular revision that is supposed to contain a new field.
+ *
+ * Note, when trying to bzero the caller's struct you have to be careful about
+ * versions. One can only bzero the initial version that existed in illumos. In
+ * other words, if someone passes an older library handle that doesn't support a
+ * version you cannot assume that their structures have those additional members
+ * in them. Instead, a 'base' version is introduced for such types that have
+ * differences and instead we only bzero out the base version and then handle
+ * the additional members. In general, because all additional members will be
+ * assigned, there's no reason to zero them out unless they are arrays that
+ * won't be entirely filled in.
+ *
+ * Due to history, anything added after the update from version 2.4, in other
+ * words additions from or after '5094 Update libsmbios with recent items'
+ * (4e901881) is currently being used for this. While we don't allow software
+ * compiling against this to get an older form, this was the first major update
+ * and a good starting point for us to enforce this behavior which is useful for
+ * moving forward to making this more public.
  */
 
 #include <sys/smbios_impl.h>
@@ -423,7 +441,10 @@ smbios_info_chassis(smbios_hdl_t *shp, id_t id, smbios_chassis_t *chp)
 		return (smb_set_errno(shp, ESMB_TYPE));
 
 	smb_info_bcopy(stp->smbst_hdr, ch, sizeof (buf));
-	bzero(chp, sizeof (smbios_chassis_t));
+	bzero(chp, sizeof (smb_base_chassis_t));
+	if (shp->sh_libvers >= SMB_VERSION_27) {
+		bzero(chp->smbc_sku, sizeof (chp->smbc_sku));
+	}
 
 	chp->smbc_oemdata = ch->smbch_oemdata;
 	chp->smbc_lock = (ch->smbch_type & SMB_CHT_LOCK) != 0;
@@ -437,7 +458,7 @@ smbios_info_chassis(smbios_hdl_t *shp, id_t id, smbios_chassis_t *chp)
 	chp->smbc_elems = ch->smbch_cn;
 	chp->smbc_elemlen = ch->smbch_cm;
 
-	if (shp->sh_smbvers >= SMB_VERSION_27) {
+	if (shp->sh_libvers >= SMB_VERSION_27) {
 		(void) strlcpy(chp->smbc_sku, SMB_CH_SKU(ch),
 		    sizeof (chp->smbc_sku));
 	}
@@ -458,7 +479,7 @@ smbios_info_processor(smbios_hdl_t *shp, id_t id, smbios_processor_t *pp)
 		return (smb_set_errno(shp, ESMB_TYPE));
 
 	smb_info_bcopy(stp->smbst_hdr, &p, sizeof (p));
-	bzero(pp, sizeof (smbios_processor_t));
+	bzero(pp, sizeof (smb_base_processor_t));
 
 	pp->smbp_cpuid = p.smbpr_cpuid;
 	pp->smbp_type = p.smbpr_type;
@@ -472,16 +493,22 @@ smbios_info_processor(smbios_hdl_t *shp, id_t id, smbios_processor_t *pp)
 	pp->smbp_l2cache = p.smbpr_l2cache;
 	pp->smbp_l3cache = p.smbpr_l3cache;
 
-	if (shp->sh_smbvers >= SMB_VERSION_25) {
+	if (shp->sh_libvers >= SMB_VERSION_25) {
 		pp->smbp_corecount = p.smbpr_corecount;
 		pp->smbp_coresenabled = p.smbpr_coresenabled;
 		pp->smbp_threadcount = p.smbpr_threadcount;
 		pp->smbp_cflags = p.smbpr_cflags;
 	}
 
-	if (shp->sh_smbvers >= SMB_VERSION_26)
+	if (shp->sh_libvers >= SMB_VERSION_26)
 		pp->smbp_family2 = p.smbpr_family2;
 
+	if (shp->sh_libvers >= SMB_VERSION_30) {
+		pp->smbp_corecount2 = p.smbpr_corecount2;
+		pp->smbp_coresenabled2 = p.smbpr_coresenabled2;
+		pp->smbp_threadcount2 = p.smbpr_threadcount2;
+	}
+
 	return (0);
 }
 
@@ -787,7 +814,7 @@ smbios_info_memdevice(smbios_hdl_t *shp, id_t id, smbios_memdevice_t *mdp)
 		return (smb_set_errno(shp, ESMB_TYPE));
 
 	smb_info_bcopy(stp->smbst_hdr, &m, sizeof (m));
-	bzero(mdp, sizeof (smbios_memdevice_t));
+	bzero(mdp, sizeof (smb_base_memdevice_t));
 
 	mdp->smbmd_array = m.smbmdev_array;
 	mdp->smbmd_error = m.smbmdev_error;
@@ -814,13 +841,13 @@ smbios_info_memdevice(smbios_hdl_t *shp, id_t id, smbios_memdevice_t *mdp)
 	mdp->smbmd_dloc = smb_strptr(stp, m.smbmdev_dloc);
 	mdp->smbmd_bloc = smb_strptr(stp, m.smbmdev_bloc);
 
-	if (shp->sh_smbvers >= SMB_VERSION_26)
+	if (shp->sh_libvers >= SMB_VERSION_26)
 		mdp->smbmd_rank = m.smbmdev_attrs & 0x0F;
 
-	if (shp->sh_smbvers >= SMB_VERSION_27)
+	if (shp->sh_libvers >= SMB_VERSION_27)
 		mdp->smbmd_clkspeed = m.smbmdev_clkspeed;
 
-	if (shp->sh_smbvers >= SMB_VERSION_28) {
+	if (shp->sh_libvers >= SMB_VERSION_28) {
 		mdp->smbmd_minvolt = m.smbmdev_minvolt;
 		mdp->smbmd_maxvolt = m.smbmdev_maxvolt;
 		mdp->smbmd_confvolt = m.smbmdev_confvolt;
diff --git a/usr/src/common/smbios/smb_open.c b/usr/src/common/smbios/smb_open.c
index 05c9179c64..242facc161 100644
--- a/usr/src/common/smbios/smb_open.c
+++ b/usr/src/common/smbios/smb_open.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -87,6 +88,7 @@ smbios_bufopen(const smbios_entry_t *ep, const void *buf, size_t len,
 	case SMB_VERSION_26:
 	case SMB_VERSION_27:
 	case SMB_VERSION_28:
+	case SMB_VERSION_30:
 		break;
 	default:
 		return (smb_open_error(shp, errp, ESMB_VERSION));
diff --git a/usr/src/lib/libsmbios/common/mapfile-vers b/usr/src/lib/libsmbios/common/mapfile-vers
index 046b4e298d..c618572a12 100644
--- a/usr/src/lib/libsmbios/common/mapfile-vers
+++ b/usr/src/lib/libsmbios/common/mapfile-vers
@@ -120,6 +120,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
 	smbios_memdevice_type_desc;
 	smbios_memdevice_rank_desc;
 	smbios_open;
+	smbios_onboard_type_desc;
 	smbios_port_conn_desc;
 	smbios_port_type_desc;
 	smbios_processor_family_desc;
diff --git a/usr/src/uts/common/sys/smbios.h b/usr/src/uts/common/sys/smbios.h
index 75a9173dbe..7e653cb911 100644
--- a/usr/src/uts/common/sys/smbios.h
+++ b/usr/src/uts/common/sys/smbios.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -345,6 +346,9 @@ typedef struct smbios_chassis {
 #define	SMB_CHT_ATCA		0x1B	/* advanced TCA */
 #define	SMB_CHT_BLADE		0x1C	/* blade */
 #define	SMB_CHT_BLADEENC	0x1D	/* blade enclosure */
+#define	SMB_CHT_TABLET		0x1E	/* tablet */
+#define	SMB_CHT_CONVERTIBLE	0x1F	/* convertible */
+#define	SMB_CHT_DETACHABLE	0x20	/* detachable */
 
 #define	SMB_CHST_OTHER		0x01	/* other */
 #define	SMB_CHST_UNKNOWN	0x02	/* unknown */
@@ -382,14 +386,18 @@ typedef struct smbios_processor {
 	id_t smbp_l1cache;		/* L1 cache handle */
 	id_t smbp_l2cache;		/* L2 cache handle */
 	id_t smbp_l3cache;		/* L3 cache handle */
-	uint8_t smbp_corecount;	/* number of cores per processor socket */
-	uint8_t smbp_coresenabled;
+	uint32_t smbp_corecount;
+		/* number of cores per processor socket */
+	uint32_t smbp_coresenabled;
 		/* number of enabled cores per processor socket */
-	uint8_t smbp_threadcount;
+	uint32_t smbp_threadcount;
 		/* number of threads per processor socket */
 	uint16_t smbp_cflags;
 		/* processor characteristics (SMB_PRC_*) */
 	uint16_t smbp_family2;		/* processor family 2 */
+	uint16_t smbp_corecount2;	/* core count 2 */
+	uint16_t smbp_coresenabled2;	/* cores enabled 2 */
+	uint16_t smbp_threadcount2;	/* thread count 2 */
 } smbios_processor_t;
 
 #define	SMB_PRT_OTHER		0x01	/* other */
@@ -462,6 +470,10 @@ typedef struct smbios_processor {
 #define	SMB_PRU_FM2		0x2A	/* socket FM2 */
 #define	SMB_PRU_LGA20113	0x2B	/* LGA2011-3 */
 #define	SMB_PRU_LGA13563	0x2C	/* LGA1356-3 */
+#define	SMB_PRU_LGA1150		0x2D	/* LGA1150 */
+#define	SMB_PRU_BGA1168		0x2E	/* BGA1168 */
+#define	SMB_PRU_BGA1234		0x2F	/* BGA1234 */
+#define	SMB_PRU_BGA1364		0x30	/* BGA1364 */
 
 #define	SMB_PRC_RESERVED	0x0001	/* reserved */
 #define	SMB_PRC_UNKNOWN		0x0002	/* unknown */
@@ -513,6 +525,7 @@ typedef struct smbios_processor {
 #define	SMB_PRF_CORE_DUO_M	0x29	/* Core Duo mobile */
 #define	SMB_PRF_CORE_SOLO_M	0x2A	/* Core Solo mobile */
 #define	SMB_PRF_ATOM		0x2B	/* Intel Atom */
+#define	SMB_PRF_CORE_M		0x2C	/* Intel Core M */
 #define	SMB_PRF_ALPHA		0x30	/* Alpha */
 #define	SMB_PRF_ALPHA_21064	0x31	/* Alpha 21064 */
 #define	SMB_PRF_ALPHA_21066	0x32	/* Alpha 21066 */
@@ -561,6 +574,9 @@ typedef struct smbios_processor {
 #define	SMB_PRF_68010		0x63	/* 68010 */
 #define	SMB_PRF_68020		0x64	/* 68020 */
 #define	SMB_PRF_68030		0x65	/* 68030 */
+#define	SMB_PRF_ATHLON_X4	0x66	/* AMD Athlon X4 Quad-Core */
+#define	SMB_PRF_OPTERON_X1K	0x67	/* AMD Opteron X1000 */
+#define	SMB_PRF_OPTERON_X2K	0x68	/* AMD Opteron X2000 APU */
 #define	SMB_PRF_HOBBIT		0x70	/* Hobbit */
 #define	SMB_PRF_TM5000		0x78	/* Crusoe TM5000 */
 #define	SMB_PRF_TM3000		0x79	/* Crusoe TM3000 */
@@ -870,6 +886,19 @@ typedef struct smbios_slot {
 #define	SMB_SLT_AGP4X		0x11	/* AGP 4X */
 #define	SMB_SLT_PCIX		0x12	/* PCI-X */
 #define	SMB_SLT_AGP8X		0x13	/* AGP 8X */
+#define	SMB_SLT_M2_1DP		0x14	/* M.2 Socket 1-DP (Mechanical Key A) */
+#define	SMB_SLT_M2_1SD		0x15	/* M.2 Socket 1-SD (Mechanical Key E) */
+#define	SMB_SLT_M2_2		0x16	/* M.2 Socket 2 (Mechanical Key B) */
+#define	SMB_SLT_M2_3		0x17	/* M.2 Socket 3 (Mechanical Key M) */
+#define	SMB_SLT_MXM_I		0x18	/* MXM Type I */
+#define	SMB_SLT_MXM_II		0x19	/* MXM Type II */
+#define	SMB_SLT_MXM_III		0x1A	/* MXM Type III (standard connector) */
+#define	SMB_SLT_MXM_III_HE	0x1B	/* MXM Type III (HE connector) */
+#define	SMB_SLT_MXM_V		0x1C	/* MXM Type IV */
+#define	SMB_SLT_MXM3_A		0x1D	/* MXM 3.0 Type A */
+#define	SMB_SLT_MXM3_B		0x1E	/* MXM 3.0 Type B */
+#define	SMB_SLT_PCIEG2_SFF	0x1F	/* PCI Express Gen 2 SFF-8639 */
+#define	SMB_SLT_PCIEG3_SFF	0x20	/* PCI Express Gen 3 SFF-8639 */
 #define	SMB_SLT_PC98_C20	0xA0	/* PC-98/C20 */
 #define	SMB_SLT_PC98_C24	0xA1	/* PC-98/C24 */
 #define	SMB_SLT_PC98_E		0xA2	/* PC-98/E */
@@ -1124,6 +1153,11 @@ typedef struct smbios_memdevice {
 #define	SMB_MDT_DDR2FBDIMM	0x14	/* DDR2 FBDIMM */
 #define	SMB_MDT_DDR3		0x18	/* DDR3 */
 #define	SMB_MDT_FBD2		0x19	/* FBD2 */
+#define	SMB_MDT_DDR4		0x1A	/* DDR4 */
+#define	SMB_MDT_LPDDR		0x1B	/* LPDDR */
+#define	SMB_MDT_LPDDR2		0x1C	/* LPDDR2 */
+#define	SMB_MDT_LPDDR3		0x1D	/* LPDDR3 */
+#define	SMB_MDT_LPDDR4		0x1E	/* LPDDR4 */
 
 #define	SMB_MDF_OTHER		0x0002	/* other */
 #define	SMB_MDF_UNKNOWN		0x0004	/* unknown */
@@ -1320,7 +1354,8 @@ typedef struct smbios_memdevice_ext {
 #define	SMB_VERSION_26	0x0206		/* SMBIOS encoding for DMTF spec 2.6 */
 #define	SMB_VERSION_27	0x0207		/* SMBIOS encoding for DMTF spec 2.7 */
 #define	SMB_VERSION_28	0x0208		/* SMBIOS encoding for DMTF spec 2.8 */
-#define	SMB_VERSION	SMB_VERSION_28	/* SMBIOS latest version definitions */
+#define	SMB_VERSION_30	0x0300		/* SMBIOS encoding for DMTF spec 3.0 */
+#define	SMB_VERSION	SMB_VERSION_30	/* SMBIOS latest version definitions */
 
 #define	SMB_O_NOCKSUM	0x1		/* do not verify header checksums */
 #define	SMB_O_NOVERS	0x2		/* do not verify header versions */
@@ -1453,6 +1488,8 @@ extern const char *smbios_memdevice_flag_name(uint_t);
 extern const char *smbios_memdevice_flag_desc(uint_t);
 extern const char *smbios_memdevice_rank_desc(uint_t);
 
+extern const char *smbios_onboard_type_desc(uint_t);
+
 extern const char *smbios_port_conn_desc(uint_t);
 extern const char *smbios_port_type_desc(uint_t);
 
diff --git a/usr/src/uts/common/sys/smbios_impl.h b/usr/src/uts/common/sys/smbios_impl.h
index 0013dcba2b..a668d6678f 100644
--- a/usr/src/uts/common/sys/smbios_impl.h
+++ b/usr/src/uts/common/sys/smbios_impl.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright 2015 OmniTI Computer Consulting, Inc.  All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
@@ -145,6 +146,9 @@ typedef struct smb_processor {
 	uint8_t smbpr_threadcount;	/* number of threads per socket */
 	uint16_t smbpr_cflags;	/* cpu characteristics (see <smbios.h>) */
 	uint16_t smbpr_family2;		/* processor family2 (see <smbios.h>) */
+	uint16_t smbpr_corecount2;	/* second number of cores per socket */
+	uint16_t smbpr_coresenabled2;	/* second number of enabled cores */
+	uint16_t smbpr_threadcount2;	/* second number of enabled threads */
 } smb_processor_t;
 
 typedef struct smb_cache {
@@ -506,6 +510,62 @@ extern void smb_dprintf(smbios_hdl_t *, const char *, ...);
 
 extern int _smb_debug;
 
+/*
+ * The following series of structures represent the base versions of public
+ * structures that are used inside by the smbios routines. This allows the
+ * common code to properly know how much it should or should not bzero and how
+ * to handle additions to the spec. Types should only be added here if we need
+ * to extend the public structures in sys/smbios.h due to a change in the spec.
+ *
+ * Types here have the name smb_base_%s which corresponds to smbios_%s.
+ */
+typedef struct smb_base_chassis {
+	uint32_t smbbc_oemdata;		/* OEM-specific data */
+	uint8_t smbbc_lock;		/* lock present? */
+	uint8_t smbbc_type;		/* type */
+	uint8_t smbbc_bustate;		/* boot-up state */
+	uint8_t smbbc_psstate;		/* power supply state */
+	uint8_t smbbc_thstate;		/* thermal state */
+	uint8_t smbbc_security;		/* security status */
+	uint8_t smbbc_uheight;		/* enclosure height in U's */
+	uint8_t smbbc_cords;		/* number of power cords */
+	uint8_t smbbc_elems;		/* number of element records (n) */
+	uint8_t smbbc_elemlen;		/* length of contained element (m) */
+} smb_base_chassis_t;
+
+typedef struct smb_base_processor {
+	uint64_t smbbp_cpuid;		/* processor cpuid information */
+	uint32_t smbbp_family;		/* processor family */
+	uint8_t smbbp_type;		/* processor type (SMB_PRT_*) */
+	uint8_t smbbp_voltage;		/* voltage (SMB_PRV_*) */
+	uint8_t smbbp_status;		/* status (SMB_PRS_*) */
+	uint8_t smbbp_upgrade;		/* upgrade (SMB_PRU_*) */
+	uint32_t smbbp_clkspeed;	/* external clock speed in MHz */
+	uint32_t smbbp_maxspeed;	/* maximum speed in MHz */
+	uint32_t smbbp_curspeed;	/* current speed in MHz */
+	id_t smbbp_l1cache;		/* L1 cache handle */
+	id_t smbbp_l2cache;		/* L2 cache handle */
+	id_t smbbp_l3cache;		/* L3 cache handle */
+} smb_base_processor_t;
+
+typedef struct smb_base_memdevice {
+	id_t smbbmd_array;		/* handle of physical memory array */
+	id_t smbbmd_error;		/* handle of memory error data */
+	uint32_t smbbmd_twidth;		/* total width in bits including ecc */
+	uint32_t smbbmd_dwidth;		/* data width in bits */
+	uint64_t smbbmd_size;		/* size in bytes (see note above) */
+	uint8_t smbbmd_form;		/* form factor */
+	uint8_t smbbmd_set;		/* set (0x00=none, 0xFF=unknown) */
+	uint8_t smbbmd_type;		/* memory type */
+	uint8_t smbbmd_pad;		/* padding */
+	uint32_t smbbmd_flags;		/* flags (see below) */
+	uint32_t smbbmd_speed;		/* speed in MHz */
+	const char *smbbmd_dloc;	/* physical device locator string */
+	const char *smbbmd_bloc;	/* physical bank locator string */
+	uint8_t smbbmd_rank;		/* rank */
+} smb_base_memdevice_t;
+
+
 #ifdef	__cplusplus
 }
 #endif
-- 
cgit v1.2.3


From 39fd84a866206a99cbb6b6e63e0c38a367aaa88e Mon Sep 17 00:00:00 2001
From: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
Date: Tue, 4 Aug 2015 23:02:41 +0200
Subject: 6119 mptsas doesn't handle timeouts in mptsas_get_sata_guid()
 Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Reviewed by: Gordon
 Ross <gordon.ross@nexenta.com> Reviewed by: Dan Fields
 <dan.fields@nexenta.com> Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
 Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Albert Lee
 <trisk@omniti.com> Reviewed by: Garrett D'Amore <garrett@damore.org> Approved
 by: Dan McDonald <danmcd@omniti.com>

---
 .../uts/common/io/scsi/adapters/mpt_sas/mptsas.c   | 194 +++++++++++++--------
 .../common/io/scsi/adapters/mpt_sas/mptsas_raid.c  |   5 +-
 .../common/sys/scsi/adapters/mpt_sas/mptsas_var.h  |   5 +-
 3 files changed, 132 insertions(+), 72 deletions(-)

(limited to 'usr/src')

diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
index f094c9a510..9a74c36c28 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright 2014 OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
@@ -410,7 +410,7 @@ static void mptsas_record_event(void *args);
 static int mptsas_reg_access(mptsas_t *mpt, mptsas_reg_access_t *data,
     int mode);
 
-mptsas_target_t *mptsas_tgt_alloc(mptsas_t *, uint16_t, uint64_t,
+mptsas_target_t *mptsas_tgt_alloc(refhash_t *, uint16_t, uint64_t,
     uint32_t, mptsas_phymask_t, uint8_t);
 static mptsas_smp_t *mptsas_smp_alloc(mptsas_t *, mptsas_smp_t *);
 static int mptsas_online_smp(dev_info_t *pdip, mptsas_smp_t *smp_node,
@@ -782,6 +782,23 @@ mptsas_target_addr_cmp(const void *a, const void *b)
 	return ((int)bap->mta_phymask - (int)aap->mta_phymask);
 }
 
+static uint64_t
+mptsas_tmp_target_hash(const void *tp)
+{
+	return ((uint64_t)(uintptr_t)tp);
+}
+
+static int
+mptsas_tmp_target_cmp(const void *a, const void *b)
+{
+	if (a > b)
+		return (1);
+	if (b < a)
+		return (-1);
+
+	return (0);
+}
+
 static void
 mptsas_target_free(void *op)
 {
@@ -808,6 +825,7 @@ mptsas_destroy_hashes(mptsas_t *mpt)
 	    sp = refhash_next(mpt->m_smp_targets, sp)) {
 		refhash_remove(mpt->m_smp_targets, sp);
 	}
+	refhash_destroy(mpt->m_tmp_targets);
 	refhash_destroy(mpt->m_targets);
 	refhash_destroy(mpt->m_smp_targets);
 	mpt->m_targets = NULL;
@@ -1364,6 +1382,16 @@ mptsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	    offsetof(mptsas_target_t, m_link),
 	    offsetof(mptsas_target_t, m_addr), KM_SLEEP);
 
+	/*
+	 * The refhash for temporary targets uses the address of the target
+	 * struct itself as tag, so the tag offset is 0. See the implementation
+	 * of mptsas_tmp_target_hash() and mptsas_tmp_target_cmp().
+	 */
+	mpt->m_tmp_targets = refhash_create(MPTSAS_TMP_TARGET_BUCKET_COUNT,
+	    mptsas_tmp_target_hash, mptsas_tmp_target_cmp,
+	    mptsas_target_free, sizeof (mptsas_target_t),
+	    offsetof(mptsas_target_t, m_link), 0, KM_SLEEP);
+
 	/*
 	 * Fill in the phy_info structure and get the base WWID
 	 */
@@ -1550,6 +1578,8 @@ fail:
 			mptsas_hba_teardown(mpt);
 		}
 
+		if (mpt->m_tmp_targets)
+			refhash_destroy(mpt->m_tmp_targets);
 		if (mpt->m_targets)
 			refhash_destroy(mpt->m_targets);
 		if (mpt->m_smp_targets)
@@ -6375,10 +6405,15 @@ mptsas_handle_topo_change(mptsas_topo_change_list_t *topo_node,
 				mptsas_log(mpt, CE_NOTE,
 				    "mptsas_handle_topo_change: could not "
 				    "allocate memory. \n");
+			} else if (rval == DEV_INFO_FAIL_GUID) {
+				mptsas_log(mpt, CE_NOTE,
+				    "mptsas_handle_topo_change: could not "
+				    "get SATA GUID for target %d. \n",
+				    topo_node->devhdl);
 			}
 			/*
-			 * If rval is DEV_INFO_PHYS_DISK than there is nothing
-			 * else to do, just leave.
+			 * If rval is DEV_INFO_PHYS_DISK or indicates failure
+			 * then there is nothing else to do, just leave.
 			 */
 			if (rval != DEV_INFO_SUCCESS) {
 				return;
@@ -9875,6 +9910,61 @@ mptsas_watch(void *arg)
 	mutex_exit(&mptsas_global_mutex);
 }
 
+static void
+mptsas_watchsubr_tgt(mptsas_t *mpt, mptsas_target_t *ptgt, hrtime_t timestamp)
+{
+	mptsas_cmd_t	*cmd;
+
+	/*
+	 * If we were draining due to a qfull condition,
+	 * go back to full throttle.
+	 */
+	if ((ptgt->m_t_throttle < MAX_THROTTLE) &&
+	    (ptgt->m_t_throttle > HOLD_THROTTLE) &&
+	    (ptgt->m_t_ncmds < ptgt->m_t_throttle)) {
+		mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE);
+		mptsas_restart_hba(mpt);
+	}
+
+	cmd = TAILQ_LAST(&ptgt->m_active_cmdq, mptsas_active_cmdq);
+	if (cmd == NULL)
+		return;
+
+	if (cmd->cmd_active_expiration <= timestamp) {
+		/*
+		 * Earliest command timeout expired. Drain throttle.
+		 */
+		mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
+
+		/*
+		 * Check for remaining commands.
+		 */
+		cmd = TAILQ_FIRST(&ptgt->m_active_cmdq);
+		if (cmd->cmd_active_expiration > timestamp) {
+			/*
+			 * Wait for remaining commands to complete or
+			 * time out.
+			 */
+			NDBG23(("command timed out, pending drain"));
+			return;
+		}
+
+		/*
+		 * All command timeouts expired.
+		 */
+		mptsas_log(mpt, CE_NOTE, "Timeout of %d seconds "
+		    "expired with %d commands on target %d lun %d.",
+		    cmd->cmd_pkt->pkt_time, ptgt->m_t_ncmds,
+		    ptgt->m_devhdl, Lun(cmd));
+
+		mptsas_cmd_timeout(mpt, ptgt);
+	} else if (cmd->cmd_active_expiration <=
+	    timestamp + (hrtime_t)mptsas_scsi_watchdog_tick * NANOSEC) {
+		NDBG23(("pending timeout"));
+		mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
+	}
+}
+
 static void
 mptsas_watchsubr(mptsas_t *mpt)
 {
@@ -9926,54 +10016,12 @@ mptsas_watchsubr(mptsas_t *mpt)
 
 	for (ptgt = refhash_first(mpt->m_targets); ptgt != NULL;
 	    ptgt = refhash_next(mpt->m_targets, ptgt)) {
-		/*
-		 * If we were draining due to a qfull condition,
-		 * go back to full throttle.
-		 */
-		if ((ptgt->m_t_throttle < MAX_THROTTLE) &&
-		    (ptgt->m_t_throttle > HOLD_THROTTLE) &&
-		    (ptgt->m_t_ncmds < ptgt->m_t_throttle)) {
-			mptsas_set_throttle(mpt, ptgt, MAX_THROTTLE);
-			mptsas_restart_hba(mpt);
-		}
-
-		cmd = TAILQ_LAST(&ptgt->m_active_cmdq, mptsas_active_cmdq);
-		if (cmd == NULL)
-			continue;
-
-		if (cmd->cmd_active_expiration <= timestamp) {
-			/*
-			 * Earliest command timeout expired. Drain throttle.
-			 */
-			mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
-
-			/*
-			 * Check for remaining commands.
-			 */
-			cmd = TAILQ_FIRST(&ptgt->m_active_cmdq);
-			if (cmd->cmd_active_expiration > timestamp) {
-				/*
-				 * Wait for remaining commands to complete or
-				 * time out.
-				 */
-				NDBG23(("command timed out, pending drain"));
-				continue;
-			}
-
-			/*
-			 * All command timeouts expired.
-			 */
-			mptsas_log(mpt, CE_NOTE, "Timeout of %d seconds "
-			    "expired with %d commands on target %d lun %d.",
-			    cmd->cmd_pkt->pkt_time, ptgt->m_t_ncmds,
-			    ptgt->m_devhdl, Lun(cmd));
+		mptsas_watchsubr_tgt(mpt, ptgt, timestamp);
+	}
 
-			mptsas_cmd_timeout(mpt, ptgt);
-		} else if (cmd->cmd_active_expiration <=
-		    timestamp + (hrtime_t)mptsas_scsi_watchdog_tick * NANOSEC) {
-			NDBG23(("pending timeout"));
-			mptsas_set_throttle(mpt, ptgt, DRAIN_THROTTLE);
-		}
+	for (ptgt = refhash_first(mpt->m_tmp_targets); ptgt != NULL;
+	    ptgt = refhash_next(mpt->m_tmp_targets, ptgt)) {
+		mptsas_watchsubr_tgt(mpt, ptgt, timestamp);
 	}
 }
 
@@ -13555,28 +13603,32 @@ mptsas_get_target_device_info(mptsas_t *mpt, uint32_t page_address,
 	 */
 	if (dev_info & (MPI2_SAS_DEVICE_INFO_SATA_DEVICE |
 	    MPI2_SAS_DEVICE_INFO_ATAPI_DEVICE)) {
+		/* alloc a temporary target to send the cmd to */
+		tmp_tgt = mptsas_tgt_alloc(mpt->m_tmp_targets, *dev_handle,
+		    0, dev_info, 0, 0);
 		mutex_exit(&mpt->m_mutex);
-		/* alloc a tmp_tgt to send the cmd */
-		tmp_tgt = kmem_zalloc(sizeof (struct mptsas_target),
-		    KM_SLEEP);
-		tmp_tgt->m_devhdl = *dev_handle;
-		tmp_tgt->m_deviceinfo = dev_info;
-		tmp_tgt->m_qfull_retries = QFULL_RETRIES;
-		tmp_tgt->m_qfull_retry_interval =
-		    drv_usectohz(QFULL_RETRY_INTERVAL * 1000);
-		tmp_tgt->m_t_throttle = MAX_THROTTLE;
+
 		devicename = mptsas_get_sata_guid(mpt, tmp_tgt, 0);
-		kmem_free(tmp_tgt, sizeof (struct mptsas_target));
-		mutex_enter(&mpt->m_mutex);
+
+		if (devicename == -1) {
+			mutex_enter(&mpt->m_mutex);
+			refhash_remove(mpt->m_tmp_targets, tmp_tgt);
+			rval = DEV_INFO_FAIL_GUID;
+			return (rval);
+		}
+
 		if (devicename != 0 && (((devicename >> 56) & 0xf0) == 0x50)) {
 			sas_wwn = devicename;
 		} else if (dev_info & MPI2_SAS_DEVICE_INFO_DIRECT_ATTACH) {
 			sas_wwn = 0;
 		}
+
+		mutex_enter(&mpt->m_mutex);
+		refhash_remove(mpt->m_tmp_targets, tmp_tgt);
 	}
 
 	phymask = mptsas_physport_to_phymask(mpt, physport);
-	*pptgt = mptsas_tgt_alloc(mpt, *dev_handle, sas_wwn,
+	*pptgt = mptsas_tgt_alloc(mpt->m_targets, *dev_handle, sas_wwn,
 	    dev_info, phymask, phynum);
 	if (*pptgt == NULL) {
 		mptsas_log(mpt, CE_WARN, "Failed to allocated target"
@@ -13609,6 +13661,7 @@ inq83_retry:
 	if (rval != DDI_SUCCESS) {
 		mptsas_log(mpt, CE_WARN, "!mptsas request inquiry page "
 		    "0x83 for target:%x, lun:%x failed!", target, lun);
+		sata_guid = -1;
 		goto out;
 	}
 	/* According to SAT2, the first descriptor is logic unit name */
@@ -14442,7 +14495,8 @@ mptsas_update_hashtab(struct mptsas *mpt)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &dev_handle, &ptgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			break;
 		}
 
@@ -16119,7 +16173,8 @@ mptsas_phy_to_tgt(mptsas_t *mpt, mptsas_phymask_t phymask, uint8_t phy)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &cur_handle, &ptgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			break;
 		}
 		if ((rval == DEV_INFO_WRONG_DEVICE_TYPE) ||
@@ -16188,7 +16243,8 @@ mptsas_wwid_to_ptgt(mptsas_t *mpt, mptsas_phymask_t phymask, uint64_t wwid)
 		rval = mptsas_get_target_device_info(mpt, page_address,
 		    &cur_handle, &tmp_tgt);
 		if ((rval == DEV_INFO_FAIL_PAGE0) ||
-		    (rval == DEV_INFO_FAIL_ALLOC)) {
+		    (rval == DEV_INFO_FAIL_ALLOC) ||
+		    (rval == DEV_INFO_FAIL_GUID)) {
 			tmp_tgt = NULL;
 			break;
 		}
@@ -16256,7 +16312,7 @@ mptsas_wwid_to_psmp(mptsas_t *mpt, mptsas_phymask_t phymask, uint64_t wwid)
 }
 
 mptsas_target_t *
-mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
+mptsas_tgt_alloc(refhash_t *refhash, uint16_t devhdl, uint64_t wwid,
     uint32_t devinfo, mptsas_phymask_t phymask, uint8_t phynum)
 {
 	mptsas_target_t *tmp_tgt = NULL;
@@ -16264,7 +16320,7 @@ mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
 
 	addr.mta_wwn = wwid;
 	addr.mta_phymask = phymask;
-	tmp_tgt = refhash_lookup(mpt->m_targets, &addr);
+	tmp_tgt = refhash_lookup(refhash, &addr);
 	if (tmp_tgt != NULL) {
 		NDBG20(("Hash item already exist"));
 		tmp_tgt->m_deviceinfo = devinfo;
@@ -16288,7 +16344,7 @@ mptsas_tgt_alloc(mptsas_t *mpt, uint16_t devhdl, uint64_t wwid,
 	tmp_tgt->m_t_throttle = MAX_THROTTLE;
 	TAILQ_INIT(&tmp_tgt->m_active_cmdq);
 
-	refhash_insert(mpt->m_targets, tmp_tgt);
+	refhash_insert(refhash, tmp_tgt);
 
 	return (tmp_tgt);
 }
diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
index 371db950e5..728730a176 100644
--- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
+++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_raid.c
@@ -23,6 +23,7 @@
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  */
 
 /*
@@ -92,7 +93,7 @@ static int mptsas_get_raid_wwid(mptsas_t *mpt, mptsas_raidvol_t *raidvol);
 
 extern int mptsas_check_dma_handle(ddi_dma_handle_t handle);
 extern int mptsas_check_acc_handle(ddi_acc_handle_t handle);
-extern mptsas_target_t *mptsas_tgt_alloc(mptsas_t *, uint16_t,
+extern mptsas_target_t *mptsas_tgt_alloc(refhash_t *, uint16_t,
     uint64_t, uint32_t, mptsas_phymask_t, uint8_t);
 
 static int
@@ -216,7 +217,7 @@ mptsas_raidconf_page_0_cb(mptsas_t *mpt, caddr_t page_memp,
 			/*
 			 * RAID uses phymask of 0.
 			 */
-			ptgt = mptsas_tgt_alloc(mpt,
+			ptgt = mptsas_tgt_alloc(mpt->m_targets,
 			    voldevhandle, raidwwn, 0, 0, 0);
 
 			raidconfig->m_raidvol[vol].m_raidtgt =
diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
index 836548aa30..3983188fce 100644
--- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
+++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, Tegile Systems Inc. All rights reserved.
  */
@@ -100,6 +100,7 @@ typedef uint16_t		mptsas_phymask_t;
  */
 #define	MPTSAS_SMP_BUCKET_COUNT		23
 #define	MPTSAS_TARGET_BUCKET_COUNT	97
+#define	MPTSAS_TMP_TARGET_BUCKET_COUNT	13
 
 /*
  * MPT HW defines
@@ -557,6 +558,7 @@ _NOTE(DATA_READABLE_WITHOUT_LOCK(mptsas_topo_change_list_t::flags))
 #define	DEV_INFO_WRONG_DEVICE_TYPE	0x2
 #define	DEV_INFO_PHYS_DISK		0x3
 #define	DEV_INFO_FAIL_ALLOC		0x4
+#define	DEV_INFO_FAIL_GUID		0x5
 
 /*
  * mpt hotplug event defines
@@ -699,6 +701,7 @@ typedef struct mptsas {
 
 	refhash_t	*m_targets;
 	refhash_t	*m_smp_targets;
+	refhash_t	*m_tmp_targets;
 
 	m_raidconfig_t	m_raidconfig[MPTSAS_MAX_RAIDCONFIGS];
 	uint8_t		m_num_raid_configs;
-- 
cgit v1.2.3